daimon_skycrawlers 0.11.3 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +5 -2
- data/lib/daimon_skycrawlers.rb +14 -0
- data/lib/daimon_skycrawlers/commands/runner.rb +12 -8
- data/lib/daimon_skycrawlers/generator/crawler.rb +13 -0
- data/lib/daimon_skycrawlers/generator/filter.rb +34 -0
- data/lib/daimon_skycrawlers/generator/generate.rb +2 -0
- data/lib/daimon_skycrawlers/generator/new.rb +5 -3
- data/lib/daimon_skycrawlers/generator/processor.rb +12 -0
- data/lib/daimon_skycrawlers/generator/templates/crawler.rb.erb +0 -4
- data/{sample/amazon-ranking/app/filters/sample_filter.rb → lib/daimon_skycrawlers/generator/templates/filter.rb.erb} +1 -1
- data/{sample/amazon-ranking/app/crawlers/sample_crawler.rb → lib/daimon_skycrawlers/generator/templates/new/app/crawler.rb} +2 -0
- data/lib/daimon_skycrawlers/generator/templates/new/app/{processors/sample_processor.rb → processor.rb} +2 -0
- data/lib/daimon_skycrawlers/generator/templates/processor.rb.erb +0 -3
- data/lib/daimon_skycrawlers/processor/spider.rb +1 -1
- data/lib/daimon_skycrawlers/version.rb +1 -1
- data/{lib/daimon_skycrawlers/generator/templates/new/app/crawlers/sample_crawler.rb → sample/amazon-ranking/app/crawler.rb} +0 -0
- data/sample/amazon-ranking/app/crawlers/.gitkeep +0 -0
- data/sample/amazon-ranking/app/processor.rb +25 -0
- data/sample/amazon-ranking/app/processors/amazon_ranking.rb +0 -17
- data/sample/itp-crawler/app/{crawlers/itp_crawler.rb → crawler.rb} +1 -0
- data/sample/itp-crawler/app/crawlers/.gitkeep +0 -0
- data/sample/itp-crawler/app/processor.rb +8 -0
- data/sample/itp-crawler/app/processors/itp_processor.rb +0 -3
- data/sample/spider/app/{crawlers/blog_crawler.rb → crawler.rb} +2 -0
- data/sample/spider/app/crawlers/.gitkeep +0 -0
- data/sample/spider/app/{processors/blog_spider.rb → processor.rb} +2 -0
- data/sample/spider/app/processors/.gitkeep +0 -0
- metadata +16 -11
- data/lib/daimon_skycrawlers/generator/templates/new/app/filters/sample_filter.rb +0 -9
- data/sample/amazon-ranking/app/processors/sample_processor.rb +0 -5
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ea06b539d343f1a13e2eb22f3cd2e9662ce3d9da
|
|
4
|
+
data.tar.gz: 39164028e5902f05a86a4c1e1f1890bbe8065dfa
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 22472b47e7a5e87758c2bf8712bd76d227e7c6589afd641060e93e8265fdb628abf8a236dbcf1dab0dc6390f0fd6fb680e96ff4998c4ba479f4a5ff334b15a4f
|
|
7
|
+
data.tar.gz: 8bdf2df5606221dcb984ab3850b8922f524dfbaf12d80cb38c11f65632327d0567b6c17aaff7a51da47e77e88fa27e360045dbdf38af2f3dc02cec3384f15090
|
data/README.md
CHANGED
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
# DaimonSkycrawlers
|
|
4
4
|
|
|
5
|
+
[](https://badge.fury.io/rb/daimon_skycrawlers)
|
|
6
|
+
[](https://travis-ci.org/bm-sms/daimon_skycrawlers)
|
|
7
|
+
|
|
5
8
|
DaimonSkycrawlers is a crawler framework.
|
|
6
9
|
|
|
7
10
|
## Requirements
|
|
@@ -63,8 +66,8 @@ Or install it yourself as:
|
|
|
63
66
|
$ daimon_skycrawlers exec processor # on new terminal
|
|
64
67
|
```
|
|
65
68
|
|
|
66
|
-
NOTE: Execute step 5 as soon as possible. Because
|
|
67
|
-
|
|
69
|
+
NOTE: Execute step 5 as soon as possible. Because crawler and
|
|
70
|
+
processor will stop after 10 seconds by default if their
|
|
68
71
|
queues are empty.
|
|
69
72
|
|
|
70
73
|
NOTE: You can change `shutdown_interval` using following code in config/init.rb:
|
data/lib/daimon_skycrawlers.rb
CHANGED
|
@@ -75,6 +75,20 @@ module DaimonSkycrawlers
|
|
|
75
75
|
exit(false)
|
|
76
76
|
end
|
|
77
77
|
|
|
78
|
+
def load_crawlers
|
|
79
|
+
Dir.glob("app/crawlers/**/*.rb") do |path|
|
|
80
|
+
require(File.expand_path(path, Dir.pwd)) &&
|
|
81
|
+
DaimonSkycrawlers.configuration.logger.info("Loaded crawler: #{path}")
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def load_processors
|
|
86
|
+
Dir.glob("app/processors/**/*.rb") do |path|
|
|
87
|
+
require(File.expand_path(path, Dir.pwd)) &&
|
|
88
|
+
DaimonSkycrawlers.configuration.logger.info("Loaded processor: #{path}")
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
|
|
78
92
|
#
|
|
79
93
|
# Return current environment
|
|
80
94
|
#
|
|
@@ -10,10 +10,8 @@ module DaimonSkycrawlers
|
|
|
10
10
|
desc "crawler", "Execute crawler"
|
|
11
11
|
def crawler
|
|
12
12
|
load_init
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
log.info("Loaded crawler: #{path}")
|
|
16
|
-
end
|
|
13
|
+
load_crawlers
|
|
14
|
+
require(File.expand_path("app/crawler.rb", Dri.pwd))
|
|
17
15
|
DaimonSkycrawlers::Crawler.run
|
|
18
16
|
rescue => ex
|
|
19
17
|
puts ex.message
|
|
@@ -23,10 +21,8 @@ module DaimonSkycrawlers
|
|
|
23
21
|
desc "processor", "Execute processor"
|
|
24
22
|
def processor
|
|
25
23
|
load_init
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
log.info("Loaded processor: #{path}")
|
|
29
|
-
end
|
|
24
|
+
load_processors
|
|
25
|
+
require(File.expand_path("app/processor.rb", Dir.pwd))
|
|
30
26
|
DaimonSkycrawlers::Processor.run
|
|
31
27
|
rescue => ex
|
|
32
28
|
puts ex.message
|
|
@@ -39,6 +35,14 @@ module DaimonSkycrawlers
|
|
|
39
35
|
DaimonSkycrawlers.load_init
|
|
40
36
|
end
|
|
41
37
|
|
|
38
|
+
def load_crawlers
|
|
39
|
+
DaimonSkycrawlers.load_crawlers
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def load_processors
|
|
43
|
+
DaimonSkycrawlers.load_processors
|
|
44
|
+
end
|
|
45
|
+
|
|
42
46
|
def log
|
|
43
47
|
DaimonSkycrawlers.configuration.logger
|
|
44
48
|
end
|
|
@@ -17,6 +17,19 @@ module DaimonSkycrawlers
|
|
|
17
17
|
}
|
|
18
18
|
template("crawler.rb.erb", "app/crawlers/#{name.underscore}.rb", config)
|
|
19
19
|
end
|
|
20
|
+
|
|
21
|
+
def display_post_message
|
|
22
|
+
puts <<MESSAGE
|
|
23
|
+
|
|
24
|
+
You can register your crawler in `app/crawler.rb` to run your crawler.
|
|
25
|
+
Following code snippet is useful:
|
|
26
|
+
|
|
27
|
+
base_url = "https://www.example.com/"
|
|
28
|
+
crawler = #{name.classify}.new(base_url)
|
|
29
|
+
DaimonSkycrawlers.register_crawler(crawler)
|
|
30
|
+
|
|
31
|
+
MESSAGE
|
|
32
|
+
end
|
|
20
33
|
end
|
|
21
34
|
end
|
|
22
35
|
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
require "thor"
|
|
2
|
+
|
|
3
|
+
module DaimonSkycrawlers
|
|
4
|
+
module Generator
|
|
5
|
+
class Filter < Thor::Group
|
|
6
|
+
include Thor::Actions
|
|
7
|
+
|
|
8
|
+
argument :name
|
|
9
|
+
|
|
10
|
+
def self.source_root
|
|
11
|
+
File.join(__dir__, "templates")
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def create_files
|
|
15
|
+
config = {
|
|
16
|
+
class_name: name.classify,
|
|
17
|
+
}
|
|
18
|
+
template("filter.rb.erb", "app/filters/#{name.underscore}.rb", config)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def display_post_message
|
|
22
|
+
puts <<MESSAGE
|
|
23
|
+
|
|
24
|
+
You can use this filter with both crawlers and processors.
|
|
25
|
+
|
|
26
|
+
filter = #{name.classify}.new
|
|
27
|
+
crawler = DaimonSkycrawlers::Crawler::Default.new
|
|
28
|
+
crawler.before_process(filter)
|
|
29
|
+
|
|
30
|
+
MESSAGE
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
require "thor"
|
|
2
2
|
require "daimon_skycrawlers/generator/crawler"
|
|
3
3
|
require "daimon_skycrawlers/generator/processor"
|
|
4
|
+
require "daimon_skycrawlers/generator/filter"
|
|
4
5
|
|
|
5
6
|
module DaimonSkycrawlers
|
|
6
7
|
module Generator
|
|
7
8
|
class Generate < Thor
|
|
8
9
|
register(Crawler, "crawler", "crawler NAME", "Generate new crawler")
|
|
9
10
|
register(Processor, "processor", "processor NAME", "Generate new processor")
|
|
11
|
+
register(Filter, "filter", "filter NAME", "Generate new filter")
|
|
10
12
|
end
|
|
11
13
|
end
|
|
12
14
|
end
|
|
@@ -70,9 +70,8 @@ module DaimonSkycrawlers
|
|
|
70
70
|
"Dockerfile.db",
|
|
71
71
|
"Gemfile",
|
|
72
72
|
"Rakefile",
|
|
73
|
-
"app/
|
|
74
|
-
"app/
|
|
75
|
-
"app/processors/sample_processor.rb",
|
|
73
|
+
"app/crawler.rb",
|
|
74
|
+
"app/processor.rb",
|
|
76
75
|
"config/init.rb",
|
|
77
76
|
"services/common/docker-entrypoint.sh",
|
|
78
77
|
"services/db/init-user-db.sh"
|
|
@@ -83,6 +82,9 @@ module DaimonSkycrawlers
|
|
|
83
82
|
|
|
84
83
|
def create_directories
|
|
85
84
|
[
|
|
85
|
+
"app/crawlers",
|
|
86
|
+
"app/filters",
|
|
87
|
+
"app/processors",
|
|
86
88
|
"vendor/bundle",
|
|
87
89
|
"docker-cache/bundle",
|
|
88
90
|
"docker-cache/.bundle"
|
|
@@ -17,6 +17,18 @@ module DaimonSkycrawlers
|
|
|
17
17
|
}
|
|
18
18
|
template("processor.rb.erb", "app/processors/#{name.underscore}.rb", config)
|
|
19
19
|
end
|
|
20
|
+
|
|
21
|
+
def display_post_message
|
|
22
|
+
puts <<MESSAGE
|
|
23
|
+
|
|
24
|
+
You can register your processor in `app/processor.rb` to run your processor.
|
|
25
|
+
Following code snippet is useful:
|
|
26
|
+
|
|
27
|
+
processor = #{naem.classify}.new
|
|
28
|
+
DaimonSkycrawlers.register_processor(processor)
|
|
29
|
+
|
|
30
|
+
MESSAGE
|
|
31
|
+
end
|
|
20
32
|
end
|
|
21
33
|
end
|
|
22
34
|
end
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
require "daimon_skycrawlers/filter/base"
|
|
2
2
|
|
|
3
|
-
class
|
|
3
|
+
class <%= config[:class_name] %> < DaimonSkycrawlers::Filter::Base
|
|
4
4
|
def call(message)
|
|
5
5
|
# Imprement your filter here.
|
|
6
6
|
# If you want to crawl `url`, return true otherwise false.
|
|
@@ -147,7 +147,7 @@ module DaimonSkycrawlers
|
|
|
147
147
|
return if urls.empty?
|
|
148
148
|
log.debug("Candidate URLs: #{urls.size}")
|
|
149
149
|
urls = urls.select do |url|
|
|
150
|
-
@link_filters.all? {|filter| filter.call(url) }
|
|
150
|
+
@link_filters.all? {|filter| filter.call(url: url) }
|
|
151
151
|
end
|
|
152
152
|
log.debug("Filtered URLs: #{urls.size}")
|
|
153
153
|
urls
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
require "daimon_skycrawlers"
|
|
2
|
+
require "daimon_skycrawlers/processor"
|
|
3
|
+
|
|
4
|
+
DaimonSkycrawlers.load_processors
|
|
5
|
+
|
|
6
|
+
DaimonSkycrawlers.register_processor do |data|
|
|
7
|
+
p "It works with '#{data[:url]}'"
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
spider = DaimonSkycrawlers::Processor::Spider.new
|
|
11
|
+
spider.configure do |s|
|
|
12
|
+
s.link_rules = "ul#zg_browseRoot li a"
|
|
13
|
+
s.link_message = { next_processor: "AmazonRanking" }
|
|
14
|
+
s.before_process do |message|
|
|
15
|
+
message[:next_processor] != "AmazonRanking"
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
DaimonSkycrawlers.register_processor(spider)
|
|
19
|
+
|
|
20
|
+
processor = AmazonRanking.new.configure do |s|
|
|
21
|
+
s.before_process do |message|
|
|
22
|
+
message[:next_processor] == "AmazonRanking"
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
DaimonSkycrawlers.register_processor(processor)
|
|
@@ -19,20 +19,3 @@ class AmazonRanking < DaimonSkycrawlers::Processor::Base
|
|
|
19
19
|
p ranking
|
|
20
20
|
end
|
|
21
21
|
end
|
|
22
|
-
|
|
23
|
-
spider = DaimonSkycrawlers::Processor::Spider.new
|
|
24
|
-
spider.configure do |s|
|
|
25
|
-
s.link_rules = "ul#zg_browseRoot li a"
|
|
26
|
-
s.link_message = { next_processor: "AmazonRanking" }
|
|
27
|
-
s.before_process do |message|
|
|
28
|
-
message[:next_processor] != "AmazonRanking"
|
|
29
|
-
end
|
|
30
|
-
end
|
|
31
|
-
DaimonSkycrawlers.register_processor(spider)
|
|
32
|
-
|
|
33
|
-
processor = AmazonRanking.new.configure do |s|
|
|
34
|
-
s.before_process do |message|
|
|
35
|
-
message[:next_processor] == "AmazonRanking"
|
|
36
|
-
end
|
|
37
|
-
end
|
|
38
|
-
DaimonSkycrawlers.register_processor(processor)
|
|
File without changes
|
|
File without changes
|
|
@@ -3,6 +3,8 @@ require "daimon_skycrawlers/filter"
|
|
|
3
3
|
require "daimon_skycrawlers/filter/duplicate_checker"
|
|
4
4
|
require "daimon_skycrawlers/filter/update_checker"
|
|
5
5
|
|
|
6
|
+
DaimonSkycrawlers.load_processors
|
|
7
|
+
|
|
6
8
|
default_processor = DaimonSkycrawlers::Processor::Default.new
|
|
7
9
|
spider = DaimonSkycrawlers::Processor::Spider.new
|
|
8
10
|
#spider.enqueue = false
|
|
File without changes
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: daimon_skycrawlers
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.12.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- daimon developers
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2016-12-
|
|
11
|
+
date: 2016-12-20 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: thor
|
|
@@ -340,18 +340,19 @@ files:
|
|
|
340
340
|
- lib/daimon_skycrawlers/filter/robots_txt_checker.rb
|
|
341
341
|
- lib/daimon_skycrawlers/filter/update_checker.rb
|
|
342
342
|
- lib/daimon_skycrawlers/generator/crawler.rb
|
|
343
|
+
- lib/daimon_skycrawlers/generator/filter.rb
|
|
343
344
|
- lib/daimon_skycrawlers/generator/generate.rb
|
|
344
345
|
- lib/daimon_skycrawlers/generator/new.rb
|
|
345
346
|
- lib/daimon_skycrawlers/generator/processor.rb
|
|
346
347
|
- lib/daimon_skycrawlers/generator/templates/crawler.rb.erb
|
|
348
|
+
- lib/daimon_skycrawlers/generator/templates/filter.rb.erb
|
|
347
349
|
- lib/daimon_skycrawlers/generator/templates/new/Dockerfile
|
|
348
350
|
- lib/daimon_skycrawlers/generator/templates/new/Dockerfile.db
|
|
349
351
|
- lib/daimon_skycrawlers/generator/templates/new/Gemfile
|
|
350
352
|
- lib/daimon_skycrawlers/generator/templates/new/README.md.erb
|
|
351
353
|
- lib/daimon_skycrawlers/generator/templates/new/Rakefile
|
|
352
|
-
- lib/daimon_skycrawlers/generator/templates/new/app/
|
|
353
|
-
- lib/daimon_skycrawlers/generator/templates/new/app/
|
|
354
|
-
- lib/daimon_skycrawlers/generator/templates/new/app/processors/sample_processor.rb
|
|
354
|
+
- lib/daimon_skycrawlers/generator/templates/new/app/crawler.rb
|
|
355
|
+
- lib/daimon_skycrawlers/generator/templates/new/app/processor.rb
|
|
355
356
|
- lib/daimon_skycrawlers/generator/templates/new/config/database.yml.erb
|
|
356
357
|
- lib/daimon_skycrawlers/generator/templates/new/config/init.rb
|
|
357
358
|
- lib/daimon_skycrawlers/generator/templates/new/docker-compose.yml.erb
|
|
@@ -385,10 +386,10 @@ files:
|
|
|
385
386
|
- sample/amazon-ranking/Gemfile.lock
|
|
386
387
|
- sample/amazon-ranking/README.md
|
|
387
388
|
- sample/amazon-ranking/Rakefile
|
|
388
|
-
- sample/amazon-ranking/app/
|
|
389
|
-
- sample/amazon-ranking/app/
|
|
389
|
+
- sample/amazon-ranking/app/crawler.rb
|
|
390
|
+
- sample/amazon-ranking/app/crawlers/.gitkeep
|
|
391
|
+
- sample/amazon-ranking/app/processor.rb
|
|
390
392
|
- sample/amazon-ranking/app/processors/amazon_ranking.rb
|
|
391
|
-
- sample/amazon-ranking/app/processors/sample_processor.rb
|
|
392
393
|
- sample/amazon-ranking/config/database.yml
|
|
393
394
|
- sample/amazon-ranking/config/init.rb
|
|
394
395
|
- sample/amazon-ranking/db/migrate/20161206061241_create_pages.rb
|
|
@@ -400,9 +401,11 @@ files:
|
|
|
400
401
|
- sample/itp-crawler/Gemfile.lock
|
|
401
402
|
- sample/itp-crawler/README.md
|
|
402
403
|
- sample/itp-crawler/Rakefile
|
|
403
|
-
- sample/itp-crawler/app/
|
|
404
|
+
- sample/itp-crawler/app/crawler.rb
|
|
405
|
+
- sample/itp-crawler/app/crawlers/.gitkeep
|
|
404
406
|
- sample/itp-crawler/app/models/itp_base.rb
|
|
405
407
|
- sample/itp-crawler/app/models/itp_shop.rb
|
|
408
|
+
- sample/itp-crawler/app/processor.rb
|
|
406
409
|
- sample/itp-crawler/app/processors/itp_processor.rb
|
|
407
410
|
- sample/itp-crawler/config/database.yml
|
|
408
411
|
- sample/itp-crawler/config/database_itp.yml
|
|
@@ -416,8 +419,10 @@ files:
|
|
|
416
419
|
- sample/spider/Gemfile
|
|
417
420
|
- sample/spider/README.md
|
|
418
421
|
- sample/spider/Rakefile
|
|
419
|
-
- sample/spider/app/
|
|
420
|
-
- sample/spider/app/
|
|
422
|
+
- sample/spider/app/crawler.rb
|
|
423
|
+
- sample/spider/app/crawlers/.gitkeep
|
|
424
|
+
- sample/spider/app/processor.rb
|
|
425
|
+
- sample/spider/app/processors/.gitkeep
|
|
421
426
|
- sample/spider/config/database.yml
|
|
422
427
|
- sample/spider/config/init.rb
|
|
423
428
|
- sample/spider/db/migrate/20160830155803_create_pages.rb
|