daimon_skycrawlers 0.11.3 → 0.12.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +5 -2
- data/lib/daimon_skycrawlers.rb +14 -0
- data/lib/daimon_skycrawlers/commands/runner.rb +12 -8
- data/lib/daimon_skycrawlers/generator/crawler.rb +13 -0
- data/lib/daimon_skycrawlers/generator/filter.rb +34 -0
- data/lib/daimon_skycrawlers/generator/generate.rb +2 -0
- data/lib/daimon_skycrawlers/generator/new.rb +5 -3
- data/lib/daimon_skycrawlers/generator/processor.rb +12 -0
- data/lib/daimon_skycrawlers/generator/templates/crawler.rb.erb +0 -4
- data/{sample/amazon-ranking/app/filters/sample_filter.rb → lib/daimon_skycrawlers/generator/templates/filter.rb.erb} +1 -1
- data/{sample/amazon-ranking/app/crawlers/sample_crawler.rb → lib/daimon_skycrawlers/generator/templates/new/app/crawler.rb} +2 -0
- data/lib/daimon_skycrawlers/generator/templates/new/app/{processors/sample_processor.rb → processor.rb} +2 -0
- data/lib/daimon_skycrawlers/generator/templates/processor.rb.erb +0 -3
- data/lib/daimon_skycrawlers/processor/spider.rb +1 -1
- data/lib/daimon_skycrawlers/version.rb +1 -1
- data/{lib/daimon_skycrawlers/generator/templates/new/app/crawlers/sample_crawler.rb → sample/amazon-ranking/app/crawler.rb} +0 -0
- data/sample/amazon-ranking/app/crawlers/.gitkeep +0 -0
- data/sample/amazon-ranking/app/processor.rb +25 -0
- data/sample/amazon-ranking/app/processors/amazon_ranking.rb +0 -17
- data/sample/itp-crawler/app/{crawlers/itp_crawler.rb → crawler.rb} +1 -0
- data/sample/itp-crawler/app/crawlers/.gitkeep +0 -0
- data/sample/itp-crawler/app/processor.rb +8 -0
- data/sample/itp-crawler/app/processors/itp_processor.rb +0 -3
- data/sample/spider/app/{crawlers/blog_crawler.rb → crawler.rb} +2 -0
- data/sample/spider/app/crawlers/.gitkeep +0 -0
- data/sample/spider/app/{processors/blog_spider.rb → processor.rb} +2 -0
- data/sample/spider/app/processors/.gitkeep +0 -0
- metadata +16 -11
- data/lib/daimon_skycrawlers/generator/templates/new/app/filters/sample_filter.rb +0 -9
- data/sample/amazon-ranking/app/processors/sample_processor.rb +0 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ea06b539d343f1a13e2eb22f3cd2e9662ce3d9da
|
4
|
+
data.tar.gz: 39164028e5902f05a86a4c1e1f1890bbe8065dfa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 22472b47e7a5e87758c2bf8712bd76d227e7c6589afd641060e93e8265fdb628abf8a236dbcf1dab0dc6390f0fd6fb680e96ff4998c4ba479f4a5ff334b15a4f
|
7
|
+
data.tar.gz: 8bdf2df5606221dcb984ab3850b8922f524dfbaf12d80cb38c11f65632327d0567b6c17aaff7a51da47e77e88fa27e360045dbdf38af2f3dc02cec3384f15090
|
data/README.md
CHANGED
@@ -2,6 +2,9 @@
|
|
2
2
|
|
3
3
|
# DaimonSkycrawlers
|
4
4
|
|
5
|
+
[![Gem Version](https://badge.fury.io/rb/daimon_skycrawlers.svg)](https://badge.fury.io/rb/daimon_skycrawlers)
|
6
|
+
[![Build Status](https://travis-ci.org/bm-sms/daimon_skycrawlers.svg?branch=master)](https://travis-ci.org/bm-sms/daimon_skycrawlers)
|
7
|
+
|
5
8
|
DaimonSkycrawlers is a crawler framework.
|
6
9
|
|
7
10
|
## Requirements
|
@@ -63,8 +66,8 @@ Or install it yourself as:
|
|
63
66
|
$ daimon_skycrawlers exec processor # on new terminal
|
64
67
|
```
|
65
68
|
|
66
|
-
NOTE: Execute step 5 as soon as possible. Because
|
67
|
-
|
69
|
+
NOTE: Execute step 5 as soon as possible. Because crawler and
|
70
|
+
processor will stop after 10 seconds by default if their
|
68
71
|
queues are empty.
|
69
72
|
|
70
73
|
NOTE: You can change `shutdown_interval` using following code in config/init.rb:
|
data/lib/daimon_skycrawlers.rb
CHANGED
@@ -75,6 +75,20 @@ module DaimonSkycrawlers
|
|
75
75
|
exit(false)
|
76
76
|
end
|
77
77
|
|
78
|
+
def load_crawlers
|
79
|
+
Dir.glob("app/crawlers/**/*.rb") do |path|
|
80
|
+
require(File.expand_path(path, Dir.pwd)) &&
|
81
|
+
DaimonSkycrawlers.configuration.logger.info("Loaded crawler: #{path}")
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def load_processors
|
86
|
+
Dir.glob("app/processors/**/*.rb") do |path|
|
87
|
+
require(File.expand_path(path, Dir.pwd)) &&
|
88
|
+
DaimonSkycrawlers.configuration.logger.info("Loaded processor: #{path}")
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
78
92
|
#
|
79
93
|
# Return current environment
|
80
94
|
#
|
@@ -10,10 +10,8 @@ module DaimonSkycrawlers
|
|
10
10
|
desc "crawler", "Execute crawler"
|
11
11
|
def crawler
|
12
12
|
load_init
|
13
|
-
|
14
|
-
|
15
|
-
log.info("Loaded crawler: #{path}")
|
16
|
-
end
|
13
|
+
load_crawlers
|
14
|
+
require(File.expand_path("app/crawler.rb", Dri.pwd))
|
17
15
|
DaimonSkycrawlers::Crawler.run
|
18
16
|
rescue => ex
|
19
17
|
puts ex.message
|
@@ -23,10 +21,8 @@ module DaimonSkycrawlers
|
|
23
21
|
desc "processor", "Execute processor"
|
24
22
|
def processor
|
25
23
|
load_init
|
26
|
-
|
27
|
-
|
28
|
-
log.info("Loaded processor: #{path}")
|
29
|
-
end
|
24
|
+
load_processors
|
25
|
+
require(File.expand_path("app/processor.rb", Dir.pwd))
|
30
26
|
DaimonSkycrawlers::Processor.run
|
31
27
|
rescue => ex
|
32
28
|
puts ex.message
|
@@ -39,6 +35,14 @@ module DaimonSkycrawlers
|
|
39
35
|
DaimonSkycrawlers.load_init
|
40
36
|
end
|
41
37
|
|
38
|
+
def load_crawlers
|
39
|
+
DaimonSkycrawlers.load_crawlers
|
40
|
+
end
|
41
|
+
|
42
|
+
def load_processors
|
43
|
+
DaimonSkycrawlers.load_processors
|
44
|
+
end
|
45
|
+
|
42
46
|
def log
|
43
47
|
DaimonSkycrawlers.configuration.logger
|
44
48
|
end
|
@@ -17,6 +17,19 @@ module DaimonSkycrawlers
|
|
17
17
|
}
|
18
18
|
template("crawler.rb.erb", "app/crawlers/#{name.underscore}.rb", config)
|
19
19
|
end
|
20
|
+
|
21
|
+
def display_post_message
|
22
|
+
puts <<MESSAGE
|
23
|
+
|
24
|
+
You can register your crawler in `app/crawler.rb` to run your crawler.
|
25
|
+
Following code snippet is useful:
|
26
|
+
|
27
|
+
base_url = "https://www.example.com/"
|
28
|
+
crawler = #{name.classify}.new(base_url)
|
29
|
+
DaimonSkycrawlers.register_crawler(crawler)
|
30
|
+
|
31
|
+
MESSAGE
|
32
|
+
end
|
20
33
|
end
|
21
34
|
end
|
22
35
|
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require "thor"
|
2
|
+
|
3
|
+
module DaimonSkycrawlers
|
4
|
+
module Generator
|
5
|
+
class Filter < Thor::Group
|
6
|
+
include Thor::Actions
|
7
|
+
|
8
|
+
argument :name
|
9
|
+
|
10
|
+
def self.source_root
|
11
|
+
File.join(__dir__, "templates")
|
12
|
+
end
|
13
|
+
|
14
|
+
def create_files
|
15
|
+
config = {
|
16
|
+
class_name: name.classify,
|
17
|
+
}
|
18
|
+
template("filter.rb.erb", "app/filters/#{name.underscore}.rb", config)
|
19
|
+
end
|
20
|
+
|
21
|
+
def display_post_message
|
22
|
+
puts <<MESSAGE
|
23
|
+
|
24
|
+
You can use this filter with both crawlers and processors.
|
25
|
+
|
26
|
+
filter = #{name.classify}.new
|
27
|
+
crawler = DaimonSkycrawlers::Crawler::Default.new
|
28
|
+
crawler.before_process(filter)
|
29
|
+
|
30
|
+
MESSAGE
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -1,12 +1,14 @@
|
|
1
1
|
require "thor"
|
2
2
|
require "daimon_skycrawlers/generator/crawler"
|
3
3
|
require "daimon_skycrawlers/generator/processor"
|
4
|
+
require "daimon_skycrawlers/generator/filter"
|
4
5
|
|
5
6
|
module DaimonSkycrawlers
|
6
7
|
module Generator
|
7
8
|
class Generate < Thor
|
8
9
|
register(Crawler, "crawler", "crawler NAME", "Generate new crawler")
|
9
10
|
register(Processor, "processor", "processor NAME", "Generate new processor")
|
11
|
+
register(Filter, "filter", "filter NAME", "Generate new filter")
|
10
12
|
end
|
11
13
|
end
|
12
14
|
end
|
@@ -70,9 +70,8 @@ module DaimonSkycrawlers
|
|
70
70
|
"Dockerfile.db",
|
71
71
|
"Gemfile",
|
72
72
|
"Rakefile",
|
73
|
-
"app/
|
74
|
-
"app/
|
75
|
-
"app/processors/sample_processor.rb",
|
73
|
+
"app/crawler.rb",
|
74
|
+
"app/processor.rb",
|
76
75
|
"config/init.rb",
|
77
76
|
"services/common/docker-entrypoint.sh",
|
78
77
|
"services/db/init-user-db.sh"
|
@@ -83,6 +82,9 @@ module DaimonSkycrawlers
|
|
83
82
|
|
84
83
|
def create_directories
|
85
84
|
[
|
85
|
+
"app/crawlers",
|
86
|
+
"app/filters",
|
87
|
+
"app/processors",
|
86
88
|
"vendor/bundle",
|
87
89
|
"docker-cache/bundle",
|
88
90
|
"docker-cache/.bundle"
|
@@ -17,6 +17,18 @@ module DaimonSkycrawlers
|
|
17
17
|
}
|
18
18
|
template("processor.rb.erb", "app/processors/#{name.underscore}.rb", config)
|
19
19
|
end
|
20
|
+
|
21
|
+
def display_post_message
|
22
|
+
puts <<MESSAGE
|
23
|
+
|
24
|
+
You can register your processor in `app/processor.rb` to run your processor.
|
25
|
+
Following code snippet is useful:
|
26
|
+
|
27
|
+
processor = #{naem.classify}.new
|
28
|
+
DaimonSkycrawlers.register_processor(processor)
|
29
|
+
|
30
|
+
MESSAGE
|
31
|
+
end
|
20
32
|
end
|
21
33
|
end
|
22
34
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require "daimon_skycrawlers/filter/base"
|
2
2
|
|
3
|
-
class
|
3
|
+
class <%= config[:class_name] %> < DaimonSkycrawlers::Filter::Base
|
4
4
|
def call(message)
|
5
5
|
# Imprement your filter here.
|
6
6
|
# If you want to crawl `url`, return true otherwise false.
|
@@ -147,7 +147,7 @@ module DaimonSkycrawlers
|
|
147
147
|
return if urls.empty?
|
148
148
|
log.debug("Candidate URLs: #{urls.size}")
|
149
149
|
urls = urls.select do |url|
|
150
|
-
@link_filters.all? {|filter| filter.call(url) }
|
150
|
+
@link_filters.all? {|filter| filter.call(url: url) }
|
151
151
|
end
|
152
152
|
log.debug("Filtered URLs: #{urls.size}")
|
153
153
|
urls
|
File without changes
|
File without changes
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require "daimon_skycrawlers"
|
2
|
+
require "daimon_skycrawlers/processor"
|
3
|
+
|
4
|
+
DaimonSkycrawlers.load_processors
|
5
|
+
|
6
|
+
DaimonSkycrawlers.register_processor do |data|
|
7
|
+
p "It works with '#{data[:url]}'"
|
8
|
+
end
|
9
|
+
|
10
|
+
spider = DaimonSkycrawlers::Processor::Spider.new
|
11
|
+
spider.configure do |s|
|
12
|
+
s.link_rules = "ul#zg_browseRoot li a"
|
13
|
+
s.link_message = { next_processor: "AmazonRanking" }
|
14
|
+
s.before_process do |message|
|
15
|
+
message[:next_processor] != "AmazonRanking"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
DaimonSkycrawlers.register_processor(spider)
|
19
|
+
|
20
|
+
processor = AmazonRanking.new.configure do |s|
|
21
|
+
s.before_process do |message|
|
22
|
+
message[:next_processor] == "AmazonRanking"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
DaimonSkycrawlers.register_processor(processor)
|
@@ -19,20 +19,3 @@ class AmazonRanking < DaimonSkycrawlers::Processor::Base
|
|
19
19
|
p ranking
|
20
20
|
end
|
21
21
|
end
|
22
|
-
|
23
|
-
spider = DaimonSkycrawlers::Processor::Spider.new
|
24
|
-
spider.configure do |s|
|
25
|
-
s.link_rules = "ul#zg_browseRoot li a"
|
26
|
-
s.link_message = { next_processor: "AmazonRanking" }
|
27
|
-
s.before_process do |message|
|
28
|
-
message[:next_processor] != "AmazonRanking"
|
29
|
-
end
|
30
|
-
end
|
31
|
-
DaimonSkycrawlers.register_processor(spider)
|
32
|
-
|
33
|
-
processor = AmazonRanking.new.configure do |s|
|
34
|
-
s.before_process do |message|
|
35
|
-
message[:next_processor] == "AmazonRanking"
|
36
|
-
end
|
37
|
-
end
|
38
|
-
DaimonSkycrawlers.register_processor(processor)
|
File without changes
|
File without changes
|
@@ -3,6 +3,8 @@ require "daimon_skycrawlers/filter"
|
|
3
3
|
require "daimon_skycrawlers/filter/duplicate_checker"
|
4
4
|
require "daimon_skycrawlers/filter/update_checker"
|
5
5
|
|
6
|
+
DaimonSkycrawlers.load_processors
|
7
|
+
|
6
8
|
default_processor = DaimonSkycrawlers::Processor::Default.new
|
7
9
|
spider = DaimonSkycrawlers::Processor::Spider.new
|
8
10
|
#spider.enqueue = false
|
File without changes
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: daimon_skycrawlers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.12.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- daimon developers
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-12-
|
11
|
+
date: 2016-12-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -340,18 +340,19 @@ files:
|
|
340
340
|
- lib/daimon_skycrawlers/filter/robots_txt_checker.rb
|
341
341
|
- lib/daimon_skycrawlers/filter/update_checker.rb
|
342
342
|
- lib/daimon_skycrawlers/generator/crawler.rb
|
343
|
+
- lib/daimon_skycrawlers/generator/filter.rb
|
343
344
|
- lib/daimon_skycrawlers/generator/generate.rb
|
344
345
|
- lib/daimon_skycrawlers/generator/new.rb
|
345
346
|
- lib/daimon_skycrawlers/generator/processor.rb
|
346
347
|
- lib/daimon_skycrawlers/generator/templates/crawler.rb.erb
|
348
|
+
- lib/daimon_skycrawlers/generator/templates/filter.rb.erb
|
347
349
|
- lib/daimon_skycrawlers/generator/templates/new/Dockerfile
|
348
350
|
- lib/daimon_skycrawlers/generator/templates/new/Dockerfile.db
|
349
351
|
- lib/daimon_skycrawlers/generator/templates/new/Gemfile
|
350
352
|
- lib/daimon_skycrawlers/generator/templates/new/README.md.erb
|
351
353
|
- lib/daimon_skycrawlers/generator/templates/new/Rakefile
|
352
|
-
- lib/daimon_skycrawlers/generator/templates/new/app/
|
353
|
-
- lib/daimon_skycrawlers/generator/templates/new/app/
|
354
|
-
- lib/daimon_skycrawlers/generator/templates/new/app/processors/sample_processor.rb
|
354
|
+
- lib/daimon_skycrawlers/generator/templates/new/app/crawler.rb
|
355
|
+
- lib/daimon_skycrawlers/generator/templates/new/app/processor.rb
|
355
356
|
- lib/daimon_skycrawlers/generator/templates/new/config/database.yml.erb
|
356
357
|
- lib/daimon_skycrawlers/generator/templates/new/config/init.rb
|
357
358
|
- lib/daimon_skycrawlers/generator/templates/new/docker-compose.yml.erb
|
@@ -385,10 +386,10 @@ files:
|
|
385
386
|
- sample/amazon-ranking/Gemfile.lock
|
386
387
|
- sample/amazon-ranking/README.md
|
387
388
|
- sample/amazon-ranking/Rakefile
|
388
|
-
- sample/amazon-ranking/app/
|
389
|
-
- sample/amazon-ranking/app/
|
389
|
+
- sample/amazon-ranking/app/crawler.rb
|
390
|
+
- sample/amazon-ranking/app/crawlers/.gitkeep
|
391
|
+
- sample/amazon-ranking/app/processor.rb
|
390
392
|
- sample/amazon-ranking/app/processors/amazon_ranking.rb
|
391
|
-
- sample/amazon-ranking/app/processors/sample_processor.rb
|
392
393
|
- sample/amazon-ranking/config/database.yml
|
393
394
|
- sample/amazon-ranking/config/init.rb
|
394
395
|
- sample/amazon-ranking/db/migrate/20161206061241_create_pages.rb
|
@@ -400,9 +401,11 @@ files:
|
|
400
401
|
- sample/itp-crawler/Gemfile.lock
|
401
402
|
- sample/itp-crawler/README.md
|
402
403
|
- sample/itp-crawler/Rakefile
|
403
|
-
- sample/itp-crawler/app/
|
404
|
+
- sample/itp-crawler/app/crawler.rb
|
405
|
+
- sample/itp-crawler/app/crawlers/.gitkeep
|
404
406
|
- sample/itp-crawler/app/models/itp_base.rb
|
405
407
|
- sample/itp-crawler/app/models/itp_shop.rb
|
408
|
+
- sample/itp-crawler/app/processor.rb
|
406
409
|
- sample/itp-crawler/app/processors/itp_processor.rb
|
407
410
|
- sample/itp-crawler/config/database.yml
|
408
411
|
- sample/itp-crawler/config/database_itp.yml
|
@@ -416,8 +419,10 @@ files:
|
|
416
419
|
- sample/spider/Gemfile
|
417
420
|
- sample/spider/README.md
|
418
421
|
- sample/spider/Rakefile
|
419
|
-
- sample/spider/app/
|
420
|
-
- sample/spider/app/
|
422
|
+
- sample/spider/app/crawler.rb
|
423
|
+
- sample/spider/app/crawlers/.gitkeep
|
424
|
+
- sample/spider/app/processor.rb
|
425
|
+
- sample/spider/app/processors/.gitkeep
|
421
426
|
- sample/spider/config/database.yml
|
422
427
|
- sample/spider/config/init.rb
|
423
428
|
- sample/spider/db/migrate/20160830155803_create_pages.rb
|