daimon_skycrawlers 0.11.3 → 0.12.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (31) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +5 -2
  3. data/lib/daimon_skycrawlers.rb +14 -0
  4. data/lib/daimon_skycrawlers/commands/runner.rb +12 -8
  5. data/lib/daimon_skycrawlers/generator/crawler.rb +13 -0
  6. data/lib/daimon_skycrawlers/generator/filter.rb +34 -0
  7. data/lib/daimon_skycrawlers/generator/generate.rb +2 -0
  8. data/lib/daimon_skycrawlers/generator/new.rb +5 -3
  9. data/lib/daimon_skycrawlers/generator/processor.rb +12 -0
  10. data/lib/daimon_skycrawlers/generator/templates/crawler.rb.erb +0 -4
  11. data/{sample/amazon-ranking/app/filters/sample_filter.rb → lib/daimon_skycrawlers/generator/templates/filter.rb.erb} +1 -1
  12. data/{sample/amazon-ranking/app/crawlers/sample_crawler.rb → lib/daimon_skycrawlers/generator/templates/new/app/crawler.rb} +2 -0
  13. data/lib/daimon_skycrawlers/generator/templates/new/app/{processors/sample_processor.rb → processor.rb} +2 -0
  14. data/lib/daimon_skycrawlers/generator/templates/processor.rb.erb +0 -3
  15. data/lib/daimon_skycrawlers/processor/spider.rb +1 -1
  16. data/lib/daimon_skycrawlers/version.rb +1 -1
  17. data/{lib/daimon_skycrawlers/generator/templates/new/app/crawlers/sample_crawler.rb → sample/amazon-ranking/app/crawler.rb} +0 -0
  18. data/sample/amazon-ranking/app/crawlers/.gitkeep +0 -0
  19. data/sample/amazon-ranking/app/processor.rb +25 -0
  20. data/sample/amazon-ranking/app/processors/amazon_ranking.rb +0 -17
  21. data/sample/itp-crawler/app/{crawlers/itp_crawler.rb → crawler.rb} +1 -0
  22. data/sample/itp-crawler/app/crawlers/.gitkeep +0 -0
  23. data/sample/itp-crawler/app/processor.rb +8 -0
  24. data/sample/itp-crawler/app/processors/itp_processor.rb +0 -3
  25. data/sample/spider/app/{crawlers/blog_crawler.rb → crawler.rb} +2 -0
  26. data/sample/spider/app/crawlers/.gitkeep +0 -0
  27. data/sample/spider/app/{processors/blog_spider.rb → processor.rb} +2 -0
  28. data/sample/spider/app/processors/.gitkeep +0 -0
  29. metadata +16 -11
  30. data/lib/daimon_skycrawlers/generator/templates/new/app/filters/sample_filter.rb +0 -9
  31. data/sample/amazon-ranking/app/processors/sample_processor.rb +0 -5
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d26cd9bc437f67de677b2fd929b3df34d5715736
4
- data.tar.gz: e2a01ec217a9eac95cd23025286782844d8fc30d
3
+ metadata.gz: ea06b539d343f1a13e2eb22f3cd2e9662ce3d9da
4
+ data.tar.gz: 39164028e5902f05a86a4c1e1f1890bbe8065dfa
5
5
  SHA512:
6
- metadata.gz: d34053642471b25522a768507867d2c21ce4ee5a653c5ab886cdb8c42bfd62558770db354b3c74bc57f14bc9241e3a87488ad65c3634f4dc76705fdf9e1e7c31
7
- data.tar.gz: 47937c94c61d1d2a71e8b2c6d5844aaa6dc1f15ac48e450453c1ed54941cd45ff900458de10382d74761ca495be10a7cbe8f1c6dc751b84b8a4925621c7ce965
6
+ metadata.gz: 22472b47e7a5e87758c2bf8712bd76d227e7c6589afd641060e93e8265fdb628abf8a236dbcf1dab0dc6390f0fd6fb680e96ff4998c4ba479f4a5ff334b15a4f
7
+ data.tar.gz: 8bdf2df5606221dcb984ab3850b8922f524dfbaf12d80cb38c11f65632327d0567b6c17aaff7a51da47e77e88fa27e360045dbdf38af2f3dc02cec3384f15090
data/README.md CHANGED
@@ -2,6 +2,9 @@
2
2
 
3
3
  # DaimonSkycrawlers
4
4
 
5
+ [![Gem Version](https://badge.fury.io/rb/daimon_skycrawlers.svg)](https://badge.fury.io/rb/daimon_skycrawlers)
6
+ [![Build Status](https://travis-ci.org/bm-sms/daimon_skycrawlers.svg?branch=master)](https://travis-ci.org/bm-sms/daimon_skycrawlers)
7
+
5
8
  DaimonSkycrawlers is a crawler framework.
6
9
 
7
10
  ## Requirements
@@ -63,8 +66,8 @@ Or install it yourself as:
63
66
  $ daimon_skycrawlers exec processor # on new terminal
64
67
  ```
65
68
 
66
- NOTE: Execute step 5 as soon as possible. Because bin/crawler and
67
- bin/processor will stop after 10 seconds by default if their
69
+ NOTE: Execute step 5 as soon as possible. Because crawler and
70
+ processor will stop after 10 seconds by default if their
68
71
  queues are empty.
69
72
 
70
73
  NOTE: You can change `shutdown_interval` using following code in config/init.rb:
@@ -75,6 +75,20 @@ module DaimonSkycrawlers
75
75
  exit(false)
76
76
  end
77
77
 
78
+ def load_crawlers
79
+ Dir.glob("app/crawlers/**/*.rb") do |path|
80
+ require(File.expand_path(path, Dir.pwd)) &&
81
+ DaimonSkycrawlers.configuration.logger.info("Loaded crawler: #{path}")
82
+ end
83
+ end
84
+
85
+ def load_processors
86
+ Dir.glob("app/processors/**/*.rb") do |path|
87
+ require(File.expand_path(path, Dir.pwd)) &&
88
+ DaimonSkycrawlers.configuration.logger.info("Loaded processor: #{path}")
89
+ end
90
+ end
91
+
78
92
  #
79
93
  # Return current environment
80
94
  #
@@ -10,10 +10,8 @@ module DaimonSkycrawlers
10
10
  desc "crawler", "Execute crawler"
11
11
  def crawler
12
12
  load_init
13
- Dir.glob("app/crawlers/**/*.rb") do |path|
14
- require(File.expand_path(path, Dir.pwd))
15
- log.info("Loaded crawler: #{path}")
16
- end
13
+ load_crawlers
14
+ require(File.expand_path("app/crawler.rb", Dri.pwd))
17
15
  DaimonSkycrawlers::Crawler.run
18
16
  rescue => ex
19
17
  puts ex.message
@@ -23,10 +21,8 @@ module DaimonSkycrawlers
23
21
  desc "processor", "Execute processor"
24
22
  def processor
25
23
  load_init
26
- Dir.glob("app/processors/**/*.rb") do |path|
27
- require(File.expand_path(path, Dir.pwd))
28
- log.info("Loaded processor: #{path}")
29
- end
24
+ load_processors
25
+ require(File.expand_path("app/processor.rb", Dir.pwd))
30
26
  DaimonSkycrawlers::Processor.run
31
27
  rescue => ex
32
28
  puts ex.message
@@ -39,6 +35,14 @@ module DaimonSkycrawlers
39
35
  DaimonSkycrawlers.load_init
40
36
  end
41
37
 
38
+ def load_crawlers
39
+ DaimonSkycrawlers.load_crawlers
40
+ end
41
+
42
+ def load_processors
43
+ DaimonSkycrawlers.load_processors
44
+ end
45
+
42
46
  def log
43
47
  DaimonSkycrawlers.configuration.logger
44
48
  end
@@ -17,6 +17,19 @@ module DaimonSkycrawlers
17
17
  }
18
18
  template("crawler.rb.erb", "app/crawlers/#{name.underscore}.rb", config)
19
19
  end
20
+
21
+ def display_post_message
22
+ puts <<MESSAGE
23
+
24
+ You can register your crawler in `app/crawler.rb` to run your crawler.
25
+ Following code snippet is useful:
26
+
27
+ base_url = "https://www.example.com/"
28
+ crawler = #{name.classify}.new(base_url)
29
+ DaimonSkycrawlers.register_crawler(crawler)
30
+
31
+ MESSAGE
32
+ end
20
33
  end
21
34
  end
22
35
  end
@@ -0,0 +1,34 @@
1
+ require "thor"
2
+
3
+ module DaimonSkycrawlers
4
+ module Generator
5
+ class Filter < Thor::Group
6
+ include Thor::Actions
7
+
8
+ argument :name
9
+
10
+ def self.source_root
11
+ File.join(__dir__, "templates")
12
+ end
13
+
14
+ def create_files
15
+ config = {
16
+ class_name: name.classify,
17
+ }
18
+ template("filter.rb.erb", "app/filters/#{name.underscore}.rb", config)
19
+ end
20
+
21
+ def display_post_message
22
+ puts <<MESSAGE
23
+
24
+ You can use this filter with both crawlers and processors.
25
+
26
+ filter = #{name.classify}.new
27
+ crawler = DaimonSkycrawlers::Crawler::Default.new
28
+ crawler.before_process(filter)
29
+
30
+ MESSAGE
31
+ end
32
+ end
33
+ end
34
+ end
@@ -1,12 +1,14 @@
1
1
  require "thor"
2
2
  require "daimon_skycrawlers/generator/crawler"
3
3
  require "daimon_skycrawlers/generator/processor"
4
+ require "daimon_skycrawlers/generator/filter"
4
5
 
5
6
  module DaimonSkycrawlers
6
7
  module Generator
7
8
  class Generate < Thor
8
9
  register(Crawler, "crawler", "crawler NAME", "Generate new crawler")
9
10
  register(Processor, "processor", "processor NAME", "Generate new processor")
11
+ register(Filter, "filter", "filter NAME", "Generate new filter")
10
12
  end
11
13
  end
12
14
  end
@@ -70,9 +70,8 @@ module DaimonSkycrawlers
70
70
  "Dockerfile.db",
71
71
  "Gemfile",
72
72
  "Rakefile",
73
- "app/crawlers/sample_crawler.rb",
74
- "app/filters/sample_filter.rb",
75
- "app/processors/sample_processor.rb",
73
+ "app/crawler.rb",
74
+ "app/processor.rb",
76
75
  "config/init.rb",
77
76
  "services/common/docker-entrypoint.sh",
78
77
  "services/db/init-user-db.sh"
@@ -83,6 +82,9 @@ module DaimonSkycrawlers
83
82
 
84
83
  def create_directories
85
84
  [
85
+ "app/crawlers",
86
+ "app/filters",
87
+ "app/processors",
86
88
  "vendor/bundle",
87
89
  "docker-cache/bundle",
88
90
  "docker-cache/.bundle"
@@ -17,6 +17,18 @@ module DaimonSkycrawlers
17
17
  }
18
18
  template("processor.rb.erb", "app/processors/#{name.underscore}.rb", config)
19
19
  end
20
+
21
+ def display_post_message
22
+ puts <<MESSAGE
23
+
24
+ You can register your processor in `app/processor.rb` to run your processor.
25
+ Following code snippet is useful:
26
+
27
+ processor = #{naem.classify}.new
28
+ DaimonSkycrawlers.register_processor(processor)
29
+
30
+ MESSAGE
31
+ end
20
32
  end
21
33
  end
22
34
  end
@@ -7,7 +7,3 @@ class <%= config[:class_name] %> < DaimonSkycrawlers::Crawler::Base
7
7
  # Implement your crawler here
8
8
  end
9
9
  end
10
-
11
- base_url = ""
12
- crawler = <%= config[:class_name] %>.new(base_url)
13
- DaimonSkycrawlers.register_crawler(crawler)
@@ -1,6 +1,6 @@
1
1
  require "daimon_skycrawlers/filter/base"
2
2
 
3
- class SampleFilter < DaimonSkycrawlers::Filter::Base
3
+ class <%= config[:class_name] %> < DaimonSkycrawlers::Filter::Base
4
4
  def call(message)
5
5
  # Imprement your filter here.
6
6
  # If you want to crawl `url`, return true otherwise false.
@@ -1,6 +1,8 @@
1
1
  require "daimon_skycrawlers/crawler"
2
2
  require "daimon_skycrawlers/crawler/default"
3
3
 
4
+ DaimonSkycrawlers.load_crawlers
5
+
4
6
  base_url = "http://example.com"
5
7
 
6
8
  crawler = DaimonSkycrawlers::Crawler::Default.new(base_url)
@@ -1,5 +1,7 @@
1
1
  require "daimon_skycrawlers/processor"
2
2
 
3
+ DaimonSkycrawlers.load_processors
4
+
3
5
  DaimonSkycrawlers.register_processor do |data|
4
6
  p "It works with '#{data[:url]}'"
5
7
  end
@@ -7,6 +7,3 @@ class <%= config[:class_name] %> < DaimonSkycrawlers::Processor::Base
7
7
  # Implement your processor here
8
8
  end
9
9
  end
10
-
11
- processor = <%= config[:class_name] %>.new
12
- DaimonSkycrawlers.register_processor(processor)
@@ -147,7 +147,7 @@ module DaimonSkycrawlers
147
147
  return if urls.empty?
148
148
  log.debug("Candidate URLs: #{urls.size}")
149
149
  urls = urls.select do |url|
150
- @link_filters.all? {|filter| filter.call(url) }
150
+ @link_filters.all? {|filter| filter.call(url: url) }
151
151
  end
152
152
  log.debug("Filtered URLs: #{urls.size}")
153
153
  urls
@@ -1,3 +1,3 @@
1
1
  module DaimonSkycrawlers
2
- VERSION = "0.11.3"
2
+ VERSION = "0.12.0"
3
3
  end
File without changes
@@ -0,0 +1,25 @@
1
+ require "daimon_skycrawlers"
2
+ require "daimon_skycrawlers/processor"
3
+
4
+ DaimonSkycrawlers.load_processors
5
+
6
+ DaimonSkycrawlers.register_processor do |data|
7
+ p "It works with '#{data[:url]}'"
8
+ end
9
+
10
+ spider = DaimonSkycrawlers::Processor::Spider.new
11
+ spider.configure do |s|
12
+ s.link_rules = "ul#zg_browseRoot li a"
13
+ s.link_message = { next_processor: "AmazonRanking" }
14
+ s.before_process do |message|
15
+ message[:next_processor] != "AmazonRanking"
16
+ end
17
+ end
18
+ DaimonSkycrawlers.register_processor(spider)
19
+
20
+ processor = AmazonRanking.new.configure do |s|
21
+ s.before_process do |message|
22
+ message[:next_processor] == "AmazonRanking"
23
+ end
24
+ end
25
+ DaimonSkycrawlers.register_processor(processor)
@@ -19,20 +19,3 @@ class AmazonRanking < DaimonSkycrawlers::Processor::Base
19
19
  p ranking
20
20
  end
21
21
  end
22
-
23
- spider = DaimonSkycrawlers::Processor::Spider.new
24
- spider.configure do |s|
25
- s.link_rules = "ul#zg_browseRoot li a"
26
- s.link_message = { next_processor: "AmazonRanking" }
27
- s.before_process do |message|
28
- message[:next_processor] != "AmazonRanking"
29
- end
30
- end
31
- DaimonSkycrawlers.register_processor(spider)
32
-
33
- processor = AmazonRanking.new.configure do |s|
34
- s.before_process do |message|
35
- message[:next_processor] == "AmazonRanking"
36
- end
37
- end
38
- DaimonSkycrawlers.register_processor(processor)
@@ -2,6 +2,7 @@ require "daimon_skycrawlers"
2
2
  require "daimon_skycrawlers/crawler"
3
3
  require "daimon_skycrawlers/crawler/default"
4
4
 
5
+ DaimonSkycrawlers.load_crawlers
5
6
 
6
7
  base_url = "http://itp.ne.jp/"
7
8
  crawler = DaimonSkycrawlers::Crawler::Default.new(base_url)
File without changes
@@ -0,0 +1,8 @@
1
+ require "daimon_skycrawlers"
2
+ require "daimon_skycrawlers/processor"
3
+ require "daimon_skycrawlers/processor/base"
4
+
5
+ DaimonSkycrawlers.load_processors
6
+
7
+ processor = ItpProcessor.new
8
+ DaimonSkycrawlers.register_processor(processor)
@@ -90,6 +90,3 @@ class ItpProcessor < DaimonSkycrawlers::Processor::Base
90
90
  end
91
91
  end
92
92
  end
93
-
94
- processor = ItpProcessor.new
95
- DaimonSkycrawlers.register_processor(processor)
@@ -1,5 +1,7 @@
1
1
  require "daimon_skycrawlers/crawler/default"
2
2
 
3
+ DaimonSkycrawlers.load_crawlers
4
+
3
5
  base_url = "http://www.clear-code.com/blog/"
4
6
 
5
7
  crawler = DaimonSkycrawlers::Crawler::Default.new(base_url)
File without changes
@@ -3,6 +3,8 @@ require "daimon_skycrawlers/filter"
3
3
  require "daimon_skycrawlers/filter/duplicate_checker"
4
4
  require "daimon_skycrawlers/filter/update_checker"
5
5
 
6
+ DaimonSkycrawlers.load_processors
7
+
6
8
  default_processor = DaimonSkycrawlers::Processor::Default.new
7
9
  spider = DaimonSkycrawlers::Processor::Spider.new
8
10
  #spider.enqueue = false
File without changes
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: daimon_skycrawlers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.3
4
+ version: 0.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - daimon developers
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-12-08 00:00:00.000000000 Z
11
+ date: 2016-12-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -340,18 +340,19 @@ files:
340
340
  - lib/daimon_skycrawlers/filter/robots_txt_checker.rb
341
341
  - lib/daimon_skycrawlers/filter/update_checker.rb
342
342
  - lib/daimon_skycrawlers/generator/crawler.rb
343
+ - lib/daimon_skycrawlers/generator/filter.rb
343
344
  - lib/daimon_skycrawlers/generator/generate.rb
344
345
  - lib/daimon_skycrawlers/generator/new.rb
345
346
  - lib/daimon_skycrawlers/generator/processor.rb
346
347
  - lib/daimon_skycrawlers/generator/templates/crawler.rb.erb
348
+ - lib/daimon_skycrawlers/generator/templates/filter.rb.erb
347
349
  - lib/daimon_skycrawlers/generator/templates/new/Dockerfile
348
350
  - lib/daimon_skycrawlers/generator/templates/new/Dockerfile.db
349
351
  - lib/daimon_skycrawlers/generator/templates/new/Gemfile
350
352
  - lib/daimon_skycrawlers/generator/templates/new/README.md.erb
351
353
  - lib/daimon_skycrawlers/generator/templates/new/Rakefile
352
- - lib/daimon_skycrawlers/generator/templates/new/app/crawlers/sample_crawler.rb
353
- - lib/daimon_skycrawlers/generator/templates/new/app/filters/sample_filter.rb
354
- - lib/daimon_skycrawlers/generator/templates/new/app/processors/sample_processor.rb
354
+ - lib/daimon_skycrawlers/generator/templates/new/app/crawler.rb
355
+ - lib/daimon_skycrawlers/generator/templates/new/app/processor.rb
355
356
  - lib/daimon_skycrawlers/generator/templates/new/config/database.yml.erb
356
357
  - lib/daimon_skycrawlers/generator/templates/new/config/init.rb
357
358
  - lib/daimon_skycrawlers/generator/templates/new/docker-compose.yml.erb
@@ -385,10 +386,10 @@ files:
385
386
  - sample/amazon-ranking/Gemfile.lock
386
387
  - sample/amazon-ranking/README.md
387
388
  - sample/amazon-ranking/Rakefile
388
- - sample/amazon-ranking/app/crawlers/sample_crawler.rb
389
- - sample/amazon-ranking/app/filters/sample_filter.rb
389
+ - sample/amazon-ranking/app/crawler.rb
390
+ - sample/amazon-ranking/app/crawlers/.gitkeep
391
+ - sample/amazon-ranking/app/processor.rb
390
392
  - sample/amazon-ranking/app/processors/amazon_ranking.rb
391
- - sample/amazon-ranking/app/processors/sample_processor.rb
392
393
  - sample/amazon-ranking/config/database.yml
393
394
  - sample/amazon-ranking/config/init.rb
394
395
  - sample/amazon-ranking/db/migrate/20161206061241_create_pages.rb
@@ -400,9 +401,11 @@ files:
400
401
  - sample/itp-crawler/Gemfile.lock
401
402
  - sample/itp-crawler/README.md
402
403
  - sample/itp-crawler/Rakefile
403
- - sample/itp-crawler/app/crawlers/itp_crawler.rb
404
+ - sample/itp-crawler/app/crawler.rb
405
+ - sample/itp-crawler/app/crawlers/.gitkeep
404
406
  - sample/itp-crawler/app/models/itp_base.rb
405
407
  - sample/itp-crawler/app/models/itp_shop.rb
408
+ - sample/itp-crawler/app/processor.rb
406
409
  - sample/itp-crawler/app/processors/itp_processor.rb
407
410
  - sample/itp-crawler/config/database.yml
408
411
  - sample/itp-crawler/config/database_itp.yml
@@ -416,8 +419,10 @@ files:
416
419
  - sample/spider/Gemfile
417
420
  - sample/spider/README.md
418
421
  - sample/spider/Rakefile
419
- - sample/spider/app/crawlers/blog_crawler.rb
420
- - sample/spider/app/processors/blog_spider.rb
422
+ - sample/spider/app/crawler.rb
423
+ - sample/spider/app/crawlers/.gitkeep
424
+ - sample/spider/app/processor.rb
425
+ - sample/spider/app/processors/.gitkeep
421
426
  - sample/spider/config/database.yml
422
427
  - sample/spider/config/init.rb
423
428
  - sample/spider/db/migrate/20160830155803_create_pages.rb
@@ -1,9 +0,0 @@
1
- require "daimon_skycrawlers/filter/base"
2
-
3
- class SampleFilter < DaimonSkycrawlers::Filter::Base
4
- def call(message)
5
- # Imprement your filter here.
6
- # If you want to crawl `url`, return true otherwise false.
7
- true
8
- end
9
- end
@@ -1,5 +0,0 @@
1
- require "daimon_skycrawlers/processor"
2
-
3
- DaimonSkycrawlers.register_processor do |data|
4
- p "It works with '#{data[:url]}'"
5
- end