daimon_skycrawlers 0.11.3 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +5 -2
  3. data/lib/daimon_skycrawlers.rb +14 -0
  4. data/lib/daimon_skycrawlers/commands/runner.rb +12 -8
  5. data/lib/daimon_skycrawlers/generator/crawler.rb +13 -0
  6. data/lib/daimon_skycrawlers/generator/filter.rb +34 -0
  7. data/lib/daimon_skycrawlers/generator/generate.rb +2 -0
  8. data/lib/daimon_skycrawlers/generator/new.rb +5 -3
  9. data/lib/daimon_skycrawlers/generator/processor.rb +12 -0
  10. data/lib/daimon_skycrawlers/generator/templates/crawler.rb.erb +0 -4
  11. data/{sample/amazon-ranking/app/filters/sample_filter.rb → lib/daimon_skycrawlers/generator/templates/filter.rb.erb} +1 -1
  12. data/{sample/amazon-ranking/app/crawlers/sample_crawler.rb → lib/daimon_skycrawlers/generator/templates/new/app/crawler.rb} +2 -0
  13. data/lib/daimon_skycrawlers/generator/templates/new/app/{processors/sample_processor.rb → processor.rb} +2 -0
  14. data/lib/daimon_skycrawlers/generator/templates/processor.rb.erb +0 -3
  15. data/lib/daimon_skycrawlers/processor/spider.rb +1 -1
  16. data/lib/daimon_skycrawlers/version.rb +1 -1
  17. data/{lib/daimon_skycrawlers/generator/templates/new/app/crawlers/sample_crawler.rb → sample/amazon-ranking/app/crawler.rb} +0 -0
  18. data/sample/amazon-ranking/app/crawlers/.gitkeep +0 -0
  19. data/sample/amazon-ranking/app/processor.rb +25 -0
  20. data/sample/amazon-ranking/app/processors/amazon_ranking.rb +0 -17
  21. data/sample/itp-crawler/app/{crawlers/itp_crawler.rb → crawler.rb} +1 -0
  22. data/sample/itp-crawler/app/crawlers/.gitkeep +0 -0
  23. data/sample/itp-crawler/app/processor.rb +8 -0
  24. data/sample/itp-crawler/app/processors/itp_processor.rb +0 -3
  25. data/sample/spider/app/{crawlers/blog_crawler.rb → crawler.rb} +2 -0
  26. data/sample/spider/app/crawlers/.gitkeep +0 -0
  27. data/sample/spider/app/{processors/blog_spider.rb → processor.rb} +2 -0
  28. data/sample/spider/app/processors/.gitkeep +0 -0
  29. metadata +16 -11
  30. data/lib/daimon_skycrawlers/generator/templates/new/app/filters/sample_filter.rb +0 -9
  31. data/sample/amazon-ranking/app/processors/sample_processor.rb +0 -5
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d26cd9bc437f67de677b2fd929b3df34d5715736
4
- data.tar.gz: e2a01ec217a9eac95cd23025286782844d8fc30d
3
+ metadata.gz: ea06b539d343f1a13e2eb22f3cd2e9662ce3d9da
4
+ data.tar.gz: 39164028e5902f05a86a4c1e1f1890bbe8065dfa
5
5
  SHA512:
6
- metadata.gz: d34053642471b25522a768507867d2c21ce4ee5a653c5ab886cdb8c42bfd62558770db354b3c74bc57f14bc9241e3a87488ad65c3634f4dc76705fdf9e1e7c31
7
- data.tar.gz: 47937c94c61d1d2a71e8b2c6d5844aaa6dc1f15ac48e450453c1ed54941cd45ff900458de10382d74761ca495be10a7cbe8f1c6dc751b84b8a4925621c7ce965
6
+ metadata.gz: 22472b47e7a5e87758c2bf8712bd76d227e7c6589afd641060e93e8265fdb628abf8a236dbcf1dab0dc6390f0fd6fb680e96ff4998c4ba479f4a5ff334b15a4f
7
+ data.tar.gz: 8bdf2df5606221dcb984ab3850b8922f524dfbaf12d80cb38c11f65632327d0567b6c17aaff7a51da47e77e88fa27e360045dbdf38af2f3dc02cec3384f15090
data/README.md CHANGED
@@ -2,6 +2,9 @@
2
2
 
3
3
  # DaimonSkycrawlers
4
4
 
5
+ [![Gem Version](https://badge.fury.io/rb/daimon_skycrawlers.svg)](https://badge.fury.io/rb/daimon_skycrawlers)
6
+ [![Build Status](https://travis-ci.org/bm-sms/daimon_skycrawlers.svg?branch=master)](https://travis-ci.org/bm-sms/daimon_skycrawlers)
7
+
5
8
  DaimonSkycrawlers is a crawler framework.
6
9
 
7
10
  ## Requirements
@@ -63,8 +66,8 @@ Or install it yourself as:
63
66
  $ daimon_skycrawlers exec processor # on new terminal
64
67
  ```
65
68
 
66
- NOTE: Execute step 5 as soon as possible. Because bin/crawler and
67
- bin/processor will stop after 10 seconds by default if their
69
+ NOTE: Execute step 5 as soon as possible. Because crawler and
70
+ processor will stop after 10 seconds by default if their
68
71
  queues are empty.
69
72
 
70
73
  NOTE: You can change `shutdown_interval` using following code in config/init.rb:
@@ -75,6 +75,20 @@ module DaimonSkycrawlers
75
75
  exit(false)
76
76
  end
77
77
 
78
+ def load_crawlers
79
+ Dir.glob("app/crawlers/**/*.rb") do |path|
80
+ require(File.expand_path(path, Dir.pwd)) &&
81
+ DaimonSkycrawlers.configuration.logger.info("Loaded crawler: #{path}")
82
+ end
83
+ end
84
+
85
+ def load_processors
86
+ Dir.glob("app/processors/**/*.rb") do |path|
87
+ require(File.expand_path(path, Dir.pwd)) &&
88
+ DaimonSkycrawlers.configuration.logger.info("Loaded processor: #{path}")
89
+ end
90
+ end
91
+
78
92
  #
79
93
  # Return current environment
80
94
  #
@@ -10,10 +10,8 @@ module DaimonSkycrawlers
10
10
  desc "crawler", "Execute crawler"
11
11
  def crawler
12
12
  load_init
13
- Dir.glob("app/crawlers/**/*.rb") do |path|
14
- require(File.expand_path(path, Dir.pwd))
15
- log.info("Loaded crawler: #{path}")
16
- end
13
+ load_crawlers
14
+ require(File.expand_path("app/crawler.rb", Dri.pwd))
17
15
  DaimonSkycrawlers::Crawler.run
18
16
  rescue => ex
19
17
  puts ex.message
@@ -23,10 +21,8 @@ module DaimonSkycrawlers
23
21
  desc "processor", "Execute processor"
24
22
  def processor
25
23
  load_init
26
- Dir.glob("app/processors/**/*.rb") do |path|
27
- require(File.expand_path(path, Dir.pwd))
28
- log.info("Loaded processor: #{path}")
29
- end
24
+ load_processors
25
+ require(File.expand_path("app/processor.rb", Dir.pwd))
30
26
  DaimonSkycrawlers::Processor.run
31
27
  rescue => ex
32
28
  puts ex.message
@@ -39,6 +35,14 @@ module DaimonSkycrawlers
39
35
  DaimonSkycrawlers.load_init
40
36
  end
41
37
 
38
+ def load_crawlers
39
+ DaimonSkycrawlers.load_crawlers
40
+ end
41
+
42
+ def load_processors
43
+ DaimonSkycrawlers.load_processors
44
+ end
45
+
42
46
  def log
43
47
  DaimonSkycrawlers.configuration.logger
44
48
  end
@@ -17,6 +17,19 @@ module DaimonSkycrawlers
17
17
  }
18
18
  template("crawler.rb.erb", "app/crawlers/#{name.underscore}.rb", config)
19
19
  end
20
+
21
+ def display_post_message
22
+ puts <<MESSAGE
23
+
24
+ You can register your crawler in `app/crawler.rb` to run your crawler.
25
+ Following code snippet is useful:
26
+
27
+ base_url = "https://www.example.com/"
28
+ crawler = #{name.classify}.new(base_url)
29
+ DaimonSkycrawlers.register_crawler(crawler)
30
+
31
+ MESSAGE
32
+ end
20
33
  end
21
34
  end
22
35
  end
@@ -0,0 +1,34 @@
1
+ require "thor"
2
+
3
+ module DaimonSkycrawlers
4
+ module Generator
5
+ class Filter < Thor::Group
6
+ include Thor::Actions
7
+
8
+ argument :name
9
+
10
+ def self.source_root
11
+ File.join(__dir__, "templates")
12
+ end
13
+
14
+ def create_files
15
+ config = {
16
+ class_name: name.classify,
17
+ }
18
+ template("filter.rb.erb", "app/filters/#{name.underscore}.rb", config)
19
+ end
20
+
21
+ def display_post_message
22
+ puts <<MESSAGE
23
+
24
+ You can use this filter with both crawlers and processors.
25
+
26
+ filter = #{name.classify}.new
27
+ crawler = DaimonSkycrawlers::Crawler::Default.new
28
+ crawler.before_process(filter)
29
+
30
+ MESSAGE
31
+ end
32
+ end
33
+ end
34
+ end
@@ -1,12 +1,14 @@
1
1
  require "thor"
2
2
  require "daimon_skycrawlers/generator/crawler"
3
3
  require "daimon_skycrawlers/generator/processor"
4
+ require "daimon_skycrawlers/generator/filter"
4
5
 
5
6
  module DaimonSkycrawlers
6
7
  module Generator
7
8
  class Generate < Thor
8
9
  register(Crawler, "crawler", "crawler NAME", "Generate new crawler")
9
10
  register(Processor, "processor", "processor NAME", "Generate new processor")
11
+ register(Filter, "filter", "filter NAME", "Generate new filter")
10
12
  end
11
13
  end
12
14
  end
@@ -70,9 +70,8 @@ module DaimonSkycrawlers
70
70
  "Dockerfile.db",
71
71
  "Gemfile",
72
72
  "Rakefile",
73
- "app/crawlers/sample_crawler.rb",
74
- "app/filters/sample_filter.rb",
75
- "app/processors/sample_processor.rb",
73
+ "app/crawler.rb",
74
+ "app/processor.rb",
76
75
  "config/init.rb",
77
76
  "services/common/docker-entrypoint.sh",
78
77
  "services/db/init-user-db.sh"
@@ -83,6 +82,9 @@ module DaimonSkycrawlers
83
82
 
84
83
  def create_directories
85
84
  [
85
+ "app/crawlers",
86
+ "app/filters",
87
+ "app/processors",
86
88
  "vendor/bundle",
87
89
  "docker-cache/bundle",
88
90
  "docker-cache/.bundle"
@@ -17,6 +17,18 @@ module DaimonSkycrawlers
17
17
  }
18
18
  template("processor.rb.erb", "app/processors/#{name.underscore}.rb", config)
19
19
  end
20
+
21
+ def display_post_message
22
+ puts <<MESSAGE
23
+
24
+ You can register your processor in `app/processor.rb` to run your processor.
25
+ Following code snippet is useful:
26
+
27
+ processor = #{naem.classify}.new
28
+ DaimonSkycrawlers.register_processor(processor)
29
+
30
+ MESSAGE
31
+ end
20
32
  end
21
33
  end
22
34
  end
@@ -7,7 +7,3 @@ class <%= config[:class_name] %> < DaimonSkycrawlers::Crawler::Base
7
7
  # Implement your crawler here
8
8
  end
9
9
  end
10
-
11
- base_url = ""
12
- crawler = <%= config[:class_name] %>.new(base_url)
13
- DaimonSkycrawlers.register_crawler(crawler)
@@ -1,6 +1,6 @@
1
1
  require "daimon_skycrawlers/filter/base"
2
2
 
3
- class SampleFilter < DaimonSkycrawlers::Filter::Base
3
+ class <%= config[:class_name] %> < DaimonSkycrawlers::Filter::Base
4
4
  def call(message)
5
5
  # Imprement your filter here.
6
6
  # If you want to crawl `url`, return true otherwise false.
@@ -1,6 +1,8 @@
1
1
  require "daimon_skycrawlers/crawler"
2
2
  require "daimon_skycrawlers/crawler/default"
3
3
 
4
+ DaimonSkycrawlers.load_crawlers
5
+
4
6
  base_url = "http://example.com"
5
7
 
6
8
  crawler = DaimonSkycrawlers::Crawler::Default.new(base_url)
@@ -1,5 +1,7 @@
1
1
  require "daimon_skycrawlers/processor"
2
2
 
3
+ DaimonSkycrawlers.load_processors
4
+
3
5
  DaimonSkycrawlers.register_processor do |data|
4
6
  p "It works with '#{data[:url]}'"
5
7
  end
@@ -7,6 +7,3 @@ class <%= config[:class_name] %> < DaimonSkycrawlers::Processor::Base
7
7
  # Implement your processor here
8
8
  end
9
9
  end
10
-
11
- processor = <%= config[:class_name] %>.new
12
- DaimonSkycrawlers.register_processor(processor)
@@ -147,7 +147,7 @@ module DaimonSkycrawlers
147
147
  return if urls.empty?
148
148
  log.debug("Candidate URLs: #{urls.size}")
149
149
  urls = urls.select do |url|
150
- @link_filters.all? {|filter| filter.call(url) }
150
+ @link_filters.all? {|filter| filter.call(url: url) }
151
151
  end
152
152
  log.debug("Filtered URLs: #{urls.size}")
153
153
  urls
@@ -1,3 +1,3 @@
1
1
  module DaimonSkycrawlers
2
- VERSION = "0.11.3"
2
+ VERSION = "0.12.0"
3
3
  end
File without changes
@@ -0,0 +1,25 @@
1
+ require "daimon_skycrawlers"
2
+ require "daimon_skycrawlers/processor"
3
+
4
+ DaimonSkycrawlers.load_processors
5
+
6
+ DaimonSkycrawlers.register_processor do |data|
7
+ p "It works with '#{data[:url]}'"
8
+ end
9
+
10
+ spider = DaimonSkycrawlers::Processor::Spider.new
11
+ spider.configure do |s|
12
+ s.link_rules = "ul#zg_browseRoot li a"
13
+ s.link_message = { next_processor: "AmazonRanking" }
14
+ s.before_process do |message|
15
+ message[:next_processor] != "AmazonRanking"
16
+ end
17
+ end
18
+ DaimonSkycrawlers.register_processor(spider)
19
+
20
+ processor = AmazonRanking.new.configure do |s|
21
+ s.before_process do |message|
22
+ message[:next_processor] == "AmazonRanking"
23
+ end
24
+ end
25
+ DaimonSkycrawlers.register_processor(processor)
@@ -19,20 +19,3 @@ class AmazonRanking < DaimonSkycrawlers::Processor::Base
19
19
  p ranking
20
20
  end
21
21
  end
22
-
23
- spider = DaimonSkycrawlers::Processor::Spider.new
24
- spider.configure do |s|
25
- s.link_rules = "ul#zg_browseRoot li a"
26
- s.link_message = { next_processor: "AmazonRanking" }
27
- s.before_process do |message|
28
- message[:next_processor] != "AmazonRanking"
29
- end
30
- end
31
- DaimonSkycrawlers.register_processor(spider)
32
-
33
- processor = AmazonRanking.new.configure do |s|
34
- s.before_process do |message|
35
- message[:next_processor] == "AmazonRanking"
36
- end
37
- end
38
- DaimonSkycrawlers.register_processor(processor)
@@ -2,6 +2,7 @@ require "daimon_skycrawlers"
2
2
  require "daimon_skycrawlers/crawler"
3
3
  require "daimon_skycrawlers/crawler/default"
4
4
 
5
+ DaimonSkycrawlers.load_crawlers
5
6
 
6
7
  base_url = "http://itp.ne.jp/"
7
8
  crawler = DaimonSkycrawlers::Crawler::Default.new(base_url)
File without changes
@@ -0,0 +1,8 @@
1
+ require "daimon_skycrawlers"
2
+ require "daimon_skycrawlers/processor"
3
+ require "daimon_skycrawlers/processor/base"
4
+
5
+ DaimonSkycrawlers.load_processors
6
+
7
+ processor = ItpProcessor.new
8
+ DaimonSkycrawlers.register_processor(processor)
@@ -90,6 +90,3 @@ class ItpProcessor < DaimonSkycrawlers::Processor::Base
90
90
  end
91
91
  end
92
92
  end
93
-
94
- processor = ItpProcessor.new
95
- DaimonSkycrawlers.register_processor(processor)
@@ -1,5 +1,7 @@
1
1
  require "daimon_skycrawlers/crawler/default"
2
2
 
3
+ DaimonSkycrawlers.load_crawlers
4
+
3
5
  base_url = "http://www.clear-code.com/blog/"
4
6
 
5
7
  crawler = DaimonSkycrawlers::Crawler::Default.new(base_url)
File without changes
@@ -3,6 +3,8 @@ require "daimon_skycrawlers/filter"
3
3
  require "daimon_skycrawlers/filter/duplicate_checker"
4
4
  require "daimon_skycrawlers/filter/update_checker"
5
5
 
6
+ DaimonSkycrawlers.load_processors
7
+
6
8
  default_processor = DaimonSkycrawlers::Processor::Default.new
7
9
  spider = DaimonSkycrawlers::Processor::Spider.new
8
10
  #spider.enqueue = false
File without changes
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: daimon_skycrawlers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.3
4
+ version: 0.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - daimon developers
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-12-08 00:00:00.000000000 Z
11
+ date: 2016-12-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -340,18 +340,19 @@ files:
340
340
  - lib/daimon_skycrawlers/filter/robots_txt_checker.rb
341
341
  - lib/daimon_skycrawlers/filter/update_checker.rb
342
342
  - lib/daimon_skycrawlers/generator/crawler.rb
343
+ - lib/daimon_skycrawlers/generator/filter.rb
343
344
  - lib/daimon_skycrawlers/generator/generate.rb
344
345
  - lib/daimon_skycrawlers/generator/new.rb
345
346
  - lib/daimon_skycrawlers/generator/processor.rb
346
347
  - lib/daimon_skycrawlers/generator/templates/crawler.rb.erb
348
+ - lib/daimon_skycrawlers/generator/templates/filter.rb.erb
347
349
  - lib/daimon_skycrawlers/generator/templates/new/Dockerfile
348
350
  - lib/daimon_skycrawlers/generator/templates/new/Dockerfile.db
349
351
  - lib/daimon_skycrawlers/generator/templates/new/Gemfile
350
352
  - lib/daimon_skycrawlers/generator/templates/new/README.md.erb
351
353
  - lib/daimon_skycrawlers/generator/templates/new/Rakefile
352
- - lib/daimon_skycrawlers/generator/templates/new/app/crawlers/sample_crawler.rb
353
- - lib/daimon_skycrawlers/generator/templates/new/app/filters/sample_filter.rb
354
- - lib/daimon_skycrawlers/generator/templates/new/app/processors/sample_processor.rb
354
+ - lib/daimon_skycrawlers/generator/templates/new/app/crawler.rb
355
+ - lib/daimon_skycrawlers/generator/templates/new/app/processor.rb
355
356
  - lib/daimon_skycrawlers/generator/templates/new/config/database.yml.erb
356
357
  - lib/daimon_skycrawlers/generator/templates/new/config/init.rb
357
358
  - lib/daimon_skycrawlers/generator/templates/new/docker-compose.yml.erb
@@ -385,10 +386,10 @@ files:
385
386
  - sample/amazon-ranking/Gemfile.lock
386
387
  - sample/amazon-ranking/README.md
387
388
  - sample/amazon-ranking/Rakefile
388
- - sample/amazon-ranking/app/crawlers/sample_crawler.rb
389
- - sample/amazon-ranking/app/filters/sample_filter.rb
389
+ - sample/amazon-ranking/app/crawler.rb
390
+ - sample/amazon-ranking/app/crawlers/.gitkeep
391
+ - sample/amazon-ranking/app/processor.rb
390
392
  - sample/amazon-ranking/app/processors/amazon_ranking.rb
391
- - sample/amazon-ranking/app/processors/sample_processor.rb
392
393
  - sample/amazon-ranking/config/database.yml
393
394
  - sample/amazon-ranking/config/init.rb
394
395
  - sample/amazon-ranking/db/migrate/20161206061241_create_pages.rb
@@ -400,9 +401,11 @@ files:
400
401
  - sample/itp-crawler/Gemfile.lock
401
402
  - sample/itp-crawler/README.md
402
403
  - sample/itp-crawler/Rakefile
403
- - sample/itp-crawler/app/crawlers/itp_crawler.rb
404
+ - sample/itp-crawler/app/crawler.rb
405
+ - sample/itp-crawler/app/crawlers/.gitkeep
404
406
  - sample/itp-crawler/app/models/itp_base.rb
405
407
  - sample/itp-crawler/app/models/itp_shop.rb
408
+ - sample/itp-crawler/app/processor.rb
406
409
  - sample/itp-crawler/app/processors/itp_processor.rb
407
410
  - sample/itp-crawler/config/database.yml
408
411
  - sample/itp-crawler/config/database_itp.yml
@@ -416,8 +419,10 @@ files:
416
419
  - sample/spider/Gemfile
417
420
  - sample/spider/README.md
418
421
  - sample/spider/Rakefile
419
- - sample/spider/app/crawlers/blog_crawler.rb
420
- - sample/spider/app/processors/blog_spider.rb
422
+ - sample/spider/app/crawler.rb
423
+ - sample/spider/app/crawlers/.gitkeep
424
+ - sample/spider/app/processor.rb
425
+ - sample/spider/app/processors/.gitkeep
421
426
  - sample/spider/config/database.yml
422
427
  - sample/spider/config/init.rb
423
428
  - sample/spider/db/migrate/20160830155803_create_pages.rb
@@ -1,9 +0,0 @@
1
- require "daimon_skycrawlers/filter/base"
2
-
3
- class SampleFilter < DaimonSkycrawlers::Filter::Base
4
- def call(message)
5
- # Imprement your filter here.
6
- # If you want to crawl `url`, return true otherwise false.
7
- true
8
- end
9
- end
@@ -1,5 +0,0 @@
1
- require "daimon_skycrawlers/processor"
2
-
3
- DaimonSkycrawlers.register_processor do |data|
4
- p "It works with '#{data[:url]}'"
5
- end