daimon_skycrawlers 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +2 -2
  3. data/Gemfile +1 -1
  4. data/README.md +30 -12
  5. data/Rakefile +3 -8
  6. data/bin/daimon-skycrawlers +3 -3
  7. data/daimon_skycrawlers.gemspec +4 -3
  8. data/lib/daimon_skycrawlers/cli.rb +3 -3
  9. data/lib/daimon_skycrawlers/config.rb +8 -0
  10. data/lib/daimon_skycrawlers/consumer/base.rb +16 -0
  11. data/lib/daimon_skycrawlers/consumer/http_response.rb +47 -0
  12. data/lib/daimon_skycrawlers/consumer/url.rb +44 -0
  13. data/lib/daimon_skycrawlers/consumer.rb +4 -0
  14. data/lib/daimon_skycrawlers/crawler/base.rb +75 -0
  15. data/lib/daimon_skycrawlers/crawler/default.rb +33 -0
  16. data/lib/daimon_skycrawlers/crawler.rb +18 -76
  17. data/lib/daimon_skycrawlers/filter/base.rb +24 -0
  18. data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +23 -0
  19. data/lib/daimon_skycrawlers/filter/update_checker.rb +31 -0
  20. data/lib/daimon_skycrawlers/filter.rb +4 -0
  21. data/lib/daimon_skycrawlers/generator/new.rb +3 -2
  22. data/lib/daimon_skycrawlers/generator/templates/new/Gemfile +3 -3
  23. data/lib/daimon_skycrawlers/generator/templates/new/README.md.erb +22 -6
  24. data/lib/daimon_skycrawlers/generator/templates/new/crawler.rb +5 -5
  25. data/lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb +6 -6
  26. data/lib/daimon_skycrawlers/generator/templates/new/enqueue.rb +17 -8
  27. data/lib/daimon_skycrawlers/generator/templates/new/init.rb +20 -0
  28. data/lib/daimon_skycrawlers/generator/templates/new/processor.rb +2 -0
  29. data/lib/daimon_skycrawlers/logger.rb +32 -0
  30. data/lib/daimon_skycrawlers/processor/base.rb +19 -0
  31. data/lib/daimon_skycrawlers/processor/default.rb +12 -9
  32. data/lib/daimon_skycrawlers/processor/spider.rb +77 -0
  33. data/lib/daimon_skycrawlers/processor.rb +23 -9
  34. data/lib/daimon_skycrawlers/queue.rb +24 -0
  35. data/lib/daimon_skycrawlers/storage/base.rb +6 -0
  36. data/lib/daimon_skycrawlers/timer.rb +24 -0
  37. data/lib/daimon_skycrawlers/version.rb +1 -1
  38. data/lib/daimon_skycrawlers.rb +24 -4
  39. data/sample/spider/Gemfile +4 -0
  40. data/sample/spider/README.md +50 -0
  41. data/sample/spider/Rakefile +1 -0
  42. data/sample/spider/config/database.yml +26 -0
  43. data/sample/spider/crawler.rb +14 -0
  44. data/sample/spider/db/migrate/20160830155803_create_pages.rb +13 -0
  45. data/sample/spider/db/schema.rb +28 -0
  46. data/sample/spider/enqueue.rb +24 -0
  47. data/sample/spider/init.rb +22 -0
  48. data/sample/spider/processor.rb +34 -0
  49. metadata +47 -12
  50. data/lib/daimon_skycrawlers/configure_songkick_queue.rb +0 -12
  51. data/lib/daimon_skycrawlers/http_response_consumer.rb +0 -38
  52. data/lib/daimon_skycrawlers/parser/base.rb +0 -13
  53. data/lib/daimon_skycrawlers/parser/default.rb +0 -50
  54. data/lib/daimon_skycrawlers/parser.rb +0 -7
  55. data/lib/daimon_skycrawlers/url_consumer.rb +0 -32
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e9ebc710f18b92107a91a3bb5a4c8972051ede8f
4
- data.tar.gz: 61db84bcdc73557ae9b28cc7894c04a1055dda6d
3
+ metadata.gz: f03168cc7d465dd69da00eabc00c5bd1d4654455
4
+ data.tar.gz: e55f902ab4f78340ee5df80499f581cefb2bad96
5
5
  SHA512:
6
- metadata.gz: 76d7b629eefc04a89d5cdce8939a9e83f35c68a9b8d1be3599e3c7d59b815f506eb1c295d1e3095f18ca7ec12169fa5e365cda981561804bbb4d2d070eef3051
7
- data.tar.gz: 04b326f7a8531d364c3d41d12f99328831b2eda1acefa72869f588750e065abc0a044fd6d87b92f20a6359983dba8b00c42a4ab40d75979f92c6a553ec16f9e9
6
+ metadata.gz: 0bd1e1e832766d27779e91bba3ae70c70d44863373cd5172d2c67e40ad6ded2dd1329a68f9652aa58fd00bac7e38fd75e3c43092fbeaec4bca42fada779b2ffd
7
+ data.tar.gz: 99dc774a495bfec4e8b0b0693333340fc57be9321ef9fb5d9b2d64f61856ab6defa10c1e8292dc13d90bcca4a69e80e5f2a4aeec033c7c1a345e64d2c965eaec
data/.travis.yml CHANGED
@@ -2,8 +2,8 @@ language: ruby
2
2
  sudo: false
3
3
  cache: bundler
4
4
  rvm:
5
- - 2.2.4
6
- - 2.3.0
5
+ - 2.2.5
6
+ - 2.3.1
7
7
 
8
8
  before_install: gem install bundler -v 1.11.2
9
9
 
data/Gemfile CHANGED
@@ -1,3 +1,3 @@
1
- source 'https://rubygems.org'
1
+ source "https://rubygems.org"
2
2
 
3
3
  gemspec
data/README.md CHANGED
@@ -1,8 +1,8 @@
1
- # DaimonSkycrawlers
1
+ ## Caution!! This product is NOT production-ready.
2
2
 
3
- Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/daimon_skycrawlers`. To experiment with that code, run `bin/console` for an interactive prompt.
3
+ # DaimonSkycrawlers
4
4
 
5
- TODO: Delete this and the text above, and describe your gem
5
+ DaimonSkyCrawlers is a crawler framework.
6
6
 
7
7
  ## Requirements
8
8
 
@@ -33,32 +33,50 @@ Or install it yourself as:
33
33
 
34
34
  1. Create project
35
35
 
36
- $ bundle exec daimon-skycrawlers new mycrawlers
37
- $ cd mycrawlers
36
+ ```
37
+ $ bundle exec daimon-skycrawlers new mycrawlers
38
+ $ cd mycrawlers
39
+ ```
38
40
 
39
41
  2. Install dependencies
40
42
 
41
- $ bundle install
43
+ ```
44
+ $ bundle install
45
+ ```
42
46
 
43
47
  3. Create database
44
48
 
45
- $ bundle exec rake db:create
46
- $ bundle exec rake db:migrate
49
+ ```
50
+ $ bundle exec rake db:create
51
+ $ bundle exec rake db:migrate
52
+ ```
47
53
 
48
54
  4. Open new terminal and run crawler/processor
49
55
 
50
- $ bundle exec ruby crawler.rb # on new terminal
51
- $ bundle exec ruby processor.rb # on new terminal
56
+ ```
57
+ $ bundle exec ruby crawler.rb # on new terminal
58
+ $ bundle exec ruby processor.rb # on new terminal
59
+ ```
52
60
 
53
61
  5. Enqueue task
54
62
 
55
- $ bundle exec ruby enqueue.rb http://example.com/
63
+ ```
64
+ $ bundle exec ruby enqueue.rb url http://example.com/
65
+ ```
56
66
 
57
67
  6. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
58
68
 
69
+ 7. You can re-enqueue task for processor
70
+
71
+ ```
72
+ $ bundle exec ruby enqueue.rb response http://example.com/
73
+ ```
74
+
75
+ Display `It works with 'http://example.com'` again on your terminal which runs your processor.
76
+
59
77
  ## Development
60
78
 
61
- After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
79
+ After checking out the repo, run `bundle install` to install dependencies. Then, run `bundle exec rake test` to run the tests. You can also run `bundle console` for an interactive prompt that will allow you to experiment.
62
80
 
63
81
  To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
64
82
 
data/Rakefile CHANGED
@@ -1,17 +1,12 @@
1
1
  require "bundler/setup"
2
2
  require "bundler/gem_tasks"
3
3
  require "rake/testtask"
4
+ require "daimon_skycrawlers/tasks"
4
5
 
5
6
  Rake::TestTask.new(:test) do |t|
6
7
  t.libs << "test"
7
8
  t.libs << "lib"
8
- t.test_files = FileList['test/**/*_test.rb']
9
+ t.test_files = FileList["test/**/*_test.rb"]
9
10
  end
10
11
 
11
- require "cucumber/rake/task"
12
-
13
- Cucumber::Rake::Task.new(:features) do |t|
14
- t.cucumber_opts = "features --format pretty"
15
- end
16
-
17
- task :default => [:test, :features]
12
+ task :default => [:test]
@@ -1,9 +1,9 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- if File.exist?(File.expand_path('../.git', __dir__))
4
- $LOAD_PATH << File.expand_path('../lib', __dir__)
3
+ if File.exist?(File.expand_path("../.git", __dir__))
4
+ $LOAD_PATH << File.expand_path("../lib", __dir__)
5
5
  end
6
6
 
7
- require 'daimon_skycrawlers/cli'
7
+ require "daimon_skycrawlers/cli"
8
8
 
9
9
  DaimonSkycrawlers::CLI.start
@@ -1,7 +1,7 @@
1
1
  # coding: utf-8
2
- lib = File.expand_path('../lib', __FILE__)
2
+ lib = File.expand_path("../lib", __FILE__)
3
3
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
- require 'daimon_skycrawlers/version'
4
+ require "daimon_skycrawlers/version"
5
5
 
6
6
  Gem::Specification.new do |spec|
7
7
  spec.name = "daimon_skycrawlers"
@@ -25,13 +25,14 @@ Gem::Specification.new do |spec|
25
25
  spec.add_dependency "nokogiri"
26
26
  spec.add_dependency "activerecord"
27
27
  spec.add_dependency "pg"
28
+ spec.add_dependency "timers"
28
29
 
29
30
  spec.add_development_dependency "bundler", "~> 1.11"
30
31
  spec.add_development_dependency "rake", "~> 10.0"
31
32
  spec.add_development_dependency "test-unit"
32
33
  spec.add_development_dependency "test-unit-rr"
33
34
  spec.add_development_dependency "test-unit-notify"
34
- spec.add_development_dependency "cucumber"
35
35
  spec.add_development_dependency "pry"
36
36
  spec.add_development_dependency "tapp"
37
+ spec.add_development_dependency "sqlite3"
37
38
  end
@@ -1,6 +1,6 @@
1
- require 'thor'
2
- require 'daimon_skycrawlers/generator/new'
3
- require 'daimon_skycrawlers/version'
1
+ require "thor"
2
+ require "daimon_skycrawlers/generator/new"
3
+ require "daimon_skycrawlers/version"
4
4
 
5
5
  module DaimonSkycrawlers
6
6
  class CLI < Thor
@@ -0,0 +1,8 @@
1
+ module DaimonSkycrawlers
2
+ module ConfigMixin
3
+ def initialize
4
+ super
5
+ @log = DaimonSkycrawlers.configuration.logger
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,16 @@
1
+ require "songkick_queue"
2
+ require "daimon_skycrawlers/logger"
3
+ require "daimon_skycrawlers/config"
4
+
5
+ module DaimonSkycrawlers
6
+ module Consumer
7
+ class Base
8
+ include DaimonSkycrawlers::LoggerMixin
9
+ include DaimonSkycrawlers::ConfigMixin
10
+
11
+ def process(message)
12
+ raise NotImplementedError, "Must implement in subclass"
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,47 @@
1
+ require "songkick_queue"
2
+ require "daimon_skycrawlers"
3
+ require "daimon_skycrawlers/consumer/base"
4
+ require "daimon_skycrawlers/processor/default"
5
+
6
+ module DaimonSkycrawlers
7
+ module Consumer
8
+ class HTTPResponse < Base
9
+ include SongkickQueue::Consumer
10
+
11
+ class << self
12
+ def register(processor = nil, &block)
13
+ if block_given?
14
+ processors << block
15
+ else
16
+ processors << processor
17
+ end
18
+ end
19
+
20
+ def processors
21
+ @processors ||= []
22
+ end
23
+
24
+ def default_processor
25
+ DaimonSkycrawlers::Processor::Default.new
26
+ end
27
+
28
+ def queue_name
29
+ "#{DaimonSkycrawlers.configuration.queue_name_prefix}.http-response"
30
+ end
31
+ end
32
+
33
+ consume_from_queue queue_name
34
+
35
+ def process(message)
36
+ if self.class.processors.empty?
37
+ processors = [self.class.default_processor]
38
+ else
39
+ processors = self.class.processors
40
+ end
41
+ processors.each do |processor|
42
+ processor.call(message)
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,44 @@
1
+ require "songkick_queue"
2
+ require "daimon_skycrawlers"
3
+ require "daimon_skycrawlers/consumer/base"
4
+
5
+ module DaimonSkycrawlers
6
+ module Consumer
7
+ class URL < Base
8
+ include SongkickQueue::Consumer
9
+
10
+ class << self
11
+ def register(crawler)
12
+ crawlers << crawler
13
+ end
14
+
15
+ def crawlers
16
+ @crawlers ||= []
17
+ end
18
+
19
+ def queue_name
20
+ "#{DaimonSkycrawlers.configuration.queue_name_prefix}.url"
21
+ end
22
+ end
23
+
24
+ consume_from_queue queue_name
25
+
26
+ def process(message)
27
+ url = message[:url]
28
+ depth = Integer(message[:depth] || 0)
29
+
30
+ crawler_interval = DaimonSkycrawlers.configuration.crawler_interval
31
+
32
+ # XXX When several crawlers are registered, how should they behave?
33
+ self.class.crawlers.each do |crawler|
34
+ crawler.fetch(url, depth: depth)
35
+ if crawler.skipped?
36
+ sleep(crawler_interval) if crawler.n_processed_urls % 50 == 0
37
+ else
38
+ sleep(crawler_interval)
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,4 @@
1
+ module DaimonSkycrawlers
2
+ module Consumer
3
+ end
4
+ end
@@ -0,0 +1,75 @@
1
+ require "uri"
2
+ require "faraday"
3
+
4
+ require "daimon_skycrawlers/logger"
5
+ require "daimon_skycrawlers/config"
6
+ require "daimon_skycrawlers/storage"
7
+ require "daimon_skycrawlers/processor"
8
+
9
+ module DaimonSkycrawlers
10
+ module Crawler
11
+ class Base
12
+ include DaimonSkycrawlers::LoggerMixin
13
+ include DaimonSkycrawlers::ConfigMixin
14
+
15
+ attr_writer :storage
16
+
17
+ def initialize(base_url = nil, options = {})
18
+ super()
19
+ @base_url = base_url
20
+ @options = options
21
+ @prepare = ->(connection) {}
22
+ @skipped = false
23
+ @n_processed_urls = 0
24
+ end
25
+
26
+ def setup_connection(options = {})
27
+ @connection = Faraday.new(@base_url, @options.merge(options)) do |faraday|
28
+ yield faraday
29
+ end
30
+ end
31
+
32
+ #
33
+ # Call this method before DaimonSkycrawlers.register_crawler
34
+ # For example, you can login before fetch URL
35
+ #
36
+ def prepare(&block)
37
+ @prepare = block
38
+ end
39
+
40
+ def storage
41
+ @storage ||= Storage::RDB.new
42
+ end
43
+
44
+ def skipped?
45
+ @skipped
46
+ end
47
+
48
+ def connection
49
+ @connection ||= Faraday.new(@base_url, @options)
50
+ end
51
+
52
+ def fetch(path, params = {}, **kw)
53
+ raise NotImplementedError, "Must implement this method in subclass"
54
+ end
55
+
56
+ def get(path, params = {})
57
+ @connection.get(path, params)
58
+ end
59
+
60
+ def post(path, params = {})
61
+ @connection.post(path, params)
62
+ end
63
+
64
+ def n_processed_urls
65
+ @n_processed_urls
66
+ end
67
+
68
+ private
69
+
70
+ def schedule_to_process(url, message = {})
71
+ DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,33 @@
1
+ require "daimon_skycrawlers/crawler/base"
2
+ require "daimon_skycrawlers/filter/update_checker"
3
+
4
+ module DaimonSkycrawlers
5
+ module Crawler
6
+ class Default < Base
7
+ def fetch(path, depth: 3, **kw)
8
+ @n_processed_urls += 1
9
+ @skipped = false
10
+ url = connection.url_prefix + path
11
+ update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(storage: storage)
12
+ unless update_checker.call(url.to_s, connection: connection)
13
+ log.info("Skip #{url}")
14
+ @skipped = true
15
+ schedule_to_process(url.to_s, heartbeat: true)
16
+ return
17
+ end
18
+ @prepare.call(connection)
19
+ response = get(path)
20
+ data = [url.to_s, response.headers, response.body]
21
+
22
+ yield(*data) if block_given?
23
+
24
+ storage.save(*data)
25
+ message = {
26
+ depth: depth
27
+ }
28
+ message = message.merge(kw)
29
+ schedule_to_process(url.to_s, message)
30
+ end
31
+ end
32
+ end
33
+ end
@@ -1,89 +1,31 @@
1
- require 'uri'
2
-
3
- require 'daimon_skycrawlers'
4
- require 'daimon_skycrawlers/version'
5
- require 'daimon_skycrawlers/configure_songkick_queue'
6
- require 'daimon_skycrawlers/url_consumer'
7
- require 'daimon_skycrawlers/storage'
8
- require 'daimon_skycrawlers/parser'
9
-
10
- require 'faraday'
1
+ require "daimon_skycrawlers"
2
+ require "daimon_skycrawlers/queue"
3
+ require "daimon_skycrawlers/timer"
4
+ require "daimon_skycrawlers/consumer/url"
11
5
 
12
6
  module DaimonSkycrawlers
13
- class Crawler
7
+ module Crawler
14
8
  class << self
15
- def run(process_name: 'daimon-skycrawler:url')
16
- SongkickQueue::Worker.new(process_name, [URLConsumer]).run
9
+ def run(process_name: default_process_name)
10
+ DaimonSkycrawlers::Timer.setup_shutdown_timer(config.queue_name_prefix, interval: config.shutdown_interval)
11
+ SongkickQueue::Worker.new(process_name, [DaimonSkycrawlers::Consumer::URL]).run
17
12
  end
18
13
 
19
- def enqueue_url(url, depth: 3, interval: 1)
20
- SongkickQueue.publish('daimon-skycrawler.url', url: url, depth: depth, interval: interval)
14
+ def enqueue_url(url, message = {})
15
+ message[:url] = url
16
+ SongkickQueue.publish(queue_name, message)
21
17
  end
22
- end
23
-
24
- attr_writer :storage
25
- attr_writer :parser
26
18
 
27
- def initialize(base_url, options = {})
28
- @base_url = base_url
29
- @options = options
30
- end
31
-
32
- def setup_connection(options = {})
33
- @connection = Faraday.new(@base_url, options) do |faraday|
34
- yield faraday
19
+ def config
20
+ DaimonSkycrawlers.configuration
35
21
  end
36
- end
37
-
38
- def storage
39
- @storage ||= Storage::RDB.new
40
- end
41
-
42
- def parser
43
- @parser ||= Parser::Default.new
44
- end
45
-
46
- # TODO Support POST when we need
47
- # TODO `params` should be a part of `path`. such as `path == "/hoi?hi=yoyo"`.
48
- def fetch(path, params = {}, depth: 3)
49
- @connection ||= Faraday.new(@base_url)
50
- response = get(path)
51
-
52
- url = @connection.url_prefix + path
53
-
54
- data = [url.to_s, response.headers, response.body]
55
22
 
56
- yield(*data) if block_given?
57
-
58
- storage.save(*data)
59
-
60
- schedule_to_process(url.to_s)
61
-
62
- parser.parse(response.body)
63
- urls = parser.links
64
-
65
- enqueue_next_urls(urls, depth: depth - 1, interval: 1)
66
- end
67
-
68
- def get(path, params = {})
69
- @connection.get(path, params)
70
- end
71
-
72
- def post(path, params = {})
73
- @connection.post(path, params)
74
- end
75
-
76
- private
77
-
78
- def schedule_to_process(url)
79
- DaimonSkycrawlers::Processor.enqueue_http_response(url)
80
- end
81
-
82
- def enqueue_next_urls(urls, depth: 3, interval: 1)
83
- return if depth <= 0
23
+ def queue_name
24
+ "#{config.queue_name_prefix}.url"
25
+ end
84
26
 
85
- urls.each do |url|
86
- self.class.enqueue_url(url, depth: depth, interval: interval)
27
+ def default_process_name
28
+ "#{config.queue_name_prefix}:url"
87
29
  end
88
30
  end
89
31
  end
@@ -0,0 +1,24 @@
1
+ require "daimon_skycrawlers/logger"
2
+ require "daimon_skycrawlers/config"
3
+
4
+ module DaimonSkycrawlers
5
+ module Filter
6
+ class Base
7
+ include DaimonSkycrawlers::LoggerMixin
8
+ include DaimonSkycrawlers::ConfigMixin
9
+
10
+ def initialize(storage: nil)
11
+ super()
12
+ @storage = storage
13
+ end
14
+
15
+ def storage
16
+ @storage ||= DaimonSkycrawlers::Storage::RDB.new
17
+ end
18
+
19
+ def call(url)
20
+ raise NotImplementedError, "Must implement this method in subclass"
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,23 @@
1
+ require "set"
2
+ require "daimon_skycrawlers/filter/base"
3
+
4
+ module DaimonSkycrawlers
5
+ module Filter
6
+ class DuplicateChecker < Base
7
+ def initialize(base_url: nil)
8
+ @base_url = nil
9
+ @base_url = URI(base_url) if base_url
10
+ @urls = Set.new
11
+ end
12
+
13
+ def call(url)
14
+ unless URI(url).absolute?
15
+ url = (@base_url + url).to_s
16
+ end
17
+ return false if @urls.include?(url)
18
+ @urls << url
19
+ true
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,31 @@
1
+ require "faraday"
2
+ require "daimon_skycrawlers/filter/base"
3
+
4
+ module DaimonSkycrawlers
5
+ module Filter
6
+ class UpdateChecker < Base
7
+ def initialize(storage: nil, base_url: nil)
8
+ super(storage: storage)
9
+ @base_url = nil
10
+ @base_url = URI(base_url) if base_url
11
+ end
12
+
13
+ def call(url, connection: nil)
14
+ unless URI(url).absolute?
15
+ url = (@base_url + url).to_s
16
+ end
17
+ page = storage.find(url)
18
+ return true unless page
19
+ if connection
20
+ headers = connection.head(url)
21
+ else
22
+ headers = Faraday.head(url)
23
+ end
24
+ return false if headers["etag"] && page.etag && headers["etag"] == page.etag
25
+ return false if headers["last-modified"].nil? && page.last_modified_at.nil?
26
+ return false if headers["last-modified"] <= page.last_modified_at
27
+ true
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,4 @@
1
+ module DaimonSkycrawlers
2
+ module Filter
3
+ end
4
+ end
@@ -1,4 +1,4 @@
1
- require 'thor'
1
+ require "thor"
2
2
 
3
3
  module DaimonSkycrawlers
4
4
  module Generator
@@ -8,7 +8,7 @@ module DaimonSkycrawlers
8
8
  argument :name
9
9
 
10
10
  def self.source_root
11
- File.join(File.dirname(__FILE__), "templates", "new")
11
+ File.join(__dir__, "templates", "new")
12
12
  end
13
13
 
14
14
  def create_files
@@ -26,6 +26,7 @@ module DaimonSkycrawlers
26
26
  "Rakefile",
27
27
  "crawler.rb",
28
28
  "enqueue.rb",
29
+ "init.rb",
29
30
  "processor.rb",
30
31
  ].each do |path|
31
32
  copy_file(path, "#{name}/#{path}")
@@ -1,4 +1,4 @@
1
- source 'https://rubygems.org'
1
+ source "https://rubygems.org"
2
2
 
3
- gem 'rake'
4
- gem 'daimon_skycrawlers'
3
+ gem "rake"
4
+ gem "daimon_skycrawlers"