daimon_skycrawlers 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +2 -2
  3. data/Gemfile +1 -1
  4. data/README.md +30 -12
  5. data/Rakefile +3 -8
  6. data/bin/daimon-skycrawlers +3 -3
  7. data/daimon_skycrawlers.gemspec +4 -3
  8. data/lib/daimon_skycrawlers/cli.rb +3 -3
  9. data/lib/daimon_skycrawlers/config.rb +8 -0
  10. data/lib/daimon_skycrawlers/consumer/base.rb +16 -0
  11. data/lib/daimon_skycrawlers/consumer/http_response.rb +47 -0
  12. data/lib/daimon_skycrawlers/consumer/url.rb +44 -0
  13. data/lib/daimon_skycrawlers/consumer.rb +4 -0
  14. data/lib/daimon_skycrawlers/crawler/base.rb +75 -0
  15. data/lib/daimon_skycrawlers/crawler/default.rb +33 -0
  16. data/lib/daimon_skycrawlers/crawler.rb +18 -76
  17. data/lib/daimon_skycrawlers/filter/base.rb +24 -0
  18. data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +23 -0
  19. data/lib/daimon_skycrawlers/filter/update_checker.rb +31 -0
  20. data/lib/daimon_skycrawlers/filter.rb +4 -0
  21. data/lib/daimon_skycrawlers/generator/new.rb +3 -2
  22. data/lib/daimon_skycrawlers/generator/templates/new/Gemfile +3 -3
  23. data/lib/daimon_skycrawlers/generator/templates/new/README.md.erb +22 -6
  24. data/lib/daimon_skycrawlers/generator/templates/new/crawler.rb +5 -5
  25. data/lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb +6 -6
  26. data/lib/daimon_skycrawlers/generator/templates/new/enqueue.rb +17 -8
  27. data/lib/daimon_skycrawlers/generator/templates/new/init.rb +20 -0
  28. data/lib/daimon_skycrawlers/generator/templates/new/processor.rb +2 -0
  29. data/lib/daimon_skycrawlers/logger.rb +32 -0
  30. data/lib/daimon_skycrawlers/processor/base.rb +19 -0
  31. data/lib/daimon_skycrawlers/processor/default.rb +12 -9
  32. data/lib/daimon_skycrawlers/processor/spider.rb +77 -0
  33. data/lib/daimon_skycrawlers/processor.rb +23 -9
  34. data/lib/daimon_skycrawlers/queue.rb +24 -0
  35. data/lib/daimon_skycrawlers/storage/base.rb +6 -0
  36. data/lib/daimon_skycrawlers/timer.rb +24 -0
  37. data/lib/daimon_skycrawlers/version.rb +1 -1
  38. data/lib/daimon_skycrawlers.rb +24 -4
  39. data/sample/spider/Gemfile +4 -0
  40. data/sample/spider/README.md +50 -0
  41. data/sample/spider/Rakefile +1 -0
  42. data/sample/spider/config/database.yml +26 -0
  43. data/sample/spider/crawler.rb +14 -0
  44. data/sample/spider/db/migrate/20160830155803_create_pages.rb +13 -0
  45. data/sample/spider/db/schema.rb +28 -0
  46. data/sample/spider/enqueue.rb +24 -0
  47. data/sample/spider/init.rb +22 -0
  48. data/sample/spider/processor.rb +34 -0
  49. metadata +47 -12
  50. data/lib/daimon_skycrawlers/configure_songkick_queue.rb +0 -12
  51. data/lib/daimon_skycrawlers/http_response_consumer.rb +0 -38
  52. data/lib/daimon_skycrawlers/parser/base.rb +0 -13
  53. data/lib/daimon_skycrawlers/parser/default.rb +0 -50
  54. data/lib/daimon_skycrawlers/parser.rb +0 -7
  55. data/lib/daimon_skycrawlers/url_consumer.rb +0 -32
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e9ebc710f18b92107a91a3bb5a4c8972051ede8f
4
- data.tar.gz: 61db84bcdc73557ae9b28cc7894c04a1055dda6d
3
+ metadata.gz: f03168cc7d465dd69da00eabc00c5bd1d4654455
4
+ data.tar.gz: e55f902ab4f78340ee5df80499f581cefb2bad96
5
5
  SHA512:
6
- metadata.gz: 76d7b629eefc04a89d5cdce8939a9e83f35c68a9b8d1be3599e3c7d59b815f506eb1c295d1e3095f18ca7ec12169fa5e365cda981561804bbb4d2d070eef3051
7
- data.tar.gz: 04b326f7a8531d364c3d41d12f99328831b2eda1acefa72869f588750e065abc0a044fd6d87b92f20a6359983dba8b00c42a4ab40d75979f92c6a553ec16f9e9
6
+ metadata.gz: 0bd1e1e832766d27779e91bba3ae70c70d44863373cd5172d2c67e40ad6ded2dd1329a68f9652aa58fd00bac7e38fd75e3c43092fbeaec4bca42fada779b2ffd
7
+ data.tar.gz: 99dc774a495bfec4e8b0b0693333340fc57be9321ef9fb5d9b2d64f61856ab6defa10c1e8292dc13d90bcca4a69e80e5f2a4aeec033c7c1a345e64d2c965eaec
data/.travis.yml CHANGED
@@ -2,8 +2,8 @@ language: ruby
2
2
  sudo: false
3
3
  cache: bundler
4
4
  rvm:
5
- - 2.2.4
6
- - 2.3.0
5
+ - 2.2.5
6
+ - 2.3.1
7
7
 
8
8
  before_install: gem install bundler -v 1.11.2
9
9
 
data/Gemfile CHANGED
@@ -1,3 +1,3 @@
1
- source 'https://rubygems.org'
1
+ source "https://rubygems.org"
2
2
 
3
3
  gemspec
data/README.md CHANGED
@@ -1,8 +1,8 @@
1
- # DaimonSkycrawlers
1
+ ## Caution!! This product is NOT production-ready.
2
2
 
3
- Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/daimon_skycrawlers`. To experiment with that code, run `bin/console` for an interactive prompt.
3
+ # DaimonSkycrawlers
4
4
 
5
- TODO: Delete this and the text above, and describe your gem
5
+ DaimonSkyCrawlers is a crawler framework.
6
6
 
7
7
  ## Requirements
8
8
 
@@ -33,32 +33,50 @@ Or install it yourself as:
33
33
 
34
34
  1. Create project
35
35
 
36
- $ bundle exec daimon-skycrawlers new mycrawlers
37
- $ cd mycrawlers
36
+ ```
37
+ $ bundle exec daimon-skycrawlers new mycrawlers
38
+ $ cd mycrawlers
39
+ ```
38
40
 
39
41
  2. Install dependencies
40
42
 
41
- $ bundle install
43
+ ```
44
+ $ bundle install
45
+ ```
42
46
 
43
47
  3. Create database
44
48
 
45
- $ bundle exec rake db:create
46
- $ bundle exec rake db:migrate
49
+ ```
50
+ $ bundle exec rake db:create
51
+ $ bundle exec rake db:migrate
52
+ ```
47
53
 
48
54
  4. Open new terminal and run crawler/processor
49
55
 
50
- $ bundle exec ruby crawler.rb # on new terminal
51
- $ bundle exec ruby processor.rb # on new terminal
56
+ ```
57
+ $ bundle exec ruby crawler.rb # on new terminal
58
+ $ bundle exec ruby processor.rb # on new terminal
59
+ ```
52
60
 
53
61
  5. Enqueue task
54
62
 
55
- $ bundle exec ruby enqueue.rb http://example.com/
63
+ ```
64
+ $ bundle exec ruby enqueue.rb url http://example.com/
65
+ ```
56
66
 
57
67
  6. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
58
68
 
69
+ 7. You can re-enqueue task for processor
70
+
71
+ ```
72
+ $ bundle exec ruby enqueue.rb response http://example.com/
73
+ ```
74
+
75
+ Display `It works with 'http://example.com'` again on your terminal which runs your processor.
76
+
59
77
  ## Development
60
78
 
61
- After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
79
+ After checking out the repo, run `bundle install` to install dependencies. Then, run `bundle exec rake test` to run the tests. You can also run `bundle console` for an interactive prompt that will allow you to experiment.
62
80
 
63
81
  To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
64
82
 
data/Rakefile CHANGED
@@ -1,17 +1,12 @@
1
1
  require "bundler/setup"
2
2
  require "bundler/gem_tasks"
3
3
  require "rake/testtask"
4
+ require "daimon_skycrawlers/tasks"
4
5
 
5
6
  Rake::TestTask.new(:test) do |t|
6
7
  t.libs << "test"
7
8
  t.libs << "lib"
8
- t.test_files = FileList['test/**/*_test.rb']
9
+ t.test_files = FileList["test/**/*_test.rb"]
9
10
  end
10
11
 
11
- require "cucumber/rake/task"
12
-
13
- Cucumber::Rake::Task.new(:features) do |t|
14
- t.cucumber_opts = "features --format pretty"
15
- end
16
-
17
- task :default => [:test, :features]
12
+ task :default => [:test]
@@ -1,9 +1,9 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- if File.exist?(File.expand_path('../.git', __dir__))
4
- $LOAD_PATH << File.expand_path('../lib', __dir__)
3
+ if File.exist?(File.expand_path("../.git", __dir__))
4
+ $LOAD_PATH << File.expand_path("../lib", __dir__)
5
5
  end
6
6
 
7
- require 'daimon_skycrawlers/cli'
7
+ require "daimon_skycrawlers/cli"
8
8
 
9
9
  DaimonSkycrawlers::CLI.start
@@ -1,7 +1,7 @@
1
1
  # coding: utf-8
2
- lib = File.expand_path('../lib', __FILE__)
2
+ lib = File.expand_path("../lib", __FILE__)
3
3
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
- require 'daimon_skycrawlers/version'
4
+ require "daimon_skycrawlers/version"
5
5
 
6
6
  Gem::Specification.new do |spec|
7
7
  spec.name = "daimon_skycrawlers"
@@ -25,13 +25,14 @@ Gem::Specification.new do |spec|
25
25
  spec.add_dependency "nokogiri"
26
26
  spec.add_dependency "activerecord"
27
27
  spec.add_dependency "pg"
28
+ spec.add_dependency "timers"
28
29
 
29
30
  spec.add_development_dependency "bundler", "~> 1.11"
30
31
  spec.add_development_dependency "rake", "~> 10.0"
31
32
  spec.add_development_dependency "test-unit"
32
33
  spec.add_development_dependency "test-unit-rr"
33
34
  spec.add_development_dependency "test-unit-notify"
34
- spec.add_development_dependency "cucumber"
35
35
  spec.add_development_dependency "pry"
36
36
  spec.add_development_dependency "tapp"
37
+ spec.add_development_dependency "sqlite3"
37
38
  end
@@ -1,6 +1,6 @@
1
- require 'thor'
2
- require 'daimon_skycrawlers/generator/new'
3
- require 'daimon_skycrawlers/version'
1
+ require "thor"
2
+ require "daimon_skycrawlers/generator/new"
3
+ require "daimon_skycrawlers/version"
4
4
 
5
5
  module DaimonSkycrawlers
6
6
  class CLI < Thor
@@ -0,0 +1,8 @@
1
+ module DaimonSkycrawlers
2
+ module ConfigMixin
3
+ def initialize
4
+ super
5
+ @log = DaimonSkycrawlers.configuration.logger
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,16 @@
1
+ require "songkick_queue"
2
+ require "daimon_skycrawlers/logger"
3
+ require "daimon_skycrawlers/config"
4
+
5
+ module DaimonSkycrawlers
6
+ module Consumer
7
+ class Base
8
+ include DaimonSkycrawlers::LoggerMixin
9
+ include DaimonSkycrawlers::ConfigMixin
10
+
11
+ def process(message)
12
+ raise NotImplementedError, "Must implement in subclass"
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,47 @@
1
+ require "songkick_queue"
2
+ require "daimon_skycrawlers"
3
+ require "daimon_skycrawlers/consumer/base"
4
+ require "daimon_skycrawlers/processor/default"
5
+
6
+ module DaimonSkycrawlers
7
+ module Consumer
8
+ class HTTPResponse < Base
9
+ include SongkickQueue::Consumer
10
+
11
+ class << self
12
+ def register(processor = nil, &block)
13
+ if block_given?
14
+ processors << block
15
+ else
16
+ processors << processor
17
+ end
18
+ end
19
+
20
+ def processors
21
+ @processors ||= []
22
+ end
23
+
24
+ def default_processor
25
+ DaimonSkycrawlers::Processor::Default.new
26
+ end
27
+
28
+ def queue_name
29
+ "#{DaimonSkycrawlers.configuration.queue_name_prefix}.http-response"
30
+ end
31
+ end
32
+
33
+ consume_from_queue queue_name
34
+
35
+ def process(message)
36
+ if self.class.processors.empty?
37
+ processors = [self.class.default_processor]
38
+ else
39
+ processors = self.class.processors
40
+ end
41
+ processors.each do |processor|
42
+ processor.call(message)
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,44 @@
1
+ require "songkick_queue"
2
+ require "daimon_skycrawlers"
3
+ require "daimon_skycrawlers/consumer/base"
4
+
5
+ module DaimonSkycrawlers
6
+ module Consumer
7
+ class URL < Base
8
+ include SongkickQueue::Consumer
9
+
10
+ class << self
11
+ def register(crawler)
12
+ crawlers << crawler
13
+ end
14
+
15
+ def crawlers
16
+ @crawlers ||= []
17
+ end
18
+
19
+ def queue_name
20
+ "#{DaimonSkycrawlers.configuration.queue_name_prefix}.url"
21
+ end
22
+ end
23
+
24
+ consume_from_queue queue_name
25
+
26
+ def process(message)
27
+ url = message[:url]
28
+ depth = Integer(message[:depth] || 0)
29
+
30
+ crawler_interval = DaimonSkycrawlers.configuration.crawler_interval
31
+
32
+ # XXX When several crawlers are registered, how should they behave?
33
+ self.class.crawlers.each do |crawler|
34
+ crawler.fetch(url, depth: depth)
35
+ if crawler.skipped?
36
+ sleep(crawler_interval) if crawler.n_processed_urls % 50 == 0
37
+ else
38
+ sleep(crawler_interval)
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,4 @@
1
+ module DaimonSkycrawlers
2
+ module Consumer
3
+ end
4
+ end
@@ -0,0 +1,75 @@
1
+ require "uri"
2
+ require "faraday"
3
+
4
+ require "daimon_skycrawlers/logger"
5
+ require "daimon_skycrawlers/config"
6
+ require "daimon_skycrawlers/storage"
7
+ require "daimon_skycrawlers/processor"
8
+
9
+ module DaimonSkycrawlers
10
+ module Crawler
11
+ class Base
12
+ include DaimonSkycrawlers::LoggerMixin
13
+ include DaimonSkycrawlers::ConfigMixin
14
+
15
+ attr_writer :storage
16
+
17
+ def initialize(base_url = nil, options = {})
18
+ super()
19
+ @base_url = base_url
20
+ @options = options
21
+ @prepare = ->(connection) {}
22
+ @skipped = false
23
+ @n_processed_urls = 0
24
+ end
25
+
26
+ def setup_connection(options = {})
27
+ @connection = Faraday.new(@base_url, @options.merge(options)) do |faraday|
28
+ yield faraday
29
+ end
30
+ end
31
+
32
+ #
33
+ # Call this method before DaimonSkycrawlers.register_crawler
34
+ # For example, you can login before fetch URL
35
+ #
36
+ def prepare(&block)
37
+ @prepare = block
38
+ end
39
+
40
+ def storage
41
+ @storage ||= Storage::RDB.new
42
+ end
43
+
44
+ def skipped?
45
+ @skipped
46
+ end
47
+
48
+ def connection
49
+ @connection ||= Faraday.new(@base_url, @options)
50
+ end
51
+
52
+ def fetch(path, params = {}, **kw)
53
+ raise NotImplementedError, "Must implement this method in subclass"
54
+ end
55
+
56
+ def get(path, params = {})
57
+ @connection.get(path, params)
58
+ end
59
+
60
+ def post(path, params = {})
61
+ @connection.post(path, params)
62
+ end
63
+
64
+ def n_processed_urls
65
+ @n_processed_urls
66
+ end
67
+
68
+ private
69
+
70
+ def schedule_to_process(url, message = {})
71
+ DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,33 @@
1
+ require "daimon_skycrawlers/crawler/base"
2
+ require "daimon_skycrawlers/filter/update_checker"
3
+
4
+ module DaimonSkycrawlers
5
+ module Crawler
6
+ class Default < Base
7
+ def fetch(path, depth: 3, **kw)
8
+ @n_processed_urls += 1
9
+ @skipped = false
10
+ url = connection.url_prefix + path
11
+ update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(storage: storage)
12
+ unless update_checker.call(url.to_s, connection: connection)
13
+ log.info("Skip #{url}")
14
+ @skipped = true
15
+ schedule_to_process(url.to_s, heartbeat: true)
16
+ return
17
+ end
18
+ @prepare.call(connection)
19
+ response = get(path)
20
+ data = [url.to_s, response.headers, response.body]
21
+
22
+ yield(*data) if block_given?
23
+
24
+ storage.save(*data)
25
+ message = {
26
+ depth: depth
27
+ }
28
+ message = message.merge(kw)
29
+ schedule_to_process(url.to_s, message)
30
+ end
31
+ end
32
+ end
33
+ end
@@ -1,89 +1,31 @@
1
- require 'uri'
2
-
3
- require 'daimon_skycrawlers'
4
- require 'daimon_skycrawlers/version'
5
- require 'daimon_skycrawlers/configure_songkick_queue'
6
- require 'daimon_skycrawlers/url_consumer'
7
- require 'daimon_skycrawlers/storage'
8
- require 'daimon_skycrawlers/parser'
9
-
10
- require 'faraday'
1
+ require "daimon_skycrawlers"
2
+ require "daimon_skycrawlers/queue"
3
+ require "daimon_skycrawlers/timer"
4
+ require "daimon_skycrawlers/consumer/url"
11
5
 
12
6
  module DaimonSkycrawlers
13
- class Crawler
7
+ module Crawler
14
8
  class << self
15
- def run(process_name: 'daimon-skycrawler:url')
16
- SongkickQueue::Worker.new(process_name, [URLConsumer]).run
9
+ def run(process_name: default_process_name)
10
+ DaimonSkycrawlers::Timer.setup_shutdown_timer(config.queue_name_prefix, interval: config.shutdown_interval)
11
+ SongkickQueue::Worker.new(process_name, [DaimonSkycrawlers::Consumer::URL]).run
17
12
  end
18
13
 
19
- def enqueue_url(url, depth: 3, interval: 1)
20
- SongkickQueue.publish('daimon-skycrawler.url', url: url, depth: depth, interval: interval)
14
+ def enqueue_url(url, message = {})
15
+ message[:url] = url
16
+ SongkickQueue.publish(queue_name, message)
21
17
  end
22
- end
23
-
24
- attr_writer :storage
25
- attr_writer :parser
26
18
 
27
- def initialize(base_url, options = {})
28
- @base_url = base_url
29
- @options = options
30
- end
31
-
32
- def setup_connection(options = {})
33
- @connection = Faraday.new(@base_url, options) do |faraday|
34
- yield faraday
19
+ def config
20
+ DaimonSkycrawlers.configuration
35
21
  end
36
- end
37
-
38
- def storage
39
- @storage ||= Storage::RDB.new
40
- end
41
-
42
- def parser
43
- @parser ||= Parser::Default.new
44
- end
45
-
46
- # TODO Support POST when we need
47
- # TODO `params` should be a part of `path`. such as `path == "/hoi?hi=yoyo"`.
48
- def fetch(path, params = {}, depth: 3)
49
- @connection ||= Faraday.new(@base_url)
50
- response = get(path)
51
-
52
- url = @connection.url_prefix + path
53
-
54
- data = [url.to_s, response.headers, response.body]
55
22
 
56
- yield(*data) if block_given?
57
-
58
- storage.save(*data)
59
-
60
- schedule_to_process(url.to_s)
61
-
62
- parser.parse(response.body)
63
- urls = parser.links
64
-
65
- enqueue_next_urls(urls, depth: depth - 1, interval: 1)
66
- end
67
-
68
- def get(path, params = {})
69
- @connection.get(path, params)
70
- end
71
-
72
- def post(path, params = {})
73
- @connection.post(path, params)
74
- end
75
-
76
- private
77
-
78
- def schedule_to_process(url)
79
- DaimonSkycrawlers::Processor.enqueue_http_response(url)
80
- end
81
-
82
- def enqueue_next_urls(urls, depth: 3, interval: 1)
83
- return if depth <= 0
23
+ def queue_name
24
+ "#{config.queue_name_prefix}.url"
25
+ end
84
26
 
85
- urls.each do |url|
86
- self.class.enqueue_url(url, depth: depth, interval: interval)
27
+ def default_process_name
28
+ "#{config.queue_name_prefix}:url"
87
29
  end
88
30
  end
89
31
  end
@@ -0,0 +1,24 @@
1
+ require "daimon_skycrawlers/logger"
2
+ require "daimon_skycrawlers/config"
3
+
4
+ module DaimonSkycrawlers
5
+ module Filter
6
+ class Base
7
+ include DaimonSkycrawlers::LoggerMixin
8
+ include DaimonSkycrawlers::ConfigMixin
9
+
10
+ def initialize(storage: nil)
11
+ super()
12
+ @storage = storage
13
+ end
14
+
15
+ def storage
16
+ @storage ||= DaimonSkycrawlers::Storage::RDB.new
17
+ end
18
+
19
+ def call(url)
20
+ raise NotImplementedError, "Must implement this method in subclass"
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,23 @@
1
+ require "set"
2
+ require "daimon_skycrawlers/filter/base"
3
+
4
+ module DaimonSkycrawlers
5
+ module Filter
6
+ class DuplicateChecker < Base
7
+ def initialize(base_url: nil)
8
+ @base_url = nil
9
+ @base_url = URI(base_url) if base_url
10
+ @urls = Set.new
11
+ end
12
+
13
+ def call(url)
14
+ unless URI(url).absolute?
15
+ url = (@base_url + url).to_s
16
+ end
17
+ return false if @urls.include?(url)
18
+ @urls << url
19
+ true
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,31 @@
1
+ require "faraday"
2
+ require "daimon_skycrawlers/filter/base"
3
+
4
+ module DaimonSkycrawlers
5
+ module Filter
6
+ class UpdateChecker < Base
7
+ def initialize(storage: nil, base_url: nil)
8
+ super(storage: storage)
9
+ @base_url = nil
10
+ @base_url = URI(base_url) if base_url
11
+ end
12
+
13
+ def call(url, connection: nil)
14
+ unless URI(url).absolute?
15
+ url = (@base_url + url).to_s
16
+ end
17
+ page = storage.find(url)
18
+ return true unless page
19
+ if connection
20
+ headers = connection.head(url)
21
+ else
22
+ headers = Faraday.head(url)
23
+ end
24
+ return false if headers["etag"] && page.etag && headers["etag"] == page.etag
25
+ return false if headers["last-modified"].nil? && page.last_modified_at.nil?
26
+ return false if headers["last-modified"] <= page.last_modified_at
27
+ true
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,4 @@
1
+ module DaimonSkycrawlers
2
+ module Filter
3
+ end
4
+ end
@@ -1,4 +1,4 @@
1
- require 'thor'
1
+ require "thor"
2
2
 
3
3
  module DaimonSkycrawlers
4
4
  module Generator
@@ -8,7 +8,7 @@ module DaimonSkycrawlers
8
8
  argument :name
9
9
 
10
10
  def self.source_root
11
- File.join(File.dirname(__FILE__), "templates", "new")
11
+ File.join(__dir__, "templates", "new")
12
12
  end
13
13
 
14
14
  def create_files
@@ -26,6 +26,7 @@ module DaimonSkycrawlers
26
26
  "Rakefile",
27
27
  "crawler.rb",
28
28
  "enqueue.rb",
29
+ "init.rb",
29
30
  "processor.rb",
30
31
  ].each do |path|
31
32
  copy_file(path, "#{name}/#{path}")
@@ -1,4 +1,4 @@
1
- source 'https://rubygems.org'
1
+ source "https://rubygems.org"
2
2
 
3
- gem 'rake'
4
- gem 'daimon_skycrawlers'
3
+ gem "rake"
4
+ gem "daimon_skycrawlers"