daimon_skycrawlers 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -2
- data/Gemfile +1 -1
- data/README.md +30 -12
- data/Rakefile +3 -8
- data/bin/daimon-skycrawlers +3 -3
- data/daimon_skycrawlers.gemspec +4 -3
- data/lib/daimon_skycrawlers/cli.rb +3 -3
- data/lib/daimon_skycrawlers/config.rb +8 -0
- data/lib/daimon_skycrawlers/consumer/base.rb +16 -0
- data/lib/daimon_skycrawlers/consumer/http_response.rb +47 -0
- data/lib/daimon_skycrawlers/consumer/url.rb +44 -0
- data/lib/daimon_skycrawlers/consumer.rb +4 -0
- data/lib/daimon_skycrawlers/crawler/base.rb +75 -0
- data/lib/daimon_skycrawlers/crawler/default.rb +33 -0
- data/lib/daimon_skycrawlers/crawler.rb +18 -76
- data/lib/daimon_skycrawlers/filter/base.rb +24 -0
- data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +23 -0
- data/lib/daimon_skycrawlers/filter/update_checker.rb +31 -0
- data/lib/daimon_skycrawlers/filter.rb +4 -0
- data/lib/daimon_skycrawlers/generator/new.rb +3 -2
- data/lib/daimon_skycrawlers/generator/templates/new/Gemfile +3 -3
- data/lib/daimon_skycrawlers/generator/templates/new/README.md.erb +22 -6
- data/lib/daimon_skycrawlers/generator/templates/new/crawler.rb +5 -5
- data/lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb +6 -6
- data/lib/daimon_skycrawlers/generator/templates/new/enqueue.rb +17 -8
- data/lib/daimon_skycrawlers/generator/templates/new/init.rb +20 -0
- data/lib/daimon_skycrawlers/generator/templates/new/processor.rb +2 -0
- data/lib/daimon_skycrawlers/logger.rb +32 -0
- data/lib/daimon_skycrawlers/processor/base.rb +19 -0
- data/lib/daimon_skycrawlers/processor/default.rb +12 -9
- data/lib/daimon_skycrawlers/processor/spider.rb +77 -0
- data/lib/daimon_skycrawlers/processor.rb +23 -9
- data/lib/daimon_skycrawlers/queue.rb +24 -0
- data/lib/daimon_skycrawlers/storage/base.rb +6 -0
- data/lib/daimon_skycrawlers/timer.rb +24 -0
- data/lib/daimon_skycrawlers/version.rb +1 -1
- data/lib/daimon_skycrawlers.rb +24 -4
- data/sample/spider/Gemfile +4 -0
- data/sample/spider/README.md +50 -0
- data/sample/spider/Rakefile +1 -0
- data/sample/spider/config/database.yml +26 -0
- data/sample/spider/crawler.rb +14 -0
- data/sample/spider/db/migrate/20160830155803_create_pages.rb +13 -0
- data/sample/spider/db/schema.rb +28 -0
- data/sample/spider/enqueue.rb +24 -0
- data/sample/spider/init.rb +22 -0
- data/sample/spider/processor.rb +34 -0
- metadata +47 -12
- data/lib/daimon_skycrawlers/configure_songkick_queue.rb +0 -12
- data/lib/daimon_skycrawlers/http_response_consumer.rb +0 -38
- data/lib/daimon_skycrawlers/parser/base.rb +0 -13
- data/lib/daimon_skycrawlers/parser/default.rb +0 -50
- data/lib/daimon_skycrawlers/parser.rb +0 -7
- data/lib/daimon_skycrawlers/url_consumer.rb +0 -32
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f03168cc7d465dd69da00eabc00c5bd1d4654455
|
4
|
+
data.tar.gz: e55f902ab4f78340ee5df80499f581cefb2bad96
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0bd1e1e832766d27779e91bba3ae70c70d44863373cd5172d2c67e40ad6ded2dd1329a68f9652aa58fd00bac7e38fd75e3c43092fbeaec4bca42fada779b2ffd
|
7
|
+
data.tar.gz: 99dc774a495bfec4e8b0b0693333340fc57be9321ef9fb5d9b2d64f61856ab6defa10c1e8292dc13d90bcca4a69e80e5f2a4aeec033c7c1a345e64d2c965eaec
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1
|
-
|
1
|
+
## Caution!! This product is NOT production-ready.
|
2
2
|
|
3
|
-
|
3
|
+
# DaimonSkycrawlers
|
4
4
|
|
5
|
-
|
5
|
+
DaimonSkyCrawlers is a crawler framework.
|
6
6
|
|
7
7
|
## Requirements
|
8
8
|
|
@@ -33,32 +33,50 @@ Or install it yourself as:
|
|
33
33
|
|
34
34
|
1. Create project
|
35
35
|
|
36
|
-
|
37
|
-
|
36
|
+
```
|
37
|
+
$ bundle exec daimon-skycrawlers new mycrawlers
|
38
|
+
$ cd mycrawlers
|
39
|
+
```
|
38
40
|
|
39
41
|
2. Install dependencies
|
40
42
|
|
41
|
-
|
43
|
+
```
|
44
|
+
$ bundle install
|
45
|
+
```
|
42
46
|
|
43
47
|
3. Create database
|
44
48
|
|
45
|
-
|
46
|
-
|
49
|
+
```
|
50
|
+
$ bundle exec rake db:create
|
51
|
+
$ bundle exec rake db:migrate
|
52
|
+
```
|
47
53
|
|
48
54
|
4. Open new terminal and run crawler/processor
|
49
55
|
|
50
|
-
|
51
|
-
|
56
|
+
```
|
57
|
+
$ bundle exec ruby crawler.rb # on new terminal
|
58
|
+
$ bundle exec ruby processor.rb # on new terminal
|
59
|
+
```
|
52
60
|
|
53
61
|
5. Enqueue task
|
54
62
|
|
55
|
-
|
63
|
+
```
|
64
|
+
$ bundle exec ruby enqueue.rb url http://example.com/
|
65
|
+
```
|
56
66
|
|
57
67
|
6. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
|
58
68
|
|
69
|
+
7. You can re-enqueue task for processor
|
70
|
+
|
71
|
+
```
|
72
|
+
$ bundle exec ruby enqueue.rb response http://example.com/
|
73
|
+
```
|
74
|
+
|
75
|
+
Display `It works with 'http://example.com'` again on your terminal which runs your processor.
|
76
|
+
|
59
77
|
## Development
|
60
78
|
|
61
|
-
After checking out the repo, run `
|
79
|
+
After checking out the repo, run `bundle install` to install dependencies. Then, run `bundle exec rake test` to run the tests. You can also run `bundle console` for an interactive prompt that will allow you to experiment.
|
62
80
|
|
63
81
|
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
64
82
|
|
data/Rakefile
CHANGED
@@ -1,17 +1,12 @@
|
|
1
1
|
require "bundler/setup"
|
2
2
|
require "bundler/gem_tasks"
|
3
3
|
require "rake/testtask"
|
4
|
+
require "daimon_skycrawlers/tasks"
|
4
5
|
|
5
6
|
Rake::TestTask.new(:test) do |t|
|
6
7
|
t.libs << "test"
|
7
8
|
t.libs << "lib"
|
8
|
-
t.test_files = FileList[
|
9
|
+
t.test_files = FileList["test/**/*_test.rb"]
|
9
10
|
end
|
10
11
|
|
11
|
-
|
12
|
-
|
13
|
-
Cucumber::Rake::Task.new(:features) do |t|
|
14
|
-
t.cucumber_opts = "features --format pretty"
|
15
|
-
end
|
16
|
-
|
17
|
-
task :default => [:test, :features]
|
12
|
+
task :default => [:test]
|
data/bin/daimon-skycrawlers
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
if File.exist?(File.expand_path(
|
4
|
-
$LOAD_PATH << File.expand_path(
|
3
|
+
if File.exist?(File.expand_path("../.git", __dir__))
|
4
|
+
$LOAD_PATH << File.expand_path("../lib", __dir__)
|
5
5
|
end
|
6
6
|
|
7
|
-
require
|
7
|
+
require "daimon_skycrawlers/cli"
|
8
8
|
|
9
9
|
DaimonSkycrawlers::CLI.start
|
data/daimon_skycrawlers.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# coding: utf-8
|
2
|
-
lib = File.expand_path(
|
2
|
+
lib = File.expand_path("../lib", __FILE__)
|
3
3
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
-
require
|
4
|
+
require "daimon_skycrawlers/version"
|
5
5
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
7
|
spec.name = "daimon_skycrawlers"
|
@@ -25,13 +25,14 @@ Gem::Specification.new do |spec|
|
|
25
25
|
spec.add_dependency "nokogiri"
|
26
26
|
spec.add_dependency "activerecord"
|
27
27
|
spec.add_dependency "pg"
|
28
|
+
spec.add_dependency "timers"
|
28
29
|
|
29
30
|
spec.add_development_dependency "bundler", "~> 1.11"
|
30
31
|
spec.add_development_dependency "rake", "~> 10.0"
|
31
32
|
spec.add_development_dependency "test-unit"
|
32
33
|
spec.add_development_dependency "test-unit-rr"
|
33
34
|
spec.add_development_dependency "test-unit-notify"
|
34
|
-
spec.add_development_dependency "cucumber"
|
35
35
|
spec.add_development_dependency "pry"
|
36
36
|
spec.add_development_dependency "tapp"
|
37
|
+
spec.add_development_dependency "sqlite3"
|
37
38
|
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require "songkick_queue"
|
2
|
+
require "daimon_skycrawlers/logger"
|
3
|
+
require "daimon_skycrawlers/config"
|
4
|
+
|
5
|
+
module DaimonSkycrawlers
|
6
|
+
module Consumer
|
7
|
+
class Base
|
8
|
+
include DaimonSkycrawlers::LoggerMixin
|
9
|
+
include DaimonSkycrawlers::ConfigMixin
|
10
|
+
|
11
|
+
def process(message)
|
12
|
+
raise NotImplementedError, "Must implement in subclass"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require "songkick_queue"
|
2
|
+
require "daimon_skycrawlers"
|
3
|
+
require "daimon_skycrawlers/consumer/base"
|
4
|
+
require "daimon_skycrawlers/processor/default"
|
5
|
+
|
6
|
+
module DaimonSkycrawlers
|
7
|
+
module Consumer
|
8
|
+
class HTTPResponse < Base
|
9
|
+
include SongkickQueue::Consumer
|
10
|
+
|
11
|
+
class << self
|
12
|
+
def register(processor = nil, &block)
|
13
|
+
if block_given?
|
14
|
+
processors << block
|
15
|
+
else
|
16
|
+
processors << processor
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def processors
|
21
|
+
@processors ||= []
|
22
|
+
end
|
23
|
+
|
24
|
+
def default_processor
|
25
|
+
DaimonSkycrawlers::Processor::Default.new
|
26
|
+
end
|
27
|
+
|
28
|
+
def queue_name
|
29
|
+
"#{DaimonSkycrawlers.configuration.queue_name_prefix}.http-response"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
consume_from_queue queue_name
|
34
|
+
|
35
|
+
def process(message)
|
36
|
+
if self.class.processors.empty?
|
37
|
+
processors = [self.class.default_processor]
|
38
|
+
else
|
39
|
+
processors = self.class.processors
|
40
|
+
end
|
41
|
+
processors.each do |processor|
|
42
|
+
processor.call(message)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require "songkick_queue"
|
2
|
+
require "daimon_skycrawlers"
|
3
|
+
require "daimon_skycrawlers/consumer/base"
|
4
|
+
|
5
|
+
module DaimonSkycrawlers
|
6
|
+
module Consumer
|
7
|
+
class URL < Base
|
8
|
+
include SongkickQueue::Consumer
|
9
|
+
|
10
|
+
class << self
|
11
|
+
def register(crawler)
|
12
|
+
crawlers << crawler
|
13
|
+
end
|
14
|
+
|
15
|
+
def crawlers
|
16
|
+
@crawlers ||= []
|
17
|
+
end
|
18
|
+
|
19
|
+
def queue_name
|
20
|
+
"#{DaimonSkycrawlers.configuration.queue_name_prefix}.url"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
consume_from_queue queue_name
|
25
|
+
|
26
|
+
def process(message)
|
27
|
+
url = message[:url]
|
28
|
+
depth = Integer(message[:depth] || 0)
|
29
|
+
|
30
|
+
crawler_interval = DaimonSkycrawlers.configuration.crawler_interval
|
31
|
+
|
32
|
+
# XXX When several crawlers are registered, how should they behave?
|
33
|
+
self.class.crawlers.each do |crawler|
|
34
|
+
crawler.fetch(url, depth: depth)
|
35
|
+
if crawler.skipped?
|
36
|
+
sleep(crawler_interval) if crawler.n_processed_urls % 50 == 0
|
37
|
+
else
|
38
|
+
sleep(crawler_interval)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require "uri"
|
2
|
+
require "faraday"
|
3
|
+
|
4
|
+
require "daimon_skycrawlers/logger"
|
5
|
+
require "daimon_skycrawlers/config"
|
6
|
+
require "daimon_skycrawlers/storage"
|
7
|
+
require "daimon_skycrawlers/processor"
|
8
|
+
|
9
|
+
module DaimonSkycrawlers
|
10
|
+
module Crawler
|
11
|
+
class Base
|
12
|
+
include DaimonSkycrawlers::LoggerMixin
|
13
|
+
include DaimonSkycrawlers::ConfigMixin
|
14
|
+
|
15
|
+
attr_writer :storage
|
16
|
+
|
17
|
+
def initialize(base_url = nil, options = {})
|
18
|
+
super()
|
19
|
+
@base_url = base_url
|
20
|
+
@options = options
|
21
|
+
@prepare = ->(connection) {}
|
22
|
+
@skipped = false
|
23
|
+
@n_processed_urls = 0
|
24
|
+
end
|
25
|
+
|
26
|
+
def setup_connection(options = {})
|
27
|
+
@connection = Faraday.new(@base_url, @options.merge(options)) do |faraday|
|
28
|
+
yield faraday
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
#
|
33
|
+
# Call this method before DaimonSkycrawlers.register_crawler
|
34
|
+
# For example, you can login before fetch URL
|
35
|
+
#
|
36
|
+
def prepare(&block)
|
37
|
+
@prepare = block
|
38
|
+
end
|
39
|
+
|
40
|
+
def storage
|
41
|
+
@storage ||= Storage::RDB.new
|
42
|
+
end
|
43
|
+
|
44
|
+
def skipped?
|
45
|
+
@skipped
|
46
|
+
end
|
47
|
+
|
48
|
+
def connection
|
49
|
+
@connection ||= Faraday.new(@base_url, @options)
|
50
|
+
end
|
51
|
+
|
52
|
+
def fetch(path, params = {}, **kw)
|
53
|
+
raise NotImplementedError, "Must implement this method in subclass"
|
54
|
+
end
|
55
|
+
|
56
|
+
def get(path, params = {})
|
57
|
+
@connection.get(path, params)
|
58
|
+
end
|
59
|
+
|
60
|
+
def post(path, params = {})
|
61
|
+
@connection.post(path, params)
|
62
|
+
end
|
63
|
+
|
64
|
+
def n_processed_urls
|
65
|
+
@n_processed_urls
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
def schedule_to_process(url, message = {})
|
71
|
+
DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require "daimon_skycrawlers/crawler/base"
|
2
|
+
require "daimon_skycrawlers/filter/update_checker"
|
3
|
+
|
4
|
+
module DaimonSkycrawlers
|
5
|
+
module Crawler
|
6
|
+
class Default < Base
|
7
|
+
def fetch(path, depth: 3, **kw)
|
8
|
+
@n_processed_urls += 1
|
9
|
+
@skipped = false
|
10
|
+
url = connection.url_prefix + path
|
11
|
+
update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(storage: storage)
|
12
|
+
unless update_checker.call(url.to_s, connection: connection)
|
13
|
+
log.info("Skip #{url}")
|
14
|
+
@skipped = true
|
15
|
+
schedule_to_process(url.to_s, heartbeat: true)
|
16
|
+
return
|
17
|
+
end
|
18
|
+
@prepare.call(connection)
|
19
|
+
response = get(path)
|
20
|
+
data = [url.to_s, response.headers, response.body]
|
21
|
+
|
22
|
+
yield(*data) if block_given?
|
23
|
+
|
24
|
+
storage.save(*data)
|
25
|
+
message = {
|
26
|
+
depth: depth
|
27
|
+
}
|
28
|
+
message = message.merge(kw)
|
29
|
+
schedule_to_process(url.to_s, message)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -1,89 +1,31 @@
|
|
1
|
-
require
|
2
|
-
|
3
|
-
require
|
4
|
-
require
|
5
|
-
require 'daimon_skycrawlers/configure_songkick_queue'
|
6
|
-
require 'daimon_skycrawlers/url_consumer'
|
7
|
-
require 'daimon_skycrawlers/storage'
|
8
|
-
require 'daimon_skycrawlers/parser'
|
9
|
-
|
10
|
-
require 'faraday'
|
1
|
+
require "daimon_skycrawlers"
|
2
|
+
require "daimon_skycrawlers/queue"
|
3
|
+
require "daimon_skycrawlers/timer"
|
4
|
+
require "daimon_skycrawlers/consumer/url"
|
11
5
|
|
12
6
|
module DaimonSkycrawlers
|
13
|
-
|
7
|
+
module Crawler
|
14
8
|
class << self
|
15
|
-
def run(process_name:
|
16
|
-
|
9
|
+
def run(process_name: default_process_name)
|
10
|
+
DaimonSkycrawlers::Timer.setup_shutdown_timer(config.queue_name_prefix, interval: config.shutdown_interval)
|
11
|
+
SongkickQueue::Worker.new(process_name, [DaimonSkycrawlers::Consumer::URL]).run
|
17
12
|
end
|
18
13
|
|
19
|
-
def enqueue_url(url,
|
20
|
-
|
14
|
+
def enqueue_url(url, message = {})
|
15
|
+
message[:url] = url
|
16
|
+
SongkickQueue.publish(queue_name, message)
|
21
17
|
end
|
22
|
-
end
|
23
|
-
|
24
|
-
attr_writer :storage
|
25
|
-
attr_writer :parser
|
26
18
|
|
27
|
-
|
28
|
-
|
29
|
-
@options = options
|
30
|
-
end
|
31
|
-
|
32
|
-
def setup_connection(options = {})
|
33
|
-
@connection = Faraday.new(@base_url, options) do |faraday|
|
34
|
-
yield faraday
|
19
|
+
def config
|
20
|
+
DaimonSkycrawlers.configuration
|
35
21
|
end
|
36
|
-
end
|
37
|
-
|
38
|
-
def storage
|
39
|
-
@storage ||= Storage::RDB.new
|
40
|
-
end
|
41
|
-
|
42
|
-
def parser
|
43
|
-
@parser ||= Parser::Default.new
|
44
|
-
end
|
45
|
-
|
46
|
-
# TODO Support POST when we need
|
47
|
-
# TODO `params` should be a part of `path`. such as `path == "/hoi?hi=yoyo"`.
|
48
|
-
def fetch(path, params = {}, depth: 3)
|
49
|
-
@connection ||= Faraday.new(@base_url)
|
50
|
-
response = get(path)
|
51
|
-
|
52
|
-
url = @connection.url_prefix + path
|
53
|
-
|
54
|
-
data = [url.to_s, response.headers, response.body]
|
55
22
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
schedule_to_process(url.to_s)
|
61
|
-
|
62
|
-
parser.parse(response.body)
|
63
|
-
urls = parser.links
|
64
|
-
|
65
|
-
enqueue_next_urls(urls, depth: depth - 1, interval: 1)
|
66
|
-
end
|
67
|
-
|
68
|
-
def get(path, params = {})
|
69
|
-
@connection.get(path, params)
|
70
|
-
end
|
71
|
-
|
72
|
-
def post(path, params = {})
|
73
|
-
@connection.post(path, params)
|
74
|
-
end
|
75
|
-
|
76
|
-
private
|
77
|
-
|
78
|
-
def schedule_to_process(url)
|
79
|
-
DaimonSkycrawlers::Processor.enqueue_http_response(url)
|
80
|
-
end
|
81
|
-
|
82
|
-
def enqueue_next_urls(urls, depth: 3, interval: 1)
|
83
|
-
return if depth <= 0
|
23
|
+
def queue_name
|
24
|
+
"#{config.queue_name_prefix}.url"
|
25
|
+
end
|
84
26
|
|
85
|
-
|
86
|
-
|
27
|
+
def default_process_name
|
28
|
+
"#{config.queue_name_prefix}:url"
|
87
29
|
end
|
88
30
|
end
|
89
31
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require "daimon_skycrawlers/logger"
|
2
|
+
require "daimon_skycrawlers/config"
|
3
|
+
|
4
|
+
module DaimonSkycrawlers
|
5
|
+
module Filter
|
6
|
+
class Base
|
7
|
+
include DaimonSkycrawlers::LoggerMixin
|
8
|
+
include DaimonSkycrawlers::ConfigMixin
|
9
|
+
|
10
|
+
def initialize(storage: nil)
|
11
|
+
super()
|
12
|
+
@storage = storage
|
13
|
+
end
|
14
|
+
|
15
|
+
def storage
|
16
|
+
@storage ||= DaimonSkycrawlers::Storage::RDB.new
|
17
|
+
end
|
18
|
+
|
19
|
+
def call(url)
|
20
|
+
raise NotImplementedError, "Must implement this method in subclass"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require "set"
|
2
|
+
require "daimon_skycrawlers/filter/base"
|
3
|
+
|
4
|
+
module DaimonSkycrawlers
|
5
|
+
module Filter
|
6
|
+
class DuplicateChecker < Base
|
7
|
+
def initialize(base_url: nil)
|
8
|
+
@base_url = nil
|
9
|
+
@base_url = URI(base_url) if base_url
|
10
|
+
@urls = Set.new
|
11
|
+
end
|
12
|
+
|
13
|
+
def call(url)
|
14
|
+
unless URI(url).absolute?
|
15
|
+
url = (@base_url + url).to_s
|
16
|
+
end
|
17
|
+
return false if @urls.include?(url)
|
18
|
+
@urls << url
|
19
|
+
true
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require "faraday"
|
2
|
+
require "daimon_skycrawlers/filter/base"
|
3
|
+
|
4
|
+
module DaimonSkycrawlers
|
5
|
+
module Filter
|
6
|
+
class UpdateChecker < Base
|
7
|
+
def initialize(storage: nil, base_url: nil)
|
8
|
+
super(storage: storage)
|
9
|
+
@base_url = nil
|
10
|
+
@base_url = URI(base_url) if base_url
|
11
|
+
end
|
12
|
+
|
13
|
+
def call(url, connection: nil)
|
14
|
+
unless URI(url).absolute?
|
15
|
+
url = (@base_url + url).to_s
|
16
|
+
end
|
17
|
+
page = storage.find(url)
|
18
|
+
return true unless page
|
19
|
+
if connection
|
20
|
+
headers = connection.head(url)
|
21
|
+
else
|
22
|
+
headers = Faraday.head(url)
|
23
|
+
end
|
24
|
+
return false if headers["etag"] && page.etag && headers["etag"] == page.etag
|
25
|
+
return false if headers["last-modified"].nil? && page.last_modified_at.nil?
|
26
|
+
return false if headers["last-modified"] <= page.last_modified_at
|
27
|
+
true
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
require
|
1
|
+
require "thor"
|
2
2
|
|
3
3
|
module DaimonSkycrawlers
|
4
4
|
module Generator
|
@@ -8,7 +8,7 @@ module DaimonSkycrawlers
|
|
8
8
|
argument :name
|
9
9
|
|
10
10
|
def self.source_root
|
11
|
-
File.join(
|
11
|
+
File.join(__dir__, "templates", "new")
|
12
12
|
end
|
13
13
|
|
14
14
|
def create_files
|
@@ -26,6 +26,7 @@ module DaimonSkycrawlers
|
|
26
26
|
"Rakefile",
|
27
27
|
"crawler.rb",
|
28
28
|
"enqueue.rb",
|
29
|
+
"init.rb",
|
29
30
|
"processor.rb",
|
30
31
|
].each do |path|
|
31
32
|
copy_file(path, "#{name}/#{path}")
|
@@ -1,4 +1,4 @@
|
|
1
|
-
source
|
1
|
+
source "https://rubygems.org"
|
2
2
|
|
3
|
-
gem
|
4
|
-
gem
|
3
|
+
gem "rake"
|
4
|
+
gem "daimon_skycrawlers"
|