daimon_skycrawlers 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +2 -2
- data/Gemfile +1 -1
- data/README.md +30 -12
- data/Rakefile +3 -8
- data/bin/daimon-skycrawlers +3 -3
- data/daimon_skycrawlers.gemspec +4 -3
- data/lib/daimon_skycrawlers/cli.rb +3 -3
- data/lib/daimon_skycrawlers/config.rb +8 -0
- data/lib/daimon_skycrawlers/consumer/base.rb +16 -0
- data/lib/daimon_skycrawlers/consumer/http_response.rb +47 -0
- data/lib/daimon_skycrawlers/consumer/url.rb +44 -0
- data/lib/daimon_skycrawlers/consumer.rb +4 -0
- data/lib/daimon_skycrawlers/crawler/base.rb +75 -0
- data/lib/daimon_skycrawlers/crawler/default.rb +33 -0
- data/lib/daimon_skycrawlers/crawler.rb +18 -76
- data/lib/daimon_skycrawlers/filter/base.rb +24 -0
- data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +23 -0
- data/lib/daimon_skycrawlers/filter/update_checker.rb +31 -0
- data/lib/daimon_skycrawlers/filter.rb +4 -0
- data/lib/daimon_skycrawlers/generator/new.rb +3 -2
- data/lib/daimon_skycrawlers/generator/templates/new/Gemfile +3 -3
- data/lib/daimon_skycrawlers/generator/templates/new/README.md.erb +22 -6
- data/lib/daimon_skycrawlers/generator/templates/new/crawler.rb +5 -5
- data/lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb +6 -6
- data/lib/daimon_skycrawlers/generator/templates/new/enqueue.rb +17 -8
- data/lib/daimon_skycrawlers/generator/templates/new/init.rb +20 -0
- data/lib/daimon_skycrawlers/generator/templates/new/processor.rb +2 -0
- data/lib/daimon_skycrawlers/logger.rb +32 -0
- data/lib/daimon_skycrawlers/processor/base.rb +19 -0
- data/lib/daimon_skycrawlers/processor/default.rb +12 -9
- data/lib/daimon_skycrawlers/processor/spider.rb +77 -0
- data/lib/daimon_skycrawlers/processor.rb +23 -9
- data/lib/daimon_skycrawlers/queue.rb +24 -0
- data/lib/daimon_skycrawlers/storage/base.rb +6 -0
- data/lib/daimon_skycrawlers/timer.rb +24 -0
- data/lib/daimon_skycrawlers/version.rb +1 -1
- data/lib/daimon_skycrawlers.rb +24 -4
- data/sample/spider/Gemfile +4 -0
- data/sample/spider/README.md +50 -0
- data/sample/spider/Rakefile +1 -0
- data/sample/spider/config/database.yml +26 -0
- data/sample/spider/crawler.rb +14 -0
- data/sample/spider/db/migrate/20160830155803_create_pages.rb +13 -0
- data/sample/spider/db/schema.rb +28 -0
- data/sample/spider/enqueue.rb +24 -0
- data/sample/spider/init.rb +22 -0
- data/sample/spider/processor.rb +34 -0
- metadata +47 -12
- data/lib/daimon_skycrawlers/configure_songkick_queue.rb +0 -12
- data/lib/daimon_skycrawlers/http_response_consumer.rb +0 -38
- data/lib/daimon_skycrawlers/parser/base.rb +0 -13
- data/lib/daimon_skycrawlers/parser/default.rb +0 -50
- data/lib/daimon_skycrawlers/parser.rb +0 -7
- data/lib/daimon_skycrawlers/url_consumer.rb +0 -32
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: f03168cc7d465dd69da00eabc00c5bd1d4654455
|
|
4
|
+
data.tar.gz: e55f902ab4f78340ee5df80499f581cefb2bad96
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 0bd1e1e832766d27779e91bba3ae70c70d44863373cd5172d2c67e40ad6ded2dd1329a68f9652aa58fd00bac7e38fd75e3c43092fbeaec4bca42fada779b2ffd
|
|
7
|
+
data.tar.gz: 99dc774a495bfec4e8b0b0693333340fc57be9321ef9fb5d9b2d64f61856ab6defa10c1e8292dc13d90bcca4a69e80e5f2a4aeec033c7c1a345e64d2c965eaec
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
data/README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
|
|
1
|
+
## Caution!! This product is NOT production-ready.
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
# DaimonSkycrawlers
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
DaimonSkyCrawlers is a crawler framework.
|
|
6
6
|
|
|
7
7
|
## Requirements
|
|
8
8
|
|
|
@@ -33,32 +33,50 @@ Or install it yourself as:
|
|
|
33
33
|
|
|
34
34
|
1. Create project
|
|
35
35
|
|
|
36
|
-
|
|
37
|
-
|
|
36
|
+
```
|
|
37
|
+
$ bundle exec daimon-skycrawlers new mycrawlers
|
|
38
|
+
$ cd mycrawlers
|
|
39
|
+
```
|
|
38
40
|
|
|
39
41
|
2. Install dependencies
|
|
40
42
|
|
|
41
|
-
|
|
43
|
+
```
|
|
44
|
+
$ bundle install
|
|
45
|
+
```
|
|
42
46
|
|
|
43
47
|
3. Create database
|
|
44
48
|
|
|
45
|
-
|
|
46
|
-
|
|
49
|
+
```
|
|
50
|
+
$ bundle exec rake db:create
|
|
51
|
+
$ bundle exec rake db:migrate
|
|
52
|
+
```
|
|
47
53
|
|
|
48
54
|
4. Open new terminal and run crawler/processor
|
|
49
55
|
|
|
50
|
-
|
|
51
|
-
|
|
56
|
+
```
|
|
57
|
+
$ bundle exec ruby crawler.rb # on new terminal
|
|
58
|
+
$ bundle exec ruby processor.rb # on new terminal
|
|
59
|
+
```
|
|
52
60
|
|
|
53
61
|
5. Enqueue task
|
|
54
62
|
|
|
55
|
-
|
|
63
|
+
```
|
|
64
|
+
$ bundle exec ruby enqueue.rb url http://example.com/
|
|
65
|
+
```
|
|
56
66
|
|
|
57
67
|
6. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
|
|
58
68
|
|
|
69
|
+
7. You can re-enqueue task for processor
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
$ bundle exec ruby enqueue.rb response http://example.com/
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Display `It works with 'http://example.com'` again on your terminal which runs your processor.
|
|
76
|
+
|
|
59
77
|
## Development
|
|
60
78
|
|
|
61
|
-
After checking out the repo, run `
|
|
79
|
+
After checking out the repo, run `bundle install` to install dependencies. Then, run `bundle exec rake test` to run the tests. You can also run `bundle console` for an interactive prompt that will allow you to experiment.
|
|
62
80
|
|
|
63
81
|
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
|
64
82
|
|
data/Rakefile
CHANGED
|
@@ -1,17 +1,12 @@
|
|
|
1
1
|
require "bundler/setup"
|
|
2
2
|
require "bundler/gem_tasks"
|
|
3
3
|
require "rake/testtask"
|
|
4
|
+
require "daimon_skycrawlers/tasks"
|
|
4
5
|
|
|
5
6
|
Rake::TestTask.new(:test) do |t|
|
|
6
7
|
t.libs << "test"
|
|
7
8
|
t.libs << "lib"
|
|
8
|
-
t.test_files = FileList[
|
|
9
|
+
t.test_files = FileList["test/**/*_test.rb"]
|
|
9
10
|
end
|
|
10
11
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
Cucumber::Rake::Task.new(:features) do |t|
|
|
14
|
-
t.cucumber_opts = "features --format pretty"
|
|
15
|
-
end
|
|
16
|
-
|
|
17
|
-
task :default => [:test, :features]
|
|
12
|
+
task :default => [:test]
|
data/bin/daimon-skycrawlers
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
#!/usr/bin/env ruby
|
|
2
2
|
|
|
3
|
-
if File.exist?(File.expand_path(
|
|
4
|
-
$LOAD_PATH << File.expand_path(
|
|
3
|
+
if File.exist?(File.expand_path("../.git", __dir__))
|
|
4
|
+
$LOAD_PATH << File.expand_path("../lib", __dir__)
|
|
5
5
|
end
|
|
6
6
|
|
|
7
|
-
require
|
|
7
|
+
require "daimon_skycrawlers/cli"
|
|
8
8
|
|
|
9
9
|
DaimonSkycrawlers::CLI.start
|
data/daimon_skycrawlers.gemspec
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# coding: utf-8
|
|
2
|
-
lib = File.expand_path(
|
|
2
|
+
lib = File.expand_path("../lib", __FILE__)
|
|
3
3
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
|
-
require
|
|
4
|
+
require "daimon_skycrawlers/version"
|
|
5
5
|
|
|
6
6
|
Gem::Specification.new do |spec|
|
|
7
7
|
spec.name = "daimon_skycrawlers"
|
|
@@ -25,13 +25,14 @@ Gem::Specification.new do |spec|
|
|
|
25
25
|
spec.add_dependency "nokogiri"
|
|
26
26
|
spec.add_dependency "activerecord"
|
|
27
27
|
spec.add_dependency "pg"
|
|
28
|
+
spec.add_dependency "timers"
|
|
28
29
|
|
|
29
30
|
spec.add_development_dependency "bundler", "~> 1.11"
|
|
30
31
|
spec.add_development_dependency "rake", "~> 10.0"
|
|
31
32
|
spec.add_development_dependency "test-unit"
|
|
32
33
|
spec.add_development_dependency "test-unit-rr"
|
|
33
34
|
spec.add_development_dependency "test-unit-notify"
|
|
34
|
-
spec.add_development_dependency "cucumber"
|
|
35
35
|
spec.add_development_dependency "pry"
|
|
36
36
|
spec.add_development_dependency "tapp"
|
|
37
|
+
spec.add_development_dependency "sqlite3"
|
|
37
38
|
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
require "songkick_queue"
|
|
2
|
+
require "daimon_skycrawlers/logger"
|
|
3
|
+
require "daimon_skycrawlers/config"
|
|
4
|
+
|
|
5
|
+
module DaimonSkycrawlers
|
|
6
|
+
module Consumer
|
|
7
|
+
class Base
|
|
8
|
+
include DaimonSkycrawlers::LoggerMixin
|
|
9
|
+
include DaimonSkycrawlers::ConfigMixin
|
|
10
|
+
|
|
11
|
+
def process(message)
|
|
12
|
+
raise NotImplementedError, "Must implement in subclass"
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
require "songkick_queue"
|
|
2
|
+
require "daimon_skycrawlers"
|
|
3
|
+
require "daimon_skycrawlers/consumer/base"
|
|
4
|
+
require "daimon_skycrawlers/processor/default"
|
|
5
|
+
|
|
6
|
+
module DaimonSkycrawlers
|
|
7
|
+
module Consumer
|
|
8
|
+
class HTTPResponse < Base
|
|
9
|
+
include SongkickQueue::Consumer
|
|
10
|
+
|
|
11
|
+
class << self
|
|
12
|
+
def register(processor = nil, &block)
|
|
13
|
+
if block_given?
|
|
14
|
+
processors << block
|
|
15
|
+
else
|
|
16
|
+
processors << processor
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def processors
|
|
21
|
+
@processors ||= []
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def default_processor
|
|
25
|
+
DaimonSkycrawlers::Processor::Default.new
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def queue_name
|
|
29
|
+
"#{DaimonSkycrawlers.configuration.queue_name_prefix}.http-response"
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
consume_from_queue queue_name
|
|
34
|
+
|
|
35
|
+
def process(message)
|
|
36
|
+
if self.class.processors.empty?
|
|
37
|
+
processors = [self.class.default_processor]
|
|
38
|
+
else
|
|
39
|
+
processors = self.class.processors
|
|
40
|
+
end
|
|
41
|
+
processors.each do |processor|
|
|
42
|
+
processor.call(message)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
require "songkick_queue"
|
|
2
|
+
require "daimon_skycrawlers"
|
|
3
|
+
require "daimon_skycrawlers/consumer/base"
|
|
4
|
+
|
|
5
|
+
module DaimonSkycrawlers
|
|
6
|
+
module Consumer
|
|
7
|
+
class URL < Base
|
|
8
|
+
include SongkickQueue::Consumer
|
|
9
|
+
|
|
10
|
+
class << self
|
|
11
|
+
def register(crawler)
|
|
12
|
+
crawlers << crawler
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def crawlers
|
|
16
|
+
@crawlers ||= []
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def queue_name
|
|
20
|
+
"#{DaimonSkycrawlers.configuration.queue_name_prefix}.url"
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
consume_from_queue queue_name
|
|
25
|
+
|
|
26
|
+
def process(message)
|
|
27
|
+
url = message[:url]
|
|
28
|
+
depth = Integer(message[:depth] || 0)
|
|
29
|
+
|
|
30
|
+
crawler_interval = DaimonSkycrawlers.configuration.crawler_interval
|
|
31
|
+
|
|
32
|
+
# XXX When several crawlers are registered, how should they behave?
|
|
33
|
+
self.class.crawlers.each do |crawler|
|
|
34
|
+
crawler.fetch(url, depth: depth)
|
|
35
|
+
if crawler.skipped?
|
|
36
|
+
sleep(crawler_interval) if crawler.n_processed_urls % 50 == 0
|
|
37
|
+
else
|
|
38
|
+
sleep(crawler_interval)
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
require "uri"
|
|
2
|
+
require "faraday"
|
|
3
|
+
|
|
4
|
+
require "daimon_skycrawlers/logger"
|
|
5
|
+
require "daimon_skycrawlers/config"
|
|
6
|
+
require "daimon_skycrawlers/storage"
|
|
7
|
+
require "daimon_skycrawlers/processor"
|
|
8
|
+
|
|
9
|
+
module DaimonSkycrawlers
|
|
10
|
+
module Crawler
|
|
11
|
+
class Base
|
|
12
|
+
include DaimonSkycrawlers::LoggerMixin
|
|
13
|
+
include DaimonSkycrawlers::ConfigMixin
|
|
14
|
+
|
|
15
|
+
attr_writer :storage
|
|
16
|
+
|
|
17
|
+
def initialize(base_url = nil, options = {})
|
|
18
|
+
super()
|
|
19
|
+
@base_url = base_url
|
|
20
|
+
@options = options
|
|
21
|
+
@prepare = ->(connection) {}
|
|
22
|
+
@skipped = false
|
|
23
|
+
@n_processed_urls = 0
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def setup_connection(options = {})
|
|
27
|
+
@connection = Faraday.new(@base_url, @options.merge(options)) do |faraday|
|
|
28
|
+
yield faraday
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
#
|
|
33
|
+
# Call this method before DaimonSkycrawlers.register_crawler
|
|
34
|
+
# For example, you can login before fetch URL
|
|
35
|
+
#
|
|
36
|
+
def prepare(&block)
|
|
37
|
+
@prepare = block
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def storage
|
|
41
|
+
@storage ||= Storage::RDB.new
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def skipped?
|
|
45
|
+
@skipped
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def connection
|
|
49
|
+
@connection ||= Faraday.new(@base_url, @options)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def fetch(path, params = {}, **kw)
|
|
53
|
+
raise NotImplementedError, "Must implement this method in subclass"
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def get(path, params = {})
|
|
57
|
+
@connection.get(path, params)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def post(path, params = {})
|
|
61
|
+
@connection.post(path, params)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def n_processed_urls
|
|
65
|
+
@n_processed_urls
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
private
|
|
69
|
+
|
|
70
|
+
def schedule_to_process(url, message = {})
|
|
71
|
+
DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
require "daimon_skycrawlers/crawler/base"
|
|
2
|
+
require "daimon_skycrawlers/filter/update_checker"
|
|
3
|
+
|
|
4
|
+
module DaimonSkycrawlers
|
|
5
|
+
module Crawler
|
|
6
|
+
class Default < Base
|
|
7
|
+
def fetch(path, depth: 3, **kw)
|
|
8
|
+
@n_processed_urls += 1
|
|
9
|
+
@skipped = false
|
|
10
|
+
url = connection.url_prefix + path
|
|
11
|
+
update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(storage: storage)
|
|
12
|
+
unless update_checker.call(url.to_s, connection: connection)
|
|
13
|
+
log.info("Skip #{url}")
|
|
14
|
+
@skipped = true
|
|
15
|
+
schedule_to_process(url.to_s, heartbeat: true)
|
|
16
|
+
return
|
|
17
|
+
end
|
|
18
|
+
@prepare.call(connection)
|
|
19
|
+
response = get(path)
|
|
20
|
+
data = [url.to_s, response.headers, response.body]
|
|
21
|
+
|
|
22
|
+
yield(*data) if block_given?
|
|
23
|
+
|
|
24
|
+
storage.save(*data)
|
|
25
|
+
message = {
|
|
26
|
+
depth: depth
|
|
27
|
+
}
|
|
28
|
+
message = message.merge(kw)
|
|
29
|
+
schedule_to_process(url.to_s, message)
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -1,89 +1,31 @@
|
|
|
1
|
-
require
|
|
2
|
-
|
|
3
|
-
require
|
|
4
|
-
require
|
|
5
|
-
require 'daimon_skycrawlers/configure_songkick_queue'
|
|
6
|
-
require 'daimon_skycrawlers/url_consumer'
|
|
7
|
-
require 'daimon_skycrawlers/storage'
|
|
8
|
-
require 'daimon_skycrawlers/parser'
|
|
9
|
-
|
|
10
|
-
require 'faraday'
|
|
1
|
+
require "daimon_skycrawlers"
|
|
2
|
+
require "daimon_skycrawlers/queue"
|
|
3
|
+
require "daimon_skycrawlers/timer"
|
|
4
|
+
require "daimon_skycrawlers/consumer/url"
|
|
11
5
|
|
|
12
6
|
module DaimonSkycrawlers
|
|
13
|
-
|
|
7
|
+
module Crawler
|
|
14
8
|
class << self
|
|
15
|
-
def run(process_name:
|
|
16
|
-
|
|
9
|
+
def run(process_name: default_process_name)
|
|
10
|
+
DaimonSkycrawlers::Timer.setup_shutdown_timer(config.queue_name_prefix, interval: config.shutdown_interval)
|
|
11
|
+
SongkickQueue::Worker.new(process_name, [DaimonSkycrawlers::Consumer::URL]).run
|
|
17
12
|
end
|
|
18
13
|
|
|
19
|
-
def enqueue_url(url,
|
|
20
|
-
|
|
14
|
+
def enqueue_url(url, message = {})
|
|
15
|
+
message[:url] = url
|
|
16
|
+
SongkickQueue.publish(queue_name, message)
|
|
21
17
|
end
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
attr_writer :storage
|
|
25
|
-
attr_writer :parser
|
|
26
18
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
@options = options
|
|
30
|
-
end
|
|
31
|
-
|
|
32
|
-
def setup_connection(options = {})
|
|
33
|
-
@connection = Faraday.new(@base_url, options) do |faraday|
|
|
34
|
-
yield faraday
|
|
19
|
+
def config
|
|
20
|
+
DaimonSkycrawlers.configuration
|
|
35
21
|
end
|
|
36
|
-
end
|
|
37
|
-
|
|
38
|
-
def storage
|
|
39
|
-
@storage ||= Storage::RDB.new
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
def parser
|
|
43
|
-
@parser ||= Parser::Default.new
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
# TODO Support POST when we need
|
|
47
|
-
# TODO `params` should be a part of `path`. such as `path == "/hoi?hi=yoyo"`.
|
|
48
|
-
def fetch(path, params = {}, depth: 3)
|
|
49
|
-
@connection ||= Faraday.new(@base_url)
|
|
50
|
-
response = get(path)
|
|
51
|
-
|
|
52
|
-
url = @connection.url_prefix + path
|
|
53
|
-
|
|
54
|
-
data = [url.to_s, response.headers, response.body]
|
|
55
22
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
schedule_to_process(url.to_s)
|
|
61
|
-
|
|
62
|
-
parser.parse(response.body)
|
|
63
|
-
urls = parser.links
|
|
64
|
-
|
|
65
|
-
enqueue_next_urls(urls, depth: depth - 1, interval: 1)
|
|
66
|
-
end
|
|
67
|
-
|
|
68
|
-
def get(path, params = {})
|
|
69
|
-
@connection.get(path, params)
|
|
70
|
-
end
|
|
71
|
-
|
|
72
|
-
def post(path, params = {})
|
|
73
|
-
@connection.post(path, params)
|
|
74
|
-
end
|
|
75
|
-
|
|
76
|
-
private
|
|
77
|
-
|
|
78
|
-
def schedule_to_process(url)
|
|
79
|
-
DaimonSkycrawlers::Processor.enqueue_http_response(url)
|
|
80
|
-
end
|
|
81
|
-
|
|
82
|
-
def enqueue_next_urls(urls, depth: 3, interval: 1)
|
|
83
|
-
return if depth <= 0
|
|
23
|
+
def queue_name
|
|
24
|
+
"#{config.queue_name_prefix}.url"
|
|
25
|
+
end
|
|
84
26
|
|
|
85
|
-
|
|
86
|
-
|
|
27
|
+
def default_process_name
|
|
28
|
+
"#{config.queue_name_prefix}:url"
|
|
87
29
|
end
|
|
88
30
|
end
|
|
89
31
|
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
require "daimon_skycrawlers/logger"
|
|
2
|
+
require "daimon_skycrawlers/config"
|
|
3
|
+
|
|
4
|
+
module DaimonSkycrawlers
|
|
5
|
+
module Filter
|
|
6
|
+
class Base
|
|
7
|
+
include DaimonSkycrawlers::LoggerMixin
|
|
8
|
+
include DaimonSkycrawlers::ConfigMixin
|
|
9
|
+
|
|
10
|
+
def initialize(storage: nil)
|
|
11
|
+
super()
|
|
12
|
+
@storage = storage
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def storage
|
|
16
|
+
@storage ||= DaimonSkycrawlers::Storage::RDB.new
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def call(url)
|
|
20
|
+
raise NotImplementedError, "Must implement this method in subclass"
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
require "set"
|
|
2
|
+
require "daimon_skycrawlers/filter/base"
|
|
3
|
+
|
|
4
|
+
module DaimonSkycrawlers
|
|
5
|
+
module Filter
|
|
6
|
+
class DuplicateChecker < Base
|
|
7
|
+
def initialize(base_url: nil)
|
|
8
|
+
@base_url = nil
|
|
9
|
+
@base_url = URI(base_url) if base_url
|
|
10
|
+
@urls = Set.new
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def call(url)
|
|
14
|
+
unless URI(url).absolute?
|
|
15
|
+
url = (@base_url + url).to_s
|
|
16
|
+
end
|
|
17
|
+
return false if @urls.include?(url)
|
|
18
|
+
@urls << url
|
|
19
|
+
true
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
require "faraday"
|
|
2
|
+
require "daimon_skycrawlers/filter/base"
|
|
3
|
+
|
|
4
|
+
module DaimonSkycrawlers
|
|
5
|
+
module Filter
|
|
6
|
+
class UpdateChecker < Base
|
|
7
|
+
def initialize(storage: nil, base_url: nil)
|
|
8
|
+
super(storage: storage)
|
|
9
|
+
@base_url = nil
|
|
10
|
+
@base_url = URI(base_url) if base_url
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def call(url, connection: nil)
|
|
14
|
+
unless URI(url).absolute?
|
|
15
|
+
url = (@base_url + url).to_s
|
|
16
|
+
end
|
|
17
|
+
page = storage.find(url)
|
|
18
|
+
return true unless page
|
|
19
|
+
if connection
|
|
20
|
+
headers = connection.head(url)
|
|
21
|
+
else
|
|
22
|
+
headers = Faraday.head(url)
|
|
23
|
+
end
|
|
24
|
+
return false if headers["etag"] && page.etag && headers["etag"] == page.etag
|
|
25
|
+
return false if headers["last-modified"].nil? && page.last_modified_at.nil?
|
|
26
|
+
return false if headers["last-modified"] <= page.last_modified_at
|
|
27
|
+
true
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
require
|
|
1
|
+
require "thor"
|
|
2
2
|
|
|
3
3
|
module DaimonSkycrawlers
|
|
4
4
|
module Generator
|
|
@@ -8,7 +8,7 @@ module DaimonSkycrawlers
|
|
|
8
8
|
argument :name
|
|
9
9
|
|
|
10
10
|
def self.source_root
|
|
11
|
-
File.join(
|
|
11
|
+
File.join(__dir__, "templates", "new")
|
|
12
12
|
end
|
|
13
13
|
|
|
14
14
|
def create_files
|
|
@@ -26,6 +26,7 @@ module DaimonSkycrawlers
|
|
|
26
26
|
"Rakefile",
|
|
27
27
|
"crawler.rb",
|
|
28
28
|
"enqueue.rb",
|
|
29
|
+
"init.rb",
|
|
29
30
|
"processor.rb",
|
|
30
31
|
].each do |path|
|
|
31
32
|
copy_file(path, "#{name}/#{path}")
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
source
|
|
1
|
+
source "https://rubygems.org"
|
|
2
2
|
|
|
3
|
-
gem
|
|
4
|
-
gem
|
|
3
|
+
gem "rake"
|
|
4
|
+
gem "daimon_skycrawlers"
|