daimon_skycrawlers 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +2 -2
- data/Gemfile +1 -1
- data/README.md +30 -12
- data/Rakefile +3 -8
- data/bin/daimon-skycrawlers +3 -3
- data/daimon_skycrawlers.gemspec +4 -3
- data/lib/daimon_skycrawlers/cli.rb +3 -3
- data/lib/daimon_skycrawlers/config.rb +8 -0
- data/lib/daimon_skycrawlers/consumer/base.rb +16 -0
- data/lib/daimon_skycrawlers/consumer/http_response.rb +47 -0
- data/lib/daimon_skycrawlers/consumer/url.rb +44 -0
- data/lib/daimon_skycrawlers/consumer.rb +4 -0
- data/lib/daimon_skycrawlers/crawler/base.rb +75 -0
- data/lib/daimon_skycrawlers/crawler/default.rb +33 -0
- data/lib/daimon_skycrawlers/crawler.rb +18 -76
- data/lib/daimon_skycrawlers/filter/base.rb +24 -0
- data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +23 -0
- data/lib/daimon_skycrawlers/filter/update_checker.rb +31 -0
- data/lib/daimon_skycrawlers/filter.rb +4 -0
- data/lib/daimon_skycrawlers/generator/new.rb +3 -2
- data/lib/daimon_skycrawlers/generator/templates/new/Gemfile +3 -3
- data/lib/daimon_skycrawlers/generator/templates/new/README.md.erb +22 -6
- data/lib/daimon_skycrawlers/generator/templates/new/crawler.rb +5 -5
- data/lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb +6 -6
- data/lib/daimon_skycrawlers/generator/templates/new/enqueue.rb +17 -8
- data/lib/daimon_skycrawlers/generator/templates/new/init.rb +20 -0
- data/lib/daimon_skycrawlers/generator/templates/new/processor.rb +2 -0
- data/lib/daimon_skycrawlers/logger.rb +32 -0
- data/lib/daimon_skycrawlers/processor/base.rb +19 -0
- data/lib/daimon_skycrawlers/processor/default.rb +12 -9
- data/lib/daimon_skycrawlers/processor/spider.rb +77 -0
- data/lib/daimon_skycrawlers/processor.rb +23 -9
- data/lib/daimon_skycrawlers/queue.rb +24 -0
- data/lib/daimon_skycrawlers/storage/base.rb +6 -0
- data/lib/daimon_skycrawlers/timer.rb +24 -0
- data/lib/daimon_skycrawlers/version.rb +1 -1
- data/lib/daimon_skycrawlers.rb +24 -4
- data/sample/spider/Gemfile +4 -0
- data/sample/spider/README.md +50 -0
- data/sample/spider/Rakefile +1 -0
- data/sample/spider/config/database.yml +26 -0
- data/sample/spider/crawler.rb +14 -0
- data/sample/spider/db/migrate/20160830155803_create_pages.rb +13 -0
- data/sample/spider/db/schema.rb +28 -0
- data/sample/spider/enqueue.rb +24 -0
- data/sample/spider/init.rb +22 -0
- data/sample/spider/processor.rb +34 -0
- metadata +47 -12
- data/lib/daimon_skycrawlers/configure_songkick_queue.rb +0 -12
- data/lib/daimon_skycrawlers/http_response_consumer.rb +0 -38
- data/lib/daimon_skycrawlers/parser/base.rb +0 -13
- data/lib/daimon_skycrawlers/parser/default.rb +0 -50
- data/lib/daimon_skycrawlers/parser.rb +0 -7
- data/lib/daimon_skycrawlers/url_consumer.rb +0 -32
|
@@ -15,20 +15,36 @@ TODO: Write description.
|
|
|
15
15
|
|
|
16
16
|
1. Install dependencies
|
|
17
17
|
|
|
18
|
-
|
|
18
|
+
```
|
|
19
|
+
$ bundle install
|
|
20
|
+
```
|
|
19
21
|
|
|
20
22
|
2. Create database
|
|
21
23
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
+
```
|
|
25
|
+
$ bundle exec rake db:create
|
|
26
|
+
$ bundle exec rake db:migrate
|
|
27
|
+
```
|
|
24
28
|
|
|
25
29
|
3. Open new terminal and run crawler/processor
|
|
26
30
|
|
|
27
|
-
|
|
28
|
-
|
|
31
|
+
```
|
|
32
|
+
$ bundle exec ruby crawler.rb # on new terminal
|
|
33
|
+
$ bundle exec ruby processor.rb # on new terminal
|
|
34
|
+
```
|
|
29
35
|
|
|
30
36
|
4. Enqueue task
|
|
31
37
|
|
|
32
|
-
|
|
38
|
+
```
|
|
39
|
+
$ bundle exec ruby enqueue.rb http://example.com/
|
|
40
|
+
```
|
|
33
41
|
|
|
34
42
|
5. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
|
|
43
|
+
|
|
44
|
+
6. You can re-enqueue task for processor
|
|
45
|
+
|
|
46
|
+
```
|
|
47
|
+
$ bundle exec ruby enqueue.rb response http://example.com/
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Display `It works with 'http://example.com'` again on your terminal which runs your processor.
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
#!/usr/bin/env ruby
|
|
2
2
|
|
|
3
3
|
require "daimon_skycrawlers/crawler"
|
|
4
|
+
require "daimon_skycrawlers/crawler/default"
|
|
4
5
|
|
|
5
|
-
|
|
6
|
+
require_relative "./init"
|
|
6
7
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
end
|
|
8
|
+
base_url = "http://example.com"
|
|
9
|
+
|
|
10
|
+
crawler = DaimonSkycrawlers::Crawler::Default.new(base_url)
|
|
11
11
|
|
|
12
12
|
DaimonSkycrawlers.register_crawler(crawler)
|
|
13
13
|
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
class CreatePages < ActiveRecord::Migration
|
|
2
2
|
def change
|
|
3
3
|
create_table :pages do |t|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
4
|
+
t.string :url
|
|
5
|
+
t.text :headers
|
|
6
|
+
t.binary :body, limit: 10 * 1024 ** 2 # 10MiB
|
|
7
|
+
t.datetime :last_modified_at
|
|
8
|
+
t.string :etag
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
t.timestamps null: false
|
|
11
11
|
end
|
|
12
12
|
end
|
|
13
13
|
end
|
|
@@ -1,15 +1,24 @@
|
|
|
1
1
|
#!/usr/bin/env ruby
|
|
2
2
|
|
|
3
|
+
require "thor"
|
|
4
|
+
|
|
3
5
|
require "daimon_skycrawlers/crawler"
|
|
6
|
+
require "daimon_skycrawlers/processor"
|
|
4
7
|
|
|
5
|
-
|
|
8
|
+
require_relative "./init"
|
|
6
9
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
10
|
+
class Enqueue < Thor
|
|
11
|
+
desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
|
|
12
|
+
def url(url, *rest)
|
|
13
|
+
message = rest.map {|arg| arg.split(":") }.to_h
|
|
14
|
+
DaimonSkycrawlers::Crawler.enqueue_url(url, message)
|
|
15
|
+
end
|
|
12
16
|
|
|
13
|
-
|
|
17
|
+
desc "response URL [key1:value1 key2:value2...]", "Enqueue URL for HTTP response consumer"
|
|
18
|
+
def response(url, *rest)
|
|
19
|
+
message = rest.map {|arg| arg.split(":") }.to_h
|
|
20
|
+
DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
|
|
21
|
+
end
|
|
22
|
+
end
|
|
14
23
|
|
|
15
|
-
|
|
24
|
+
Enqueue.start(ARGV)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
require "daimon_skycrawlers"
|
|
2
|
+
require "daimon_skycrawlers/logger"
|
|
3
|
+
require "daimon_skycrawlers/queue"
|
|
4
|
+
|
|
5
|
+
DaimonSkycrawlers.configure do |config|
|
|
6
|
+
config.logger = DaimonSkycrawlers::Logger.default
|
|
7
|
+
config.crawler_interval = 1
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
DaimonSkycrawlers::Queue.configure do |config|
|
|
11
|
+
# queue configuration
|
|
12
|
+
config.logger = DaimonSkycrawlers.configuration.logger
|
|
13
|
+
config.host = "127.0.0.1"
|
|
14
|
+
config.port = 5672
|
|
15
|
+
# config.username = 'guest'
|
|
16
|
+
# config.password = 'guest'
|
|
17
|
+
config.vhost = "/"
|
|
18
|
+
config.max_reconnect_attempts = 10
|
|
19
|
+
config.network_recovery_interval = 1.0
|
|
20
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
require "delegate"
|
|
2
|
+
require "logger"
|
|
3
|
+
|
|
4
|
+
module DaimonSkycrawlers
|
|
5
|
+
class Logger < SimpleDelegator
|
|
6
|
+
class << self
|
|
7
|
+
def default
|
|
8
|
+
@default ||= DaimonSkycrawlers::Logger.new(STDOUT)
|
|
9
|
+
end
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def initialize(logdev, shift_age = 0, shift_size = 1048576)
|
|
13
|
+
@log = ::Logger.new(logdev, shift_age, shift_size)
|
|
14
|
+
super(@log)
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
module LoggerMixin
|
|
19
|
+
def initialize
|
|
20
|
+
super
|
|
21
|
+
@log = DaimonSkycrawlers::Logger.default
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
class << self
|
|
25
|
+
def included(base)
|
|
26
|
+
base.module_eval do
|
|
27
|
+
attr_accessor :log
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
require "daimon_skycrawlers/logger"
|
|
2
|
+
require "daimon_skycrawlers/config"
|
|
3
|
+
|
|
4
|
+
module DaimonSkycrawlers
|
|
5
|
+
module Processor
|
|
6
|
+
class Base
|
|
7
|
+
include DaimonSkycrawlers::LoggerMixin
|
|
8
|
+
include DaimonSkycrawlers::ConfigMixin
|
|
9
|
+
|
|
10
|
+
def call(message)
|
|
11
|
+
raise "Implement this method in subclass"
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def storage
|
|
15
|
+
@storage ||= DaimonSkycrawlers::Storage::RDB.new
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -1,19 +1,22 @@
|
|
|
1
1
|
require "daimon_skycrawlers/storage/rdb"
|
|
2
|
+
require "daimon_skycrawlers/processor/base"
|
|
2
3
|
|
|
3
4
|
module DaimonSkycrawlers
|
|
4
|
-
|
|
5
|
-
class Default
|
|
5
|
+
module Processor
|
|
6
|
+
class Default < Base
|
|
6
7
|
def call(message)
|
|
8
|
+
return if message[:heartbeat]
|
|
7
9
|
url = message[:url]
|
|
8
|
-
storage = DaimonSkycrawlers::Storage::RDB.new
|
|
9
10
|
page = storage.find(url)
|
|
10
11
|
headers = JSON.parse(page.headers)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
12
|
+
headers_string = headers.map {|key, value| " #{key}: #{value}" }.join("\n")
|
|
13
|
+
dumped_message = <<LOG
|
|
14
|
+
URL: #{page.url}
|
|
15
|
+
Body: #{page.body.bytesize} bytes
|
|
16
|
+
Headers:
|
|
17
|
+
#{headers_string}
|
|
18
|
+
LOG
|
|
19
|
+
log.info(dumped_message)
|
|
17
20
|
end
|
|
18
21
|
end
|
|
19
22
|
end
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
require "nokogiri"
|
|
2
|
+
require "daimon_skycrawlers/crawler"
|
|
3
|
+
|
|
4
|
+
module DaimonSkycrawlers
|
|
5
|
+
module Processor
|
|
6
|
+
class Spider < Base
|
|
7
|
+
attr_accessor :enqueue
|
|
8
|
+
|
|
9
|
+
def initialize
|
|
10
|
+
super
|
|
11
|
+
@filters = []
|
|
12
|
+
@doc = nil
|
|
13
|
+
@links = nil
|
|
14
|
+
@enqueue = true
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def append_filter(filter = nil, &block)
|
|
18
|
+
if block_given?
|
|
19
|
+
@filters << block
|
|
20
|
+
else
|
|
21
|
+
@filters << filter
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
#
|
|
26
|
+
# @param [Hash] message Must have key :url, :depth
|
|
27
|
+
#
|
|
28
|
+
def call(message)
|
|
29
|
+
key_url = message[:url]
|
|
30
|
+
depth = Integer(message[:depth] || 2)
|
|
31
|
+
return if message[:heartbeat]
|
|
32
|
+
return if depth <= 1
|
|
33
|
+
page = storage.find(key_url)
|
|
34
|
+
@doc = Nokogiri::HTML(page.body)
|
|
35
|
+
new_message = {
|
|
36
|
+
depth: depth - 1,
|
|
37
|
+
}
|
|
38
|
+
links.each do |url|
|
|
39
|
+
enqueue_url(url, new_message)
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
private
|
|
44
|
+
|
|
45
|
+
def links
|
|
46
|
+
return @links if @links
|
|
47
|
+
@links = retrieve_links
|
|
48
|
+
@links
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def retrieve_links
|
|
52
|
+
urls = @doc.search("a").map do |element|
|
|
53
|
+
element["href"]
|
|
54
|
+
end
|
|
55
|
+
urls.uniq!
|
|
56
|
+
apply_filters(urls) || []
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def apply_filters(urls)
|
|
60
|
+
return if urls.nil?
|
|
61
|
+
return if urls.empty?
|
|
62
|
+
log.debug("Candidate URLs: #{urls.size}")
|
|
63
|
+
urls = urls.select do |url|
|
|
64
|
+
@filters.inject(true) {|memo, filter| memo & filter.call(url) }
|
|
65
|
+
end
|
|
66
|
+
log.debug("Filtered URLs: #{urls.size}")
|
|
67
|
+
urls
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def enqueue_url(url, new_message)
|
|
71
|
+
return unless @enqueue
|
|
72
|
+
log.debug("Enqueue: URL:#{url}, message: #{new_message}")
|
|
73
|
+
DaimonSkycrawlers::Crawler.enqueue_url(url, new_message)
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
@@ -1,17 +1,31 @@
|
|
|
1
|
-
require
|
|
2
|
-
require
|
|
3
|
-
require
|
|
4
|
-
require
|
|
1
|
+
require "daimon_skycrawlers"
|
|
2
|
+
require "daimon_skycrawlers/queue"
|
|
3
|
+
require "daimon_skycrawlers/timer"
|
|
4
|
+
require "daimon_skycrawlers/consumer/http_response"
|
|
5
5
|
|
|
6
6
|
module DaimonSkycrawlers
|
|
7
|
-
|
|
7
|
+
module Processor
|
|
8
8
|
class << self
|
|
9
|
-
def run(process_name:
|
|
10
|
-
|
|
9
|
+
def run(process_name: default_process_name)
|
|
10
|
+
DaimonSkycrawlers::Timer.setup_shutdown_timer(config.queue_name_prefix, interval: config.shutdown_interval)
|
|
11
|
+
SongkickQueue::Worker.new(process_name, [DaimonSkycrawlers::Consumer::HTTPResponse]).run
|
|
11
12
|
end
|
|
12
13
|
|
|
13
|
-
def enqueue_http_response(url)
|
|
14
|
-
|
|
14
|
+
def enqueue_http_response(url, message = {})
|
|
15
|
+
message[:url] = url
|
|
16
|
+
SongkickQueue.publish(queue_name, message)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def config
|
|
20
|
+
DaimonSkycrawlers.configuration
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def queue_name
|
|
24
|
+
"#{config.queue_name_prefix}.http-response"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def default_process_name
|
|
28
|
+
"#{config.queue_name_prefix}:http-response"
|
|
15
29
|
end
|
|
16
30
|
end
|
|
17
31
|
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
require "songkick_queue"
|
|
2
|
+
|
|
3
|
+
module DaimonSkycrawlers
|
|
4
|
+
class Queue
|
|
5
|
+
class << self
|
|
6
|
+
def configuration
|
|
7
|
+
@configuration ||= SongkickQueue.configure do |config|
|
|
8
|
+
config.logger = Logger.new(STDOUT)
|
|
9
|
+
config.host = "127.0.0.1"
|
|
10
|
+
config.port = 5672
|
|
11
|
+
# config.username = 'guest'
|
|
12
|
+
# config.password = 'guest'
|
|
13
|
+
config.vhost = "/"
|
|
14
|
+
config.max_reconnect_attempts = 10
|
|
15
|
+
config.network_recovery_interval = 1.0
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def configure
|
|
20
|
+
yield configuration
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -1,6 +1,12 @@
|
|
|
1
|
+
require "daimon_skycrawlers/logger"
|
|
2
|
+
require "daimon_skycrawlers/config"
|
|
3
|
+
|
|
1
4
|
module DaimonSkycrawlers
|
|
2
5
|
module Storage
|
|
3
6
|
class Base
|
|
7
|
+
include DaimonSkycrawlers::LoggerMixin
|
|
8
|
+
include DaimonSkycrawlers::ConfigMixin
|
|
9
|
+
|
|
4
10
|
def save(url, headers, body)
|
|
5
11
|
raise "Implement this in subclass"
|
|
6
12
|
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
require "timers"
|
|
2
|
+
require "daimon_skycrawlers"
|
|
3
|
+
|
|
4
|
+
module DaimonSkycrawlers
|
|
5
|
+
module Timer
|
|
6
|
+
module_function
|
|
7
|
+
|
|
8
|
+
def setup_shutdown_timer(queue_name_prefix, interval: 10)
|
|
9
|
+
timers = Timers::Group.new
|
|
10
|
+
timer = timers.after(interval) do
|
|
11
|
+
Process.kill(:INT, 0)
|
|
12
|
+
end
|
|
13
|
+
Thread.new(timers) do |t|
|
|
14
|
+
loop { t.wait }
|
|
15
|
+
end
|
|
16
|
+
ActiveSupport::Notifications.subscribe("consume_message.songkick_queue") do |*args|
|
|
17
|
+
event = ActiveSupport::Notifications::Event.new(*args)
|
|
18
|
+
queue_name = event.payload[:queue_name]
|
|
19
|
+
DaimonSkycrawlers.configuration.logger.debug("Reset timer: consume message #{queue_name}")
|
|
20
|
+
timer.reset
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
data/lib/daimon_skycrawlers.rb
CHANGED
|
@@ -1,15 +1,35 @@
|
|
|
1
|
-
require
|
|
1
|
+
require "bundler/setup"
|
|
2
2
|
|
|
3
|
-
require
|
|
3
|
+
require "daimon_skycrawlers/version"
|
|
4
|
+
require "daimon_skycrawlers/logger"
|
|
4
5
|
|
|
5
6
|
module DaimonSkycrawlers
|
|
7
|
+
Configuration = Struct.new(
|
|
8
|
+
:logger,
|
|
9
|
+
:queue_name_prefix,
|
|
10
|
+
:crawler_interval,
|
|
11
|
+
:shutdown_interval
|
|
12
|
+
)
|
|
6
13
|
class << self
|
|
7
14
|
def register_processor(processor = nil, &block)
|
|
8
|
-
|
|
15
|
+
DaimonSkycrawlers::Consumer::HTTPResponse.register(processor, &block)
|
|
9
16
|
end
|
|
10
17
|
|
|
11
18
|
def register_crawler(crawler)
|
|
12
|
-
|
|
19
|
+
DaimonSkycrawlers::Consumer::URL.register(crawler)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def configuration
|
|
23
|
+
@configuration ||= DaimonSkycrawlers::Configuration.new.tap do |config|
|
|
24
|
+
config.logger = DaimonSkycrawlers::Logger.default
|
|
25
|
+
config.queue_name_prefix = "daimon-skycrawlers"
|
|
26
|
+
config.crawler_interval = 1
|
|
27
|
+
config.shutdown_interval = 10
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def configure
|
|
32
|
+
yield configuration
|
|
13
33
|
end
|
|
14
34
|
end
|
|
15
35
|
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# spider
|
|
2
|
+
|
|
3
|
+
TODO: Write description.
|
|
4
|
+
|
|
5
|
+
## Requirements
|
|
6
|
+
|
|
7
|
+
- Ruby
|
|
8
|
+
- RabbitMQ
|
|
9
|
+
- RDB
|
|
10
|
+
- PostgreSQL (default)
|
|
11
|
+
- MySQL
|
|
12
|
+
- SQLite3
|
|
13
|
+
|
|
14
|
+
## Usage
|
|
15
|
+
|
|
16
|
+
1. Install dependencies
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
$ bundle install
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
2. Create database
|
|
23
|
+
|
|
24
|
+
```
|
|
25
|
+
$ bundle exec rake db:create
|
|
26
|
+
$ bundle exec rake db:migrate
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
3. Open new terminal and run crawler/processor
|
|
30
|
+
|
|
31
|
+
```
|
|
32
|
+
$ bundle exec ruby crawler.rb # on new terminal
|
|
33
|
+
$ bundle exec ruby processor.rb # on new terminal
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
4. Enqueue task
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
$ bundle exec ruby enqueue.rb http://example.com/
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
5. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
|
|
43
|
+
|
|
44
|
+
6. You can re-enqueue task for processor
|
|
45
|
+
|
|
46
|
+
```
|
|
47
|
+
$ bundle exec ruby enqueue.rb response http://example.com/
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Display `It works with 'http://example.com'` again on your terminal which runs your processor.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
require "daimon_skycrawlers/tasks"
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# PostgreSQL. Versions 8.2 and up are supported.
|
|
2
|
+
#
|
|
3
|
+
default: &default
|
|
4
|
+
adapter: postgresql
|
|
5
|
+
encoding: unicode
|
|
6
|
+
pool: 5
|
|
7
|
+
|
|
8
|
+
development:
|
|
9
|
+
<<: *default
|
|
10
|
+
database: spider_development
|
|
11
|
+
#username: spider
|
|
12
|
+
#password:
|
|
13
|
+
#host: localhost
|
|
14
|
+
#port: 5432
|
|
15
|
+
#schema_search_path: myapp,sharedapp,public
|
|
16
|
+
#min_messages: notice
|
|
17
|
+
|
|
18
|
+
test:
|
|
19
|
+
<<: *default
|
|
20
|
+
database: spider_test
|
|
21
|
+
|
|
22
|
+
production:
|
|
23
|
+
<<: *default
|
|
24
|
+
database: spider_production
|
|
25
|
+
username: spider
|
|
26
|
+
password: <%= ENV['SPIDER_PASSWORD'] %>
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require "daimon_skycrawlers/crawler"
|
|
4
|
+
require "daimon_skycrawlers/crawler/default"
|
|
5
|
+
|
|
6
|
+
require_relative "./init"
|
|
7
|
+
|
|
8
|
+
base_url = "http://www.clear-code.com/blog/"
|
|
9
|
+
|
|
10
|
+
crawler = DaimonSkycrawlers::Crawler::Default.new(base_url)
|
|
11
|
+
|
|
12
|
+
DaimonSkycrawlers.register_crawler(crawler)
|
|
13
|
+
|
|
14
|
+
DaimonSkycrawlers::Crawler.run
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
class CreatePages < ActiveRecord::Migration
|
|
2
|
+
def change
|
|
3
|
+
create_table :pages do |t|
|
|
4
|
+
t.string :url
|
|
5
|
+
t.text :headers
|
|
6
|
+
t.binary :body, limit: 10 * 1024 ** 2 # 10MiB
|
|
7
|
+
t.datetime :last_modified_at
|
|
8
|
+
t.string :etag
|
|
9
|
+
|
|
10
|
+
t.timestamps null: false
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# This file is auto-generated from the current state of the database. Instead
|
|
2
|
+
# of editing this file, please use the migrations feature of Active Record to
|
|
3
|
+
# incrementally modify your database, and then regenerate this schema definition.
|
|
4
|
+
#
|
|
5
|
+
# Note that this schema.rb definition is the authoritative source for your
|
|
6
|
+
# database schema. If you need to create the application database on another
|
|
7
|
+
# system, you should be using db:schema:load, not running all the migrations
|
|
8
|
+
# from scratch. The latter is a flawed and unsustainable approach (the more migrations
|
|
9
|
+
# you'll amass, the slower it'll run and the greater likelihood for issues).
|
|
10
|
+
#
|
|
11
|
+
# It's strongly recommended that you check this file into your version control system.
|
|
12
|
+
|
|
13
|
+
ActiveRecord::Schema.define(version: 20160830155803) do
|
|
14
|
+
|
|
15
|
+
# These are extensions that must be enabled in order to support this database
|
|
16
|
+
enable_extension "plpgsql"
|
|
17
|
+
|
|
18
|
+
create_table "pages", force: :cascade do |t|
|
|
19
|
+
t.string "url"
|
|
20
|
+
t.text "headers"
|
|
21
|
+
t.binary "body"
|
|
22
|
+
t.datetime "last_modified_at"
|
|
23
|
+
t.string "etag"
|
|
24
|
+
t.datetime "created_at", null: false
|
|
25
|
+
t.datetime "updated_at", null: false
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require "thor"
|
|
4
|
+
|
|
5
|
+
require "daimon_skycrawlers/crawler"
|
|
6
|
+
require "daimon_skycrawlers/processor"
|
|
7
|
+
|
|
8
|
+
require_relative "./init"
|
|
9
|
+
|
|
10
|
+
class Enqueue < Thor
|
|
11
|
+
desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
|
|
12
|
+
def url(url, *rest)
|
|
13
|
+
message = rest.map {|arg| arg.split(":") }.to_h
|
|
14
|
+
DaimonSkycrawlers::Crawler.enqueue_url(url, message)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
desc "response URL [key1:value1 key2:value2...]", "Enqueue URL for HTTP response consumer"
|
|
18
|
+
def response(url, *rest)
|
|
19
|
+
message = rest.map {|arg| arg.split(":") }.to_h
|
|
20
|
+
DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
Enqueue.start(ARGV)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
require "daimon_skycrawlers"
|
|
2
|
+
require "daimon_skycrawlers/logger"
|
|
3
|
+
require "daimon_skycrawlers/queue"
|
|
4
|
+
|
|
5
|
+
DaimonSkycrawlers.configure do |config|
|
|
6
|
+
config.logger = DaimonSkycrawlers::Logger.default
|
|
7
|
+
config.logger.level = :debug
|
|
8
|
+
config.crawler_interval = 1
|
|
9
|
+
config.shutdown_interval = 30
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
DaimonSkycrawlers::Queue.configure do |config|
|
|
13
|
+
# queue configuration
|
|
14
|
+
config.logger = DaimonSkycrawlers.configuration.logger
|
|
15
|
+
config.host = "127.0.0.1"
|
|
16
|
+
config.port = 5672
|
|
17
|
+
# config.username = 'guest'
|
|
18
|
+
# config.password = 'guest'
|
|
19
|
+
config.vhost = "/"
|
|
20
|
+
config.max_reconnect_attempts = 10
|
|
21
|
+
config.network_recovery_interval = 1.0
|
|
22
|
+
end
|