daimon_skycrawlers 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -2
- data/Gemfile +1 -1
- data/README.md +30 -12
- data/Rakefile +3 -8
- data/bin/daimon-skycrawlers +3 -3
- data/daimon_skycrawlers.gemspec +4 -3
- data/lib/daimon_skycrawlers/cli.rb +3 -3
- data/lib/daimon_skycrawlers/config.rb +8 -0
- data/lib/daimon_skycrawlers/consumer/base.rb +16 -0
- data/lib/daimon_skycrawlers/consumer/http_response.rb +47 -0
- data/lib/daimon_skycrawlers/consumer/url.rb +44 -0
- data/lib/daimon_skycrawlers/consumer.rb +4 -0
- data/lib/daimon_skycrawlers/crawler/base.rb +75 -0
- data/lib/daimon_skycrawlers/crawler/default.rb +33 -0
- data/lib/daimon_skycrawlers/crawler.rb +18 -76
- data/lib/daimon_skycrawlers/filter/base.rb +24 -0
- data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +23 -0
- data/lib/daimon_skycrawlers/filter/update_checker.rb +31 -0
- data/lib/daimon_skycrawlers/filter.rb +4 -0
- data/lib/daimon_skycrawlers/generator/new.rb +3 -2
- data/lib/daimon_skycrawlers/generator/templates/new/Gemfile +3 -3
- data/lib/daimon_skycrawlers/generator/templates/new/README.md.erb +22 -6
- data/lib/daimon_skycrawlers/generator/templates/new/crawler.rb +5 -5
- data/lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb +6 -6
- data/lib/daimon_skycrawlers/generator/templates/new/enqueue.rb +17 -8
- data/lib/daimon_skycrawlers/generator/templates/new/init.rb +20 -0
- data/lib/daimon_skycrawlers/generator/templates/new/processor.rb +2 -0
- data/lib/daimon_skycrawlers/logger.rb +32 -0
- data/lib/daimon_skycrawlers/processor/base.rb +19 -0
- data/lib/daimon_skycrawlers/processor/default.rb +12 -9
- data/lib/daimon_skycrawlers/processor/spider.rb +77 -0
- data/lib/daimon_skycrawlers/processor.rb +23 -9
- data/lib/daimon_skycrawlers/queue.rb +24 -0
- data/lib/daimon_skycrawlers/storage/base.rb +6 -0
- data/lib/daimon_skycrawlers/timer.rb +24 -0
- data/lib/daimon_skycrawlers/version.rb +1 -1
- data/lib/daimon_skycrawlers.rb +24 -4
- data/sample/spider/Gemfile +4 -0
- data/sample/spider/README.md +50 -0
- data/sample/spider/Rakefile +1 -0
- data/sample/spider/config/database.yml +26 -0
- data/sample/spider/crawler.rb +14 -0
- data/sample/spider/db/migrate/20160830155803_create_pages.rb +13 -0
- data/sample/spider/db/schema.rb +28 -0
- data/sample/spider/enqueue.rb +24 -0
- data/sample/spider/init.rb +22 -0
- data/sample/spider/processor.rb +34 -0
- metadata +47 -12
- data/lib/daimon_skycrawlers/configure_songkick_queue.rb +0 -12
- data/lib/daimon_skycrawlers/http_response_consumer.rb +0 -38
- data/lib/daimon_skycrawlers/parser/base.rb +0 -13
- data/lib/daimon_skycrawlers/parser/default.rb +0 -50
- data/lib/daimon_skycrawlers/parser.rb +0 -7
- data/lib/daimon_skycrawlers/url_consumer.rb +0 -32
@@ -15,20 +15,36 @@ TODO: Write description.
|
|
15
15
|
|
16
16
|
1. Install dependencies
|
17
17
|
|
18
|
-
|
18
|
+
```
|
19
|
+
$ bundle install
|
20
|
+
```
|
19
21
|
|
20
22
|
2. Create database
|
21
23
|
|
22
|
-
|
23
|
-
|
24
|
+
```
|
25
|
+
$ bundle exec rake db:create
|
26
|
+
$ bundle exec rake db:migrate
|
27
|
+
```
|
24
28
|
|
25
29
|
3. Open new terminal and run crawler/processor
|
26
30
|
|
27
|
-
|
28
|
-
|
31
|
+
```
|
32
|
+
$ bundle exec ruby crawler.rb # on new terminal
|
33
|
+
$ bundle exec ruby processor.rb # on new terminal
|
34
|
+
```
|
29
35
|
|
30
36
|
4. Enqueue task
|
31
37
|
|
32
|
-
|
38
|
+
```
|
39
|
+
$ bundle exec ruby enqueue.rb http://example.com/
|
40
|
+
```
|
33
41
|
|
34
42
|
5. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
|
43
|
+
|
44
|
+
6. You can re-enqueue task for processor
|
45
|
+
|
46
|
+
```
|
47
|
+
$ bundle exec ruby enqueue.rb response http://example.com/
|
48
|
+
```
|
49
|
+
|
50
|
+
Display `It works with 'http://example.com'` again on your terminal which runs your processor.
|
@@ -1,13 +1,13 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
require "daimon_skycrawlers/crawler"
|
4
|
+
require "daimon_skycrawlers/crawler/default"
|
4
5
|
|
5
|
-
|
6
|
+
require_relative "./init"
|
6
7
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
end
|
8
|
+
base_url = "http://example.com"
|
9
|
+
|
10
|
+
crawler = DaimonSkycrawlers::Crawler::Default.new(base_url)
|
11
11
|
|
12
12
|
DaimonSkycrawlers.register_crawler(crawler)
|
13
13
|
|
@@ -1,13 +1,13 @@
|
|
1
1
|
class CreatePages < ActiveRecord::Migration
|
2
2
|
def change
|
3
3
|
create_table :pages do |t|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
4
|
+
t.string :url
|
5
|
+
t.text :headers
|
6
|
+
t.binary :body, limit: 10 * 1024 ** 2 # 10MiB
|
7
|
+
t.datetime :last_modified_at
|
8
|
+
t.string :etag
|
9
9
|
|
10
|
-
|
10
|
+
t.timestamps null: false
|
11
11
|
end
|
12
12
|
end
|
13
13
|
end
|
@@ -1,15 +1,24 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
+
require "thor"
|
4
|
+
|
3
5
|
require "daimon_skycrawlers/crawler"
|
6
|
+
require "daimon_skycrawlers/processor"
|
4
7
|
|
5
|
-
|
8
|
+
require_relative "./init"
|
6
9
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
10
|
+
class Enqueue < Thor
|
11
|
+
desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
|
12
|
+
def url(url, *rest)
|
13
|
+
message = rest.map {|arg| arg.split(":") }.to_h
|
14
|
+
DaimonSkycrawlers::Crawler.enqueue_url(url, message)
|
15
|
+
end
|
12
16
|
|
13
|
-
|
17
|
+
desc "response URL [key1:value1 key2:value2...]", "Enqueue URL for HTTP response consumer"
|
18
|
+
def response(url, *rest)
|
19
|
+
message = rest.map {|arg| arg.split(":") }.to_h
|
20
|
+
DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
|
21
|
+
end
|
22
|
+
end
|
14
23
|
|
15
|
-
|
24
|
+
Enqueue.start(ARGV)
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require "daimon_skycrawlers"
|
2
|
+
require "daimon_skycrawlers/logger"
|
3
|
+
require "daimon_skycrawlers/queue"
|
4
|
+
|
5
|
+
DaimonSkycrawlers.configure do |config|
|
6
|
+
config.logger = DaimonSkycrawlers::Logger.default
|
7
|
+
config.crawler_interval = 1
|
8
|
+
end
|
9
|
+
|
10
|
+
DaimonSkycrawlers::Queue.configure do |config|
|
11
|
+
# queue configuration
|
12
|
+
config.logger = DaimonSkycrawlers.configuration.logger
|
13
|
+
config.host = "127.0.0.1"
|
14
|
+
config.port = 5672
|
15
|
+
# config.username = 'guest'
|
16
|
+
# config.password = 'guest'
|
17
|
+
config.vhost = "/"
|
18
|
+
config.max_reconnect_attempts = 10
|
19
|
+
config.network_recovery_interval = 1.0
|
20
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require "delegate"
|
2
|
+
require "logger"
|
3
|
+
|
4
|
+
module DaimonSkycrawlers
|
5
|
+
class Logger < SimpleDelegator
|
6
|
+
class << self
|
7
|
+
def default
|
8
|
+
@default ||= DaimonSkycrawlers::Logger.new(STDOUT)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def initialize(logdev, shift_age = 0, shift_size = 1048576)
|
13
|
+
@log = ::Logger.new(logdev, shift_age, shift_size)
|
14
|
+
super(@log)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
module LoggerMixin
|
19
|
+
def initialize
|
20
|
+
super
|
21
|
+
@log = DaimonSkycrawlers::Logger.default
|
22
|
+
end
|
23
|
+
|
24
|
+
class << self
|
25
|
+
def included(base)
|
26
|
+
base.module_eval do
|
27
|
+
attr_accessor :log
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require "daimon_skycrawlers/logger"
|
2
|
+
require "daimon_skycrawlers/config"
|
3
|
+
|
4
|
+
module DaimonSkycrawlers
|
5
|
+
module Processor
|
6
|
+
class Base
|
7
|
+
include DaimonSkycrawlers::LoggerMixin
|
8
|
+
include DaimonSkycrawlers::ConfigMixin
|
9
|
+
|
10
|
+
def call(message)
|
11
|
+
raise "Implement this method in subclass"
|
12
|
+
end
|
13
|
+
|
14
|
+
def storage
|
15
|
+
@storage ||= DaimonSkycrawlers::Storage::RDB.new
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -1,19 +1,22 @@
|
|
1
1
|
require "daimon_skycrawlers/storage/rdb"
|
2
|
+
require "daimon_skycrawlers/processor/base"
|
2
3
|
|
3
4
|
module DaimonSkycrawlers
|
4
|
-
|
5
|
-
class Default
|
5
|
+
module Processor
|
6
|
+
class Default < Base
|
6
7
|
def call(message)
|
8
|
+
return if message[:heartbeat]
|
7
9
|
url = message[:url]
|
8
|
-
storage = DaimonSkycrawlers::Storage::RDB.new
|
9
10
|
page = storage.find(url)
|
10
11
|
headers = JSON.parse(page.headers)
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
12
|
+
headers_string = headers.map {|key, value| " #{key}: #{value}" }.join("\n")
|
13
|
+
dumped_message = <<LOG
|
14
|
+
URL: #{page.url}
|
15
|
+
Body: #{page.body.bytesize} bytes
|
16
|
+
Headers:
|
17
|
+
#{headers_string}
|
18
|
+
LOG
|
19
|
+
log.info(dumped_message)
|
17
20
|
end
|
18
21
|
end
|
19
22
|
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
require "daimon_skycrawlers/crawler"
|
3
|
+
|
4
|
+
module DaimonSkycrawlers
|
5
|
+
module Processor
|
6
|
+
class Spider < Base
|
7
|
+
attr_accessor :enqueue
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
super
|
11
|
+
@filters = []
|
12
|
+
@doc = nil
|
13
|
+
@links = nil
|
14
|
+
@enqueue = true
|
15
|
+
end
|
16
|
+
|
17
|
+
def append_filter(filter = nil, &block)
|
18
|
+
if block_given?
|
19
|
+
@filters << block
|
20
|
+
else
|
21
|
+
@filters << filter
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
#
|
26
|
+
# @param [Hash] message Must have key :url, :depth
|
27
|
+
#
|
28
|
+
def call(message)
|
29
|
+
key_url = message[:url]
|
30
|
+
depth = Integer(message[:depth] || 2)
|
31
|
+
return if message[:heartbeat]
|
32
|
+
return if depth <= 1
|
33
|
+
page = storage.find(key_url)
|
34
|
+
@doc = Nokogiri::HTML(page.body)
|
35
|
+
new_message = {
|
36
|
+
depth: depth - 1,
|
37
|
+
}
|
38
|
+
links.each do |url|
|
39
|
+
enqueue_url(url, new_message)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def links
|
46
|
+
return @links if @links
|
47
|
+
@links = retrieve_links
|
48
|
+
@links
|
49
|
+
end
|
50
|
+
|
51
|
+
def retrieve_links
|
52
|
+
urls = @doc.search("a").map do |element|
|
53
|
+
element["href"]
|
54
|
+
end
|
55
|
+
urls.uniq!
|
56
|
+
apply_filters(urls) || []
|
57
|
+
end
|
58
|
+
|
59
|
+
def apply_filters(urls)
|
60
|
+
return if urls.nil?
|
61
|
+
return if urls.empty?
|
62
|
+
log.debug("Candidate URLs: #{urls.size}")
|
63
|
+
urls = urls.select do |url|
|
64
|
+
@filters.inject(true) {|memo, filter| memo & filter.call(url) }
|
65
|
+
end
|
66
|
+
log.debug("Filtered URLs: #{urls.size}")
|
67
|
+
urls
|
68
|
+
end
|
69
|
+
|
70
|
+
def enqueue_url(url, new_message)
|
71
|
+
return unless @enqueue
|
72
|
+
log.debug("Enqueue: URL:#{url}, message: #{new_message}")
|
73
|
+
DaimonSkycrawlers::Crawler.enqueue_url(url, new_message)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -1,17 +1,31 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
1
|
+
require "daimon_skycrawlers"
|
2
|
+
require "daimon_skycrawlers/queue"
|
3
|
+
require "daimon_skycrawlers/timer"
|
4
|
+
require "daimon_skycrawlers/consumer/http_response"
|
5
5
|
|
6
6
|
module DaimonSkycrawlers
|
7
|
-
|
7
|
+
module Processor
|
8
8
|
class << self
|
9
|
-
def run(process_name:
|
10
|
-
|
9
|
+
def run(process_name: default_process_name)
|
10
|
+
DaimonSkycrawlers::Timer.setup_shutdown_timer(config.queue_name_prefix, interval: config.shutdown_interval)
|
11
|
+
SongkickQueue::Worker.new(process_name, [DaimonSkycrawlers::Consumer::HTTPResponse]).run
|
11
12
|
end
|
12
13
|
|
13
|
-
def enqueue_http_response(url)
|
14
|
-
|
14
|
+
def enqueue_http_response(url, message = {})
|
15
|
+
message[:url] = url
|
16
|
+
SongkickQueue.publish(queue_name, message)
|
17
|
+
end
|
18
|
+
|
19
|
+
def config
|
20
|
+
DaimonSkycrawlers.configuration
|
21
|
+
end
|
22
|
+
|
23
|
+
def queue_name
|
24
|
+
"#{config.queue_name_prefix}.http-response"
|
25
|
+
end
|
26
|
+
|
27
|
+
def default_process_name
|
28
|
+
"#{config.queue_name_prefix}:http-response"
|
15
29
|
end
|
16
30
|
end
|
17
31
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require "songkick_queue"
|
2
|
+
|
3
|
+
module DaimonSkycrawlers
|
4
|
+
class Queue
|
5
|
+
class << self
|
6
|
+
def configuration
|
7
|
+
@configuration ||= SongkickQueue.configure do |config|
|
8
|
+
config.logger = Logger.new(STDOUT)
|
9
|
+
config.host = "127.0.0.1"
|
10
|
+
config.port = 5672
|
11
|
+
# config.username = 'guest'
|
12
|
+
# config.password = 'guest'
|
13
|
+
config.vhost = "/"
|
14
|
+
config.max_reconnect_attempts = 10
|
15
|
+
config.network_recovery_interval = 1.0
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def configure
|
20
|
+
yield configuration
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -1,6 +1,12 @@
|
|
1
|
+
require "daimon_skycrawlers/logger"
|
2
|
+
require "daimon_skycrawlers/config"
|
3
|
+
|
1
4
|
module DaimonSkycrawlers
|
2
5
|
module Storage
|
3
6
|
class Base
|
7
|
+
include DaimonSkycrawlers::LoggerMixin
|
8
|
+
include DaimonSkycrawlers::ConfigMixin
|
9
|
+
|
4
10
|
def save(url, headers, body)
|
5
11
|
raise "Implement this in subclass"
|
6
12
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require "timers"
|
2
|
+
require "daimon_skycrawlers"
|
3
|
+
|
4
|
+
module DaimonSkycrawlers
|
5
|
+
module Timer
|
6
|
+
module_function
|
7
|
+
|
8
|
+
def setup_shutdown_timer(queue_name_prefix, interval: 10)
|
9
|
+
timers = Timers::Group.new
|
10
|
+
timer = timers.after(interval) do
|
11
|
+
Process.kill(:INT, 0)
|
12
|
+
end
|
13
|
+
Thread.new(timers) do |t|
|
14
|
+
loop { t.wait }
|
15
|
+
end
|
16
|
+
ActiveSupport::Notifications.subscribe("consume_message.songkick_queue") do |*args|
|
17
|
+
event = ActiveSupport::Notifications::Event.new(*args)
|
18
|
+
queue_name = event.payload[:queue_name]
|
19
|
+
DaimonSkycrawlers.configuration.logger.debug("Reset timer: consume message #{queue_name}")
|
20
|
+
timer.reset
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/lib/daimon_skycrawlers.rb
CHANGED
@@ -1,15 +1,35 @@
|
|
1
|
-
require
|
1
|
+
require "bundler/setup"
|
2
2
|
|
3
|
-
require
|
3
|
+
require "daimon_skycrawlers/version"
|
4
|
+
require "daimon_skycrawlers/logger"
|
4
5
|
|
5
6
|
module DaimonSkycrawlers
|
7
|
+
Configuration = Struct.new(
|
8
|
+
:logger,
|
9
|
+
:queue_name_prefix,
|
10
|
+
:crawler_interval,
|
11
|
+
:shutdown_interval
|
12
|
+
)
|
6
13
|
class << self
|
7
14
|
def register_processor(processor = nil, &block)
|
8
|
-
|
15
|
+
DaimonSkycrawlers::Consumer::HTTPResponse.register(processor, &block)
|
9
16
|
end
|
10
17
|
|
11
18
|
def register_crawler(crawler)
|
12
|
-
|
19
|
+
DaimonSkycrawlers::Consumer::URL.register(crawler)
|
20
|
+
end
|
21
|
+
|
22
|
+
def configuration
|
23
|
+
@configuration ||= DaimonSkycrawlers::Configuration.new.tap do |config|
|
24
|
+
config.logger = DaimonSkycrawlers::Logger.default
|
25
|
+
config.queue_name_prefix = "daimon-skycrawlers"
|
26
|
+
config.crawler_interval = 1
|
27
|
+
config.shutdown_interval = 10
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def configure
|
32
|
+
yield configuration
|
13
33
|
end
|
14
34
|
end
|
15
35
|
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# spider
|
2
|
+
|
3
|
+
TODO: Write description.
|
4
|
+
|
5
|
+
## Requirements
|
6
|
+
|
7
|
+
- Ruby
|
8
|
+
- RabbitMQ
|
9
|
+
- RDB
|
10
|
+
- PostgreSQL (default)
|
11
|
+
- MySQL
|
12
|
+
- SQLite3
|
13
|
+
|
14
|
+
## Usage
|
15
|
+
|
16
|
+
1. Install dependencies
|
17
|
+
|
18
|
+
```
|
19
|
+
$ bundle install
|
20
|
+
```
|
21
|
+
|
22
|
+
2. Create database
|
23
|
+
|
24
|
+
```
|
25
|
+
$ bundle exec rake db:create
|
26
|
+
$ bundle exec rake db:migrate
|
27
|
+
```
|
28
|
+
|
29
|
+
3. Open new terminal and run crawler/processor
|
30
|
+
|
31
|
+
```
|
32
|
+
$ bundle exec ruby crawler.rb # on new terminal
|
33
|
+
$ bundle exec ruby processor.rb # on new terminal
|
34
|
+
```
|
35
|
+
|
36
|
+
4. Enqueue task
|
37
|
+
|
38
|
+
```
|
39
|
+
$ bundle exec ruby enqueue.rb http://example.com/
|
40
|
+
```
|
41
|
+
|
42
|
+
5. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
|
43
|
+
|
44
|
+
6. You can re-enqueue task for processor
|
45
|
+
|
46
|
+
```
|
47
|
+
$ bundle exec ruby enqueue.rb response http://example.com/
|
48
|
+
```
|
49
|
+
|
50
|
+
Display `It works with 'http://example.com'` again on your terminal which runs your processor.
|
@@ -0,0 +1 @@
|
|
1
|
+
require "daimon_skycrawlers/tasks"
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# PostgreSQL. Versions 8.2 and up are supported.
|
2
|
+
#
|
3
|
+
default: &default
|
4
|
+
adapter: postgresql
|
5
|
+
encoding: unicode
|
6
|
+
pool: 5
|
7
|
+
|
8
|
+
development:
|
9
|
+
<<: *default
|
10
|
+
database: spider_development
|
11
|
+
#username: spider
|
12
|
+
#password:
|
13
|
+
#host: localhost
|
14
|
+
#port: 5432
|
15
|
+
#schema_search_path: myapp,sharedapp,public
|
16
|
+
#min_messages: notice
|
17
|
+
|
18
|
+
test:
|
19
|
+
<<: *default
|
20
|
+
database: spider_test
|
21
|
+
|
22
|
+
production:
|
23
|
+
<<: *default
|
24
|
+
database: spider_production
|
25
|
+
username: spider
|
26
|
+
password: <%= ENV['SPIDER_PASSWORD'] %>
|
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "daimon_skycrawlers/crawler"
|
4
|
+
require "daimon_skycrawlers/crawler/default"
|
5
|
+
|
6
|
+
require_relative "./init"
|
7
|
+
|
8
|
+
base_url = "http://www.clear-code.com/blog/"
|
9
|
+
|
10
|
+
crawler = DaimonSkycrawlers::Crawler::Default.new(base_url)
|
11
|
+
|
12
|
+
DaimonSkycrawlers.register_crawler(crawler)
|
13
|
+
|
14
|
+
DaimonSkycrawlers::Crawler.run
|
@@ -0,0 +1,13 @@
|
|
1
|
+
class CreatePages < ActiveRecord::Migration
|
2
|
+
def change
|
3
|
+
create_table :pages do |t|
|
4
|
+
t.string :url
|
5
|
+
t.text :headers
|
6
|
+
t.binary :body, limit: 10 * 1024 ** 2 # 10MiB
|
7
|
+
t.datetime :last_modified_at
|
8
|
+
t.string :etag
|
9
|
+
|
10
|
+
t.timestamps null: false
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# This file is auto-generated from the current state of the database. Instead
|
2
|
+
# of editing this file, please use the migrations feature of Active Record to
|
3
|
+
# incrementally modify your database, and then regenerate this schema definition.
|
4
|
+
#
|
5
|
+
# Note that this schema.rb definition is the authoritative source for your
|
6
|
+
# database schema. If you need to create the application database on another
|
7
|
+
# system, you should be using db:schema:load, not running all the migrations
|
8
|
+
# from scratch. The latter is a flawed and unsustainable approach (the more migrations
|
9
|
+
# you'll amass, the slower it'll run and the greater likelihood for issues).
|
10
|
+
#
|
11
|
+
# It's strongly recommended that you check this file into your version control system.
|
12
|
+
|
13
|
+
ActiveRecord::Schema.define(version: 20160830155803) do
|
14
|
+
|
15
|
+
# These are extensions that must be enabled in order to support this database
|
16
|
+
enable_extension "plpgsql"
|
17
|
+
|
18
|
+
create_table "pages", force: :cascade do |t|
|
19
|
+
t.string "url"
|
20
|
+
t.text "headers"
|
21
|
+
t.binary "body"
|
22
|
+
t.datetime "last_modified_at"
|
23
|
+
t.string "etag"
|
24
|
+
t.datetime "created_at", null: false
|
25
|
+
t.datetime "updated_at", null: false
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "thor"
|
4
|
+
|
5
|
+
require "daimon_skycrawlers/crawler"
|
6
|
+
require "daimon_skycrawlers/processor"
|
7
|
+
|
8
|
+
require_relative "./init"
|
9
|
+
|
10
|
+
class Enqueue < Thor
|
11
|
+
desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
|
12
|
+
def url(url, *rest)
|
13
|
+
message = rest.map {|arg| arg.split(":") }.to_h
|
14
|
+
DaimonSkycrawlers::Crawler.enqueue_url(url, message)
|
15
|
+
end
|
16
|
+
|
17
|
+
desc "response URL [key1:value1 key2:value2...]", "Enqueue URL for HTTP response consumer"
|
18
|
+
def response(url, *rest)
|
19
|
+
message = rest.map {|arg| arg.split(":") }.to_h
|
20
|
+
DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
Enqueue.start(ARGV)
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require "daimon_skycrawlers"
|
2
|
+
require "daimon_skycrawlers/logger"
|
3
|
+
require "daimon_skycrawlers/queue"
|
4
|
+
|
5
|
+
DaimonSkycrawlers.configure do |config|
|
6
|
+
config.logger = DaimonSkycrawlers::Logger.default
|
7
|
+
config.logger.level = :debug
|
8
|
+
config.crawler_interval = 1
|
9
|
+
config.shutdown_interval = 30
|
10
|
+
end
|
11
|
+
|
12
|
+
DaimonSkycrawlers::Queue.configure do |config|
|
13
|
+
# queue configuration
|
14
|
+
config.logger = DaimonSkycrawlers.configuration.logger
|
15
|
+
config.host = "127.0.0.1"
|
16
|
+
config.port = 5672
|
17
|
+
# config.username = 'guest'
|
18
|
+
# config.password = 'guest'
|
19
|
+
config.vhost = "/"
|
20
|
+
config.max_reconnect_attempts = 10
|
21
|
+
config.network_recovery_interval = 1.0
|
22
|
+
end
|