daimon_skycrawlers 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +30 -22
- data/bin/daimon-skycrawlers +1 -1
- data/daimon_skycrawlers.gemspec +3 -1
- data/lib/daimon_skycrawlers/consumer/http_response.rb +28 -0
- data/lib/daimon_skycrawlers/consumer/url.rb +20 -0
- data/lib/daimon_skycrawlers/crawler/base.rb +20 -0
- data/lib/daimon_skycrawlers/crawler/default.rb +5 -0
- data/lib/daimon_skycrawlers/crawler.rb +26 -0
- data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +8 -0
- data/lib/daimon_skycrawlers/filter/update_checker.rb +8 -0
- data/lib/daimon_skycrawlers/generator/new.rb +34 -11
- data/lib/daimon_skycrawlers/generator/templates/new/README.md.erb +4 -4
- data/lib/daimon_skycrawlers/generator/templates/new/{crawler.rb → app/crawlers/sample_crawler.rb} +0 -6
- data/lib/daimon_skycrawlers/generator/templates/new/{processor.rb → app/processors/sample_processor.rb} +0 -6
- data/lib/daimon_skycrawlers/generator/templates/new/bin/crawler +10 -0
- data/{sample/spider/enqueue.rb → lib/daimon_skycrawlers/generator/templates/new/bin/enqueue} +1 -2
- data/lib/daimon_skycrawlers/generator/templates/new/bin/processor +10 -0
- data/lib/daimon_skycrawlers/generator/templates/new/{init.rb → config/init.rb} +1 -0
- data/lib/daimon_skycrawlers/storage/file.rb +46 -0
- data/lib/daimon_skycrawlers/storage/null.rb +12 -0
- data/lib/daimon_skycrawlers/storage/rdb.rb +16 -0
- data/lib/daimon_skycrawlers/storage.rb +1 -0
- data/lib/daimon_skycrawlers/version.rb +1 -1
- data/lib/daimon_skycrawlers.rb +31 -0
- data/sample/spider/README.md +4 -4
- data/sample/spider/{crawler.rb → app/crawlers/blog_crawler.rb} +0 -7
- data/sample/spider/{processor.rb → app/processors/blog_spider.rb} +0 -7
- data/sample/spider/bin/crawler +10 -0
- data/{lib/daimon_skycrawlers/generator/templates/new/enqueue.rb → sample/spider/bin/enqueue} +1 -2
- data/sample/spider/bin/processor +11 -0
- data/sample/spider/{init.rb → config/init.rb} +0 -0
- metadata +51 -19
- data/lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb +0 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 411d6b35b2c909712f9f79cd5e45440226ebba28
|
4
|
+
data.tar.gz: afa7bdc3cc98a28742c64ba46d7237fcce981b92
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f8844e9ebd88e7be7344a607866c15ea74be64a8a405f05f196d54f0125bf66be6a81c4ff27ea3ee4480a82989abbedd79cd934c122c7ac1bee5d83bbfa1b6d3
|
7
|
+
data.tar.gz: 37fe8376a4165aef7f29aa29af51bb637402c48d9d5327c44ea05c37dac2418c39b1f5e55303ec8c66e9cfe8eb3e77f0dc264648caa262abc551c72f06944b55
|
data/README.md
CHANGED
@@ -33,44 +33,52 @@ Or install it yourself as:
|
|
33
33
|
|
34
34
|
1. Create project
|
35
35
|
|
36
|
-
```
|
37
|
-
$ bundle exec daimon-skycrawlers new mycrawlers
|
38
|
-
$ cd mycrawlers
|
39
|
-
```
|
36
|
+
```
|
37
|
+
$ bundle exec daimon-skycrawlers new mycrawlers
|
38
|
+
$ cd mycrawlers
|
39
|
+
```
|
40
|
+
or
|
41
|
+
```
|
42
|
+
$ daimon-skycrawlers new mycrawlers
|
43
|
+
$ cd mycrawlers
|
44
|
+
```
|
40
45
|
|
41
46
|
2. Install dependencies
|
42
47
|
|
43
|
-
```
|
44
|
-
$ bundle install
|
45
|
-
```
|
48
|
+
```
|
49
|
+
$ bundle install
|
50
|
+
```
|
46
51
|
|
47
52
|
3. Create database
|
48
53
|
|
49
|
-
```
|
50
|
-
$ bundle exec rake db:create
|
51
|
-
$ bundle exec rake db:migrate
|
52
|
-
```
|
54
|
+
```
|
55
|
+
$ bundle exec rake db:create
|
56
|
+
$ bundle exec rake db:migrate
|
57
|
+
```
|
53
58
|
|
54
59
|
4. Open new terminal and run crawler/processor
|
55
60
|
|
56
|
-
```
|
57
|
-
$
|
58
|
-
$
|
59
|
-
```
|
61
|
+
```
|
62
|
+
$ bin/crawler # on new terminal
|
63
|
+
$ bin/processor # on new terminal
|
64
|
+
```
|
65
|
+
|
66
|
+
NOTE: Execute step 5 as soon as possible. Because bin/crawler and
|
67
|
+
bin/processor will stop after 10 seconds by default if their
|
68
|
+
queues are empty.
|
60
69
|
|
61
70
|
5. Enqueue task
|
62
71
|
|
63
|
-
```
|
64
|
-
$
|
65
|
-
```
|
72
|
+
```
|
73
|
+
$ bin/enqueue url http://example.com/
|
74
|
+
```
|
66
75
|
|
67
76
|
6. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
|
68
|
-
|
69
77
|
7. You can re-enqueue task for processor
|
70
78
|
|
71
|
-
```
|
72
|
-
$
|
73
|
-
```
|
79
|
+
```
|
80
|
+
$ bin/enqueue response http://example.com/
|
81
|
+
```
|
74
82
|
|
75
83
|
Display `It works with 'http://example.com'` again on your terminal which runs your processor.
|
76
84
|
|
data/bin/daimon-skycrawlers
CHANGED
data/daimon_skycrawlers.gemspec
CHANGED
@@ -18,16 +18,17 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
19
19
|
spec.require_paths = ["lib"]
|
20
20
|
|
21
|
+
spec.add_dependency "bundler", "~> 1.11"
|
21
22
|
spec.add_dependency "thor"
|
22
23
|
spec.add_dependency "songkick_queue"
|
23
24
|
spec.add_dependency "faraday"
|
24
25
|
spec.add_dependency "faraday_middleware"
|
25
26
|
spec.add_dependency "nokogiri"
|
26
27
|
spec.add_dependency "activerecord"
|
28
|
+
spec.add_dependency "railties"
|
27
29
|
spec.add_dependency "pg"
|
28
30
|
spec.add_dependency "timers"
|
29
31
|
|
30
|
-
spec.add_development_dependency "bundler", "~> 1.11"
|
31
32
|
spec.add_development_dependency "rake", "~> 10.0"
|
32
33
|
spec.add_development_dependency "test-unit"
|
33
34
|
spec.add_development_dependency "test-unit-rr"
|
@@ -35,4 +36,5 @@ Gem::Specification.new do |spec|
|
|
35
36
|
spec.add_development_dependency "pry"
|
36
37
|
spec.add_development_dependency "tapp"
|
37
38
|
spec.add_development_dependency "sqlite3"
|
39
|
+
spec.add_development_dependency "yard"
|
38
40
|
end
|
@@ -5,10 +5,26 @@ require "daimon_skycrawlers/processor/default"
|
|
5
5
|
|
6
6
|
module DaimonSkycrawlers
|
7
7
|
module Consumer
|
8
|
+
#
|
9
|
+
# HTTP response consumer class
|
10
|
+
#
|
8
11
|
class HTTPResponse < Base
|
9
12
|
include SongkickQueue::Consumer
|
10
13
|
|
11
14
|
class << self
|
15
|
+
#
|
16
|
+
# Register a processor
|
17
|
+
#
|
18
|
+
# @overload register(processor)
|
19
|
+
# @param [Processor] processor instance which implements `call` method
|
20
|
+
# @return [void]
|
21
|
+
#
|
22
|
+
# @overload register
|
23
|
+
# @return [void]
|
24
|
+
# @yield [message] register given block as a processor
|
25
|
+
# @yieldparam message [Hash] A message from queue
|
26
|
+
# @yieldreturn [void]
|
27
|
+
#
|
12
28
|
def register(processor = nil, &block)
|
13
29
|
if block_given?
|
14
30
|
processors << block
|
@@ -17,14 +33,23 @@ module DaimonSkycrawlers
|
|
17
33
|
end
|
18
34
|
end
|
19
35
|
|
36
|
+
#
|
37
|
+
# @private
|
38
|
+
#
|
20
39
|
def processors
|
21
40
|
@processors ||= []
|
22
41
|
end
|
23
42
|
|
43
|
+
#
|
44
|
+
# @private
|
45
|
+
#
|
24
46
|
def default_processor
|
25
47
|
DaimonSkycrawlers::Processor::Default.new
|
26
48
|
end
|
27
49
|
|
50
|
+
#
|
51
|
+
# @private
|
52
|
+
#
|
28
53
|
def queue_name
|
29
54
|
"#{DaimonSkycrawlers.configuration.queue_name_prefix}.http-response"
|
30
55
|
end
|
@@ -32,6 +57,9 @@ module DaimonSkycrawlers
|
|
32
57
|
|
33
58
|
consume_from_queue queue_name
|
34
59
|
|
60
|
+
#
|
61
|
+
# @private
|
62
|
+
#
|
35
63
|
def process(message)
|
36
64
|
if self.class.processors.empty?
|
37
65
|
processors = [self.class.default_processor]
|
@@ -4,18 +4,35 @@ require "daimon_skycrawlers/consumer/base"
|
|
4
4
|
|
5
5
|
module DaimonSkycrawlers
|
6
6
|
module Consumer
|
7
|
+
#
|
8
|
+
# URL consumer class
|
9
|
+
#
|
7
10
|
class URL < Base
|
8
11
|
include SongkickQueue::Consumer
|
9
12
|
|
10
13
|
class << self
|
14
|
+
#
|
15
|
+
# Register a given crawler
|
16
|
+
#
|
17
|
+
# @param [Crawler] crawler instance which implements `fetch` method
|
18
|
+
# @return [void]
|
19
|
+
#
|
11
20
|
def register(crawler)
|
12
21
|
crawlers << crawler
|
13
22
|
end
|
14
23
|
|
24
|
+
#
|
25
|
+
# Returns registered crawlers
|
26
|
+
#
|
27
|
+
# @return [Array<Crawler>]
|
28
|
+
#
|
15
29
|
def crawlers
|
16
30
|
@crawlers ||= []
|
17
31
|
end
|
18
32
|
|
33
|
+
#
|
34
|
+
# @private
|
35
|
+
#
|
19
36
|
def queue_name
|
20
37
|
"#{DaimonSkycrawlers.configuration.queue_name_prefix}.url"
|
21
38
|
end
|
@@ -23,6 +40,9 @@ module DaimonSkycrawlers
|
|
23
40
|
|
24
41
|
consume_from_queue queue_name
|
25
42
|
|
43
|
+
#
|
44
|
+
# @private
|
45
|
+
#
|
26
46
|
def process(message)
|
27
47
|
url = message[:url]
|
28
48
|
depth = Integer(message[:depth] || 0)
|
@@ -8,12 +8,22 @@ require "daimon_skycrawlers/processor"
|
|
8
8
|
|
9
9
|
module DaimonSkycrawlers
|
10
10
|
module Crawler
|
11
|
+
#
|
12
|
+
# The base class of crawler
|
13
|
+
#
|
11
14
|
class Base
|
12
15
|
include DaimonSkycrawlers::LoggerMixin
|
13
16
|
include DaimonSkycrawlers::ConfigMixin
|
14
17
|
|
18
|
+
# @!attribute [w] storage
|
19
|
+
# Set storage to crawler instance.
|
20
|
+
# @return [void]
|
15
21
|
attr_writer :storage
|
16
22
|
|
23
|
+
#
|
24
|
+
# @param [String] Base URL for crawler
|
25
|
+
# @param [Hash] options for Faraday
|
26
|
+
#
|
17
27
|
def initialize(base_url = nil, options = {})
|
18
28
|
super()
|
19
29
|
@base_url = base_url
|
@@ -23,6 +33,13 @@ module DaimonSkycrawlers
|
|
23
33
|
@n_processed_urls = 0
|
24
34
|
end
|
25
35
|
|
36
|
+
#
|
37
|
+
# Set up connection
|
38
|
+
#
|
39
|
+
# @param [Hash] options for Faraday
|
40
|
+
# @yield [faraday]
|
41
|
+
# @yieldparam faraday [Faraday]
|
42
|
+
#
|
26
43
|
def setup_connection(options = {})
|
27
44
|
@connection = Faraday.new(@base_url, @options.merge(options)) do |faraday|
|
28
45
|
yield faraday
|
@@ -37,6 +54,9 @@ module DaimonSkycrawlers
|
|
37
54
|
@prepare = block
|
38
55
|
end
|
39
56
|
|
57
|
+
#
|
58
|
+
# Retrieve storage instance
|
59
|
+
#
|
40
60
|
def storage
|
41
61
|
@storage ||= Storage::RDB.new
|
42
62
|
end
|
@@ -3,6 +3,11 @@ require "daimon_skycrawlers/filter/update_checker"
|
|
3
3
|
|
4
4
|
module DaimonSkycrawlers
|
5
5
|
module Crawler
|
6
|
+
#
|
7
|
+
# The default crawler
|
8
|
+
#
|
9
|
+
# This crawler can GET given URL and store response to storage
|
10
|
+
#
|
6
11
|
class Default < Base
|
7
12
|
def fetch(path, depth: 3, **kw)
|
8
13
|
@n_processed_urls += 1
|
@@ -6,24 +6,50 @@ require "daimon_skycrawlers/consumer/url"
|
|
6
6
|
module DaimonSkycrawlers
|
7
7
|
module Crawler
|
8
8
|
class << self
|
9
|
+
#
|
10
|
+
# Run registered crawlers
|
11
|
+
#
|
12
|
+
# @param process_name [String] Process name
|
13
|
+
#
|
9
14
|
def run(process_name: default_process_name)
|
10
15
|
DaimonSkycrawlers::Timer.setup_shutdown_timer(config.queue_name_prefix, interval: config.shutdown_interval)
|
11
16
|
SongkickQueue::Worker.new(process_name, [DaimonSkycrawlers::Consumer::URL]).run
|
12
17
|
end
|
13
18
|
|
19
|
+
#
|
20
|
+
# Enqueue a URL to crawler queue
|
21
|
+
#
|
22
|
+
# @param [String] Specify absolute URL
|
23
|
+
# @param [Hash] Extra parameters for crawler
|
24
|
+
# @return [void]
|
14
25
|
def enqueue_url(url, message = {})
|
15
26
|
message[:url] = url
|
16
27
|
SongkickQueue.publish(queue_name, message)
|
17
28
|
end
|
18
29
|
|
30
|
+
#
|
31
|
+
# Shortcut of DaimonSkycrawlers.configuration
|
32
|
+
#
|
33
|
+
# @return [DaimonSkycrawlers::Configuration]
|
34
|
+
#
|
19
35
|
def config
|
20
36
|
DaimonSkycrawlers.configuration
|
21
37
|
end
|
22
38
|
|
39
|
+
#
|
40
|
+
# Queue name for crawler
|
41
|
+
#
|
42
|
+
# @return [String] Queue name
|
43
|
+
#
|
23
44
|
def queue_name
|
24
45
|
"#{config.queue_name_prefix}.url"
|
25
46
|
end
|
26
47
|
|
48
|
+
#
|
49
|
+
# Default process name
|
50
|
+
#
|
51
|
+
# @return [String] Default process name
|
52
|
+
#
|
27
53
|
def default_process_name
|
28
54
|
"#{config.queue_name_prefix}:url"
|
29
55
|
end
|
@@ -3,6 +3,9 @@ require "daimon_skycrawlers/filter/base"
|
|
3
3
|
|
4
4
|
module DaimonSkycrawlers
|
5
5
|
module Filter
|
6
|
+
#
|
7
|
+
# This filter provides duplication checker for given URL.
|
8
|
+
#
|
6
9
|
class DuplicateChecker < Base
|
7
10
|
def initialize(base_url: nil)
|
8
11
|
@base_url = nil
|
@@ -10,6 +13,11 @@ module DaimonSkycrawlers
|
|
10
13
|
@urls = Set.new
|
11
14
|
end
|
12
15
|
|
16
|
+
#
|
17
|
+
# @param [String] url to check duplication. If given URL is
|
18
|
+
# relative URL, use `@base_url + url` as absolute URL.
|
19
|
+
# @return [true|false] Return false when duplicated, otherwise return true.
|
20
|
+
#
|
13
21
|
def call(url)
|
14
22
|
unless URI(url).absolute?
|
15
23
|
url = (@base_url + url).to_s
|
@@ -3,6 +3,9 @@ require "daimon_skycrawlers/filter/base"
|
|
3
3
|
|
4
4
|
module DaimonSkycrawlers
|
5
5
|
module Filter
|
6
|
+
#
|
7
|
+
# This filter provides update checker for given URL.
|
8
|
+
#
|
6
9
|
class UpdateChecker < Base
|
7
10
|
def initialize(storage: nil, base_url: nil)
|
8
11
|
super(storage: storage)
|
@@ -10,6 +13,11 @@ module DaimonSkycrawlers
|
|
10
13
|
@base_url = URI(base_url) if base_url
|
11
14
|
end
|
12
15
|
|
16
|
+
#
|
17
|
+
# @param [String] url
|
18
|
+
# @param connection [Faraday]
|
19
|
+
# @return [true|false] Return true when need update, otherwise return false
|
20
|
+
#
|
13
21
|
def call(url, connection: nil)
|
14
22
|
unless URI(url).absolute?
|
15
23
|
url = (@base_url + url).to_s
|
@@ -1,4 +1,8 @@
|
|
1
1
|
require "thor"
|
2
|
+
require "rails/generators"
|
3
|
+
require "rails/generators/actions"
|
4
|
+
require "rails/generators/active_record"
|
5
|
+
require "rails/generators/active_record/migration/migration_generator"
|
2
6
|
|
3
7
|
module DaimonSkycrawlers
|
4
8
|
module Generator
|
@@ -18,26 +22,45 @@ module DaimonSkycrawlers
|
|
18
22
|
].each do |path|
|
19
23
|
template("#{path}.erb", "#{name}/#{path}")
|
20
24
|
end
|
25
|
+
invoke(MigrationGenerator, [
|
26
|
+
"CreatePage",
|
27
|
+
"url:string",
|
28
|
+
"headers:text",
|
29
|
+
"body:binary",
|
30
|
+
"last_modified_at:datetime",
|
31
|
+
"etag:string",
|
32
|
+
"timestamps"
|
33
|
+
],
|
34
|
+
{ destination_root: File.join(destination_root, name) })
|
21
35
|
end
|
22
36
|
|
23
37
|
def copy_files
|
24
38
|
[
|
25
39
|
"Gemfile",
|
26
40
|
"Rakefile",
|
27
|
-
"
|
28
|
-
"
|
29
|
-
"
|
30
|
-
"
|
41
|
+
"app/crawlers/sample_crawler.rb",
|
42
|
+
"app/processors/sample_processor.rb",
|
43
|
+
"bin/crawler",
|
44
|
+
"bin/enqueue",
|
45
|
+
"bin/processor",
|
46
|
+
"config/init.rb",
|
31
47
|
].each do |path|
|
32
|
-
copy_file(path, "#{name}/#{path}")
|
33
|
-
end
|
34
|
-
[
|
35
|
-
"db/migrate/create_pages.rb",
|
36
|
-
].each do |path|
|
37
|
-
migration = "#{Time.now.strftime("%Y%m%d%H%M%S")}_#{File.basename(path)}"
|
38
|
-
copy_file(path, "#{name}/db/migrate/#{migration}")
|
48
|
+
copy_file(path, "#{name}/#{path}", mode: :preserve)
|
39
49
|
end
|
40
50
|
end
|
41
51
|
end
|
52
|
+
|
53
|
+
class MigrationGenerator < ActiveRecord::Generators::MigrationGenerator
|
54
|
+
def self.source_root
|
55
|
+
ActiveRecord::Generators::MigrationGenerator.source_root
|
56
|
+
end
|
57
|
+
|
58
|
+
def create_migration_file
|
59
|
+
set_local_assigns!
|
60
|
+
validate_file_name!
|
61
|
+
dest = options[:destination_root]
|
62
|
+
migration_template @migration_template, "#{dest}/db/migrate/#{file_name}.rb"
|
63
|
+
end
|
64
|
+
end
|
42
65
|
end
|
43
66
|
end
|
@@ -29,14 +29,14 @@ $ bundle exec rake db:migrate
|
|
29
29
|
3. Open new terminal and run crawler/processor
|
30
30
|
|
31
31
|
```
|
32
|
-
$
|
33
|
-
$
|
32
|
+
$ bin/crawler # on new terminal
|
33
|
+
$ bin/processor # on new terminal
|
34
34
|
```
|
35
35
|
|
36
36
|
4. Enqueue task
|
37
37
|
|
38
38
|
```
|
39
|
-
$
|
39
|
+
$ bin/enqueue url http://example.com/
|
40
40
|
```
|
41
41
|
|
42
42
|
5. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
|
@@ -44,7 +44,7 @@ $ bundle exec ruby enqueue.rb http://example.com/
|
|
44
44
|
6. You can re-enqueue task for processor
|
45
45
|
|
46
46
|
```
|
47
|
-
$
|
47
|
+
$ bin/enqueue response http://example.com/
|
48
48
|
```
|
49
49
|
|
50
50
|
Display `It works with 'http://example.com'` again on your terminal which runs your processor.
|
data/lib/daimon_skycrawlers/generator/templates/new/{crawler.rb → app/crawlers/sample_crawler.rb}
RENAMED
@@ -1,14 +1,8 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
1
|
require "daimon_skycrawlers/crawler"
|
4
2
|
require "daimon_skycrawlers/crawler/default"
|
5
3
|
|
6
|
-
require_relative "./init"
|
7
|
-
|
8
4
|
base_url = "http://example.com"
|
9
5
|
|
10
6
|
crawler = DaimonSkycrawlers::Crawler::Default.new(base_url)
|
11
7
|
|
12
8
|
DaimonSkycrawlers.register_crawler(crawler)
|
13
|
-
|
14
|
-
DaimonSkycrawlers::Crawler.run
|
data/{sample/spider/enqueue.rb → lib/daimon_skycrawlers/generator/templates/new/bin/enqueue}
RENAMED
@@ -2,11 +2,10 @@
|
|
2
2
|
|
3
3
|
require "thor"
|
4
4
|
|
5
|
+
require_relative "../config/init"
|
5
6
|
require "daimon_skycrawlers/crawler"
|
6
7
|
require "daimon_skycrawlers/processor"
|
7
8
|
|
8
|
-
require_relative "./init"
|
9
|
-
|
10
9
|
class Enqueue < Thor
|
11
10
|
desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
|
12
11
|
def url(url, *rest)
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require "daimon_skycrawlers/storage/base"
|
2
|
+
|
3
|
+
module DaimonSkycrawlers
|
4
|
+
module Storage
|
5
|
+
#
|
6
|
+
# Storage for files
|
7
|
+
#
|
8
|
+
class File < Base
|
9
|
+
def initialize(base_dir)
|
10
|
+
super()
|
11
|
+
@base_dir = Pathname(base_dir)
|
12
|
+
end
|
13
|
+
|
14
|
+
def save(url, headers, body)
|
15
|
+
@base_dir.mkpath
|
16
|
+
body_path(url).dirname.mkpath
|
17
|
+
body_path(url).open("wb+") do |file|
|
18
|
+
file.write(body)
|
19
|
+
end
|
20
|
+
headers_path(url).open("wb+") do |file|
|
21
|
+
file.write(JSON.generate(headers))
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def read(url)
|
26
|
+
headers = JSON.parse(headers_path(url).read)
|
27
|
+
body = body_path(url).read
|
28
|
+
Page.new(url, headers, body, headers["last-modified"], headers["etag"])
|
29
|
+
end
|
30
|
+
|
31
|
+
Page = Struct.new(:url, :headers, :body, :last_modified, :etag)
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def body_path(url)
|
36
|
+
url = URI(url)
|
37
|
+
@base_dir + ".#{url.path}"
|
38
|
+
end
|
39
|
+
|
40
|
+
def headers_path(url)
|
41
|
+
url = URI(url)
|
42
|
+
Pathname("#{body_path(url)}-headers.json")
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -1,9 +1,21 @@
|
|
1
1
|
module DaimonSkycrawlers
|
2
2
|
module Storage
|
3
|
+
#
|
4
|
+
# The null storage.
|
5
|
+
#
|
6
|
+
# This storage is useful for test.
|
7
|
+
#
|
3
8
|
class Null < Base
|
9
|
+
|
10
|
+
#
|
11
|
+
# Save nothing
|
12
|
+
#
|
4
13
|
def save(url, headers, body)
|
5
14
|
end
|
6
15
|
|
16
|
+
#
|
17
|
+
# Find nothing
|
18
|
+
#
|
7
19
|
def find(url)
|
8
20
|
end
|
9
21
|
end
|
@@ -3,13 +3,24 @@ require "active_record"
|
|
3
3
|
|
4
4
|
module DaimonSkycrawlers
|
5
5
|
module Storage
|
6
|
+
#
|
7
|
+
# Storage for RDBMS
|
8
|
+
#
|
6
9
|
class RDB < Base
|
7
10
|
def initialize(config_path = "config/database.yml")
|
11
|
+
super()
|
8
12
|
config = YAML.load_file(config_path)
|
9
13
|
environment = ENV["SKYCRAWLERS_ENV"] || "development"
|
10
14
|
ActiveRecord::Base.establish_connection(config[environment])
|
11
15
|
end
|
12
16
|
|
17
|
+
#
|
18
|
+
# Save
|
19
|
+
#
|
20
|
+
# @param [String] url identity of the page
|
21
|
+
# @param [Hash] header of URL
|
22
|
+
# @param [String] body
|
23
|
+
#
|
13
24
|
def save(url, headers, body)
|
14
25
|
Page.create(url: url,
|
15
26
|
headers: JSON.generate(headers),
|
@@ -18,6 +29,11 @@ module DaimonSkycrawlers
|
|
18
29
|
etag: headers["etag"])
|
19
30
|
end
|
20
31
|
|
32
|
+
#
|
33
|
+
# Fetch page identified by url
|
34
|
+
#
|
35
|
+
# @param [String] url identity of the page
|
36
|
+
#
|
21
37
|
def find(url)
|
22
38
|
Page.where(url: url).order(last_modified_at: :desc).limit(1).first
|
23
39
|
end
|
data/lib/daimon_skycrawlers.rb
CHANGED
@@ -11,14 +11,38 @@ module DaimonSkycrawlers
|
|
11
11
|
:shutdown_interval
|
12
12
|
)
|
13
13
|
class << self
|
14
|
+
#
|
15
|
+
# Register a processor
|
16
|
+
#
|
17
|
+
# @overload register_processor(processor)
|
18
|
+
# @param [Processor] processor instance which implements `call` method
|
19
|
+
# @return [void]
|
20
|
+
#
|
21
|
+
# @overload register_processor
|
22
|
+
# @return [void]
|
23
|
+
# @yield [message] Register given block as a processor.
|
24
|
+
# @yieldparam message [Hash] A message from queue
|
25
|
+
# @yieldreturn [void]
|
26
|
+
#
|
14
27
|
def register_processor(processor = nil, &block)
|
15
28
|
DaimonSkycrawlers::Consumer::HTTPResponse.register(processor, &block)
|
16
29
|
end
|
17
30
|
|
31
|
+
#
|
32
|
+
# Register a crawler
|
33
|
+
#
|
34
|
+
# @param [Crawler] crawler instance which implements `fetch` method
|
35
|
+
# @return [void]
|
36
|
+
#
|
18
37
|
def register_crawler(crawler)
|
19
38
|
DaimonSkycrawlers::Consumer::URL.register(crawler)
|
20
39
|
end
|
21
40
|
|
41
|
+
#
|
42
|
+
# Retrieve configuration object
|
43
|
+
#
|
44
|
+
# @return [DaimonSkycrawlers::Configuration]
|
45
|
+
#
|
22
46
|
def configuration
|
23
47
|
@configuration ||= DaimonSkycrawlers::Configuration.new.tap do |config|
|
24
48
|
config.logger = DaimonSkycrawlers::Logger.default
|
@@ -28,6 +52,13 @@ module DaimonSkycrawlers
|
|
28
52
|
end
|
29
53
|
end
|
30
54
|
|
55
|
+
#
|
56
|
+
# Configure DaimonSkycrawlers
|
57
|
+
#
|
58
|
+
# @return [void]
|
59
|
+
# @yield [configuration] configure DaimonSkycrawlers
|
60
|
+
# @yieldparam configuration [DaimonSkycrawlers::Configuration] configuration object
|
61
|
+
# @yieldreturn [void]
|
31
62
|
def configure
|
32
63
|
yield configuration
|
33
64
|
end
|
data/sample/spider/README.md
CHANGED
@@ -29,14 +29,14 @@ $ bundle exec rake db:migrate
|
|
29
29
|
3. Open new terminal and run crawler/processor
|
30
30
|
|
31
31
|
```
|
32
|
-
$
|
33
|
-
$
|
32
|
+
$ bin/crawler # on new terminal
|
33
|
+
$ bin/processor # on new terminal
|
34
34
|
```
|
35
35
|
|
36
36
|
4. Enqueue task
|
37
37
|
|
38
38
|
```
|
39
|
-
$
|
39
|
+
$ bin/enqueue url http://example.com/
|
40
40
|
```
|
41
41
|
|
42
42
|
5. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
|
@@ -44,7 +44,7 @@ $ bundle exec ruby enqueue.rb http://example.com/
|
|
44
44
|
6. You can re-enqueue task for processor
|
45
45
|
|
46
46
|
```
|
47
|
-
$
|
47
|
+
$ bin/enqueue response http://example.com/
|
48
48
|
```
|
49
49
|
|
50
50
|
Display `It works with 'http://example.com'` again on your terminal which runs your processor.
|
@@ -1,14 +1,7 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require "daimon_skycrawlers/crawler"
|
4
1
|
require "daimon_skycrawlers/crawler/default"
|
5
2
|
|
6
|
-
require_relative "./init"
|
7
|
-
|
8
3
|
base_url = "http://www.clear-code.com/blog/"
|
9
4
|
|
10
5
|
crawler = DaimonSkycrawlers::Crawler::Default.new(base_url)
|
11
6
|
|
12
7
|
DaimonSkycrawlers.register_crawler(crawler)
|
13
|
-
|
14
|
-
DaimonSkycrawlers::Crawler.run
|
@@ -1,13 +1,8 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require "daimon_skycrawlers/processor"
|
4
1
|
require "daimon_skycrawlers/processor/spider"
|
5
2
|
require "daimon_skycrawlers/filter"
|
6
3
|
require "daimon_skycrawlers/filter/duplicate_checker"
|
7
4
|
require "daimon_skycrawlers/filter/update_checker"
|
8
5
|
|
9
|
-
require_relative "./init"
|
10
|
-
|
11
6
|
default_processor = DaimonSkycrawlers::Processor::Default.new
|
12
7
|
spider = DaimonSkycrawlers::Processor::Spider.new
|
13
8
|
#spider.enqueue = false
|
@@ -30,5 +25,3 @@ spider.append_filter(update_checker)
|
|
30
25
|
|
31
26
|
DaimonSkycrawlers.register_processor(default_processor)
|
32
27
|
DaimonSkycrawlers.register_processor(spider)
|
33
|
-
|
34
|
-
DaimonSkycrawlers::Processor.run
|
data/{lib/daimon_skycrawlers/generator/templates/new/enqueue.rb → sample/spider/bin/enqueue}
RENAMED
@@ -2,11 +2,10 @@
|
|
2
2
|
|
3
3
|
require "thor"
|
4
4
|
|
5
|
+
require_relative "../config/init"
|
5
6
|
require "daimon_skycrawlers/crawler"
|
6
7
|
require "daimon_skycrawlers/processor"
|
7
8
|
|
8
|
-
require_relative "./init"
|
9
|
-
|
10
9
|
class Enqueue < Thor
|
11
10
|
desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
|
12
11
|
def url(url, *rest)
|
File without changes
|
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: daimon_skycrawlers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryunosuke SATO
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-09-
|
11
|
+
date: 2016-09-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.11'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.11'
|
13
27
|
- !ruby/object:Gem::Dependency
|
14
28
|
name: thor
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -95,7 +109,7 @@ dependencies:
|
|
95
109
|
- !ruby/object:Gem::Version
|
96
110
|
version: '0'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
|
-
name:
|
112
|
+
name: railties
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
100
114
|
requirements:
|
101
115
|
- - ">="
|
@@ -109,7 +123,7 @@ dependencies:
|
|
109
123
|
- !ruby/object:Gem::Version
|
110
124
|
version: '0'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
|
-
name:
|
126
|
+
name: pg
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
114
128
|
requirements:
|
115
129
|
- - ">="
|
@@ -123,19 +137,19 @@ dependencies:
|
|
123
137
|
- !ruby/object:Gem::Version
|
124
138
|
version: '0'
|
125
139
|
- !ruby/object:Gem::Dependency
|
126
|
-
name:
|
140
|
+
name: timers
|
127
141
|
requirement: !ruby/object:Gem::Requirement
|
128
142
|
requirements:
|
129
|
-
- - "
|
143
|
+
- - ">="
|
130
144
|
- !ruby/object:Gem::Version
|
131
|
-
version: '
|
132
|
-
type: :
|
145
|
+
version: '0'
|
146
|
+
type: :runtime
|
133
147
|
prerelease: false
|
134
148
|
version_requirements: !ruby/object:Gem::Requirement
|
135
149
|
requirements:
|
136
|
-
- - "
|
150
|
+
- - ">="
|
137
151
|
- !ruby/object:Gem::Version
|
138
|
-
version: '
|
152
|
+
version: '0'
|
139
153
|
- !ruby/object:Gem::Dependency
|
140
154
|
name: rake
|
141
155
|
requirement: !ruby/object:Gem::Requirement
|
@@ -234,6 +248,20 @@ dependencies:
|
|
234
248
|
- - ">="
|
235
249
|
- !ruby/object:Gem::Version
|
236
250
|
version: '0'
|
251
|
+
- !ruby/object:Gem::Dependency
|
252
|
+
name: yard
|
253
|
+
requirement: !ruby/object:Gem::Requirement
|
254
|
+
requirements:
|
255
|
+
- - ">="
|
256
|
+
- !ruby/object:Gem::Version
|
257
|
+
version: '0'
|
258
|
+
type: :development
|
259
|
+
prerelease: false
|
260
|
+
version_requirements: !ruby/object:Gem::Requirement
|
261
|
+
requirements:
|
262
|
+
- - ">="
|
263
|
+
- !ruby/object:Gem::Version
|
264
|
+
version: '0'
|
237
265
|
description: This is a crawler framework.
|
238
266
|
email:
|
239
267
|
- tricknotes.rs@gmail.com
|
@@ -269,12 +297,13 @@ files:
|
|
269
297
|
- lib/daimon_skycrawlers/generator/templates/new/Gemfile
|
270
298
|
- lib/daimon_skycrawlers/generator/templates/new/README.md.erb
|
271
299
|
- lib/daimon_skycrawlers/generator/templates/new/Rakefile
|
300
|
+
- lib/daimon_skycrawlers/generator/templates/new/app/crawlers/sample_crawler.rb
|
301
|
+
- lib/daimon_skycrawlers/generator/templates/new/app/processors/sample_processor.rb
|
302
|
+
- lib/daimon_skycrawlers/generator/templates/new/bin/crawler
|
303
|
+
- lib/daimon_skycrawlers/generator/templates/new/bin/enqueue
|
304
|
+
- lib/daimon_skycrawlers/generator/templates/new/bin/processor
|
272
305
|
- lib/daimon_skycrawlers/generator/templates/new/config/database.yml.erb
|
273
|
-
- lib/daimon_skycrawlers/generator/templates/new/
|
274
|
-
- lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb
|
275
|
-
- lib/daimon_skycrawlers/generator/templates/new/enqueue.rb
|
276
|
-
- lib/daimon_skycrawlers/generator/templates/new/init.rb
|
277
|
-
- lib/daimon_skycrawlers/generator/templates/new/processor.rb
|
306
|
+
- lib/daimon_skycrawlers/generator/templates/new/config/init.rb
|
278
307
|
- lib/daimon_skycrawlers/logger.rb
|
279
308
|
- lib/daimon_skycrawlers/processor.rb
|
280
309
|
- lib/daimon_skycrawlers/processor/base.rb
|
@@ -283,6 +312,7 @@ files:
|
|
283
312
|
- lib/daimon_skycrawlers/queue.rb
|
284
313
|
- lib/daimon_skycrawlers/storage.rb
|
285
314
|
- lib/daimon_skycrawlers/storage/base.rb
|
315
|
+
- lib/daimon_skycrawlers/storage/file.rb
|
286
316
|
- lib/daimon_skycrawlers/storage/null.rb
|
287
317
|
- lib/daimon_skycrawlers/storage/rdb.rb
|
288
318
|
- lib/daimon_skycrawlers/tasks.rb
|
@@ -292,13 +322,15 @@ files:
|
|
292
322
|
- sample/spider/Gemfile
|
293
323
|
- sample/spider/README.md
|
294
324
|
- sample/spider/Rakefile
|
325
|
+
- sample/spider/app/crawlers/blog_crawler.rb
|
326
|
+
- sample/spider/app/processors/blog_spider.rb
|
327
|
+
- sample/spider/bin/crawler
|
328
|
+
- sample/spider/bin/enqueue
|
329
|
+
- sample/spider/bin/processor
|
295
330
|
- sample/spider/config/database.yml
|
296
|
-
- sample/spider/
|
331
|
+
- sample/spider/config/init.rb
|
297
332
|
- sample/spider/db/migrate/20160830155803_create_pages.rb
|
298
333
|
- sample/spider/db/schema.rb
|
299
|
-
- sample/spider/enqueue.rb
|
300
|
-
- sample/spider/init.rb
|
301
|
-
- sample/spider/processor.rb
|
302
334
|
homepage: https://github.com/bm-sms/daimon-skycrawlers
|
303
335
|
licenses:
|
304
336
|
- MIT
|
@@ -1,13 +0,0 @@
|
|
1
|
-
class CreatePages < ActiveRecord::Migration
|
2
|
-
def change
|
3
|
-
create_table :pages do |t|
|
4
|
-
t.string :url
|
5
|
-
t.text :headers
|
6
|
-
t.binary :body, limit: 10 * 1024 ** 2 # 10MiB
|
7
|
-
t.datetime :last_modified_at
|
8
|
-
t.string :etag
|
9
|
-
|
10
|
-
t.timestamps null: false
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|