daimon_skycrawlers 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +30 -22
- data/bin/daimon-skycrawlers +1 -1
- data/daimon_skycrawlers.gemspec +3 -1
- data/lib/daimon_skycrawlers/consumer/http_response.rb +28 -0
- data/lib/daimon_skycrawlers/consumer/url.rb +20 -0
- data/lib/daimon_skycrawlers/crawler/base.rb +20 -0
- data/lib/daimon_skycrawlers/crawler/default.rb +5 -0
- data/lib/daimon_skycrawlers/crawler.rb +26 -0
- data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +8 -0
- data/lib/daimon_skycrawlers/filter/update_checker.rb +8 -0
- data/lib/daimon_skycrawlers/generator/new.rb +34 -11
- data/lib/daimon_skycrawlers/generator/templates/new/README.md.erb +4 -4
- data/lib/daimon_skycrawlers/generator/templates/new/{crawler.rb → app/crawlers/sample_crawler.rb} +0 -6
- data/lib/daimon_skycrawlers/generator/templates/new/{processor.rb → app/processors/sample_processor.rb} +0 -6
- data/lib/daimon_skycrawlers/generator/templates/new/bin/crawler +10 -0
- data/{sample/spider/enqueue.rb → lib/daimon_skycrawlers/generator/templates/new/bin/enqueue} +1 -2
- data/lib/daimon_skycrawlers/generator/templates/new/bin/processor +10 -0
- data/lib/daimon_skycrawlers/generator/templates/new/{init.rb → config/init.rb} +1 -0
- data/lib/daimon_skycrawlers/storage/file.rb +46 -0
- data/lib/daimon_skycrawlers/storage/null.rb +12 -0
- data/lib/daimon_skycrawlers/storage/rdb.rb +16 -0
- data/lib/daimon_skycrawlers/storage.rb +1 -0
- data/lib/daimon_skycrawlers/version.rb +1 -1
- data/lib/daimon_skycrawlers.rb +31 -0
- data/sample/spider/README.md +4 -4
- data/sample/spider/{crawler.rb → app/crawlers/blog_crawler.rb} +0 -7
- data/sample/spider/{processor.rb → app/processors/blog_spider.rb} +0 -7
- data/sample/spider/bin/crawler +10 -0
- data/{lib/daimon_skycrawlers/generator/templates/new/enqueue.rb → sample/spider/bin/enqueue} +1 -2
- data/sample/spider/bin/processor +11 -0
- data/sample/spider/{init.rb → config/init.rb} +0 -0
- metadata +51 -19
- data/lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb +0 -13
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 411d6b35b2c909712f9f79cd5e45440226ebba28
|
|
4
|
+
data.tar.gz: afa7bdc3cc98a28742c64ba46d7237fcce981b92
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f8844e9ebd88e7be7344a607866c15ea74be64a8a405f05f196d54f0125bf66be6a81c4ff27ea3ee4480a82989abbedd79cd934c122c7ac1bee5d83bbfa1b6d3
|
|
7
|
+
data.tar.gz: 37fe8376a4165aef7f29aa29af51bb637402c48d9d5327c44ea05c37dac2418c39b1f5e55303ec8c66e9cfe8eb3e77f0dc264648caa262abc551c72f06944b55
|
data/README.md
CHANGED
|
@@ -33,44 +33,52 @@ Or install it yourself as:
|
|
|
33
33
|
|
|
34
34
|
1. Create project
|
|
35
35
|
|
|
36
|
-
```
|
|
37
|
-
$ bundle exec daimon-skycrawlers new mycrawlers
|
|
38
|
-
$ cd mycrawlers
|
|
39
|
-
```
|
|
36
|
+
```
|
|
37
|
+
$ bundle exec daimon-skycrawlers new mycrawlers
|
|
38
|
+
$ cd mycrawlers
|
|
39
|
+
```
|
|
40
|
+
or
|
|
41
|
+
```
|
|
42
|
+
$ daimon-skycrawlers new mycrawlers
|
|
43
|
+
$ cd mycrawlers
|
|
44
|
+
```
|
|
40
45
|
|
|
41
46
|
2. Install dependencies
|
|
42
47
|
|
|
43
|
-
```
|
|
44
|
-
$ bundle install
|
|
45
|
-
```
|
|
48
|
+
```
|
|
49
|
+
$ bundle install
|
|
50
|
+
```
|
|
46
51
|
|
|
47
52
|
3. Create database
|
|
48
53
|
|
|
49
|
-
```
|
|
50
|
-
$ bundle exec rake db:create
|
|
51
|
-
$ bundle exec rake db:migrate
|
|
52
|
-
```
|
|
54
|
+
```
|
|
55
|
+
$ bundle exec rake db:create
|
|
56
|
+
$ bundle exec rake db:migrate
|
|
57
|
+
```
|
|
53
58
|
|
|
54
59
|
4. Open new terminal and run crawler/processor
|
|
55
60
|
|
|
56
|
-
```
|
|
57
|
-
$
|
|
58
|
-
$
|
|
59
|
-
```
|
|
61
|
+
```
|
|
62
|
+
$ bin/crawler # on new terminal
|
|
63
|
+
$ bin/processor # on new terminal
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
NOTE: Execute step 5 as soon as possible. Because bin/crawler and
|
|
67
|
+
bin/processor will stop after 10 seconds by default if their
|
|
68
|
+
queues are empty.
|
|
60
69
|
|
|
61
70
|
5. Enqueue task
|
|
62
71
|
|
|
63
|
-
```
|
|
64
|
-
$
|
|
65
|
-
```
|
|
72
|
+
```
|
|
73
|
+
$ bin/enqueue url http://example.com/
|
|
74
|
+
```
|
|
66
75
|
|
|
67
76
|
6. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
|
|
68
|
-
|
|
69
77
|
7. You can re-enqueue task for processor
|
|
70
78
|
|
|
71
|
-
```
|
|
72
|
-
$
|
|
73
|
-
```
|
|
79
|
+
```
|
|
80
|
+
$ bin/enqueue response http://example.com/
|
|
81
|
+
```
|
|
74
82
|
|
|
75
83
|
Display `It works with 'http://example.com'` again on your terminal which runs your processor.
|
|
76
84
|
|
data/bin/daimon-skycrawlers
CHANGED
data/daimon_skycrawlers.gemspec
CHANGED
|
@@ -18,16 +18,17 @@ Gem::Specification.new do |spec|
|
|
|
18
18
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
|
19
19
|
spec.require_paths = ["lib"]
|
|
20
20
|
|
|
21
|
+
spec.add_dependency "bundler", "~> 1.11"
|
|
21
22
|
spec.add_dependency "thor"
|
|
22
23
|
spec.add_dependency "songkick_queue"
|
|
23
24
|
spec.add_dependency "faraday"
|
|
24
25
|
spec.add_dependency "faraday_middleware"
|
|
25
26
|
spec.add_dependency "nokogiri"
|
|
26
27
|
spec.add_dependency "activerecord"
|
|
28
|
+
spec.add_dependency "railties"
|
|
27
29
|
spec.add_dependency "pg"
|
|
28
30
|
spec.add_dependency "timers"
|
|
29
31
|
|
|
30
|
-
spec.add_development_dependency "bundler", "~> 1.11"
|
|
31
32
|
spec.add_development_dependency "rake", "~> 10.0"
|
|
32
33
|
spec.add_development_dependency "test-unit"
|
|
33
34
|
spec.add_development_dependency "test-unit-rr"
|
|
@@ -35,4 +36,5 @@ Gem::Specification.new do |spec|
|
|
|
35
36
|
spec.add_development_dependency "pry"
|
|
36
37
|
spec.add_development_dependency "tapp"
|
|
37
38
|
spec.add_development_dependency "sqlite3"
|
|
39
|
+
spec.add_development_dependency "yard"
|
|
38
40
|
end
|
|
@@ -5,10 +5,26 @@ require "daimon_skycrawlers/processor/default"
|
|
|
5
5
|
|
|
6
6
|
module DaimonSkycrawlers
|
|
7
7
|
module Consumer
|
|
8
|
+
#
|
|
9
|
+
# HTTP response consumer class
|
|
10
|
+
#
|
|
8
11
|
class HTTPResponse < Base
|
|
9
12
|
include SongkickQueue::Consumer
|
|
10
13
|
|
|
11
14
|
class << self
|
|
15
|
+
#
|
|
16
|
+
# Register a processor
|
|
17
|
+
#
|
|
18
|
+
# @overload register(processor)
|
|
19
|
+
# @param [Processor] processor instance which implements `call` method
|
|
20
|
+
# @return [void]
|
|
21
|
+
#
|
|
22
|
+
# @overload register
|
|
23
|
+
# @return [void]
|
|
24
|
+
# @yield [message] register given block as a processor
|
|
25
|
+
# @yieldparam message [Hash] A message from queue
|
|
26
|
+
# @yieldreturn [void]
|
|
27
|
+
#
|
|
12
28
|
def register(processor = nil, &block)
|
|
13
29
|
if block_given?
|
|
14
30
|
processors << block
|
|
@@ -17,14 +33,23 @@ module DaimonSkycrawlers
|
|
|
17
33
|
end
|
|
18
34
|
end
|
|
19
35
|
|
|
36
|
+
#
|
|
37
|
+
# @private
|
|
38
|
+
#
|
|
20
39
|
def processors
|
|
21
40
|
@processors ||= []
|
|
22
41
|
end
|
|
23
42
|
|
|
43
|
+
#
|
|
44
|
+
# @private
|
|
45
|
+
#
|
|
24
46
|
def default_processor
|
|
25
47
|
DaimonSkycrawlers::Processor::Default.new
|
|
26
48
|
end
|
|
27
49
|
|
|
50
|
+
#
|
|
51
|
+
# @private
|
|
52
|
+
#
|
|
28
53
|
def queue_name
|
|
29
54
|
"#{DaimonSkycrawlers.configuration.queue_name_prefix}.http-response"
|
|
30
55
|
end
|
|
@@ -32,6 +57,9 @@ module DaimonSkycrawlers
|
|
|
32
57
|
|
|
33
58
|
consume_from_queue queue_name
|
|
34
59
|
|
|
60
|
+
#
|
|
61
|
+
# @private
|
|
62
|
+
#
|
|
35
63
|
def process(message)
|
|
36
64
|
if self.class.processors.empty?
|
|
37
65
|
processors = [self.class.default_processor]
|
|
@@ -4,18 +4,35 @@ require "daimon_skycrawlers/consumer/base"
|
|
|
4
4
|
|
|
5
5
|
module DaimonSkycrawlers
|
|
6
6
|
module Consumer
|
|
7
|
+
#
|
|
8
|
+
# URL consumer class
|
|
9
|
+
#
|
|
7
10
|
class URL < Base
|
|
8
11
|
include SongkickQueue::Consumer
|
|
9
12
|
|
|
10
13
|
class << self
|
|
14
|
+
#
|
|
15
|
+
# Register a given crawler
|
|
16
|
+
#
|
|
17
|
+
# @param [Crawler] crawler instance which implements `fetch` method
|
|
18
|
+
# @return [void]
|
|
19
|
+
#
|
|
11
20
|
def register(crawler)
|
|
12
21
|
crawlers << crawler
|
|
13
22
|
end
|
|
14
23
|
|
|
24
|
+
#
|
|
25
|
+
# Returns registered crawlers
|
|
26
|
+
#
|
|
27
|
+
# @return [Array<Crawler>]
|
|
28
|
+
#
|
|
15
29
|
def crawlers
|
|
16
30
|
@crawlers ||= []
|
|
17
31
|
end
|
|
18
32
|
|
|
33
|
+
#
|
|
34
|
+
# @private
|
|
35
|
+
#
|
|
19
36
|
def queue_name
|
|
20
37
|
"#{DaimonSkycrawlers.configuration.queue_name_prefix}.url"
|
|
21
38
|
end
|
|
@@ -23,6 +40,9 @@ module DaimonSkycrawlers
|
|
|
23
40
|
|
|
24
41
|
consume_from_queue queue_name
|
|
25
42
|
|
|
43
|
+
#
|
|
44
|
+
# @private
|
|
45
|
+
#
|
|
26
46
|
def process(message)
|
|
27
47
|
url = message[:url]
|
|
28
48
|
depth = Integer(message[:depth] || 0)
|
|
@@ -8,12 +8,22 @@ require "daimon_skycrawlers/processor"
|
|
|
8
8
|
|
|
9
9
|
module DaimonSkycrawlers
|
|
10
10
|
module Crawler
|
|
11
|
+
#
|
|
12
|
+
# The base class of crawler
|
|
13
|
+
#
|
|
11
14
|
class Base
|
|
12
15
|
include DaimonSkycrawlers::LoggerMixin
|
|
13
16
|
include DaimonSkycrawlers::ConfigMixin
|
|
14
17
|
|
|
18
|
+
# @!attribute [w] storage
|
|
19
|
+
# Set storage to crawler instance.
|
|
20
|
+
# @return [void]
|
|
15
21
|
attr_writer :storage
|
|
16
22
|
|
|
23
|
+
#
|
|
24
|
+
# @param [String] Base URL for crawler
|
|
25
|
+
# @param [Hash] options for Faraday
|
|
26
|
+
#
|
|
17
27
|
def initialize(base_url = nil, options = {})
|
|
18
28
|
super()
|
|
19
29
|
@base_url = base_url
|
|
@@ -23,6 +33,13 @@ module DaimonSkycrawlers
|
|
|
23
33
|
@n_processed_urls = 0
|
|
24
34
|
end
|
|
25
35
|
|
|
36
|
+
#
|
|
37
|
+
# Set up connection
|
|
38
|
+
#
|
|
39
|
+
# @param [Hash] options for Faraday
|
|
40
|
+
# @yield [faraday]
|
|
41
|
+
# @yieldparam faraday [Faraday]
|
|
42
|
+
#
|
|
26
43
|
def setup_connection(options = {})
|
|
27
44
|
@connection = Faraday.new(@base_url, @options.merge(options)) do |faraday|
|
|
28
45
|
yield faraday
|
|
@@ -37,6 +54,9 @@ module DaimonSkycrawlers
|
|
|
37
54
|
@prepare = block
|
|
38
55
|
end
|
|
39
56
|
|
|
57
|
+
#
|
|
58
|
+
# Retrieve storage instance
|
|
59
|
+
#
|
|
40
60
|
def storage
|
|
41
61
|
@storage ||= Storage::RDB.new
|
|
42
62
|
end
|
|
@@ -3,6 +3,11 @@ require "daimon_skycrawlers/filter/update_checker"
|
|
|
3
3
|
|
|
4
4
|
module DaimonSkycrawlers
|
|
5
5
|
module Crawler
|
|
6
|
+
#
|
|
7
|
+
# The default crawler
|
|
8
|
+
#
|
|
9
|
+
# This crawler can GET given URL and store response to storage
|
|
10
|
+
#
|
|
6
11
|
class Default < Base
|
|
7
12
|
def fetch(path, depth: 3, **kw)
|
|
8
13
|
@n_processed_urls += 1
|
|
@@ -6,24 +6,50 @@ require "daimon_skycrawlers/consumer/url"
|
|
|
6
6
|
module DaimonSkycrawlers
|
|
7
7
|
module Crawler
|
|
8
8
|
class << self
|
|
9
|
+
#
|
|
10
|
+
# Run registered crawlers
|
|
11
|
+
#
|
|
12
|
+
# @param process_name [String] Process name
|
|
13
|
+
#
|
|
9
14
|
def run(process_name: default_process_name)
|
|
10
15
|
DaimonSkycrawlers::Timer.setup_shutdown_timer(config.queue_name_prefix, interval: config.shutdown_interval)
|
|
11
16
|
SongkickQueue::Worker.new(process_name, [DaimonSkycrawlers::Consumer::URL]).run
|
|
12
17
|
end
|
|
13
18
|
|
|
19
|
+
#
|
|
20
|
+
# Enqueue a URL to crawler queue
|
|
21
|
+
#
|
|
22
|
+
# @param [String] Specify absolute URL
|
|
23
|
+
# @param [Hash] Extra parameters for crawler
|
|
24
|
+
# @return [void]
|
|
14
25
|
def enqueue_url(url, message = {})
|
|
15
26
|
message[:url] = url
|
|
16
27
|
SongkickQueue.publish(queue_name, message)
|
|
17
28
|
end
|
|
18
29
|
|
|
30
|
+
#
|
|
31
|
+
# Shortcut of DaimonSkycrawlers.configuration
|
|
32
|
+
#
|
|
33
|
+
# @return [DaimonSkycrawlers::Configuration]
|
|
34
|
+
#
|
|
19
35
|
def config
|
|
20
36
|
DaimonSkycrawlers.configuration
|
|
21
37
|
end
|
|
22
38
|
|
|
39
|
+
#
|
|
40
|
+
# Queue name for crawler
|
|
41
|
+
#
|
|
42
|
+
# @return [String] Queue name
|
|
43
|
+
#
|
|
23
44
|
def queue_name
|
|
24
45
|
"#{config.queue_name_prefix}.url"
|
|
25
46
|
end
|
|
26
47
|
|
|
48
|
+
#
|
|
49
|
+
# Default process name
|
|
50
|
+
#
|
|
51
|
+
# @return [String] Default process name
|
|
52
|
+
#
|
|
27
53
|
def default_process_name
|
|
28
54
|
"#{config.queue_name_prefix}:url"
|
|
29
55
|
end
|
|
@@ -3,6 +3,9 @@ require "daimon_skycrawlers/filter/base"
|
|
|
3
3
|
|
|
4
4
|
module DaimonSkycrawlers
|
|
5
5
|
module Filter
|
|
6
|
+
#
|
|
7
|
+
# This filter provides duplication checker for given URL.
|
|
8
|
+
#
|
|
6
9
|
class DuplicateChecker < Base
|
|
7
10
|
def initialize(base_url: nil)
|
|
8
11
|
@base_url = nil
|
|
@@ -10,6 +13,11 @@ module DaimonSkycrawlers
|
|
|
10
13
|
@urls = Set.new
|
|
11
14
|
end
|
|
12
15
|
|
|
16
|
+
#
|
|
17
|
+
# @param [String] url to check duplication. If given URL is
|
|
18
|
+
# relative URL, use `@base_url + url` as absolute URL.
|
|
19
|
+
# @return [true|false] Return false when duplicated, otherwise return true.
|
|
20
|
+
#
|
|
13
21
|
def call(url)
|
|
14
22
|
unless URI(url).absolute?
|
|
15
23
|
url = (@base_url + url).to_s
|
|
@@ -3,6 +3,9 @@ require "daimon_skycrawlers/filter/base"
|
|
|
3
3
|
|
|
4
4
|
module DaimonSkycrawlers
|
|
5
5
|
module Filter
|
|
6
|
+
#
|
|
7
|
+
# This filter provides update checker for given URL.
|
|
8
|
+
#
|
|
6
9
|
class UpdateChecker < Base
|
|
7
10
|
def initialize(storage: nil, base_url: nil)
|
|
8
11
|
super(storage: storage)
|
|
@@ -10,6 +13,11 @@ module DaimonSkycrawlers
|
|
|
10
13
|
@base_url = URI(base_url) if base_url
|
|
11
14
|
end
|
|
12
15
|
|
|
16
|
+
#
|
|
17
|
+
# @param [String] url
|
|
18
|
+
# @param connection [Faraday]
|
|
19
|
+
# @return [true|false] Return true when need update, otherwise return false
|
|
20
|
+
#
|
|
13
21
|
def call(url, connection: nil)
|
|
14
22
|
unless URI(url).absolute?
|
|
15
23
|
url = (@base_url + url).to_s
|
|
@@ -1,4 +1,8 @@
|
|
|
1
1
|
require "thor"
|
|
2
|
+
require "rails/generators"
|
|
3
|
+
require "rails/generators/actions"
|
|
4
|
+
require "rails/generators/active_record"
|
|
5
|
+
require "rails/generators/active_record/migration/migration_generator"
|
|
2
6
|
|
|
3
7
|
module DaimonSkycrawlers
|
|
4
8
|
module Generator
|
|
@@ -18,26 +22,45 @@ module DaimonSkycrawlers
|
|
|
18
22
|
].each do |path|
|
|
19
23
|
template("#{path}.erb", "#{name}/#{path}")
|
|
20
24
|
end
|
|
25
|
+
invoke(MigrationGenerator, [
|
|
26
|
+
"CreatePage",
|
|
27
|
+
"url:string",
|
|
28
|
+
"headers:text",
|
|
29
|
+
"body:binary",
|
|
30
|
+
"last_modified_at:datetime",
|
|
31
|
+
"etag:string",
|
|
32
|
+
"timestamps"
|
|
33
|
+
],
|
|
34
|
+
{ destination_root: File.join(destination_root, name) })
|
|
21
35
|
end
|
|
22
36
|
|
|
23
37
|
def copy_files
|
|
24
38
|
[
|
|
25
39
|
"Gemfile",
|
|
26
40
|
"Rakefile",
|
|
27
|
-
"
|
|
28
|
-
"
|
|
29
|
-
"
|
|
30
|
-
"
|
|
41
|
+
"app/crawlers/sample_crawler.rb",
|
|
42
|
+
"app/processors/sample_processor.rb",
|
|
43
|
+
"bin/crawler",
|
|
44
|
+
"bin/enqueue",
|
|
45
|
+
"bin/processor",
|
|
46
|
+
"config/init.rb",
|
|
31
47
|
].each do |path|
|
|
32
|
-
copy_file(path, "#{name}/#{path}")
|
|
33
|
-
end
|
|
34
|
-
[
|
|
35
|
-
"db/migrate/create_pages.rb",
|
|
36
|
-
].each do |path|
|
|
37
|
-
migration = "#{Time.now.strftime("%Y%m%d%H%M%S")}_#{File.basename(path)}"
|
|
38
|
-
copy_file(path, "#{name}/db/migrate/#{migration}")
|
|
48
|
+
copy_file(path, "#{name}/#{path}", mode: :preserve)
|
|
39
49
|
end
|
|
40
50
|
end
|
|
41
51
|
end
|
|
52
|
+
|
|
53
|
+
class MigrationGenerator < ActiveRecord::Generators::MigrationGenerator
|
|
54
|
+
def self.source_root
|
|
55
|
+
ActiveRecord::Generators::MigrationGenerator.source_root
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def create_migration_file
|
|
59
|
+
set_local_assigns!
|
|
60
|
+
validate_file_name!
|
|
61
|
+
dest = options[:destination_root]
|
|
62
|
+
migration_template @migration_template, "#{dest}/db/migrate/#{file_name}.rb"
|
|
63
|
+
end
|
|
64
|
+
end
|
|
42
65
|
end
|
|
43
66
|
end
|
|
@@ -29,14 +29,14 @@ $ bundle exec rake db:migrate
|
|
|
29
29
|
3. Open new terminal and run crawler/processor
|
|
30
30
|
|
|
31
31
|
```
|
|
32
|
-
$
|
|
33
|
-
$
|
|
32
|
+
$ bin/crawler # on new terminal
|
|
33
|
+
$ bin/processor # on new terminal
|
|
34
34
|
```
|
|
35
35
|
|
|
36
36
|
4. Enqueue task
|
|
37
37
|
|
|
38
38
|
```
|
|
39
|
-
$
|
|
39
|
+
$ bin/enqueue url http://example.com/
|
|
40
40
|
```
|
|
41
41
|
|
|
42
42
|
5. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
|
|
@@ -44,7 +44,7 @@ $ bundle exec ruby enqueue.rb http://example.com/
|
|
|
44
44
|
6. You can re-enqueue task for processor
|
|
45
45
|
|
|
46
46
|
```
|
|
47
|
-
$
|
|
47
|
+
$ bin/enqueue response http://example.com/
|
|
48
48
|
```
|
|
49
49
|
|
|
50
50
|
Display `It works with 'http://example.com'` again on your terminal which runs your processor.
|
data/lib/daimon_skycrawlers/generator/templates/new/{crawler.rb → app/crawlers/sample_crawler.rb}
RENAMED
|
@@ -1,14 +1,8 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
|
|
3
1
|
require "daimon_skycrawlers/crawler"
|
|
4
2
|
require "daimon_skycrawlers/crawler/default"
|
|
5
3
|
|
|
6
|
-
require_relative "./init"
|
|
7
|
-
|
|
8
4
|
base_url = "http://example.com"
|
|
9
5
|
|
|
10
6
|
crawler = DaimonSkycrawlers::Crawler::Default.new(base_url)
|
|
11
7
|
|
|
12
8
|
DaimonSkycrawlers.register_crawler(crawler)
|
|
13
|
-
|
|
14
|
-
DaimonSkycrawlers::Crawler.run
|
data/{sample/spider/enqueue.rb → lib/daimon_skycrawlers/generator/templates/new/bin/enqueue}
RENAMED
|
@@ -2,11 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
require "thor"
|
|
4
4
|
|
|
5
|
+
require_relative "../config/init"
|
|
5
6
|
require "daimon_skycrawlers/crawler"
|
|
6
7
|
require "daimon_skycrawlers/processor"
|
|
7
8
|
|
|
8
|
-
require_relative "./init"
|
|
9
|
-
|
|
10
9
|
class Enqueue < Thor
|
|
11
10
|
desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
|
|
12
11
|
def url(url, *rest)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
require "daimon_skycrawlers/storage/base"
|
|
2
|
+
|
|
3
|
+
module DaimonSkycrawlers
|
|
4
|
+
module Storage
|
|
5
|
+
#
|
|
6
|
+
# Storage for files
|
|
7
|
+
#
|
|
8
|
+
class File < Base
|
|
9
|
+
def initialize(base_dir)
|
|
10
|
+
super()
|
|
11
|
+
@base_dir = Pathname(base_dir)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def save(url, headers, body)
|
|
15
|
+
@base_dir.mkpath
|
|
16
|
+
body_path(url).dirname.mkpath
|
|
17
|
+
body_path(url).open("wb+") do |file|
|
|
18
|
+
file.write(body)
|
|
19
|
+
end
|
|
20
|
+
headers_path(url).open("wb+") do |file|
|
|
21
|
+
file.write(JSON.generate(headers))
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def read(url)
|
|
26
|
+
headers = JSON.parse(headers_path(url).read)
|
|
27
|
+
body = body_path(url).read
|
|
28
|
+
Page.new(url, headers, body, headers["last-modified"], headers["etag"])
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
Page = Struct.new(:url, :headers, :body, :last_modified, :etag)
|
|
32
|
+
|
|
33
|
+
private
|
|
34
|
+
|
|
35
|
+
def body_path(url)
|
|
36
|
+
url = URI(url)
|
|
37
|
+
@base_dir + ".#{url.path}"
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def headers_path(url)
|
|
41
|
+
url = URI(url)
|
|
42
|
+
Pathname("#{body_path(url)}-headers.json")
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
@@ -1,9 +1,21 @@
|
|
|
1
1
|
module DaimonSkycrawlers
|
|
2
2
|
module Storage
|
|
3
|
+
#
|
|
4
|
+
# The null storage.
|
|
5
|
+
#
|
|
6
|
+
# This storage is useful for test.
|
|
7
|
+
#
|
|
3
8
|
class Null < Base
|
|
9
|
+
|
|
10
|
+
#
|
|
11
|
+
# Save nothing
|
|
12
|
+
#
|
|
4
13
|
def save(url, headers, body)
|
|
5
14
|
end
|
|
6
15
|
|
|
16
|
+
#
|
|
17
|
+
# Find nothing
|
|
18
|
+
#
|
|
7
19
|
def find(url)
|
|
8
20
|
end
|
|
9
21
|
end
|
|
@@ -3,13 +3,24 @@ require "active_record"
|
|
|
3
3
|
|
|
4
4
|
module DaimonSkycrawlers
|
|
5
5
|
module Storage
|
|
6
|
+
#
|
|
7
|
+
# Storage for RDBMS
|
|
8
|
+
#
|
|
6
9
|
class RDB < Base
|
|
7
10
|
def initialize(config_path = "config/database.yml")
|
|
11
|
+
super()
|
|
8
12
|
config = YAML.load_file(config_path)
|
|
9
13
|
environment = ENV["SKYCRAWLERS_ENV"] || "development"
|
|
10
14
|
ActiveRecord::Base.establish_connection(config[environment])
|
|
11
15
|
end
|
|
12
16
|
|
|
17
|
+
#
|
|
18
|
+
# Save
|
|
19
|
+
#
|
|
20
|
+
# @param [String] url identity of the page
|
|
21
|
+
# @param [Hash] header of URL
|
|
22
|
+
# @param [String] body
|
|
23
|
+
#
|
|
13
24
|
def save(url, headers, body)
|
|
14
25
|
Page.create(url: url,
|
|
15
26
|
headers: JSON.generate(headers),
|
|
@@ -18,6 +29,11 @@ module DaimonSkycrawlers
|
|
|
18
29
|
etag: headers["etag"])
|
|
19
30
|
end
|
|
20
31
|
|
|
32
|
+
#
|
|
33
|
+
# Fetch page identified by url
|
|
34
|
+
#
|
|
35
|
+
# @param [String] url identity of the page
|
|
36
|
+
#
|
|
21
37
|
def find(url)
|
|
22
38
|
Page.where(url: url).order(last_modified_at: :desc).limit(1).first
|
|
23
39
|
end
|
data/lib/daimon_skycrawlers.rb
CHANGED
|
@@ -11,14 +11,38 @@ module DaimonSkycrawlers
|
|
|
11
11
|
:shutdown_interval
|
|
12
12
|
)
|
|
13
13
|
class << self
|
|
14
|
+
#
|
|
15
|
+
# Register a processor
|
|
16
|
+
#
|
|
17
|
+
# @overload register_processor(processor)
|
|
18
|
+
# @param [Processor] processor instance which implements `call` method
|
|
19
|
+
# @return [void]
|
|
20
|
+
#
|
|
21
|
+
# @overload register_processor
|
|
22
|
+
# @return [void]
|
|
23
|
+
# @yield [message] Register given block as a processor.
|
|
24
|
+
# @yieldparam message [Hash] A message from queue
|
|
25
|
+
# @yieldreturn [void]
|
|
26
|
+
#
|
|
14
27
|
def register_processor(processor = nil, &block)
|
|
15
28
|
DaimonSkycrawlers::Consumer::HTTPResponse.register(processor, &block)
|
|
16
29
|
end
|
|
17
30
|
|
|
31
|
+
#
|
|
32
|
+
# Register a crawler
|
|
33
|
+
#
|
|
34
|
+
# @param [Crawler] crawler instance which implements `fetch` method
|
|
35
|
+
# @return [void]
|
|
36
|
+
#
|
|
18
37
|
def register_crawler(crawler)
|
|
19
38
|
DaimonSkycrawlers::Consumer::URL.register(crawler)
|
|
20
39
|
end
|
|
21
40
|
|
|
41
|
+
#
|
|
42
|
+
# Retrieve configuration object
|
|
43
|
+
#
|
|
44
|
+
# @return [DaimonSkycrawlers::Configuration]
|
|
45
|
+
#
|
|
22
46
|
def configuration
|
|
23
47
|
@configuration ||= DaimonSkycrawlers::Configuration.new.tap do |config|
|
|
24
48
|
config.logger = DaimonSkycrawlers::Logger.default
|
|
@@ -28,6 +52,13 @@ module DaimonSkycrawlers
|
|
|
28
52
|
end
|
|
29
53
|
end
|
|
30
54
|
|
|
55
|
+
#
|
|
56
|
+
# Configure DaimonSkycrawlers
|
|
57
|
+
#
|
|
58
|
+
# @return [void]
|
|
59
|
+
# @yield [configuration] configure DaimonSkycrawlers
|
|
60
|
+
# @yieldparam configuration [DaimonSkycrawlers::Configuration] configuration object
|
|
61
|
+
# @yieldreturn [void]
|
|
31
62
|
def configure
|
|
32
63
|
yield configuration
|
|
33
64
|
end
|
data/sample/spider/README.md
CHANGED
|
@@ -29,14 +29,14 @@ $ bundle exec rake db:migrate
|
|
|
29
29
|
3. Open new terminal and run crawler/processor
|
|
30
30
|
|
|
31
31
|
```
|
|
32
|
-
$
|
|
33
|
-
$
|
|
32
|
+
$ bin/crawler # on new terminal
|
|
33
|
+
$ bin/processor # on new terminal
|
|
34
34
|
```
|
|
35
35
|
|
|
36
36
|
4. Enqueue task
|
|
37
37
|
|
|
38
38
|
```
|
|
39
|
-
$
|
|
39
|
+
$ bin/enqueue url http://example.com/
|
|
40
40
|
```
|
|
41
41
|
|
|
42
42
|
5. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
|
|
@@ -44,7 +44,7 @@ $ bundle exec ruby enqueue.rb http://example.com/
|
|
|
44
44
|
6. You can re-enqueue task for processor
|
|
45
45
|
|
|
46
46
|
```
|
|
47
|
-
$
|
|
47
|
+
$ bin/enqueue response http://example.com/
|
|
48
48
|
```
|
|
49
49
|
|
|
50
50
|
Display `It works with 'http://example.com'` again on your terminal which runs your processor.
|
|
@@ -1,14 +1,7 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
|
|
3
|
-
require "daimon_skycrawlers/crawler"
|
|
4
1
|
require "daimon_skycrawlers/crawler/default"
|
|
5
2
|
|
|
6
|
-
require_relative "./init"
|
|
7
|
-
|
|
8
3
|
base_url = "http://www.clear-code.com/blog/"
|
|
9
4
|
|
|
10
5
|
crawler = DaimonSkycrawlers::Crawler::Default.new(base_url)
|
|
11
6
|
|
|
12
7
|
DaimonSkycrawlers.register_crawler(crawler)
|
|
13
|
-
|
|
14
|
-
DaimonSkycrawlers::Crawler.run
|
|
@@ -1,13 +1,8 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
|
|
3
|
-
require "daimon_skycrawlers/processor"
|
|
4
1
|
require "daimon_skycrawlers/processor/spider"
|
|
5
2
|
require "daimon_skycrawlers/filter"
|
|
6
3
|
require "daimon_skycrawlers/filter/duplicate_checker"
|
|
7
4
|
require "daimon_skycrawlers/filter/update_checker"
|
|
8
5
|
|
|
9
|
-
require_relative "./init"
|
|
10
|
-
|
|
11
6
|
default_processor = DaimonSkycrawlers::Processor::Default.new
|
|
12
7
|
spider = DaimonSkycrawlers::Processor::Spider.new
|
|
13
8
|
#spider.enqueue = false
|
|
@@ -30,5 +25,3 @@ spider.append_filter(update_checker)
|
|
|
30
25
|
|
|
31
26
|
DaimonSkycrawlers.register_processor(default_processor)
|
|
32
27
|
DaimonSkycrawlers.register_processor(spider)
|
|
33
|
-
|
|
34
|
-
DaimonSkycrawlers::Processor.run
|
data/{lib/daimon_skycrawlers/generator/templates/new/enqueue.rb → sample/spider/bin/enqueue}
RENAMED
|
@@ -2,11 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
require "thor"
|
|
4
4
|
|
|
5
|
+
require_relative "../config/init"
|
|
5
6
|
require "daimon_skycrawlers/crawler"
|
|
6
7
|
require "daimon_skycrawlers/processor"
|
|
7
8
|
|
|
8
|
-
require_relative "./init"
|
|
9
|
-
|
|
10
9
|
class Enqueue < Thor
|
|
11
10
|
desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
|
|
12
11
|
def url(url, *rest)
|
|
File without changes
|
metadata
CHANGED
|
@@ -1,15 +1,29 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: daimon_skycrawlers
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ryunosuke SATO
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2016-09-
|
|
11
|
+
date: 2016-09-29 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: bundler
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - "~>"
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '1.11'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - "~>"
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '1.11'
|
|
13
27
|
- !ruby/object:Gem::Dependency
|
|
14
28
|
name: thor
|
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -95,7 +109,7 @@ dependencies:
|
|
|
95
109
|
- !ruby/object:Gem::Version
|
|
96
110
|
version: '0'
|
|
97
111
|
- !ruby/object:Gem::Dependency
|
|
98
|
-
name:
|
|
112
|
+
name: railties
|
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
|
100
114
|
requirements:
|
|
101
115
|
- - ">="
|
|
@@ -109,7 +123,7 @@ dependencies:
|
|
|
109
123
|
- !ruby/object:Gem::Version
|
|
110
124
|
version: '0'
|
|
111
125
|
- !ruby/object:Gem::Dependency
|
|
112
|
-
name:
|
|
126
|
+
name: pg
|
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
|
114
128
|
requirements:
|
|
115
129
|
- - ">="
|
|
@@ -123,19 +137,19 @@ dependencies:
|
|
|
123
137
|
- !ruby/object:Gem::Version
|
|
124
138
|
version: '0'
|
|
125
139
|
- !ruby/object:Gem::Dependency
|
|
126
|
-
name:
|
|
140
|
+
name: timers
|
|
127
141
|
requirement: !ruby/object:Gem::Requirement
|
|
128
142
|
requirements:
|
|
129
|
-
- - "
|
|
143
|
+
- - ">="
|
|
130
144
|
- !ruby/object:Gem::Version
|
|
131
|
-
version: '
|
|
132
|
-
type: :
|
|
145
|
+
version: '0'
|
|
146
|
+
type: :runtime
|
|
133
147
|
prerelease: false
|
|
134
148
|
version_requirements: !ruby/object:Gem::Requirement
|
|
135
149
|
requirements:
|
|
136
|
-
- - "
|
|
150
|
+
- - ">="
|
|
137
151
|
- !ruby/object:Gem::Version
|
|
138
|
-
version: '
|
|
152
|
+
version: '0'
|
|
139
153
|
- !ruby/object:Gem::Dependency
|
|
140
154
|
name: rake
|
|
141
155
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -234,6 +248,20 @@ dependencies:
|
|
|
234
248
|
- - ">="
|
|
235
249
|
- !ruby/object:Gem::Version
|
|
236
250
|
version: '0'
|
|
251
|
+
- !ruby/object:Gem::Dependency
|
|
252
|
+
name: yard
|
|
253
|
+
requirement: !ruby/object:Gem::Requirement
|
|
254
|
+
requirements:
|
|
255
|
+
- - ">="
|
|
256
|
+
- !ruby/object:Gem::Version
|
|
257
|
+
version: '0'
|
|
258
|
+
type: :development
|
|
259
|
+
prerelease: false
|
|
260
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
261
|
+
requirements:
|
|
262
|
+
- - ">="
|
|
263
|
+
- !ruby/object:Gem::Version
|
|
264
|
+
version: '0'
|
|
237
265
|
description: This is a crawler framework.
|
|
238
266
|
email:
|
|
239
267
|
- tricknotes.rs@gmail.com
|
|
@@ -269,12 +297,13 @@ files:
|
|
|
269
297
|
- lib/daimon_skycrawlers/generator/templates/new/Gemfile
|
|
270
298
|
- lib/daimon_skycrawlers/generator/templates/new/README.md.erb
|
|
271
299
|
- lib/daimon_skycrawlers/generator/templates/new/Rakefile
|
|
300
|
+
- lib/daimon_skycrawlers/generator/templates/new/app/crawlers/sample_crawler.rb
|
|
301
|
+
- lib/daimon_skycrawlers/generator/templates/new/app/processors/sample_processor.rb
|
|
302
|
+
- lib/daimon_skycrawlers/generator/templates/new/bin/crawler
|
|
303
|
+
- lib/daimon_skycrawlers/generator/templates/new/bin/enqueue
|
|
304
|
+
- lib/daimon_skycrawlers/generator/templates/new/bin/processor
|
|
272
305
|
- lib/daimon_skycrawlers/generator/templates/new/config/database.yml.erb
|
|
273
|
-
- lib/daimon_skycrawlers/generator/templates/new/
|
|
274
|
-
- lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb
|
|
275
|
-
- lib/daimon_skycrawlers/generator/templates/new/enqueue.rb
|
|
276
|
-
- lib/daimon_skycrawlers/generator/templates/new/init.rb
|
|
277
|
-
- lib/daimon_skycrawlers/generator/templates/new/processor.rb
|
|
306
|
+
- lib/daimon_skycrawlers/generator/templates/new/config/init.rb
|
|
278
307
|
- lib/daimon_skycrawlers/logger.rb
|
|
279
308
|
- lib/daimon_skycrawlers/processor.rb
|
|
280
309
|
- lib/daimon_skycrawlers/processor/base.rb
|
|
@@ -283,6 +312,7 @@ files:
|
|
|
283
312
|
- lib/daimon_skycrawlers/queue.rb
|
|
284
313
|
- lib/daimon_skycrawlers/storage.rb
|
|
285
314
|
- lib/daimon_skycrawlers/storage/base.rb
|
|
315
|
+
- lib/daimon_skycrawlers/storage/file.rb
|
|
286
316
|
- lib/daimon_skycrawlers/storage/null.rb
|
|
287
317
|
- lib/daimon_skycrawlers/storage/rdb.rb
|
|
288
318
|
- lib/daimon_skycrawlers/tasks.rb
|
|
@@ -292,13 +322,15 @@ files:
|
|
|
292
322
|
- sample/spider/Gemfile
|
|
293
323
|
- sample/spider/README.md
|
|
294
324
|
- sample/spider/Rakefile
|
|
325
|
+
- sample/spider/app/crawlers/blog_crawler.rb
|
|
326
|
+
- sample/spider/app/processors/blog_spider.rb
|
|
327
|
+
- sample/spider/bin/crawler
|
|
328
|
+
- sample/spider/bin/enqueue
|
|
329
|
+
- sample/spider/bin/processor
|
|
295
330
|
- sample/spider/config/database.yml
|
|
296
|
-
- sample/spider/
|
|
331
|
+
- sample/spider/config/init.rb
|
|
297
332
|
- sample/spider/db/migrate/20160830155803_create_pages.rb
|
|
298
333
|
- sample/spider/db/schema.rb
|
|
299
|
-
- sample/spider/enqueue.rb
|
|
300
|
-
- sample/spider/init.rb
|
|
301
|
-
- sample/spider/processor.rb
|
|
302
334
|
homepage: https://github.com/bm-sms/daimon-skycrawlers
|
|
303
335
|
licenses:
|
|
304
336
|
- MIT
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
class CreatePages < ActiveRecord::Migration
|
|
2
|
-
def change
|
|
3
|
-
create_table :pages do |t|
|
|
4
|
-
t.string :url
|
|
5
|
-
t.text :headers
|
|
6
|
-
t.binary :body, limit: 10 * 1024 ** 2 # 10MiB
|
|
7
|
-
t.datetime :last_modified_at
|
|
8
|
-
t.string :etag
|
|
9
|
-
|
|
10
|
-
t.timestamps null: false
|
|
11
|
-
end
|
|
12
|
-
end
|
|
13
|
-
end
|