daimon_skycrawlers 1.0.0.pre.rc1 → 1.0.0.pre.rc2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +11 -0
- data/daimon_skycrawlers.gemspec +1 -0
- data/lib/daimon_skycrawlers.rb +32 -2
- data/lib/daimon_skycrawlers/callbacks.rb +32 -2
- data/lib/daimon_skycrawlers/cli.rb +4 -0
- data/lib/daimon_skycrawlers/commands/enqueue.rb +4 -1
- data/lib/daimon_skycrawlers/commands/runner.rb +2 -0
- data/lib/daimon_skycrawlers/config.rb +1 -0
- data/lib/daimon_skycrawlers/configurable.rb +6 -1
- data/lib/daimon_skycrawlers/consumer.rb +3 -0
- data/lib/daimon_skycrawlers/consumer/base.rb +5 -0
- data/lib/daimon_skycrawlers/consumer/http_response.rb +1 -1
- data/lib/daimon_skycrawlers/consumer/url.rb +1 -1
- data/lib/daimon_skycrawlers/crawler.rb +5 -2
- data/lib/daimon_skycrawlers/crawler/base.rb +56 -8
- data/lib/daimon_skycrawlers/crawler/default.rb +9 -1
- data/lib/daimon_skycrawlers/filter.rb +3 -0
- data/lib/daimon_skycrawlers/filter/base.rb +12 -0
- data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +2 -2
- data/lib/daimon_skycrawlers/filter/robots_txt_checker.rb +1 -1
- data/lib/daimon_skycrawlers/filter/update_checker.rb +2 -2
- data/lib/daimon_skycrawlers/generator/crawler.rb +4 -1
- data/lib/daimon_skycrawlers/generator/filter.rb +4 -1
- data/lib/daimon_skycrawlers/generator/generate.rb +3 -0
- data/lib/daimon_skycrawlers/generator/new.rb +5 -1
- data/lib/daimon_skycrawlers/generator/processor.rb +4 -1
- data/lib/daimon_skycrawlers/logger.rb +8 -0
- data/lib/daimon_skycrawlers/processor.rb +5 -2
- data/lib/daimon_skycrawlers/processor/base.rb +28 -2
- data/lib/daimon_skycrawlers/processor/default.rb +7 -1
- data/lib/daimon_skycrawlers/processor/proc.rb +6 -0
- data/lib/daimon_skycrawlers/processor/spider.rb +2 -2
- data/lib/daimon_skycrawlers/queue.rb +31 -0
- data/lib/daimon_skycrawlers/sitemap_parser.rb +23 -1
- data/lib/daimon_skycrawlers/storage.rb +3 -0
- data/lib/daimon_skycrawlers/storage/base.rb +21 -1
- data/lib/daimon_skycrawlers/storage/file.rb +16 -0
- data/lib/daimon_skycrawlers/storage/null.rb +2 -2
- data/lib/daimon_skycrawlers/storage/rdb.rb +25 -7
- data/lib/daimon_skycrawlers/timer.rb +9 -0
- data/lib/daimon_skycrawlers/version.rb +4 -1
- data/sample/amazon-ranking/app/processors/amazon_ranking.rb +1 -1
- data/sample/itp-crawler/app/processors/itp_processor.rb +1 -1
- data/{lib/daimon_skycrawlers/generator/templates → templates}/crawler.rb.erb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/filter.rb.erb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Dockerfile +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Dockerfile.db +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Gemfile +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/README.md.erb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Rakefile +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/app/crawler.rb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/app/processor.rb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/config/database.yml.erb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/config/init.rb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/docker-compose.yml.erb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/env.db.erb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/env.erb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/services/common/docker-entrypoint.sh +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/services/db/init-user-db.sh +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/processor.rb.erb +0 -0
- metadata +34 -19
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 648bff4fe2019f82836bcf21c75dc6e29d45abc8
|
4
|
+
data.tar.gz: f7691acca87a0686190171806a6a07aaa8f4015f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 378fe0060bacead511a87f637702cd5d24ce79c5acf3cba0bfac123d9dfc2cfcde41261a813dce69f9e11126d988df609a92df126f82235610db7c5a7332539b
|
7
|
+
data.tar.gz: '0790c05562fc71542a1cf33ca0971bbf356659d20012f7968c6c4ffd9ec9df521f0e6d009aa8256ec05a7d1b3e494baaec187d1faa8bbd34102fb73875edcfd0'
|
data/.yardopts
ADDED
data/daimon_skycrawlers.gemspec
CHANGED
@@ -36,6 +36,7 @@ Gem::Specification.new do |spec|
|
|
36
36
|
spec.add_development_dependency "test-unit-rr"
|
37
37
|
spec.add_development_dependency "test-unit-notify"
|
38
38
|
spec.add_development_dependency "pry"
|
39
|
+
spec.add_development_dependency "redcarpet"
|
39
40
|
spec.add_development_dependency "tapp"
|
40
41
|
spec.add_development_dependency "simplecov"
|
41
42
|
spec.add_development_dependency "sqlite3"
|
data/lib/daimon_skycrawlers.rb
CHANGED
@@ -3,7 +3,13 @@ require "bundler/setup"
|
|
3
3
|
require "daimon_skycrawlers/version"
|
4
4
|
require "daimon_skycrawlers/logger"
|
5
5
|
|
6
|
+
#
|
7
|
+
# Name space for this library
|
8
|
+
#
|
6
9
|
module DaimonSkycrawlers
|
10
|
+
#
|
11
|
+
# Configuration class
|
12
|
+
#
|
7
13
|
Configuration = Struct.new(
|
8
14
|
:logger,
|
9
15
|
:queue_name_prefix,
|
@@ -15,7 +21,7 @@ module DaimonSkycrawlers
|
|
15
21
|
# Register a processor
|
16
22
|
#
|
17
23
|
# @overload register_processor(processor)
|
18
|
-
# @param [Processor]
|
24
|
+
# @param processor [Processor] instance which implements `call` method
|
19
25
|
# @return [void]
|
20
26
|
#
|
21
27
|
# @overload register_processor
|
@@ -31,7 +37,7 @@ module DaimonSkycrawlers
|
|
31
37
|
#
|
32
38
|
# Register a crawler
|
33
39
|
#
|
34
|
-
# @param [Crawler]
|
40
|
+
# @param crawler [Crawler] instance which implements `fetch` method
|
35
41
|
# @return [void]
|
36
42
|
#
|
37
43
|
def register_crawler(crawler)
|
@@ -55,6 +61,20 @@ module DaimonSkycrawlers
|
|
55
61
|
#
|
56
62
|
# Configure DaimonSkycrawlers
|
57
63
|
#
|
64
|
+
# ```ruby
|
65
|
+
# DaimonSkycrawlers.configure do |config|
|
66
|
+
# config.logger = DaimonSkycrawlers::Logger.default
|
67
|
+
# config.queue_name_prefix = "daimon-skycrawlers"
|
68
|
+
# config.crawler_interval = 1
|
69
|
+
# config.shutdown_interval = 10
|
70
|
+
# end
|
71
|
+
# ```
|
72
|
+
#
|
73
|
+
# * logger: logger instance
|
74
|
+
# * queue_name_prefix: prefix of queue name.
|
75
|
+
# * crawler_interval: crawling interval
|
76
|
+
# * shutdown_interval: shutdown after interval after the queue is empty
|
77
|
+
#
|
58
78
|
# @return [void]
|
59
79
|
# @yield [configuration] configure DaimonSkycrawlers
|
60
80
|
# @yieldparam configuration [DaimonSkycrawlers::Configuration] configuration object
|
@@ -75,6 +95,11 @@ module DaimonSkycrawlers
|
|
75
95
|
exit(false)
|
76
96
|
end
|
77
97
|
|
98
|
+
#
|
99
|
+
# Load "app/crawlers/**/*.rb"
|
100
|
+
#
|
101
|
+
# @return [void]
|
102
|
+
#
|
78
103
|
def load_crawlers
|
79
104
|
Dir.glob("app/crawlers/**/*.rb") do |path|
|
80
105
|
require(File.expand_path(path, Dir.pwd)) &&
|
@@ -82,6 +107,11 @@ module DaimonSkycrawlers
|
|
82
107
|
end
|
83
108
|
end
|
84
109
|
|
110
|
+
#
|
111
|
+
# Load "app/processors/**/*.rb"
|
112
|
+
#
|
113
|
+
# @return [void]
|
114
|
+
#
|
85
115
|
def load_processors
|
86
116
|
Dir.glob("app/processors/**/*.rb") do |path|
|
87
117
|
require(File.expand_path(path, Dir.pwd)) &&
|
@@ -1,11 +1,22 @@
|
|
1
1
|
module DaimonSkycrawlers
|
2
|
+
#
|
3
|
+
# This module provides simple callback system
|
4
|
+
#
|
2
5
|
module Callbacks
|
6
|
+
# @private
|
3
7
|
def initialize
|
4
8
|
super
|
5
9
|
@before_process_callbacks = []
|
6
10
|
@after_process_callbacks = []
|
7
11
|
end
|
8
12
|
|
13
|
+
#
|
14
|
+
# Register before process callback
|
15
|
+
#
|
16
|
+
# @param callback [Object] This object must respond to call
|
17
|
+
# @yield [message]
|
18
|
+
# @yieldparam message [Hash]
|
19
|
+
#
|
9
20
|
def before_process(callback = nil, &block)
|
10
21
|
if block_given?
|
11
22
|
@before_process_callbacks << block
|
@@ -14,16 +25,29 @@ module DaimonSkycrawlers
|
|
14
25
|
end
|
15
26
|
end
|
16
27
|
|
17
|
-
|
28
|
+
#
|
29
|
+
# Run registered before process callbacks
|
30
|
+
#
|
31
|
+
def run_before_process_callbacks(message)
|
18
32
|
@before_process_callbacks.all? do |callback|
|
19
33
|
callback.call(message)
|
20
34
|
end
|
21
35
|
end
|
22
36
|
|
37
|
+
#
|
38
|
+
# Clear all before process callbacks
|
39
|
+
#
|
23
40
|
def clear_before_process_callbacks
|
24
41
|
@before_process_callbacks = []
|
25
42
|
end
|
26
43
|
|
44
|
+
#
|
45
|
+
# Register after process callback
|
46
|
+
#
|
47
|
+
# @param callback [Object] This object must respond to call
|
48
|
+
# @yield [message]
|
49
|
+
# @yieldparam message [Hash]
|
50
|
+
#
|
27
51
|
def after_process(callback = nil, &block)
|
28
52
|
if block_given?
|
29
53
|
@after_process_callbacks << block
|
@@ -32,12 +56,18 @@ module DaimonSkycrawlers
|
|
32
56
|
end
|
33
57
|
end
|
34
58
|
|
35
|
-
|
59
|
+
#
|
60
|
+
# Run registered before process callbacks
|
61
|
+
#
|
62
|
+
def run_after_process_callbacks(message)
|
36
63
|
@after_process_callbacks.each do |callback|
|
37
64
|
callback.call(message)
|
38
65
|
end
|
39
66
|
end
|
40
67
|
|
68
|
+
#
|
69
|
+
# Clear all after process callbacks
|
70
|
+
#
|
41
71
|
def clear_after_process_callbacks
|
42
72
|
@after_process_callbacks = []
|
43
73
|
end
|
@@ -6,6 +6,10 @@ require "daimon_skycrawlers/commands/runner"
|
|
6
6
|
require "daimon_skycrawlers/version"
|
7
7
|
|
8
8
|
module DaimonSkycrawlers
|
9
|
+
#
|
10
|
+
# This class provides CLI (Command Line Interface)
|
11
|
+
#
|
12
|
+
# @private
|
9
13
|
class CLI < Thor
|
10
14
|
register(Generator::New, "new", "new NAME", "Create new project")
|
11
15
|
register(Generator::Generate, "generate", "generate COMMAND", "Generate new code")
|
@@ -7,7 +7,9 @@ require "thor"
|
|
7
7
|
require "webrobots"
|
8
8
|
|
9
9
|
module DaimonSkycrawlers
|
10
|
+
# @private
|
10
11
|
module Commands
|
12
|
+
# @private
|
11
13
|
class Enqueue < Thor
|
12
14
|
desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
|
13
15
|
def url(url, *rest)
|
@@ -72,7 +74,8 @@ module DaimonSkycrawlers
|
|
72
74
|
method_option("type", aliases: ["-t"], type: :string, default: "url", desc: "Specify type for URLs")
|
73
75
|
def yaml(path)
|
74
76
|
load_init
|
75
|
-
|
77
|
+
yaml_text = ERB.new(File.read(path), nil, "-").result(binding)
|
78
|
+
YAML.load(yaml_text).each do |hash|
|
76
79
|
url = hash["url"]
|
77
80
|
message = hash["message"] || {}
|
78
81
|
raise "Could not find URL: #{hash}" unless url
|
@@ -1,8 +1,13 @@
|
|
1
1
|
module DaimonSkycrawlers
|
2
|
+
#
|
3
|
+
# This module provides `#configure` to construct instance
|
4
|
+
#
|
2
5
|
module Configurable
|
3
6
|
#
|
4
|
-
# Configure
|
7
|
+
# Configure instance
|
5
8
|
#
|
9
|
+
# @yield [instance] give instance to the block
|
10
|
+
# @yieldparam instance [DaimonSkycrawlers::Crawler::Base|DaimonSkycrawlers::Processor::Base] self
|
6
11
|
# @return [DaimonSkycrawlers::Crawler::Base|DaimonSkycrawlers::Processor::Base] self
|
7
12
|
#
|
8
13
|
def configure
|
@@ -4,10 +4,15 @@ require "daimon_skycrawlers/config"
|
|
4
4
|
|
5
5
|
module DaimonSkycrawlers
|
6
6
|
module Consumer
|
7
|
+
#
|
8
|
+
# Base class for consumer
|
9
|
+
#
|
10
|
+
# @private
|
7
11
|
class Base
|
8
12
|
include DaimonSkycrawlers::LoggerMixin
|
9
13
|
include DaimonSkycrawlers::ConfigMixin
|
10
14
|
|
15
|
+
# @private
|
11
16
|
def process(message)
|
12
17
|
raise NotImplementedError, "Must implement in subclass"
|
13
18
|
end
|
@@ -17,7 +17,7 @@ module DaimonSkycrawlers
|
|
17
17
|
# Register a processor
|
18
18
|
#
|
19
19
|
# @overload register(processor)
|
20
|
-
# @param [Processor] processor instance which implements `call` method
|
20
|
+
# @param processor [Processor] processor instance which implements `call` method
|
21
21
|
# @return [void]
|
22
22
|
#
|
23
23
|
# @overload register
|
@@ -14,7 +14,7 @@ module DaimonSkycrawlers
|
|
14
14
|
#
|
15
15
|
# Register a given crawler
|
16
16
|
#
|
17
|
-
# @param [Crawler] crawler instance which implements `fetch` method
|
17
|
+
# @param crawler [Crawler] crawler instance which implements `fetch` method
|
18
18
|
# @return [void]
|
19
19
|
#
|
20
20
|
def register(crawler)
|
@@ -4,6 +4,9 @@ require "daimon_skycrawlers/timer"
|
|
4
4
|
require "daimon_skycrawlers/consumer/url"
|
5
5
|
|
6
6
|
module DaimonSkycrawlers
|
7
|
+
#
|
8
|
+
# Name space for crawler related classes
|
9
|
+
#
|
7
10
|
module Crawler
|
8
11
|
class << self
|
9
12
|
#
|
@@ -21,8 +24,8 @@ module DaimonSkycrawlers
|
|
21
24
|
#
|
22
25
|
# Enqueue a URL to crawler queue
|
23
26
|
#
|
24
|
-
# @param [String] Specify absolute URL
|
25
|
-
# @param [Hash] Extra parameters for crawler
|
27
|
+
# @param url [String] Specify absolute URL
|
28
|
+
# @param message [Hash] Extra parameters for crawler
|
26
29
|
# @return [void]
|
27
30
|
def enqueue_url(url, message = {})
|
28
31
|
message[:url] = url
|
@@ -18,6 +18,9 @@ module DaimonSkycrawlers
|
|
18
18
|
#
|
19
19
|
# The base class of crawler
|
20
20
|
#
|
21
|
+
# A crawler implementation can inherit this class and override
|
22
|
+
# `#fetch` in the class.
|
23
|
+
#
|
21
24
|
class Base
|
22
25
|
include DaimonSkycrawlers::LoggerMixin
|
23
26
|
include DaimonSkycrawlers::ConfigMixin
|
@@ -35,8 +38,9 @@ module DaimonSkycrawlers
|
|
35
38
|
attr_reader :n_processed_urls
|
36
39
|
|
37
40
|
#
|
38
|
-
# @param [String] Base URL for crawler
|
39
|
-
# @param [Hash] options for Faraday
|
41
|
+
# @param base_url [String] Base URL for crawler
|
42
|
+
# @param faraday_options [Hash] options for Faraday
|
43
|
+
# @param options [Hash] options for crawler
|
40
44
|
#
|
41
45
|
def initialize(base_url = nil, faraday_options: {}, options: {})
|
42
46
|
super()
|
@@ -46,12 +50,15 @@ module DaimonSkycrawlers
|
|
46
50
|
@prepare = ->(connection) {}
|
47
51
|
@skipped = false
|
48
52
|
@n_processed_urls = 0
|
53
|
+
|
54
|
+
setup_default_filters
|
55
|
+
setup_default_post_processes
|
49
56
|
end
|
50
57
|
|
51
58
|
#
|
52
59
|
# Set up connection
|
53
60
|
#
|
54
|
-
# @param [Hash] options for Faraday
|
61
|
+
# @param options [Hash] options for Faraday
|
55
62
|
# @yield [faraday]
|
56
63
|
# @yieldparam faraday [Faraday]
|
57
64
|
#
|
@@ -67,6 +74,8 @@ module DaimonSkycrawlers
|
|
67
74
|
# Call this method before DaimonSkycrawlers.register_crawler
|
68
75
|
# For example, you can login before fetch URL
|
69
76
|
#
|
77
|
+
# @yield [connection]
|
78
|
+
#
|
70
79
|
def prepare(&block)
|
71
80
|
@prepare = block
|
72
81
|
end
|
@@ -74,26 +83,39 @@ module DaimonSkycrawlers
|
|
74
83
|
#
|
75
84
|
# Retrieve storage instance
|
76
85
|
#
|
86
|
+
# @return [DaimonSkycrawlers::Storage::Base]
|
87
|
+
#
|
77
88
|
def storage
|
78
89
|
@storage ||= Storage::RDB.new
|
79
90
|
end
|
80
91
|
|
92
|
+
#
|
93
|
+
# @return [true|false]
|
94
|
+
#
|
81
95
|
def skipped?
|
82
96
|
@skipped
|
83
97
|
end
|
84
98
|
|
99
|
+
#
|
100
|
+
# @return [Faraday]
|
101
|
+
#
|
85
102
|
def connection
|
86
103
|
@connection ||= Faraday.new(@base_url, @faraday_options)
|
87
104
|
end
|
88
105
|
|
106
|
+
#
|
107
|
+
# Process crawler sequence
|
108
|
+
#
|
109
|
+
# 1. Run registered filters
|
110
|
+
# 1. Prepare connection
|
111
|
+
# 1. Download(fetch) data from given URL
|
112
|
+
# 1. Run post processes (store downloaded data to storage)
|
113
|
+
#
|
89
114
|
def process(message, &block)
|
90
115
|
@skipped = false
|
91
116
|
@n_processed_urls += 1
|
92
117
|
|
93
|
-
|
94
|
-
setup_default_post_processes
|
95
|
-
|
96
|
-
proceeding = run_before_callbacks(message)
|
118
|
+
proceeding = run_before_process_callbacks(message)
|
97
119
|
unless proceeding
|
98
120
|
skip(message[:url])
|
99
121
|
return
|
@@ -106,18 +128,44 @@ module DaimonSkycrawlers
|
|
106
128
|
@prepare.call(connection)
|
107
129
|
response = fetch(url, message, &block)
|
108
130
|
data = { url: url, message: message, response: response }
|
109
|
-
|
131
|
+
run_after_process_callbacks(data)
|
110
132
|
data
|
111
133
|
end
|
112
134
|
|
135
|
+
#
|
136
|
+
# Fetch URL
|
137
|
+
#
|
138
|
+
# Override this method in subclass.
|
139
|
+
#
|
140
|
+
# @param path [String] URI or path
|
141
|
+
# @param message [Hash] message can include anything
|
142
|
+
#
|
143
|
+
# @return [Faraday::Response] HTTP response
|
144
|
+
#
|
113
145
|
def fetch(path, message = {})
|
114
146
|
raise NotImplementedError, "Must implement this method in subclass"
|
115
147
|
end
|
116
148
|
|
149
|
+
#
|
150
|
+
# GET URL with params
|
151
|
+
#
|
152
|
+
# @param path [String] URI or path
|
153
|
+
# @param params [Hash] query parameters
|
154
|
+
#
|
155
|
+
# @return [Faraday::Response] HTTP response
|
156
|
+
#
|
117
157
|
def get(path, params = {})
|
118
158
|
@connection.get(path, params)
|
119
159
|
end
|
120
160
|
|
161
|
+
#
|
162
|
+
# POST URL with params
|
163
|
+
#
|
164
|
+
# @param path [String] URI or path
|
165
|
+
# @param params [Hash] query parameters
|
166
|
+
#
|
167
|
+
# @return [Faraday::Response] HTTP response
|
168
|
+
#
|
121
169
|
def post(path, params = {})
|
122
170
|
@connection.post(path, params)
|
123
171
|
end
|