dh_easy-core 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.travis.yml +7 -0
- data/.yardopts +1 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +6 -0
- data/LICENSE +21 -0
- data/README.md +20 -0
- data/Rakefile +22 -0
- data/dh_easy-core.gemspec +50 -0
- data/doc/DhEasy.html +117 -0
- data/doc/DhEasy/Core.html +1590 -0
- data/doc/DhEasy/Core/Config.html +311 -0
- data/doc/DhEasy/Core/Exception.html +117 -0
- data/doc/DhEasy/Core/Exception/OutdatedError.html +135 -0
- data/doc/DhEasy/Core/Helper.html +117 -0
- data/doc/DhEasy/Core/Helper/Cookie.html +1070 -0
- data/doc/DhEasy/Core/Mock.html +282 -0
- data/doc/DhEasy/Core/Mock/FakeDb.html +3779 -0
- data/doc/DhEasy/Core/Mock/FakeExecutor.html +3289 -0
- data/doc/DhEasy/Core/Mock/FakeFinisher.html +160 -0
- data/doc/DhEasy/Core/Mock/FakeParser.html +160 -0
- data/doc/DhEasy/Core/Mock/FakeSeeder.html +160 -0
- data/doc/DhEasy/Core/Plugin.html +117 -0
- data/doc/DhEasy/Core/Plugin/CollectionVault.html +299 -0
- data/doc/DhEasy/Core/Plugin/ConfigBehavior.html +541 -0
- data/doc/DhEasy/Core/Plugin/ContextIntegrator.html +445 -0
- data/doc/DhEasy/Core/Plugin/Executor.html +259 -0
- data/doc/DhEasy/Core/Plugin/ExecutorBehavior.html +344 -0
- data/doc/DhEasy/Core/Plugin/Finisher.html +265 -0
- data/doc/DhEasy/Core/Plugin/FinisherBehavior.html +142 -0
- data/doc/DhEasy/Core/Plugin/InitializeHook.html +220 -0
- data/doc/DhEasy/Core/Plugin/Parser.html +270 -0
- data/doc/DhEasy/Core/Plugin/ParserBehavior.html +235 -0
- data/doc/DhEasy/Core/Plugin/Seeder.html +674 -0
- data/doc/DhEasy/Core/Plugin/SeederBehavior.html +142 -0
- data/doc/DhEasy/Core/SmartCollection.html +1087 -0
- data/doc/_index.html +364 -0
- data/doc/class_list.html +51 -0
- data/doc/css/common.css +1 -0
- data/doc/css/full_list.css +58 -0
- data/doc/css/style.css +496 -0
- data/doc/file.README.html +91 -0
- data/doc/file_list.html +56 -0
- data/doc/frames.html +17 -0
- data/doc/index.html +91 -0
- data/doc/js/app.js +303 -0
- data/doc/js/full_list.js +216 -0
- data/doc/js/jquery.js +4 -0
- data/doc/method_list.html +939 -0
- data/doc/top-level-namespace.html +110 -0
- data/lib/dh_easy/core.rb +257 -0
- data/lib/dh_easy/core/config.rb +27 -0
- data/lib/dh_easy/core/exception.rb +8 -0
- data/lib/dh_easy/core/exception/outdated_error.rb +9 -0
- data/lib/dh_easy/core/helper.rb +8 -0
- data/lib/dh_easy/core/helper/cookie.rb +209 -0
- data/lib/dh_easy/core/mock.rb +45 -0
- data/lib/dh_easy/core/mock/fake_db.rb +561 -0
- data/lib/dh_easy/core/mock/fake_executor.rb +373 -0
- data/lib/dh_easy/core/mock/fake_finisher.rb +28 -0
- data/lib/dh_easy/core/mock/fake_parser.rb +33 -0
- data/lib/dh_easy/core/mock/fake_seeder.rb +28 -0
- data/lib/dh_easy/core/plugin.rb +19 -0
- data/lib/dh_easy/core/plugin/collection_vault.rb +23 -0
- data/lib/dh_easy/core/plugin/config_behavior.rb +43 -0
- data/lib/dh_easy/core/plugin/context_integrator.rb +60 -0
- data/lib/dh_easy/core/plugin/executor.rb +19 -0
- data/lib/dh_easy/core/plugin/executor_behavior.rb +32 -0
- data/lib/dh_easy/core/plugin/finisher.rb +19 -0
- data/lib/dh_easy/core/plugin/finisher_behavior.rb +9 -0
- data/lib/dh_easy/core/plugin/initialize_hook.rb +17 -0
- data/lib/dh_easy/core/plugin/parser.rb +19 -0
- data/lib/dh_easy/core/plugin/parser_behavior.rb +17 -0
- data/lib/dh_easy/core/plugin/seeder.rb +44 -0
- data/lib/dh_easy/core/plugin/seeder_behavior.rb +9 -0
- data/lib/dh_easy/core/smart_collection.rb +236 -0
- data/lib/dh_easy/core/version.rb +6 -0
- metadata +249 -0
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'dh_easy/core/mock/fake_db'
|
2
|
+
require 'dh_easy/core/mock/fake_executor'
|
3
|
+
require 'dh_easy/core/mock/fake_parser'
|
4
|
+
require 'dh_easy/core/mock/fake_seeder'
|
5
|
+
require 'dh_easy/core/mock/fake_finisher'
|
6
|
+
|
7
|
+
module DhEasy
|
8
|
+
module Core
|
9
|
+
module Mock
|
10
|
+
# Generate a context and message queue from a list of exposed methods.
|
11
|
+
#
|
12
|
+
# @param [Array] exposed_methods List of exposed methods.
|
13
|
+
#
|
14
|
+
# @example
|
15
|
+
# exposed_methods = [:boo, :bar]
|
16
|
+
# context, message_queue = DhEasy::Core::Mock.context_vars exposed_methods
|
17
|
+
# context.boo 1, 2
|
18
|
+
# context.bar 'A', 'B'
|
19
|
+
# context.bar '111', '222'
|
20
|
+
# message_queue
|
21
|
+
# # => [
|
22
|
+
# # [:boo, [1, 2]],
|
23
|
+
# # [:bar, ['A', 'B']],
|
24
|
+
# # [:bar, ['111', '222']]
|
25
|
+
# # ]
|
26
|
+
#
|
27
|
+
# @return [Array] `[context, message_queue]` being:
|
28
|
+
# * `context`: Object implementing exposed methods.
|
29
|
+
# * `[Array] message_queue`: Array to store messages.
|
30
|
+
def self.context_vars exposed_methods
|
31
|
+
context = Object.new
|
32
|
+
metaclass = class << context; self; end
|
33
|
+
message_queue = [] # Beat reference bug
|
34
|
+
exposed_methods = exposed_methods
|
35
|
+
exposed_methods.each do |key|
|
36
|
+
metaclass.send(:define_method, key) do |*args|
|
37
|
+
# Record all method calls into message queue for easy access
|
38
|
+
message_queue << [key, args]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
[context, message_queue]
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,561 @@
|
|
1
|
+
module DhEasy
|
2
|
+
module Core
|
3
|
+
module Mock
|
4
|
+
# Fake in memory database that emulates `DataHen` database objects' black box behavior.
|
5
|
+
class FakeDb
|
6
|
+
# Page id keys, analog to primary keys.
|
7
|
+
PAGE_KEYS = ['gid'].freeze
|
8
|
+
# Output id keys, analog to primary keys.
|
9
|
+
OUTPUT_KEYS = ['_id', '_collection'].freeze
|
10
|
+
# Job id keys, analog to primary keys.
|
11
|
+
JOB_KEYS = ['job_id'].freeze
|
12
|
+
# Job available status.
|
13
|
+
JOB_STATUSES = {
|
14
|
+
active: 'active',
|
15
|
+
done: 'done',
|
16
|
+
cancelled: 'cancelled',
|
17
|
+
paused: 'paused'
|
18
|
+
}
|
19
|
+
# Default collection for saved outputs
|
20
|
+
DEFAULT_COLLECTION = 'default'
|
21
|
+
|
22
|
+
# Generate a smart collection with keys and initial values.
|
23
|
+
#
|
24
|
+
# @param [Array] keys Analog to primary keys, combination will be uniq.
|
25
|
+
# @param [Hash] opts Configuration options (see DhEasy::Core::SmartCollection#initialize).
|
26
|
+
#
|
27
|
+
# @return [DhEasy::Core::SmartCollection]
|
28
|
+
def self.new_collection keys, opts = {}
|
29
|
+
DhEasy::Core::SmartCollection.new keys, opts
|
30
|
+
end
|
31
|
+
|
32
|
+
# Generate a fake UUID.
|
33
|
+
#
|
34
|
+
# @param seed (nil) Object to use as seed for uuid.
|
35
|
+
#
|
36
|
+
# @return [String]
|
37
|
+
def self.fake_uuid seed = nil
|
38
|
+
seed ||= (Time.new.to_f + rand)
|
39
|
+
Digest::SHA1.hexdigest seed.to_s
|
40
|
+
end
|
41
|
+
|
42
|
+
# Generate a fake UUID based on output fields without `_` prefix.
|
43
|
+
#
|
44
|
+
# @param [Hash] data Output data.
|
45
|
+
#
|
46
|
+
# @return [String]
|
47
|
+
def self.output_uuid data
|
48
|
+
seed = data.select{|k,v|k.to_s =~ /^[^_]/}.hash
|
49
|
+
fake_uuid seed
|
50
|
+
end
|
51
|
+
|
52
|
+
# Build a page with defaults by using FakeDb engine.
|
53
|
+
#
|
54
|
+
# @param [Hash] page Page initial values.
|
55
|
+
# @param [Hash] opts ({}) Configuration options (see #initialize).
|
56
|
+
#
|
57
|
+
# @return [Hash]
|
58
|
+
def self.build_page page, opts = {}
|
59
|
+
opts = {
|
60
|
+
allow_page_gid_override: true,
|
61
|
+
allow_job_id_override: true
|
62
|
+
}.merge opts
|
63
|
+
temp_db = DhEasy::Core::Mock::FakeDb.new opts
|
64
|
+
temp_db.pages << page
|
65
|
+
temp_db.pages.first
|
66
|
+
end
|
67
|
+
|
68
|
+
# Build a fake page by using FakeDb engine.
|
69
|
+
#
|
70
|
+
# @param [Hash] opts ({}) Configuration options (see #initialize).
|
71
|
+
# @option opts [String] :url ('https://example.com') Page url.
|
72
|
+
#
|
73
|
+
# @return [Hash]
|
74
|
+
def self.build_fake_page opts = {}
|
75
|
+
page = {
|
76
|
+
'url' => (opts[:url] || 'https://example.com')
|
77
|
+
}
|
78
|
+
build_page page, opts
|
79
|
+
end
|
80
|
+
|
81
|
+
# Clean an URL to remove fragment, lowercase schema and host, and sort
|
82
|
+
# query string.
|
83
|
+
#
|
84
|
+
# @param [String] raw_url URL to clean.
|
85
|
+
#
|
86
|
+
# @return [String]
|
87
|
+
def self.clean_uri raw_url
|
88
|
+
url = URI.parse(raw_url)
|
89
|
+
url.hostname = url.hostname.downcase
|
90
|
+
url.fragment = nil
|
91
|
+
|
92
|
+
# Sort query string keys
|
93
|
+
unless url.query.nil?
|
94
|
+
query_string = CGI.parse(url.query)
|
95
|
+
keys = query_string.keys.sort
|
96
|
+
data = []
|
97
|
+
keys.each do |key|
|
98
|
+
query_string[key].each do |value|
|
99
|
+
data << "#{URI.encode key}=#{URI.encode value}"
|
100
|
+
end
|
101
|
+
end
|
102
|
+
url.query = data.join('&')
|
103
|
+
end
|
104
|
+
url.to_s
|
105
|
+
end
|
106
|
+
|
107
|
+
# Format headers for gid generation.
|
108
|
+
# @private
|
109
|
+
#
|
110
|
+
# @param [Hash,nil] headers Headers hash.
|
111
|
+
#
|
112
|
+
# @return [Hash]
|
113
|
+
def self.format_headers headers
|
114
|
+
return {} if headers.nil?
|
115
|
+
data = {}
|
116
|
+
headers.each do |key, value|
|
117
|
+
unless value.is_a? Array
|
118
|
+
data[key] = value
|
119
|
+
next
|
120
|
+
end
|
121
|
+
data[key] = value.sort
|
122
|
+
end
|
123
|
+
data
|
124
|
+
end
|
125
|
+
|
126
|
+
# Build a job with defaults by using FakeDb engine.
|
127
|
+
#
|
128
|
+
# @param [Hash] job Job initial values.
|
129
|
+
# @param [Hash] opts ({}) Configuration options (see #initialize).
|
130
|
+
#
|
131
|
+
# @return [Hash]
|
132
|
+
def self.build_job job, opts = {}
|
133
|
+
temp_db = DhEasy::Core::Mock::FakeDb.new opts
|
134
|
+
temp_db.jobs << job
|
135
|
+
temp_db.jobs.last
|
136
|
+
end
|
137
|
+
|
138
|
+
# Build a fake job by using FakeDb engine.
|
139
|
+
#
|
140
|
+
# @param [Hash] opts ({}) Configuration options (see #initialize).
|
141
|
+
# @option opts [String] :scraper_name (nil) Scraper name.
|
142
|
+
# @option opts [Integer] :job_id (nil) Job id.
|
143
|
+
# @option opts [String] :status ('done').
|
144
|
+
#
|
145
|
+
# @return [Hash]
|
146
|
+
def self.build_fake_job opts = {}
|
147
|
+
job = {
|
148
|
+
'job_id' => opts[:job_id],
|
149
|
+
'scraper_name' => opts[:scraper_name],
|
150
|
+
'status' => (opts[:status] || 'done')
|
151
|
+
}
|
152
|
+
build_job job, opts
|
153
|
+
end
|
154
|
+
|
155
|
+
# Return a timestamp
|
156
|
+
#
|
157
|
+
# @param [Time] time (nil) Time from which to get time stamp.
|
158
|
+
#
|
159
|
+
# @return [String]
|
160
|
+
def self.time_stamp time = nil
|
161
|
+
time = Time.new if time.nil?
|
162
|
+
time.utc.strftime('%Y-%m-%dT%H:%M:%SZ')
|
163
|
+
end
|
164
|
+
|
165
|
+
# Get current job or create new one from values.
|
166
|
+
#
|
167
|
+
# @param [Integer] target_job_id (nil) Job id to ensure existance.
|
168
|
+
#
|
169
|
+
# @return [Hash]
|
170
|
+
def ensure_job target_job_id = nil
|
171
|
+
target_job_id = job_id if target_job_id.nil?
|
172
|
+
job = jobs.find{|v|v['job_id'] == target_job_id}
|
173
|
+
return job unless job.nil?
|
174
|
+
job = {
|
175
|
+
'job_id' => target_job_id,
|
176
|
+
'scraper_name' => scraper_name,
|
177
|
+
}
|
178
|
+
job['status'] = 'active' unless target_job_id != job_id
|
179
|
+
jobs << job
|
180
|
+
jobs.last
|
181
|
+
end
|
182
|
+
|
183
|
+
# Fake scraper_name.
|
184
|
+
# @return [String,nil]
|
185
|
+
def scraper_name
|
186
|
+
@scraper_name ||= 'my_scraper'
|
187
|
+
end
|
188
|
+
|
189
|
+
# Set fake scraper_name value.
|
190
|
+
def scraper_name= value
|
191
|
+
job = ensure_job
|
192
|
+
@scraper_name = value
|
193
|
+
job['scraper_name'] = scraper_name
|
194
|
+
end
|
195
|
+
|
196
|
+
# Fake job id.
|
197
|
+
# @return [Integer,nil]
|
198
|
+
def job_id
|
199
|
+
@job_id ||= generate_job_id
|
200
|
+
end
|
201
|
+
|
202
|
+
# Set fake job id value.
|
203
|
+
def job_id= value
|
204
|
+
@job_id = value
|
205
|
+
ensure_job
|
206
|
+
job_id
|
207
|
+
end
|
208
|
+
|
209
|
+
# Current fake page gid.
|
210
|
+
# @return [Integer,nil]
|
211
|
+
def page_gid
|
212
|
+
@page_gid ||= self.class.fake_uuid
|
213
|
+
end
|
214
|
+
|
215
|
+
# Set current fake page gid value.
|
216
|
+
def page_gid= value
|
217
|
+
@page_gid = value
|
218
|
+
end
|
219
|
+
|
220
|
+
# Enable page gid override on page or output insert.
|
221
|
+
def enable_page_gid_override
|
222
|
+
@allow_page_gid_override = true
|
223
|
+
end
|
224
|
+
|
225
|
+
# Disable page gid override on page or output insert.
|
226
|
+
def disable_page_gid_override
|
227
|
+
@allow_page_gid_override = false
|
228
|
+
end
|
229
|
+
|
230
|
+
# Specify whenever page gid overriding by user is allowed on page or
|
231
|
+
# output insert.
|
232
|
+
#
|
233
|
+
# @return [Boolean] `true` when allowed, else `false`.
|
234
|
+
def allow_page_gid_override?
|
235
|
+
@allow_page_gid_override ||= false
|
236
|
+
end
|
237
|
+
|
238
|
+
# Enable job id override on page or output insert.
|
239
|
+
def enable_job_id_override
|
240
|
+
@allow_job_id_override = true
|
241
|
+
end
|
242
|
+
|
243
|
+
# Disable job id override on page or output insert.
|
244
|
+
def disable_job_id_override
|
245
|
+
@allow_job_id_override = false
|
246
|
+
end
|
247
|
+
|
248
|
+
# Specify whenever job id overriding by user is allowed on page or
|
249
|
+
# output insert.
|
250
|
+
#
|
251
|
+
# @return [Boolean] `true` when allowed, else `false`.
|
252
|
+
def allow_job_id_override?
|
253
|
+
@allow_job_id_override ||= false
|
254
|
+
end
|
255
|
+
|
256
|
+
# Initialize fake database.
|
257
|
+
#
|
258
|
+
# @param [Hash] opts ({}) Configuration options.
|
259
|
+
# @option opts [Integer,nil] :job_id Job id default value.
|
260
|
+
# @option opts [String,nil] :scraper_name Scraper name default value.
|
261
|
+
# @option opts [String,nil] :page_gid Page gid default value.
|
262
|
+
# @option opts [Boolean, nil] :allow_page_gid_override (false) Specify
|
263
|
+
# whenever page gid can be overrided on page or output insert.
|
264
|
+
# @option opts [Boolean, nil] :allow_job_id_override (false) Specify
|
265
|
+
# whenever job id can be overrided on page or output insert.
|
266
|
+
def initialize opts = {}
|
267
|
+
self.job_id = opts[:job_id]
|
268
|
+
self.scraper_name = opts[:scraper_name]
|
269
|
+
self.page_gid = opts[:page_gid]
|
270
|
+
@allow_page_gid_override = opts[:allow_page_gid_override].nil? ? false : !!opts[:allow_page_gid_override]
|
271
|
+
@allow_job_id_override = opts[:allow_job_id_override].nil? ? false : !!opts[:allow_job_id_override]
|
272
|
+
end
|
273
|
+
|
274
|
+
# Generate a fake scraper name.
|
275
|
+
#
|
276
|
+
# @return [String]
|
277
|
+
def generate_scraper_name
|
278
|
+
Faker::Internet.unique.slug
|
279
|
+
end
|
280
|
+
|
281
|
+
# Generate a fake job_id.
|
282
|
+
#
|
283
|
+
# @return [Integer]
|
284
|
+
def generate_job_id
|
285
|
+
jobs.count < 1 ? 1 : (jobs.max{|a,b|a['job_id'] <=> b['job_id']}['job_id'] + 1)
|
286
|
+
end
|
287
|
+
|
288
|
+
# Get output keys with key generators to emulate saving on db.
|
289
|
+
# @private
|
290
|
+
#
|
291
|
+
# @return [Hash]
|
292
|
+
def job_defaults
|
293
|
+
@job_defaults ||= {
|
294
|
+
'job_id' => lambda{|job| generate_job_id},
|
295
|
+
'scraper_name' => lambda{|job| generate_scraper_name},
|
296
|
+
'status' => 'done',
|
297
|
+
'created_at' => lambda{|job| Time.now}
|
298
|
+
}
|
299
|
+
end
|
300
|
+
|
301
|
+
# Stored job collection
|
302
|
+
#
|
303
|
+
# @return [DhEasy::Core::SmartCollection]
|
304
|
+
def jobs
|
305
|
+
return @jobs unless @jobs.nil?
|
306
|
+
collection = self.class.new_collection JOB_KEYS,
|
307
|
+
defaults: job_defaults
|
308
|
+
collection.bind_event(:before_defaults) do |collection, raw_item|
|
309
|
+
DhEasy::Core.deep_stringify_keys raw_item
|
310
|
+
end
|
311
|
+
collection.bind_event(:before_insert) do |collection, item, match|
|
312
|
+
item['job_id'] ||= generate_job_id
|
313
|
+
item
|
314
|
+
end
|
315
|
+
@jobs ||= collection
|
316
|
+
end
|
317
|
+
|
318
|
+
# Generate a fake UUID based on page data:
|
319
|
+
# * url
|
320
|
+
# * method
|
321
|
+
# * headers
|
322
|
+
# * fetch_type
|
323
|
+
# * cookie
|
324
|
+
# * no_redirect
|
325
|
+
# * body
|
326
|
+
# * ua_type
|
327
|
+
#
|
328
|
+
# @param [Hash] page_data Page data.
|
329
|
+
#
|
330
|
+
# @return [String]
|
331
|
+
def generate_page_gid page_data
|
332
|
+
fields = [
|
333
|
+
'url',
|
334
|
+
'method',
|
335
|
+
'headers',
|
336
|
+
'fetch_type',
|
337
|
+
'cookie',
|
338
|
+
'no_redirect',
|
339
|
+
'body',
|
340
|
+
'ua_type'
|
341
|
+
]
|
342
|
+
data = page_data.select{|k,v|fields.include? k}
|
343
|
+
data['url'] = self.class.clean_uri data['url']
|
344
|
+
data['headers'] = self.class.format_headers data['headers']
|
345
|
+
data['cookie'] = DhEasy::Core::Helper::Cookie.parse_from_request data['cookie'] unless data['cookie'].nil?
|
346
|
+
seed = data.select{|k,v|fields.include? k}.hash
|
347
|
+
checksum = self.class.fake_uuid seed
|
348
|
+
"#{URI.parse(data['url']).hostname}-#{checksum}"
|
349
|
+
end
|
350
|
+
|
351
|
+
# Get page keys with key generators to emulate saving on db.
|
352
|
+
# @private
|
353
|
+
#
|
354
|
+
# @return [Hash]
|
355
|
+
def page_defaults
|
356
|
+
@page_defaults ||= {
|
357
|
+
'url' => nil,
|
358
|
+
'status' => 'to_fetch',
|
359
|
+
'job_id' => lambda{|page| job_id},
|
360
|
+
'method' => 'GET',
|
361
|
+
'headers' => {},
|
362
|
+
'fetch_type' => 'standard',
|
363
|
+
'cookie' => nil,
|
364
|
+
'no_redirect' => false,
|
365
|
+
'body' => nil,
|
366
|
+
'ua_type' => 'desktop',
|
367
|
+
'no_url_encode' => false,
|
368
|
+
'http2' => false,
|
369
|
+
'vars' => {}
|
370
|
+
}
|
371
|
+
end
|
372
|
+
|
373
|
+
# Stored page collection.
|
374
|
+
#
|
375
|
+
# @return [DhEasy::Core::SmartCollection]
|
376
|
+
#
|
377
|
+
# @note Page gid will be replaced on insert by an auto generated uuid
|
378
|
+
# unless page gid overriding is enabled
|
379
|
+
# (see #allow_page_gid_override?)
|
380
|
+
def pages
|
381
|
+
return @pages unless @page.nil?
|
382
|
+
|
383
|
+
collection = self.class.new_collection PAGE_KEYS,
|
384
|
+
defaults: page_defaults
|
385
|
+
collection.bind_event(:before_defaults) do |collection, raw_item|
|
386
|
+
item = DhEasy::Core.deep_stringify_keys raw_item
|
387
|
+
item.delete 'job_id' unless allow_job_id_override?
|
388
|
+
item
|
389
|
+
end
|
390
|
+
collection.bind_event(:before_insert) do |collection, item, match|
|
391
|
+
if item['gid'].nil? || !allow_page_gid_override?
|
392
|
+
item['gid'] = generate_page_gid item
|
393
|
+
end
|
394
|
+
item
|
395
|
+
end
|
396
|
+
collection.bind_event(:after_insert) do |collection, item|
|
397
|
+
ensure_job item['job_id']
|
398
|
+
end
|
399
|
+
@pages ||= collection
|
400
|
+
end
|
401
|
+
|
402
|
+
# Generate a fake UUID for outputs.
|
403
|
+
#
|
404
|
+
# @param [Hash] data Output data.
|
405
|
+
#
|
406
|
+
# @return [String]
|
407
|
+
def generate_output_id data
|
408
|
+
# Generate random UUID to match Datahen behavior
|
409
|
+
self.class.fake_uuid
|
410
|
+
end
|
411
|
+
|
412
|
+
# Get output keys with key generators to emulate saving on db.
|
413
|
+
# @private
|
414
|
+
#
|
415
|
+
# @return [Hash]
|
416
|
+
def output_defaults
|
417
|
+
@output_defaults ||= {
|
418
|
+
'_collection' => DEFAULT_COLLECTION,
|
419
|
+
'_job_id' => lambda{|output| job_id},
|
420
|
+
'_created_at' => lambda{|output| self.class.time_stamp},
|
421
|
+
'_gid' => lambda{|output| page_gid}
|
422
|
+
}
|
423
|
+
end
|
424
|
+
|
425
|
+
# Stored output collection
|
426
|
+
#
|
427
|
+
# @return [DhEasy::Core::SmartCollection]
|
428
|
+
def outputs
|
429
|
+
return @outputs unless @outputs.nil?
|
430
|
+
collection = self.class.new_collection OUTPUT_KEYS,
|
431
|
+
defaults: output_defaults
|
432
|
+
collection.bind_event(:before_defaults) do |collection, raw_item|
|
433
|
+
item = DhEasy::Core.deep_stringify_keys raw_item
|
434
|
+
item.delete '_job_id' unless allow_job_id_override?
|
435
|
+
item.delete '_gid_id' unless allow_page_gid_override?
|
436
|
+
item
|
437
|
+
end
|
438
|
+
collection.bind_event(:before_insert) do |collection, item, match|
|
439
|
+
item['_id'] ||= generate_output_id item
|
440
|
+
item
|
441
|
+
end
|
442
|
+
collection.bind_event(:after_insert) do |collection, item|
|
443
|
+
ensure_job item['_job_id']
|
444
|
+
end
|
445
|
+
@outputs ||= collection
|
446
|
+
end
|
447
|
+
|
448
|
+
# Match data to filters.
|
449
|
+
# @private
|
450
|
+
#
|
451
|
+
# @param data Hash containing data.
|
452
|
+
# @param filters Filters to apply on match.
|
453
|
+
#
|
454
|
+
# @return [Boolean]
|
455
|
+
#
|
456
|
+
# @note Missing and `nil` values on `data` will match when `filters`'
|
457
|
+
# field is `nil`.
|
458
|
+
def match? data, filters
|
459
|
+
filters.each do |key, value|
|
460
|
+
return false if data[key] != value
|
461
|
+
end
|
462
|
+
true
|
463
|
+
end
|
464
|
+
|
465
|
+
# Search items from a collection.
|
466
|
+
#
|
467
|
+
# @param [Symbol] collection Allowed values: `:outputs`, `:pages`.
|
468
|
+
# @param [Hash] filter Filters to query.
|
469
|
+
# @param [Integer] offset (0) Search results offset.
|
470
|
+
# @param [Integer,nil] limit (nil) Limit search results count. Set to `nil` for unlimited.
|
471
|
+
#
|
472
|
+
# @raise ArgumentError On unknown collection.
|
473
|
+
#
|
474
|
+
# @note _Warning:_ It uses table scan to filter and should be used on test suites only.
|
475
|
+
def query collection, filter, offset = 0, limit = nil
|
476
|
+
return [] unless limit.nil? || limit > 0
|
477
|
+
|
478
|
+
# Get collection items
|
479
|
+
items = case collection
|
480
|
+
when :outputs
|
481
|
+
outputs
|
482
|
+
when :pages
|
483
|
+
pages
|
484
|
+
when :jobs
|
485
|
+
jobs
|
486
|
+
else
|
487
|
+
raise ArgumentError.new "Unknown collection #{collection}."
|
488
|
+
end
|
489
|
+
|
490
|
+
# Search items
|
491
|
+
count = 0
|
492
|
+
matches = []
|
493
|
+
items.each do |item|
|
494
|
+
next unless match? item, filter
|
495
|
+
count += 1
|
496
|
+
|
497
|
+
# Skip until offset
|
498
|
+
next unless offset < count
|
499
|
+
# Break on limit reach
|
500
|
+
break unless limit.nil? || matches.count < limit
|
501
|
+
matches << item
|
502
|
+
end
|
503
|
+
matches
|
504
|
+
end
|
505
|
+
|
506
|
+
# Refetch a page.
|
507
|
+
#
|
508
|
+
# @param [Integer] job_id Page's job_id to refetch.
|
509
|
+
# @param [String] gid Page's gid to refetch.
|
510
|
+
def refetch job_id, gid
|
511
|
+
page = pages.find_match('gid' => gid, 'job_id' => job_id)
|
512
|
+
raise Exception.new("Page not found with job_id \"#{job_id}\" gid \"#{gid}\"") if page.nil?
|
513
|
+
page['status'] = 'to_fetch'
|
514
|
+
page['freshness'] = self.class.time_stamp
|
515
|
+
page['to_fetch'] = self.class.time_stamp
|
516
|
+
page['fetched_from'] = nil
|
517
|
+
page['fetching_at'] = '2001-01-01T00:00:00Z'
|
518
|
+
page['fetched_at'] = nil
|
519
|
+
page['fetching_try_count'] = 0
|
520
|
+
page['effective_url'] = nil
|
521
|
+
page['parsing_at'] = nil
|
522
|
+
page['parsing_failed_at'] = nil
|
523
|
+
page['parsed_at'] = nil
|
524
|
+
page['parsing_try_count'] = 0
|
525
|
+
page['parsing_fail_count'] = 0
|
526
|
+
page['parsing_updated_at'] = '2001-01-01T00:00:00Z'
|
527
|
+
page['response_checksum'] = nil
|
528
|
+
page['response_status'] = nil
|
529
|
+
page['response_status_code'] = nil
|
530
|
+
page['response_headers'] = nil
|
531
|
+
page['response_cookie'] = nil
|
532
|
+
page['response_proto'] = nil
|
533
|
+
page['content_type'] = nil
|
534
|
+
page['content_size'] = 0
|
535
|
+
page['failed_response_status_code'] = nil
|
536
|
+
page['failed_response_headers'] = nil
|
537
|
+
page['failed_response_cookie'] = nil
|
538
|
+
page['failed_effective_url'] = nil
|
539
|
+
page['failed_at'] = nil
|
540
|
+
page['failed_content_type'] = nil
|
541
|
+
end
|
542
|
+
|
543
|
+
# Reparse a page.
|
544
|
+
#
|
545
|
+
# @param [Integer] job_id Page's job_id to reparse.
|
546
|
+
# @param [String] gid Page's gid to reparse.
|
547
|
+
def reparse job_id, gid
|
548
|
+
page = pages.find_match('gid' => gid, 'job_id' => job_id)
|
549
|
+
raise Exception.new("Page not found with job_id \"#{job_id}\" gid \"#{gid}\"") if page.nil?
|
550
|
+
page['status'] = 'to_parse'
|
551
|
+
page['parsing_at'] = nil
|
552
|
+
page['parsing_failed_at'] = nil
|
553
|
+
page['parsing_updated_at'] = '2001-01-01T00:00:00Z'
|
554
|
+
page['parsed_at'] = nil
|
555
|
+
page['parsing_try_count'] = 0
|
556
|
+
page['parsing_fail_count'] = 0
|
557
|
+
end
|
558
|
+
end
|
559
|
+
end
|
560
|
+
end
|
561
|
+
end
|