dh_easy-core 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/.yardopts +1 -0
  5. data/CODE_OF_CONDUCT.md +74 -0
  6. data/Gemfile +6 -0
  7. data/LICENSE +21 -0
  8. data/README.md +20 -0
  9. data/Rakefile +22 -0
  10. data/dh_easy-core.gemspec +50 -0
  11. data/doc/DhEasy.html +117 -0
  12. data/doc/DhEasy/Core.html +1590 -0
  13. data/doc/DhEasy/Core/Config.html +311 -0
  14. data/doc/DhEasy/Core/Exception.html +117 -0
  15. data/doc/DhEasy/Core/Exception/OutdatedError.html +135 -0
  16. data/doc/DhEasy/Core/Helper.html +117 -0
  17. data/doc/DhEasy/Core/Helper/Cookie.html +1070 -0
  18. data/doc/DhEasy/Core/Mock.html +282 -0
  19. data/doc/DhEasy/Core/Mock/FakeDb.html +3779 -0
  20. data/doc/DhEasy/Core/Mock/FakeExecutor.html +3289 -0
  21. data/doc/DhEasy/Core/Mock/FakeFinisher.html +160 -0
  22. data/doc/DhEasy/Core/Mock/FakeParser.html +160 -0
  23. data/doc/DhEasy/Core/Mock/FakeSeeder.html +160 -0
  24. data/doc/DhEasy/Core/Plugin.html +117 -0
  25. data/doc/DhEasy/Core/Plugin/CollectionVault.html +299 -0
  26. data/doc/DhEasy/Core/Plugin/ConfigBehavior.html +541 -0
  27. data/doc/DhEasy/Core/Plugin/ContextIntegrator.html +445 -0
  28. data/doc/DhEasy/Core/Plugin/Executor.html +259 -0
  29. data/doc/DhEasy/Core/Plugin/ExecutorBehavior.html +344 -0
  30. data/doc/DhEasy/Core/Plugin/Finisher.html +265 -0
  31. data/doc/DhEasy/Core/Plugin/FinisherBehavior.html +142 -0
  32. data/doc/DhEasy/Core/Plugin/InitializeHook.html +220 -0
  33. data/doc/DhEasy/Core/Plugin/Parser.html +270 -0
  34. data/doc/DhEasy/Core/Plugin/ParserBehavior.html +235 -0
  35. data/doc/DhEasy/Core/Plugin/Seeder.html +674 -0
  36. data/doc/DhEasy/Core/Plugin/SeederBehavior.html +142 -0
  37. data/doc/DhEasy/Core/SmartCollection.html +1087 -0
  38. data/doc/_index.html +364 -0
  39. data/doc/class_list.html +51 -0
  40. data/doc/css/common.css +1 -0
  41. data/doc/css/full_list.css +58 -0
  42. data/doc/css/style.css +496 -0
  43. data/doc/file.README.html +91 -0
  44. data/doc/file_list.html +56 -0
  45. data/doc/frames.html +17 -0
  46. data/doc/index.html +91 -0
  47. data/doc/js/app.js +303 -0
  48. data/doc/js/full_list.js +216 -0
  49. data/doc/js/jquery.js +4 -0
  50. data/doc/method_list.html +939 -0
  51. data/doc/top-level-namespace.html +110 -0
  52. data/lib/dh_easy/core.rb +257 -0
  53. data/lib/dh_easy/core/config.rb +27 -0
  54. data/lib/dh_easy/core/exception.rb +8 -0
  55. data/lib/dh_easy/core/exception/outdated_error.rb +9 -0
  56. data/lib/dh_easy/core/helper.rb +8 -0
  57. data/lib/dh_easy/core/helper/cookie.rb +209 -0
  58. data/lib/dh_easy/core/mock.rb +45 -0
  59. data/lib/dh_easy/core/mock/fake_db.rb +561 -0
  60. data/lib/dh_easy/core/mock/fake_executor.rb +373 -0
  61. data/lib/dh_easy/core/mock/fake_finisher.rb +28 -0
  62. data/lib/dh_easy/core/mock/fake_parser.rb +33 -0
  63. data/lib/dh_easy/core/mock/fake_seeder.rb +28 -0
  64. data/lib/dh_easy/core/plugin.rb +19 -0
  65. data/lib/dh_easy/core/plugin/collection_vault.rb +23 -0
  66. data/lib/dh_easy/core/plugin/config_behavior.rb +43 -0
  67. data/lib/dh_easy/core/plugin/context_integrator.rb +60 -0
  68. data/lib/dh_easy/core/plugin/executor.rb +19 -0
  69. data/lib/dh_easy/core/plugin/executor_behavior.rb +32 -0
  70. data/lib/dh_easy/core/plugin/finisher.rb +19 -0
  71. data/lib/dh_easy/core/plugin/finisher_behavior.rb +9 -0
  72. data/lib/dh_easy/core/plugin/initialize_hook.rb +17 -0
  73. data/lib/dh_easy/core/plugin/parser.rb +19 -0
  74. data/lib/dh_easy/core/plugin/parser_behavior.rb +17 -0
  75. data/lib/dh_easy/core/plugin/seeder.rb +44 -0
  76. data/lib/dh_easy/core/plugin/seeder_behavior.rb +9 -0
  77. data/lib/dh_easy/core/smart_collection.rb +236 -0
  78. data/lib/dh_easy/core/version.rb +6 -0
  79. metadata +249 -0
@@ -0,0 +1,45 @@
1
+ require 'dh_easy/core/mock/fake_db'
2
+ require 'dh_easy/core/mock/fake_executor'
3
+ require 'dh_easy/core/mock/fake_parser'
4
+ require 'dh_easy/core/mock/fake_seeder'
5
+ require 'dh_easy/core/mock/fake_finisher'
6
+
7
+ module DhEasy
8
+ module Core
9
+ module Mock
10
+ # Generate a context and message queue from a list of exposed methods.
11
+ #
12
+ # @param [Array] exposed_methods List of exposed methods.
13
+ #
14
+ # @example
15
+ # exposed_methods = [:boo, :bar]
16
+ # context, message_queue = DhEasy::Core::Mock.context_vars exposed_methods
17
+ # context.boo 1, 2
18
+ # context.bar 'A', 'B'
19
+ # context.bar '111', '222'
20
+ # message_queue
21
+ # # => [
22
+ # # [:boo, [1, 2]],
23
+ # # [:bar, ['A', 'B']],
24
+ # # [:bar, ['111', '222']]
25
+ # # ]
26
+ #
27
+ # @return [Array] `[context, message_queue]` being:
28
+ # * `context`: Object implementing exposed methods.
29
+ # * `[Array] message_queue`: Array to store messages.
30
+ def self.context_vars exposed_methods
31
+ context = Object.new
32
+ metaclass = class << context; self; end
33
+ message_queue = [] # Beat reference bug
34
+ exposed_methods = exposed_methods
35
+ exposed_methods.each do |key|
36
+ metaclass.send(:define_method, key) do |*args|
37
+ # Record all method calls into message queue for easy access
38
+ message_queue << [key, args]
39
+ end
40
+ end
41
+ [context, message_queue]
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,561 @@
1
+ module DhEasy
2
+ module Core
3
+ module Mock
4
+ # Fake in memory database that emulates `DataHen` database objects' black box behavior.
5
+ class FakeDb
6
+ # Page id keys, analog to primary keys.
7
+ PAGE_KEYS = ['gid'].freeze
8
+ # Output id keys, analog to primary keys.
9
+ OUTPUT_KEYS = ['_id', '_collection'].freeze
10
+ # Job id keys, analog to primary keys.
11
+ JOB_KEYS = ['job_id'].freeze
12
+ # Job available status.
13
+ JOB_STATUSES = {
14
+ active: 'active',
15
+ done: 'done',
16
+ cancelled: 'cancelled',
17
+ paused: 'paused'
18
+ }
19
+ # Default collection for saved outputs
20
+ DEFAULT_COLLECTION = 'default'
21
+
22
+ # Generate a smart collection with keys and initial values.
23
+ #
24
+ # @param [Array] keys Analog to primary keys, combination will be uniq.
25
+ # @param [Hash] opts Configuration options (see DhEasy::Core::SmartCollection#initialize).
26
+ #
27
+ # @return [DhEasy::Core::SmartCollection]
28
+ def self.new_collection keys, opts = {}
29
+ DhEasy::Core::SmartCollection.new keys, opts
30
+ end
31
+
32
+ # Generate a fake UUID.
33
+ #
34
+ # @param seed (nil) Object to use as seed for uuid.
35
+ #
36
+ # @return [String]
37
+ def self.fake_uuid seed = nil
38
+ seed ||= (Time.new.to_f + rand)
39
+ Digest::SHA1.hexdigest seed.to_s
40
+ end
41
+
42
+ # Generate a fake UUID based on output fields without `_` prefix.
43
+ #
44
+ # @param [Hash] data Output data.
45
+ #
46
+ # @return [String]
47
+ def self.output_uuid data
48
+ seed = data.select{|k,v|k.to_s =~ /^[^_]/}.hash
49
+ fake_uuid seed
50
+ end
51
+
52
+ # Build a page with defaults by using FakeDb engine.
53
+ #
54
+ # @param [Hash] page Page initial values.
55
+ # @param [Hash] opts ({}) Configuration options (see #initialize).
56
+ #
57
+ # @return [Hash]
58
+ def self.build_page page, opts = {}
59
+ opts = {
60
+ allow_page_gid_override: true,
61
+ allow_job_id_override: true
62
+ }.merge opts
63
+ temp_db = DhEasy::Core::Mock::FakeDb.new opts
64
+ temp_db.pages << page
65
+ temp_db.pages.first
66
+ end
67
+
68
+ # Build a fake page by using FakeDb engine.
69
+ #
70
+ # @param [Hash] opts ({}) Configuration options (see #initialize).
71
+ # @option opts [String] :url ('https://example.com') Page url.
72
+ #
73
+ # @return [Hash]
74
+ def self.build_fake_page opts = {}
75
+ page = {
76
+ 'url' => (opts[:url] || 'https://example.com')
77
+ }
78
+ build_page page, opts
79
+ end
80
+
81
+ # Clean an URL to remove fragment, lowercase schema and host, and sort
82
+ # query string.
83
+ #
84
+ # @param [String] raw_url URL to clean.
85
+ #
86
+ # @return [String]
87
+ def self.clean_uri raw_url
88
+ url = URI.parse(raw_url)
89
+ url.hostname = url.hostname.downcase
90
+ url.fragment = nil
91
+
92
+ # Sort query string keys
93
+ unless url.query.nil?
94
+ query_string = CGI.parse(url.query)
95
+ keys = query_string.keys.sort
96
+ data = []
97
+ keys.each do |key|
98
+ query_string[key].each do |value|
99
+ data << "#{URI.encode key}=#{URI.encode value}"
100
+ end
101
+ end
102
+ url.query = data.join('&')
103
+ end
104
+ url.to_s
105
+ end
106
+
107
+ # Format headers for gid generation.
108
+ # @private
109
+ #
110
+ # @param [Hash,nil] headers Headers hash.
111
+ #
112
+ # @return [Hash]
113
+ def self.format_headers headers
114
+ return {} if headers.nil?
115
+ data = {}
116
+ headers.each do |key, value|
117
+ unless value.is_a? Array
118
+ data[key] = value
119
+ next
120
+ end
121
+ data[key] = value.sort
122
+ end
123
+ data
124
+ end
125
+
126
+ # Build a job with defaults by using FakeDb engine.
127
+ #
128
+ # @param [Hash] job Job initial values.
129
+ # @param [Hash] opts ({}) Configuration options (see #initialize).
130
+ #
131
+ # @return [Hash]
132
+ def self.build_job job, opts = {}
133
+ temp_db = DhEasy::Core::Mock::FakeDb.new opts
134
+ temp_db.jobs << job
135
+ temp_db.jobs.last
136
+ end
137
+
138
+ # Build a fake job by using FakeDb engine.
139
+ #
140
+ # @param [Hash] opts ({}) Configuration options (see #initialize).
141
+ # @option opts [String] :scraper_name (nil) Scraper name.
142
+ # @option opts [Integer] :job_id (nil) Job id.
143
+ # @option opts [String] :status ('done').
144
+ #
145
+ # @return [Hash]
146
+ def self.build_fake_job opts = {}
147
+ job = {
148
+ 'job_id' => opts[:job_id],
149
+ 'scraper_name' => opts[:scraper_name],
150
+ 'status' => (opts[:status] || 'done')
151
+ }
152
+ build_job job, opts
153
+ end
154
+
155
+ # Return a timestamp
156
+ #
157
+ # @param [Time] time (nil) Time from which to get time stamp.
158
+ #
159
+ # @return [String]
160
+ def self.time_stamp time = nil
161
+ time = Time.new if time.nil?
162
+ time.utc.strftime('%Y-%m-%dT%H:%M:%SZ')
163
+ end
164
+
165
+ # Get current job or create new one from values.
166
+ #
167
+ # @param [Integer] target_job_id (nil) Job id to ensure existance.
168
+ #
169
+ # @return [Hash]
170
+ def ensure_job target_job_id = nil
171
+ target_job_id = job_id if target_job_id.nil?
172
+ job = jobs.find{|v|v['job_id'] == target_job_id}
173
+ return job unless job.nil?
174
+ job = {
175
+ 'job_id' => target_job_id,
176
+ 'scraper_name' => scraper_name,
177
+ }
178
+ job['status'] = 'active' unless target_job_id != job_id
179
+ jobs << job
180
+ jobs.last
181
+ end
182
+
183
+ # Fake scraper_name.
184
+ # @return [String,nil]
185
+ def scraper_name
186
+ @scraper_name ||= 'my_scraper'
187
+ end
188
+
189
+ # Set fake scraper_name value.
190
+ def scraper_name= value
191
+ job = ensure_job
192
+ @scraper_name = value
193
+ job['scraper_name'] = scraper_name
194
+ end
195
+
196
+ # Fake job id.
197
+ # @return [Integer,nil]
198
+ def job_id
199
+ @job_id ||= generate_job_id
200
+ end
201
+
202
+ # Set fake job id value.
203
+ def job_id= value
204
+ @job_id = value
205
+ ensure_job
206
+ job_id
207
+ end
208
+
209
+ # Current fake page gid.
210
+ # @return [Integer,nil]
211
+ def page_gid
212
+ @page_gid ||= self.class.fake_uuid
213
+ end
214
+
215
+ # Set current fake page gid value.
216
+ def page_gid= value
217
+ @page_gid = value
218
+ end
219
+
220
+ # Enable page gid override on page or output insert.
221
+ def enable_page_gid_override
222
+ @allow_page_gid_override = true
223
+ end
224
+
225
+ # Disable page gid override on page or output insert.
226
+ def disable_page_gid_override
227
+ @allow_page_gid_override = false
228
+ end
229
+
230
+ # Specify whenever page gid overriding by user is allowed on page or
231
+ # output insert.
232
+ #
233
+ # @return [Boolean] `true` when allowed, else `false`.
234
+ def allow_page_gid_override?
235
+ @allow_page_gid_override ||= false
236
+ end
237
+
238
+ # Enable job id override on page or output insert.
239
+ def enable_job_id_override
240
+ @allow_job_id_override = true
241
+ end
242
+
243
+ # Disable job id override on page or output insert.
244
+ def disable_job_id_override
245
+ @allow_job_id_override = false
246
+ end
247
+
248
+ # Specify whenever job id overriding by user is allowed on page or
249
+ # output insert.
250
+ #
251
+ # @return [Boolean] `true` when allowed, else `false`.
252
+ def allow_job_id_override?
253
+ @allow_job_id_override ||= false
254
+ end
255
+
256
+ # Initialize fake database.
257
+ #
258
+ # @param [Hash] opts ({}) Configuration options.
259
+ # @option opts [Integer,nil] :job_id Job id default value.
260
+ # @option opts [String,nil] :scraper_name Scraper name default value.
261
+ # @option opts [String,nil] :page_gid Page gid default value.
262
+ # @option opts [Boolean, nil] :allow_page_gid_override (false) Specify
263
+ # whenever page gid can be overrided on page or output insert.
264
+ # @option opts [Boolean, nil] :allow_job_id_override (false) Specify
265
+ # whenever job id can be overrided on page or output insert.
266
+ def initialize opts = {}
267
+ self.job_id = opts[:job_id]
268
+ self.scraper_name = opts[:scraper_name]
269
+ self.page_gid = opts[:page_gid]
270
+ @allow_page_gid_override = opts[:allow_page_gid_override].nil? ? false : !!opts[:allow_page_gid_override]
271
+ @allow_job_id_override = opts[:allow_job_id_override].nil? ? false : !!opts[:allow_job_id_override]
272
+ end
273
+
274
+ # Generate a fake scraper name.
275
+ #
276
+ # @return [String]
277
+ def generate_scraper_name
278
+ Faker::Internet.unique.slug
279
+ end
280
+
281
+ # Generate a fake job_id.
282
+ #
283
+ # @return [Integer]
284
+ def generate_job_id
285
+ jobs.count < 1 ? 1 : (jobs.max{|a,b|a['job_id'] <=> b['job_id']}['job_id'] + 1)
286
+ end
287
+
288
+ # Get output keys with key generators to emulate saving on db.
289
+ # @private
290
+ #
291
+ # @return [Hash]
292
+ def job_defaults
293
+ @job_defaults ||= {
294
+ 'job_id' => lambda{|job| generate_job_id},
295
+ 'scraper_name' => lambda{|job| generate_scraper_name},
296
+ 'status' => 'done',
297
+ 'created_at' => lambda{|job| Time.now}
298
+ }
299
+ end
300
+
301
+ # Stored job collection
302
+ #
303
+ # @return [DhEasy::Core::SmartCollection]
304
+ def jobs
305
+ return @jobs unless @jobs.nil?
306
+ collection = self.class.new_collection JOB_KEYS,
307
+ defaults: job_defaults
308
+ collection.bind_event(:before_defaults) do |collection, raw_item|
309
+ DhEasy::Core.deep_stringify_keys raw_item
310
+ end
311
+ collection.bind_event(:before_insert) do |collection, item, match|
312
+ item['job_id'] ||= generate_job_id
313
+ item
314
+ end
315
+ @jobs ||= collection
316
+ end
317
+
318
+ # Generate a fake UUID based on page data:
319
+ # * url
320
+ # * method
321
+ # * headers
322
+ # * fetch_type
323
+ # * cookie
324
+ # * no_redirect
325
+ # * body
326
+ # * ua_type
327
+ #
328
+ # @param [Hash] page_data Page data.
329
+ #
330
+ # @return [String]
331
+ def generate_page_gid page_data
332
+ fields = [
333
+ 'url',
334
+ 'method',
335
+ 'headers',
336
+ 'fetch_type',
337
+ 'cookie',
338
+ 'no_redirect',
339
+ 'body',
340
+ 'ua_type'
341
+ ]
342
+ data = page_data.select{|k,v|fields.include? k}
343
+ data['url'] = self.class.clean_uri data['url']
344
+ data['headers'] = self.class.format_headers data['headers']
345
+ data['cookie'] = DhEasy::Core::Helper::Cookie.parse_from_request data['cookie'] unless data['cookie'].nil?
346
+ seed = data.select{|k,v|fields.include? k}.hash
347
+ checksum = self.class.fake_uuid seed
348
+ "#{URI.parse(data['url']).hostname}-#{checksum}"
349
+ end
350
+
351
+ # Get page keys with key generators to emulate saving on db.
352
+ # @private
353
+ #
354
+ # @return [Hash]
355
+ def page_defaults
356
+ @page_defaults ||= {
357
+ 'url' => nil,
358
+ 'status' => 'to_fetch',
359
+ 'job_id' => lambda{|page| job_id},
360
+ 'method' => 'GET',
361
+ 'headers' => {},
362
+ 'fetch_type' => 'standard',
363
+ 'cookie' => nil,
364
+ 'no_redirect' => false,
365
+ 'body' => nil,
366
+ 'ua_type' => 'desktop',
367
+ 'no_url_encode' => false,
368
+ 'http2' => false,
369
+ 'vars' => {}
370
+ }
371
+ end
372
+
373
+ # Stored page collection.
374
+ #
375
+ # @return [DhEasy::Core::SmartCollection]
376
+ #
377
+ # @note Page gid will be replaced on insert by an auto generated uuid
378
+ # unless page gid overriding is enabled
379
+ # (see #allow_page_gid_override?)
380
+ def pages
381
+ return @pages unless @page.nil?
382
+
383
+ collection = self.class.new_collection PAGE_KEYS,
384
+ defaults: page_defaults
385
+ collection.bind_event(:before_defaults) do |collection, raw_item|
386
+ item = DhEasy::Core.deep_stringify_keys raw_item
387
+ item.delete 'job_id' unless allow_job_id_override?
388
+ item
389
+ end
390
+ collection.bind_event(:before_insert) do |collection, item, match|
391
+ if item['gid'].nil? || !allow_page_gid_override?
392
+ item['gid'] = generate_page_gid item
393
+ end
394
+ item
395
+ end
396
+ collection.bind_event(:after_insert) do |collection, item|
397
+ ensure_job item['job_id']
398
+ end
399
+ @pages ||= collection
400
+ end
401
+
402
+ # Generate a fake UUID for outputs.
403
+ #
404
+ # @param [Hash] data Output data.
405
+ #
406
+ # @return [String]
407
+ def generate_output_id data
408
+ # Generate random UUID to match Datahen behavior
409
+ self.class.fake_uuid
410
+ end
411
+
412
+ # Get output keys with key generators to emulate saving on db.
413
+ # @private
414
+ #
415
+ # @return [Hash]
416
+ def output_defaults
417
+ @output_defaults ||= {
418
+ '_collection' => DEFAULT_COLLECTION,
419
+ '_job_id' => lambda{|output| job_id},
420
+ '_created_at' => lambda{|output| self.class.time_stamp},
421
+ '_gid' => lambda{|output| page_gid}
422
+ }
423
+ end
424
+
425
+ # Stored output collection
426
+ #
427
+ # @return [DhEasy::Core::SmartCollection]
428
+ def outputs
429
+ return @outputs unless @outputs.nil?
430
+ collection = self.class.new_collection OUTPUT_KEYS,
431
+ defaults: output_defaults
432
+ collection.bind_event(:before_defaults) do |collection, raw_item|
433
+ item = DhEasy::Core.deep_stringify_keys raw_item
434
+ item.delete '_job_id' unless allow_job_id_override?
435
+ item.delete '_gid_id' unless allow_page_gid_override?
436
+ item
437
+ end
438
+ collection.bind_event(:before_insert) do |collection, item, match|
439
+ item['_id'] ||= generate_output_id item
440
+ item
441
+ end
442
+ collection.bind_event(:after_insert) do |collection, item|
443
+ ensure_job item['_job_id']
444
+ end
445
+ @outputs ||= collection
446
+ end
447
+
448
+ # Match data to filters.
449
+ # @private
450
+ #
451
+ # @param data Hash containing data.
452
+ # @param filters Filters to apply on match.
453
+ #
454
+ # @return [Boolean]
455
+ #
456
+ # @note Missing and `nil` values on `data` will match when `filters`'
457
+ # field is `nil`.
458
+ def match? data, filters
459
+ filters.each do |key, value|
460
+ return false if data[key] != value
461
+ end
462
+ true
463
+ end
464
+
465
+ # Search items from a collection.
466
+ #
467
+ # @param [Symbol] collection Allowed values: `:outputs`, `:pages`.
468
+ # @param [Hash] filter Filters to query.
469
+ # @param [Integer] offset (0) Search results offset.
470
+ # @param [Integer,nil] limit (nil) Limit search results count. Set to `nil` for unlimited.
471
+ #
472
+ # @raise ArgumentError On unknown collection.
473
+ #
474
+ # @note _Warning:_ It uses table scan to filter and should be used on test suites only.
475
+ def query collection, filter, offset = 0, limit = nil
476
+ return [] unless limit.nil? || limit > 0
477
+
478
+ # Get collection items
479
+ items = case collection
480
+ when :outputs
481
+ outputs
482
+ when :pages
483
+ pages
484
+ when :jobs
485
+ jobs
486
+ else
487
+ raise ArgumentError.new "Unknown collection #{collection}."
488
+ end
489
+
490
+ # Search items
491
+ count = 0
492
+ matches = []
493
+ items.each do |item|
494
+ next unless match? item, filter
495
+ count += 1
496
+
497
+ # Skip until offset
498
+ next unless offset < count
499
+ # Break on limit reach
500
+ break unless limit.nil? || matches.count < limit
501
+ matches << item
502
+ end
503
+ matches
504
+ end
505
+
506
+ # Refetch a page.
507
+ #
508
+ # @param [Integer] job_id Page's job_id to refetch.
509
+ # @param [String] gid Page's gid to refetch.
510
+ def refetch job_id, gid
511
+ page = pages.find_match('gid' => gid, 'job_id' => job_id)
512
+ raise Exception.new("Page not found with job_id \"#{job_id}\" gid \"#{gid}\"") if page.nil?
513
+ page['status'] = 'to_fetch'
514
+ page['freshness'] = self.class.time_stamp
515
+ page['to_fetch'] = self.class.time_stamp
516
+ page['fetched_from'] = nil
517
+ page['fetching_at'] = '2001-01-01T00:00:00Z'
518
+ page['fetched_at'] = nil
519
+ page['fetching_try_count'] = 0
520
+ page['effective_url'] = nil
521
+ page['parsing_at'] = nil
522
+ page['parsing_failed_at'] = nil
523
+ page['parsed_at'] = nil
524
+ page['parsing_try_count'] = 0
525
+ page['parsing_fail_count'] = 0
526
+ page['parsing_updated_at'] = '2001-01-01T00:00:00Z'
527
+ page['response_checksum'] = nil
528
+ page['response_status'] = nil
529
+ page['response_status_code'] = nil
530
+ page['response_headers'] = nil
531
+ page['response_cookie'] = nil
532
+ page['response_proto'] = nil
533
+ page['content_type'] = nil
534
+ page['content_size'] = 0
535
+ page['failed_response_status_code'] = nil
536
+ page['failed_response_headers'] = nil
537
+ page['failed_response_cookie'] = nil
538
+ page['failed_effective_url'] = nil
539
+ page['failed_at'] = nil
540
+ page['failed_content_type'] = nil
541
+ end
542
+
543
+ # Reparse a page.
544
+ #
545
+ # @param [Integer] job_id Page's job_id to reparse.
546
+ # @param [String] gid Page's gid to reparse.
547
+ def reparse job_id, gid
548
+ page = pages.find_match('gid' => gid, 'job_id' => job_id)
549
+ raise Exception.new("Page not found with job_id \"#{job_id}\" gid \"#{gid}\"") if page.nil?
550
+ page['status'] = 'to_parse'
551
+ page['parsing_at'] = nil
552
+ page['parsing_failed_at'] = nil
553
+ page['parsing_updated_at'] = '2001-01-01T00:00:00Z'
554
+ page['parsed_at'] = nil
555
+ page['parsing_try_count'] = 0
556
+ page['parsing_fail_count'] = 0
557
+ end
558
+ end
559
+ end
560
+ end
561
+ end