dh_easy-core 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (79) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/.yardopts +1 -0
  5. data/CODE_OF_CONDUCT.md +74 -0
  6. data/Gemfile +6 -0
  7. data/LICENSE +21 -0
  8. data/README.md +20 -0
  9. data/Rakefile +22 -0
  10. data/dh_easy-core.gemspec +50 -0
  11. data/doc/DhEasy.html +117 -0
  12. data/doc/DhEasy/Core.html +1590 -0
  13. data/doc/DhEasy/Core/Config.html +311 -0
  14. data/doc/DhEasy/Core/Exception.html +117 -0
  15. data/doc/DhEasy/Core/Exception/OutdatedError.html +135 -0
  16. data/doc/DhEasy/Core/Helper.html +117 -0
  17. data/doc/DhEasy/Core/Helper/Cookie.html +1070 -0
  18. data/doc/DhEasy/Core/Mock.html +282 -0
  19. data/doc/DhEasy/Core/Mock/FakeDb.html +3779 -0
  20. data/doc/DhEasy/Core/Mock/FakeExecutor.html +3289 -0
  21. data/doc/DhEasy/Core/Mock/FakeFinisher.html +160 -0
  22. data/doc/DhEasy/Core/Mock/FakeParser.html +160 -0
  23. data/doc/DhEasy/Core/Mock/FakeSeeder.html +160 -0
  24. data/doc/DhEasy/Core/Plugin.html +117 -0
  25. data/doc/DhEasy/Core/Plugin/CollectionVault.html +299 -0
  26. data/doc/DhEasy/Core/Plugin/ConfigBehavior.html +541 -0
  27. data/doc/DhEasy/Core/Plugin/ContextIntegrator.html +445 -0
  28. data/doc/DhEasy/Core/Plugin/Executor.html +259 -0
  29. data/doc/DhEasy/Core/Plugin/ExecutorBehavior.html +344 -0
  30. data/doc/DhEasy/Core/Plugin/Finisher.html +265 -0
  31. data/doc/DhEasy/Core/Plugin/FinisherBehavior.html +142 -0
  32. data/doc/DhEasy/Core/Plugin/InitializeHook.html +220 -0
  33. data/doc/DhEasy/Core/Plugin/Parser.html +270 -0
  34. data/doc/DhEasy/Core/Plugin/ParserBehavior.html +235 -0
  35. data/doc/DhEasy/Core/Plugin/Seeder.html +674 -0
  36. data/doc/DhEasy/Core/Plugin/SeederBehavior.html +142 -0
  37. data/doc/DhEasy/Core/SmartCollection.html +1087 -0
  38. data/doc/_index.html +364 -0
  39. data/doc/class_list.html +51 -0
  40. data/doc/css/common.css +1 -0
  41. data/doc/css/full_list.css +58 -0
  42. data/doc/css/style.css +496 -0
  43. data/doc/file.README.html +91 -0
  44. data/doc/file_list.html +56 -0
  45. data/doc/frames.html +17 -0
  46. data/doc/index.html +91 -0
  47. data/doc/js/app.js +303 -0
  48. data/doc/js/full_list.js +216 -0
  49. data/doc/js/jquery.js +4 -0
  50. data/doc/method_list.html +939 -0
  51. data/doc/top-level-namespace.html +110 -0
  52. data/lib/dh_easy/core.rb +257 -0
  53. data/lib/dh_easy/core/config.rb +27 -0
  54. data/lib/dh_easy/core/exception.rb +8 -0
  55. data/lib/dh_easy/core/exception/outdated_error.rb +9 -0
  56. data/lib/dh_easy/core/helper.rb +8 -0
  57. data/lib/dh_easy/core/helper/cookie.rb +209 -0
  58. data/lib/dh_easy/core/mock.rb +45 -0
  59. data/lib/dh_easy/core/mock/fake_db.rb +561 -0
  60. data/lib/dh_easy/core/mock/fake_executor.rb +373 -0
  61. data/lib/dh_easy/core/mock/fake_finisher.rb +28 -0
  62. data/lib/dh_easy/core/mock/fake_parser.rb +33 -0
  63. data/lib/dh_easy/core/mock/fake_seeder.rb +28 -0
  64. data/lib/dh_easy/core/plugin.rb +19 -0
  65. data/lib/dh_easy/core/plugin/collection_vault.rb +23 -0
  66. data/lib/dh_easy/core/plugin/config_behavior.rb +43 -0
  67. data/lib/dh_easy/core/plugin/context_integrator.rb +60 -0
  68. data/lib/dh_easy/core/plugin/executor.rb +19 -0
  69. data/lib/dh_easy/core/plugin/executor_behavior.rb +32 -0
  70. data/lib/dh_easy/core/plugin/finisher.rb +19 -0
  71. data/lib/dh_easy/core/plugin/finisher_behavior.rb +9 -0
  72. data/lib/dh_easy/core/plugin/initialize_hook.rb +17 -0
  73. data/lib/dh_easy/core/plugin/parser.rb +19 -0
  74. data/lib/dh_easy/core/plugin/parser_behavior.rb +17 -0
  75. data/lib/dh_easy/core/plugin/seeder.rb +44 -0
  76. data/lib/dh_easy/core/plugin/seeder_behavior.rb +9 -0
  77. data/lib/dh_easy/core/smart_collection.rb +236 -0
  78. data/lib/dh_easy/core/version.rb +6 -0
  79. metadata +249 -0
@@ -0,0 +1,45 @@
1
+ require 'dh_easy/core/mock/fake_db'
2
+ require 'dh_easy/core/mock/fake_executor'
3
+ require 'dh_easy/core/mock/fake_parser'
4
+ require 'dh_easy/core/mock/fake_seeder'
5
+ require 'dh_easy/core/mock/fake_finisher'
6
+
7
+ module DhEasy
8
+ module Core
9
+ module Mock
10
+ # Generate a context and message queue from a list of exposed methods.
11
+ #
12
+ # @param [Array] exposed_methods List of exposed methods.
13
+ #
14
+ # @example
15
+ # exposed_methods = [:boo, :bar]
16
+ # context, message_queue = DhEasy::Core::Mock.context_vars exposed_methods
17
+ # context.boo 1, 2
18
+ # context.bar 'A', 'B'
19
+ # context.bar '111', '222'
20
+ # message_queue
21
+ # # => [
22
+ # # [:boo, [1, 2]],
23
+ # # [:bar, ['A', 'B']],
24
+ # # [:bar, ['111', '222']]
25
+ # # ]
26
+ #
27
+ # @return [Array] `[context, message_queue]` being:
28
+ # * `context`: Object implementing exposed methods.
29
+ # * `[Array] message_queue`: Array to store messages.
30
+ def self.context_vars exposed_methods
31
+ context = Object.new
32
+ metaclass = class << context; self; end
33
+ message_queue = [] # Beat reference bug
34
+ exposed_methods = exposed_methods
35
+ exposed_methods.each do |key|
36
+ metaclass.send(:define_method, key) do |*args|
37
+ # Record all method calls into message queue for easy access
38
+ message_queue << [key, args]
39
+ end
40
+ end
41
+ [context, message_queue]
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,561 @@
1
+ module DhEasy
2
+ module Core
3
+ module Mock
4
+ # Fake in memory database that emulates `DataHen` database objects' black box behavior.
5
+ class FakeDb
6
+ # Page id keys, analog to primary keys.
7
+ PAGE_KEYS = ['gid'].freeze
8
+ # Output id keys, analog to primary keys.
9
+ OUTPUT_KEYS = ['_id', '_collection'].freeze
10
+ # Job id keys, analog to primary keys.
11
+ JOB_KEYS = ['job_id'].freeze
12
+ # Job available status.
13
+ JOB_STATUSES = {
14
+ active: 'active',
15
+ done: 'done',
16
+ cancelled: 'cancelled',
17
+ paused: 'paused'
18
+ }
19
+ # Default collection for saved outputs
20
+ DEFAULT_COLLECTION = 'default'
21
+
22
+ # Generate a smart collection with keys and initial values.
23
+ #
24
+ # @param [Array] keys Analog to primary keys, combination will be uniq.
25
+ # @param [Hash] opts Configuration options (see DhEasy::Core::SmartCollection#initialize).
26
+ #
27
+ # @return [DhEasy::Core::SmartCollection]
28
+ def self.new_collection keys, opts = {}
29
+ DhEasy::Core::SmartCollection.new keys, opts
30
+ end
31
+
32
+ # Generate a fake UUID.
33
+ #
34
+ # @param seed (nil) Object to use as seed for uuid.
35
+ #
36
+ # @return [String]
37
+ def self.fake_uuid seed = nil
38
+ seed ||= (Time.new.to_f + rand)
39
+ Digest::SHA1.hexdigest seed.to_s
40
+ end
41
+
42
+ # Generate a fake UUID based on output fields without `_` prefix.
43
+ #
44
+ # @param [Hash] data Output data.
45
+ #
46
+ # @return [String]
47
+ def self.output_uuid data
48
+ seed = data.select{|k,v|k.to_s =~ /^[^_]/}.hash
49
+ fake_uuid seed
50
+ end
51
+
52
+ # Build a page with defaults by using FakeDb engine.
53
+ #
54
+ # @param [Hash] page Page initial values.
55
+ # @param [Hash] opts ({}) Configuration options (see #initialize).
56
+ #
57
+ # @return [Hash]
58
+ def self.build_page page, opts = {}
59
+ opts = {
60
+ allow_page_gid_override: true,
61
+ allow_job_id_override: true
62
+ }.merge opts
63
+ temp_db = DhEasy::Core::Mock::FakeDb.new opts
64
+ temp_db.pages << page
65
+ temp_db.pages.first
66
+ end
67
+
68
+ # Build a fake page by using FakeDb engine.
69
+ #
70
+ # @param [Hash] opts ({}) Configuration options (see #initialize).
71
+ # @option opts [String] :url ('https://example.com') Page url.
72
+ #
73
+ # @return [Hash]
74
+ def self.build_fake_page opts = {}
75
+ page = {
76
+ 'url' => (opts[:url] || 'https://example.com')
77
+ }
78
+ build_page page, opts
79
+ end
80
+
81
+ # Clean an URL to remove fragment, lowercase schema and host, and sort
82
+ # query string.
83
+ #
84
+ # @param [String] raw_url URL to clean.
85
+ #
86
+ # @return [String]
87
+ def self.clean_uri raw_url
88
+ url = URI.parse(raw_url)
89
+ url.hostname = url.hostname.downcase
90
+ url.fragment = nil
91
+
92
+ # Sort query string keys
93
+ unless url.query.nil?
94
+ query_string = CGI.parse(url.query)
95
+ keys = query_string.keys.sort
96
+ data = []
97
+ keys.each do |key|
98
+ query_string[key].each do |value|
99
+ data << "#{URI.encode key}=#{URI.encode value}"
100
+ end
101
+ end
102
+ url.query = data.join('&')
103
+ end
104
+ url.to_s
105
+ end
106
+
107
+ # Format headers for gid generation.
108
+ # @private
109
+ #
110
+ # @param [Hash,nil] headers Headers hash.
111
+ #
112
+ # @return [Hash]
113
+ def self.format_headers headers
114
+ return {} if headers.nil?
115
+ data = {}
116
+ headers.each do |key, value|
117
+ unless value.is_a? Array
118
+ data[key] = value
119
+ next
120
+ end
121
+ data[key] = value.sort
122
+ end
123
+ data
124
+ end
125
+
126
+ # Build a job with defaults by using FakeDb engine.
127
+ #
128
+ # @param [Hash] job Job initial values.
129
+ # @param [Hash] opts ({}) Configuration options (see #initialize).
130
+ #
131
+ # @return [Hash]
132
+ def self.build_job job, opts = {}
133
+ temp_db = DhEasy::Core::Mock::FakeDb.new opts
134
+ temp_db.jobs << job
135
+ temp_db.jobs.last
136
+ end
137
+
138
+ # Build a fake job by using FakeDb engine.
139
+ #
140
+ # @param [Hash] opts ({}) Configuration options (see #initialize).
141
+ # @option opts [String] :scraper_name (nil) Scraper name.
142
+ # @option opts [Integer] :job_id (nil) Job id.
143
+ # @option opts [String] :status ('done').
144
+ #
145
+ # @return [Hash]
146
+ def self.build_fake_job opts = {}
147
+ job = {
148
+ 'job_id' => opts[:job_id],
149
+ 'scraper_name' => opts[:scraper_name],
150
+ 'status' => (opts[:status] || 'done')
151
+ }
152
+ build_job job, opts
153
+ end
154
+
155
+ # Return a timestamp
156
+ #
157
+ # @param [Time] time (nil) Time from which to get time stamp.
158
+ #
159
+ # @return [String]
160
+ def self.time_stamp time = nil
161
+ time = Time.new if time.nil?
162
+ time.utc.strftime('%Y-%m-%dT%H:%M:%SZ')
163
+ end
164
+
165
+ # Get current job or create new one from values.
166
+ #
167
+ # @param [Integer] target_job_id (nil) Job id to ensure existance.
168
+ #
169
+ # @return [Hash]
170
+ def ensure_job target_job_id = nil
171
+ target_job_id = job_id if target_job_id.nil?
172
+ job = jobs.find{|v|v['job_id'] == target_job_id}
173
+ return job unless job.nil?
174
+ job = {
175
+ 'job_id' => target_job_id,
176
+ 'scraper_name' => scraper_name,
177
+ }
178
+ job['status'] = 'active' unless target_job_id != job_id
179
+ jobs << job
180
+ jobs.last
181
+ end
182
+
183
+ # Fake scraper_name.
184
+ # @return [String,nil]
185
+ def scraper_name
186
+ @scraper_name ||= 'my_scraper'
187
+ end
188
+
189
+ # Set fake scraper_name value.
190
+ def scraper_name= value
191
+ job = ensure_job
192
+ @scraper_name = value
193
+ job['scraper_name'] = scraper_name
194
+ end
195
+
196
+ # Fake job id.
197
+ # @return [Integer,nil]
198
+ def job_id
199
+ @job_id ||= generate_job_id
200
+ end
201
+
202
+ # Set fake job id value.
203
+ def job_id= value
204
+ @job_id = value
205
+ ensure_job
206
+ job_id
207
+ end
208
+
209
+ # Current fake page gid.
210
+ # @return [Integer,nil]
211
+ def page_gid
212
+ @page_gid ||= self.class.fake_uuid
213
+ end
214
+
215
+ # Set current fake page gid value.
216
+ def page_gid= value
217
+ @page_gid = value
218
+ end
219
+
220
+ # Enable page gid override on page or output insert.
221
+ def enable_page_gid_override
222
+ @allow_page_gid_override = true
223
+ end
224
+
225
+ # Disable page gid override on page or output insert.
226
+ def disable_page_gid_override
227
+ @allow_page_gid_override = false
228
+ end
229
+
230
+ # Specify whenever page gid overriding by user is allowed on page or
231
+ # output insert.
232
+ #
233
+ # @return [Boolean] `true` when allowed, else `false`.
234
+ def allow_page_gid_override?
235
+ @allow_page_gid_override ||= false
236
+ end
237
+
238
+ # Enable job id override on page or output insert.
239
+ def enable_job_id_override
240
+ @allow_job_id_override = true
241
+ end
242
+
243
+ # Disable job id override on page or output insert.
244
+ def disable_job_id_override
245
+ @allow_job_id_override = false
246
+ end
247
+
248
+ # Specify whenever job id overriding by user is allowed on page or
249
+ # output insert.
250
+ #
251
+ # @return [Boolean] `true` when allowed, else `false`.
252
+ def allow_job_id_override?
253
+ @allow_job_id_override ||= false
254
+ end
255
+
256
+ # Initialize fake database.
257
+ #
258
+ # @param [Hash] opts ({}) Configuration options.
259
+ # @option opts [Integer,nil] :job_id Job id default value.
260
+ # @option opts [String,nil] :scraper_name Scraper name default value.
261
+ # @option opts [String,nil] :page_gid Page gid default value.
262
+ # @option opts [Boolean, nil] :allow_page_gid_override (false) Specify
263
+ # whenever page gid can be overrided on page or output insert.
264
+ # @option opts [Boolean, nil] :allow_job_id_override (false) Specify
265
+ # whenever job id can be overrided on page or output insert.
266
+ def initialize opts = {}
267
+ self.job_id = opts[:job_id]
268
+ self.scraper_name = opts[:scraper_name]
269
+ self.page_gid = opts[:page_gid]
270
+ @allow_page_gid_override = opts[:allow_page_gid_override].nil? ? false : !!opts[:allow_page_gid_override]
271
+ @allow_job_id_override = opts[:allow_job_id_override].nil? ? false : !!opts[:allow_job_id_override]
272
+ end
273
+
274
+ # Generate a fake scraper name.
275
+ #
276
+ # @return [String]
277
+ def generate_scraper_name
278
+ Faker::Internet.unique.slug
279
+ end
280
+
281
+ # Generate a fake job_id.
282
+ #
283
+ # @return [Integer]
284
+ def generate_job_id
285
+ jobs.count < 1 ? 1 : (jobs.max{|a,b|a['job_id'] <=> b['job_id']}['job_id'] + 1)
286
+ end
287
+
288
+ # Get output keys with key generators to emulate saving on db.
289
+ # @private
290
+ #
291
+ # @return [Hash]
292
+ def job_defaults
293
+ @job_defaults ||= {
294
+ 'job_id' => lambda{|job| generate_job_id},
295
+ 'scraper_name' => lambda{|job| generate_scraper_name},
296
+ 'status' => 'done',
297
+ 'created_at' => lambda{|job| Time.now}
298
+ }
299
+ end
300
+
301
+ # Stored job collection
302
+ #
303
+ # @return [DhEasy::Core::SmartCollection]
304
+ def jobs
305
+ return @jobs unless @jobs.nil?
306
+ collection = self.class.new_collection JOB_KEYS,
307
+ defaults: job_defaults
308
+ collection.bind_event(:before_defaults) do |collection, raw_item|
309
+ DhEasy::Core.deep_stringify_keys raw_item
310
+ end
311
+ collection.bind_event(:before_insert) do |collection, item, match|
312
+ item['job_id'] ||= generate_job_id
313
+ item
314
+ end
315
+ @jobs ||= collection
316
+ end
317
+
318
+ # Generate a fake UUID based on page data:
319
+ # * url
320
+ # * method
321
+ # * headers
322
+ # * fetch_type
323
+ # * cookie
324
+ # * no_redirect
325
+ # * body
326
+ # * ua_type
327
+ #
328
+ # @param [Hash] page_data Page data.
329
+ #
330
+ # @return [String]
331
+ def generate_page_gid page_data
332
+ fields = [
333
+ 'url',
334
+ 'method',
335
+ 'headers',
336
+ 'fetch_type',
337
+ 'cookie',
338
+ 'no_redirect',
339
+ 'body',
340
+ 'ua_type'
341
+ ]
342
+ data = page_data.select{|k,v|fields.include? k}
343
+ data['url'] = self.class.clean_uri data['url']
344
+ data['headers'] = self.class.format_headers data['headers']
345
+ data['cookie'] = DhEasy::Core::Helper::Cookie.parse_from_request data['cookie'] unless data['cookie'].nil?
346
+ seed = data.select{|k,v|fields.include? k}.hash
347
+ checksum = self.class.fake_uuid seed
348
+ "#{URI.parse(data['url']).hostname}-#{checksum}"
349
+ end
350
+
351
+ # Get page keys with key generators to emulate saving on db.
352
+ # @private
353
+ #
354
+ # @return [Hash]
355
+ def page_defaults
356
+ @page_defaults ||= {
357
+ 'url' => nil,
358
+ 'status' => 'to_fetch',
359
+ 'job_id' => lambda{|page| job_id},
360
+ 'method' => 'GET',
361
+ 'headers' => {},
362
+ 'fetch_type' => 'standard',
363
+ 'cookie' => nil,
364
+ 'no_redirect' => false,
365
+ 'body' => nil,
366
+ 'ua_type' => 'desktop',
367
+ 'no_url_encode' => false,
368
+ 'http2' => false,
369
+ 'vars' => {}
370
+ }
371
+ end
372
+
373
+ # Stored page collection.
374
+ #
375
+ # @return [DhEasy::Core::SmartCollection]
376
+ #
377
+ # @note Page gid will be replaced on insert by an auto generated uuid
378
+ # unless page gid overriding is enabled
379
+ # (see #allow_page_gid_override?)
380
+ def pages
381
+ return @pages unless @page.nil?
382
+
383
+ collection = self.class.new_collection PAGE_KEYS,
384
+ defaults: page_defaults
385
+ collection.bind_event(:before_defaults) do |collection, raw_item|
386
+ item = DhEasy::Core.deep_stringify_keys raw_item
387
+ item.delete 'job_id' unless allow_job_id_override?
388
+ item
389
+ end
390
+ collection.bind_event(:before_insert) do |collection, item, match|
391
+ if item['gid'].nil? || !allow_page_gid_override?
392
+ item['gid'] = generate_page_gid item
393
+ end
394
+ item
395
+ end
396
+ collection.bind_event(:after_insert) do |collection, item|
397
+ ensure_job item['job_id']
398
+ end
399
+ @pages ||= collection
400
+ end
401
+
402
+ # Generate a fake UUID for outputs.
403
+ #
404
+ # @param [Hash] data Output data.
405
+ #
406
+ # @return [String]
407
+ def generate_output_id data
408
+ # Generate random UUID to match Datahen behavior
409
+ self.class.fake_uuid
410
+ end
411
+
412
+ # Get output keys with key generators to emulate saving on db.
413
+ # @private
414
+ #
415
+ # @return [Hash]
416
+ def output_defaults
417
+ @output_defaults ||= {
418
+ '_collection' => DEFAULT_COLLECTION,
419
+ '_job_id' => lambda{|output| job_id},
420
+ '_created_at' => lambda{|output| self.class.time_stamp},
421
+ '_gid' => lambda{|output| page_gid}
422
+ }
423
+ end
424
+
425
+ # Stored output collection
426
+ #
427
+ # @return [DhEasy::Core::SmartCollection]
428
+ def outputs
429
+ return @outputs unless @outputs.nil?
430
+ collection = self.class.new_collection OUTPUT_KEYS,
431
+ defaults: output_defaults
432
+ collection.bind_event(:before_defaults) do |collection, raw_item|
433
+ item = DhEasy::Core.deep_stringify_keys raw_item
434
+ item.delete '_job_id' unless allow_job_id_override?
435
+ item.delete '_gid_id' unless allow_page_gid_override?
436
+ item
437
+ end
438
+ collection.bind_event(:before_insert) do |collection, item, match|
439
+ item['_id'] ||= generate_output_id item
440
+ item
441
+ end
442
+ collection.bind_event(:after_insert) do |collection, item|
443
+ ensure_job item['_job_id']
444
+ end
445
+ @outputs ||= collection
446
+ end
447
+
448
+ # Match data to filters.
449
+ # @private
450
+ #
451
+ # @param data Hash containing data.
452
+ # @param filters Filters to apply on match.
453
+ #
454
+ # @return [Boolean]
455
+ #
456
+ # @note Missing and `nil` values on `data` will match when `filters`'
457
+ # field is `nil`.
458
+ def match? data, filters
459
+ filters.each do |key, value|
460
+ return false if data[key] != value
461
+ end
462
+ true
463
+ end
464
+
465
+ # Search items from a collection.
466
+ #
467
+ # @param [Symbol] collection Allowed values: `:outputs`, `:pages`.
468
+ # @param [Hash] filter Filters to query.
469
+ # @param [Integer] offset (0) Search results offset.
470
+ # @param [Integer,nil] limit (nil) Limit search results count. Set to `nil` for unlimited.
471
+ #
472
+ # @raise ArgumentError On unknown collection.
473
+ #
474
+ # @note _Warning:_ It uses table scan to filter and should be used on test suites only.
475
+ def query collection, filter, offset = 0, limit = nil
476
+ return [] unless limit.nil? || limit > 0
477
+
478
+ # Get collection items
479
+ items = case collection
480
+ when :outputs
481
+ outputs
482
+ when :pages
483
+ pages
484
+ when :jobs
485
+ jobs
486
+ else
487
+ raise ArgumentError.new "Unknown collection #{collection}."
488
+ end
489
+
490
+ # Search items
491
+ count = 0
492
+ matches = []
493
+ items.each do |item|
494
+ next unless match? item, filter
495
+ count += 1
496
+
497
+ # Skip until offset
498
+ next unless offset < count
499
+ # Break on limit reach
500
+ break unless limit.nil? || matches.count < limit
501
+ matches << item
502
+ end
503
+ matches
504
+ end
505
+
506
+ # Refetch a page.
507
+ #
508
+ # @param [Integer] job_id Page's job_id to refetch.
509
+ # @param [String] gid Page's gid to refetch.
510
+ def refetch job_id, gid
511
+ page = pages.find_match('gid' => gid, 'job_id' => job_id)
512
+ raise Exception.new("Page not found with job_id \"#{job_id}\" gid \"#{gid}\"") if page.nil?
513
+ page['status'] = 'to_fetch'
514
+ page['freshness'] = self.class.time_stamp
515
+ page['to_fetch'] = self.class.time_stamp
516
+ page['fetched_from'] = nil
517
+ page['fetching_at'] = '2001-01-01T00:00:00Z'
518
+ page['fetched_at'] = nil
519
+ page['fetching_try_count'] = 0
520
+ page['effective_url'] = nil
521
+ page['parsing_at'] = nil
522
+ page['parsing_failed_at'] = nil
523
+ page['parsed_at'] = nil
524
+ page['parsing_try_count'] = 0
525
+ page['parsing_fail_count'] = 0
526
+ page['parsing_updated_at'] = '2001-01-01T00:00:00Z'
527
+ page['response_checksum'] = nil
528
+ page['response_status'] = nil
529
+ page['response_status_code'] = nil
530
+ page['response_headers'] = nil
531
+ page['response_cookie'] = nil
532
+ page['response_proto'] = nil
533
+ page['content_type'] = nil
534
+ page['content_size'] = 0
535
+ page['failed_response_status_code'] = nil
536
+ page['failed_response_headers'] = nil
537
+ page['failed_response_cookie'] = nil
538
+ page['failed_effective_url'] = nil
539
+ page['failed_at'] = nil
540
+ page['failed_content_type'] = nil
541
+ end
542
+
543
+ # Reparse a page.
544
+ #
545
+ # @param [Integer] job_id Page's job_id to reparse.
546
+ # @param [String] gid Page's gid to reparse.
547
+ def reparse job_id, gid
548
+ page = pages.find_match('gid' => gid, 'job_id' => job_id)
549
+ raise Exception.new("Page not found with job_id \"#{job_id}\" gid \"#{gid}\"") if page.nil?
550
+ page['status'] = 'to_parse'
551
+ page['parsing_at'] = nil
552
+ page['parsing_failed_at'] = nil
553
+ page['parsing_updated_at'] = '2001-01-01T00:00:00Z'
554
+ page['parsed_at'] = nil
555
+ page['parsing_try_count'] = 0
556
+ page['parsing_fail_count'] = 0
557
+ end
558
+ end
559
+ end
560
+ end
561
+ end