dh_easy-core 0.2.2 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/doc/DhEasy.html +6 -6
  3. data/doc/DhEasy/Core.html +39 -40
  4. data/doc/DhEasy/Core/Config.html +6 -6
  5. data/doc/DhEasy/Core/Exception.html +6 -6
  6. data/doc/DhEasy/Core/Exception/OutdatedError.html +6 -6
  7. data/doc/DhEasy/Core/Helper.html +6 -6
  8. data/doc/DhEasy/Core/Helper/Cookie.html +6 -6
  9. data/doc/DhEasy/Core/Mock.html +6 -6
  10. data/doc/DhEasy/Core/Mock/FakeDb.html +963 -400
  11. data/doc/DhEasy/Core/Mock/FakeExecutor.html +26 -37
  12. data/doc/DhEasy/Core/Mock/FakeFinisher.html +6 -6
  13. data/doc/DhEasy/Core/Mock/FakeParser.html +6 -6
  14. data/doc/DhEasy/Core/Mock/FakeSeeder.html +6 -6
  15. data/doc/DhEasy/Core/Plugin.html +6 -6
  16. data/doc/DhEasy/Core/Plugin/CollectionVault.html +6 -6
  17. data/doc/DhEasy/Core/Plugin/ConfigBehavior.html +7 -7
  18. data/doc/DhEasy/Core/Plugin/ContextIntegrator.html +6 -6
  19. data/doc/DhEasy/Core/Plugin/Executor.html +6 -6
  20. data/doc/DhEasy/Core/Plugin/ExecutorBehavior.html +6 -6
  21. data/doc/DhEasy/Core/Plugin/Finisher.html +6 -6
  22. data/doc/DhEasy/Core/Plugin/FinisherBehavior.html +6 -6
  23. data/doc/DhEasy/Core/Plugin/InitializeHook.html +6 -6
  24. data/doc/DhEasy/Core/Plugin/Parser.html +6 -6
  25. data/doc/DhEasy/Core/Plugin/ParserBehavior.html +6 -6
  26. data/doc/DhEasy/Core/Plugin/Seeder.html +6 -6
  27. data/doc/DhEasy/Core/Plugin/SeederBehavior.html +6 -6
  28. data/doc/DhEasy/Core/SmartCollection.html +6 -6
  29. data/doc/_index.html +7 -7
  30. data/doc/class_list.html +2 -2
  31. data/doc/css/style.css +2 -2
  32. data/doc/file.README.html +9 -16
  33. data/doc/file_list.html +2 -2
  34. data/doc/frames.html +2 -2
  35. data/doc/index.html +9 -16
  36. data/doc/js/app.js +14 -3
  37. data/doc/method_list.html +80 -48
  38. data/doc/top-level-namespace.html +6 -6
  39. data/lib/dh_easy/core.rb +2 -1
  40. data/lib/dh_easy/core/mock/fake_db.rb +228 -35
  41. data/lib/dh_easy/core/mock/fake_executor.rb +0 -1
  42. data/lib/dh_easy/core/version.rb +1 -1
  43. metadata +3 -4
@@ -18,6 +18,12 @@ module DhEasy
18
18
  }
19
19
  # Default collection for saved outputs
20
20
  DEFAULT_COLLECTION = 'default'
21
+ # Default page's fetch type
22
+ DEFAULT_FETCH_TYPE = 'standard'
23
+ # Default uuid algorithm
24
+ DEFAULT_UUID_ALGORITHM = :md5
25
+ # Valid uuid algorithms
26
+ VALID_UUID_ALGORITHMS = [:md5, :sha1, :sha256]
21
27
 
22
28
  # Generate a smart collection with keys and initial values.
23
29
  #
@@ -32,21 +38,31 @@ module DhEasy
32
38
  # Generate a fake UUID.
33
39
  #
34
40
  # @param seed (nil) Object to use as seed for uuid.
41
+ # @param [Enumerator] algorithm (nil) Algorithm to use: sha256 (default), sha1, md5.
35
42
  #
36
43
  # @return [String]
37
- def self.fake_uuid seed = nil
44
+ def self.fake_uuid seed = nil, algorithm = nil
38
45
  seed ||= (Time.new.to_f + rand)
39
- Digest::SHA1.hexdigest seed.to_s
46
+ algorithm ||= DEFAULT_UUID_ALGORITHM
47
+ case algorithm
48
+ when :sha256
49
+ Digest::SHA256.hexdigest seed.to_s
50
+ when :sha1
51
+ Digest::SHA1.hexdigest seed.to_s
52
+ else
53
+ Digest::MD5.hexdigest seed.to_s
54
+ end
40
55
  end
41
56
 
42
57
  # Generate a fake UUID based on output fields without `_` prefix.
43
58
  #
44
59
  # @param [Hash] data Output data.
60
+ # @param [Enumerator] uuid_algorithm (nil) Algorithm to use: sha256 (default), sha1, md5.
45
61
  #
46
62
  # @return [String]
47
- def self.output_uuid data
63
+ def self.output_uuid data, uuid_algorithm = nil
48
64
  seed = data.select{|k,v|k.to_s =~ /^[^_]/}.hash
49
- fake_uuid seed
65
+ fake_uuid seed, uuid_algorithm
50
66
  end
51
67
 
52
68
  # Build a page with defaults by using FakeDb engine.
@@ -83,8 +99,8 @@ module DhEasy
83
99
  #
84
100
  # @param [String] raw_url URL to clean.
85
101
  #
86
- # @return [String]
87
- def self.clean_uri raw_url
102
+ # @return [URI::HTTPS]
103
+ def self.clean_uri_obj raw_url
88
104
  url = URI.parse(raw_url)
89
105
  url.hostname = url.hostname.downcase
90
106
  url.fragment = nil
@@ -101,7 +117,17 @@ module DhEasy
101
117
  end
102
118
  url.query = data.join('&')
103
119
  end
104
- url.to_s
120
+ url
121
+ end
122
+
123
+ # Clean an URL to remove fragment, lowercase schema and host, and sort
124
+ # query string.
125
+ #
126
+ # @param [String] raw_url URL to clean.
127
+ #
128
+ # @return [String]
129
+ def self.clean_uri raw_url
130
+ clean_uri_obj(raw_url).to_s
105
131
  end
106
132
 
107
133
  # Format headers for gid generation.
@@ -111,16 +137,87 @@ module DhEasy
111
137
  #
112
138
  # @return [Hash]
113
139
  def self.format_headers headers
114
- return {} if headers.nil?
115
- data = {}
140
+ return '' if headers.nil?
141
+ data = []
116
142
  headers.each do |key, value|
117
143
  unless value.is_a? Array
118
- data[key] = value
144
+ data << "#{key.downcase}:#{value.to_s}"
119
145
  next
120
146
  end
121
- data[key] = value.sort
147
+ data << "#{key.downcase}:#{value.sort.join ','}"
122
148
  end
123
- data
149
+ data.sort.join ';'
150
+ end
151
+
152
+ # Identify whenever it has a default_fetch_type.
153
+ # @private
154
+ #
155
+ # @param [String,nil] fetch_type Fetch type.
156
+ #
157
+ # @return [Boolean] `true` when default value, else `false`.
158
+ def self.is_default_fetch_type? fetch_type
159
+ return true if fetch_type.nil?
160
+ return true if fetch_type === DEFAULT_FETCH_TYPE
161
+ false
162
+ end
163
+
164
+ # Identify whenever a driver hash is empty.
165
+ # @private
166
+ #
167
+ # @param [Hash,nil] driver Driver hash.
168
+ #
169
+ # @return [Boolean] `true` when empty, else `false`.
170
+ def self.is_driver_empty? driver
171
+ return true if driver.nil?
172
+ return true unless driver.is_a? Hash
173
+ return false if driver['name'].to_s.strip != ''
174
+ return false if driver['code'].to_s.strip != ''
175
+ return false if driver['pre_code'].to_s.strip != ''
176
+ return false if !driver['stealth'].nil? && !!driver['stealth']
177
+ return false if !driver['enable_images'].nil? && !!driver['enable_images']
178
+ return false if !driver['goto_options'].nil? && driver['goto_options'].is_a?(Hash) && driver['goto_options'].keys.length > 0
179
+ true
180
+ end
181
+
182
+ # Identify whenever a display hash is empty.
183
+ # @private
184
+ #
185
+ # @param [Hash,nil] display Display hash.
186
+ #
187
+ # @return [Boolean] `true` when empty, else `false`.
188
+ def self.is_display_empty? display
189
+ return true if display.nil?
190
+ return true unless display.is_a? Hash
191
+ return false if !display['width'].nil? && display['width'].to_f.ceil > 0
192
+ return false if !display['height'].nil? && display['height'].to_f.ceil > 0
193
+ true
194
+ end
195
+
196
+ # Identify whenever a screenshot hash is empty.
197
+ # @private
198
+ #
199
+ # @param [Hash,nil] screenshot Screenshot hash.
200
+ #
201
+ # @return [Boolean] `true` when empty, else `false`.
202
+ def self.is_screenshot_empty? screenshot
203
+ return true if screenshot.nil?
204
+ return true unless screenshot.is_a? Hash
205
+ return true if screenshot['take_screenshot'].nil? || !screenshot['take_screenshot']
206
+ return true if !screenshot['options'].nil? && !screenshot['options'].is_a?(Hash)
207
+ return false
208
+ end
209
+
210
+ # Identify whenever a hash is empty.
211
+ # @private
212
+ #
213
+ # @param [Hash,nil] hash Hash to validate.
214
+ #
215
+ # @return [Boolean] `true` when empty, else `false`.
216
+ def self.is_hash_empty? hash
217
+ return true if hash.nil?
218
+ return true unless hash.is_a? Hash
219
+ return false if hash.keys.length > 0
220
+ true
124
221
  end
125
222
 
126
223
  # Build a job with defaults by using FakeDb engine.
@@ -159,7 +256,7 @@ module DhEasy
159
256
  # @return [String]
160
257
  def self.time_stamp time = nil
161
258
  time = Time.new if time.nil?
162
- time.utc.strftime('%Y-%m-%dT%H:%M:%SZ')
259
+ time.utc.strftime('%FT%T.%6N').gsub(/[0.]+\Z/,'') << "Z"
163
260
  end
164
261
 
165
262
  # Get current job or create new one from values.
@@ -209,7 +306,7 @@ module DhEasy
209
306
  # Current fake page gid.
210
307
  # @return [Integer,nil]
211
308
  def page_gid
212
- @page_gid ||= self.class.fake_uuid
309
+ @page_gid ||= self.fake_uuid
213
310
  end
214
311
 
215
312
  # Set current fake page gid value.
@@ -217,6 +314,21 @@ module DhEasy
217
314
  @page_gid = value
218
315
  end
219
316
 
317
+ # Current UUID algorithm.
318
+ # @return [Enumerator,nil]
319
+ def uuid_algorithm
320
+ @uuid_algorithm ||= DEFAULT_UUID_ALGORITHM
321
+ end
322
+
323
+ # Set current UUID algorithm value.
324
+ # @raise [ArgumentError] Whenever an invalid algorithm is provided
325
+ def uuid_algorithm= value
326
+ unless value.nil? || VALID_UUID_ALGORITHMS.include?(value)
327
+ raise ArgumentError.new("Invalid UUID algorithm, valid values are :md5, :sha1, :sha256")
328
+ end
329
+ @uuid_algorithm = value
330
+ end
331
+
220
332
  # Enable page gid override on page or output insert.
221
333
  def enable_page_gid_override
222
334
  @allow_page_gid_override = true
@@ -263,14 +375,26 @@ module DhEasy
263
375
  # whenever page gid can be overrided on page or output insert.
264
376
  # @option opts [Boolean, nil] :allow_job_id_override (false) Specify
265
377
  # whenever job id can be overrided on page or output insert.
378
+ # @option opts [Enumerator, nil] :uuid_algorithm (:md5) Specify the
379
+ # algorithm to be used to generate UUID values.
266
380
  def initialize opts = {}
267
381
  self.job_id = opts[:job_id]
268
382
  self.scraper_name = opts[:scraper_name]
269
383
  self.page_gid = opts[:page_gid]
384
+ self.uuid_algorithm = opts[:uuid_algorithm]
270
385
  @allow_page_gid_override = opts[:allow_page_gid_override].nil? ? false : !!opts[:allow_page_gid_override]
271
386
  @allow_job_id_override = opts[:allow_job_id_override].nil? ? false : !!opts[:allow_job_id_override]
272
387
  end
273
388
 
389
+ # Generate a fake UUID using the configured uuid algorithm.
390
+ #
391
+ # @param seed (nil) Object to use as seed for uuid.
392
+ #
393
+ # @return [String]
394
+ def fake_uuid seed = nil
395
+ self.class.fake_uuid seed, self.uuid_algorithm
396
+ end
397
+
274
398
  # Generate a fake scraper name.
275
399
  #
276
400
  # @return [String]
@@ -329,23 +453,42 @@ module DhEasy
329
453
  #
330
454
  # @return [String]
331
455
  def generate_page_gid page_data
332
- fields = [
333
- 'url',
334
- 'method',
335
- 'headers',
336
- 'fetch_type',
337
- 'cookie',
338
- 'no_redirect',
339
- 'body',
340
- 'ua_type'
341
- ]
342
- data = page_data.select{|k,v|fields.include? k}
343
- data['url'] = self.class.clean_uri data['url']
344
- data['headers'] = self.class.format_headers data['headers']
345
- data['cookie'] = DhEasy::Core::Helper::Cookie.parse_from_request data['cookie'] unless data['cookie'].nil?
346
- seed = data.select{|k,v|fields.include? k}.hash
347
- checksum = self.class.fake_uuid seed
348
- "#{URI.parse(data['url']).hostname}-#{checksum}"
456
+ # ensure page url
457
+ return "" if page_data['url'].nil? || page_data['url'].to_s.strip === ''
458
+
459
+ # calculate extra fields, keep field order to match datahen
460
+ data = []
461
+ data << "method:#{page_data['method'].to_s.downcase}"
462
+ no_url_encode = (!page_data['no_url_encode'].nil? && !!page_data['no_url_encode'])
463
+ uri = self.class.clean_uri_obj(page_data['url'])
464
+ url = (no_url_encode ? page_data['url'].to_s.lstrip : uri.to_s)
465
+ data << "url:#{url}"
466
+ headers = self.class.format_headers page_data['headers']
467
+ data << "headers:#{headers}"
468
+ data << "body:#{page_data['body'].to_s}"
469
+ no_redirect = (!page_data['no_redirect'].nil? && !!page_data['no_redirect'])
470
+ data << "no_redirect:#{no_redirect.to_s}"
471
+ ua_type = (page_data['ua_type'].to_s === '') ? 'desktop' : page_data['ua_type']
472
+ data << "ua_type:#{ua_type}"
473
+
474
+ # complex fields
475
+ data << "fetch_type:#{page_data['fetch_type']}" unless self.class.is_default_fetch_type? page_data['fetch_type']
476
+ # keep this cookie logic to match datahen
477
+ data << "cookie:#{page_data['cookie'].split(/;\s*/).sort.join(';')}" if page_data['cookie'].to_s.strip != ''
478
+ data << "http2:true" if page_data.has_key?('http2') && !page_data['http2'].nil? && !!page_data['http2']
479
+ data << "driverName:#{page_data['driver']['name']}" unless self.class.is_driver_empty? page_data['driver']
480
+ unless self.class.is_display_empty? page_data['display']
481
+ data << "display:#{page_data['display']['width']}x#{page_data['display']['height']}"
482
+ end
483
+ unless self.class.is_screenshot_empty? page_data['screenshot']
484
+ checksum = self.fake_uuid JSON.generate(page_data['screenshot'])
485
+ data << "screenshot:#{checksum}"
486
+ end
487
+
488
+ # generate GID
489
+ seed = data.join('|')
490
+ checksum = self.fake_uuid seed
491
+ "#{uri.hostname}-#{checksum}"
349
492
  end
350
493
 
351
494
  # Get page keys with key generators to emulate saving on db.
@@ -354,18 +497,45 @@ module DhEasy
354
497
  # @return [Hash]
355
498
  def page_defaults
356
499
  @page_defaults ||= {
500
+ 'job_id' => lambda{|page| job_id},
357
501
  'url' => nil,
358
502
  'status' => 'to_fetch',
359
- 'job_id' => lambda{|page| job_id},
503
+ 'page_type' => 'default',
360
504
  'method' => 'GET',
361
505
  'headers' => {},
362
- 'fetch_type' => 'standard',
506
+ 'fetch_type' => DEFAULT_FETCH_TYPE,
363
507
  'cookie' => nil,
364
508
  'no_redirect' => false,
365
509
  'body' => nil,
366
510
  'ua_type' => 'desktop',
367
511
  'no_url_encode' => false,
368
512
  'http2' => false,
513
+ 'priority' => 0,
514
+ 'parsing_try_count' => 0,
515
+ 'parsing_fail_count' => 0,
516
+ 'fetching_at' => '0001-01-01T00:00:00Z',
517
+ 'fetching_try_count' => 0,
518
+ 'refetch_count' => 0,
519
+ 'fetched_from' => '',
520
+ 'content_size' => 0,
521
+ 'force_fetch' => false,
522
+ 'driver' => {
523
+ 'name' => '',
524
+ 'pre_code' => '',
525
+ 'code' => '',
526
+ 'goto_options' => nil,
527
+ 'stealth' => false,
528
+ 'enable_images' => false
529
+ },
530
+ 'display' => {
531
+ 'width' => 0,
532
+ 'height' => 0
533
+ },
534
+ 'screenshot' => {
535
+ 'take_screenshot' => false,
536
+ 'options' => nil
537
+ },
538
+ 'driver_log' => nil,
369
539
  'vars' => {}
370
540
  }
371
541
  end
@@ -380,17 +550,40 @@ module DhEasy
380
550
  def pages
381
551
  return @pages unless @page.nil?
382
552
 
553
+ defaults = self.page_defaults
383
554
  collection = self.class.new_collection PAGE_KEYS,
384
- defaults: page_defaults
555
+ defaults: defaults
385
556
  collection.bind_event(:before_defaults) do |collection, raw_item|
386
557
  item = DhEasy::Core.deep_stringify_keys raw_item
558
+ if !item['driver'].nil? && item['driver'].is_a?(Hash)
559
+ item['driver'] = defaults['driver'].merge item['driver']
560
+ end
561
+ if !item['display'].nil? && item['display'].is_a?(Hash)
562
+ item['display'] = defaults['display'].merge item['display']
563
+ end
564
+ if !item['screenshot'].nil? && item['screenshot'].is_a?(Hash)
565
+ item['screenshot'] = defaults['screenshot'].merge item['screenshot']
566
+ end
387
567
  item.delete 'job_id' unless allow_job_id_override?
388
568
  item
389
569
  end
390
570
  collection.bind_event(:before_insert) do |collection, item, match|
571
+ item['driver'] = nil if self.class.is_driver_empty? item['driver']
572
+ item['display'] = nil if self.class.is_display_empty? item['display']
573
+ item['screenshot'] = nil if self.class.is_screenshot_empty? item['screenshot']
574
+ item['headers'] = nil if self.class.is_hash_empty? item['headers']
575
+ item['vars'] = nil if self.class.is_hash_empty? item['vars']
576
+ uri = self.class.clean_uri_obj(item['url'])
577
+ item['hostname'] = uri.hostname
578
+ uri = nil
391
579
  if item['gid'].nil? || !allow_page_gid_override?
392
580
  item['gid'] = generate_page_gid item
393
581
  end
582
+
583
+ # 30 days = 60 * 60 * 24 * 30 = 2592000
584
+ item['freshness'] ||= self.class.time_stamp (Time.now - 2592000)
585
+ item['to_fetch'] ||= self.class.time_stamp
586
+ item['created_at'] ||= self.class.time_stamp
394
587
  item
395
588
  end
396
589
  collection.bind_event(:after_insert) do |collection, item|
@@ -406,7 +599,7 @@ module DhEasy
406
599
  # @return [String]
407
600
  def generate_output_id data
408
601
  # Generate random UUID to match Datahen behavior
409
- self.class.fake_uuid
602
+ self.fake_uuid
410
603
  end
411
604
 
412
605
  # Get output keys with key generators to emulate saving on db.
@@ -293,7 +293,6 @@ module DhEasy
293
293
  raise ArgumentError.new("per_page needs to be an Integer between 1 and #{MAX_FIND_OUTPUTS_PER_PAGE}.")
294
294
  end
295
295
 
296
- count = 0
297
296
  offset = (page - 1) * per_page
298
297
  job = latest_job_by(opts[:scraper_name])
299
298
  fixed_query = query.merge(
@@ -1,6 +1,6 @@
1
1
  module DhEasy
2
2
  module Core
3
3
  # Gem version
4
- VERSION = "0.2.2"
4
+ VERSION = "0.3.1"
5
5
  end
6
6
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dh_easy-core
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eduardo Rosales
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-12-04 00:00:00.000000000 Z
11
+ date: 2021-05-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: datahen
@@ -241,8 +241,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
241
241
  - !ruby/object:Gem::Version
242
242
  version: '0'
243
243
  requirements: []
244
- rubyforge_project:
245
- rubygems_version: 2.7.6
244
+ rubygems_version: 3.0.3
246
245
  signing_key:
247
246
  specification_version: 4
248
247
  summary: DataHen Easy toolkit core module