dh_easy-core 0.2.2 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/doc/DhEasy.html +6 -6
  3. data/doc/DhEasy/Core.html +39 -40
  4. data/doc/DhEasy/Core/Config.html +6 -6
  5. data/doc/DhEasy/Core/Exception.html +6 -6
  6. data/doc/DhEasy/Core/Exception/OutdatedError.html +6 -6
  7. data/doc/DhEasy/Core/Helper.html +6 -6
  8. data/doc/DhEasy/Core/Helper/Cookie.html +6 -6
  9. data/doc/DhEasy/Core/Mock.html +6 -6
  10. data/doc/DhEasy/Core/Mock/FakeDb.html +963 -400
  11. data/doc/DhEasy/Core/Mock/FakeExecutor.html +26 -37
  12. data/doc/DhEasy/Core/Mock/FakeFinisher.html +6 -6
  13. data/doc/DhEasy/Core/Mock/FakeParser.html +6 -6
  14. data/doc/DhEasy/Core/Mock/FakeSeeder.html +6 -6
  15. data/doc/DhEasy/Core/Plugin.html +6 -6
  16. data/doc/DhEasy/Core/Plugin/CollectionVault.html +6 -6
  17. data/doc/DhEasy/Core/Plugin/ConfigBehavior.html +7 -7
  18. data/doc/DhEasy/Core/Plugin/ContextIntegrator.html +6 -6
  19. data/doc/DhEasy/Core/Plugin/Executor.html +6 -6
  20. data/doc/DhEasy/Core/Plugin/ExecutorBehavior.html +6 -6
  21. data/doc/DhEasy/Core/Plugin/Finisher.html +6 -6
  22. data/doc/DhEasy/Core/Plugin/FinisherBehavior.html +6 -6
  23. data/doc/DhEasy/Core/Plugin/InitializeHook.html +6 -6
  24. data/doc/DhEasy/Core/Plugin/Parser.html +6 -6
  25. data/doc/DhEasy/Core/Plugin/ParserBehavior.html +6 -6
  26. data/doc/DhEasy/Core/Plugin/Seeder.html +6 -6
  27. data/doc/DhEasy/Core/Plugin/SeederBehavior.html +6 -6
  28. data/doc/DhEasy/Core/SmartCollection.html +6 -6
  29. data/doc/_index.html +7 -7
  30. data/doc/class_list.html +2 -2
  31. data/doc/css/style.css +2 -2
  32. data/doc/file.README.html +9 -16
  33. data/doc/file_list.html +2 -2
  34. data/doc/frames.html +2 -2
  35. data/doc/index.html +9 -16
  36. data/doc/js/app.js +14 -3
  37. data/doc/method_list.html +80 -48
  38. data/doc/top-level-namespace.html +6 -6
  39. data/lib/dh_easy/core.rb +2 -1
  40. data/lib/dh_easy/core/mock/fake_db.rb +228 -35
  41. data/lib/dh_easy/core/mock/fake_executor.rb +0 -1
  42. data/lib/dh_easy/core/version.rb +1 -1
  43. metadata +3 -4
@@ -18,6 +18,12 @@ module DhEasy
18
18
  }
19
19
  # Default collection for saved outputs
20
20
  DEFAULT_COLLECTION = 'default'
21
+ # Default page's fetch type
22
+ DEFAULT_FETCH_TYPE = 'standard'
23
+ # Default uuid algorithm
24
+ DEFAULT_UUID_ALGORITHM = :md5
25
+ # Valid uuid algorithms
26
+ VALID_UUID_ALGORITHMS = [:md5, :sha1, :sha256]
21
27
 
22
28
  # Generate a smart collection with keys and initial values.
23
29
  #
@@ -32,21 +38,31 @@ module DhEasy
32
38
  # Generate a fake UUID.
33
39
  #
34
40
  # @param seed (nil) Object to use as seed for uuid.
41
+ # @param [Enumerator] algorithm (nil) Algorithm to use: sha256 (default), sha1, md5.
35
42
  #
36
43
  # @return [String]
37
- def self.fake_uuid seed = nil
44
+ def self.fake_uuid seed = nil, algorithm = nil
38
45
  seed ||= (Time.new.to_f + rand)
39
- Digest::SHA1.hexdigest seed.to_s
46
+ algorithm ||= DEFAULT_UUID_ALGORITHM
47
+ case algorithm
48
+ when :sha256
49
+ Digest::SHA256.hexdigest seed.to_s
50
+ when :sha1
51
+ Digest::SHA1.hexdigest seed.to_s
52
+ else
53
+ Digest::MD5.hexdigest seed.to_s
54
+ end
40
55
  end
41
56
 
42
57
  # Generate a fake UUID based on output fields without `_` prefix.
43
58
  #
44
59
  # @param [Hash] data Output data.
60
+ # @param [Enumerator] uuid_algorithm (nil) Algorithm to use: sha256 (default), sha1, md5.
45
61
  #
46
62
  # @return [String]
47
- def self.output_uuid data
63
+ def self.output_uuid data, uuid_algorithm = nil
48
64
  seed = data.select{|k,v|k.to_s =~ /^[^_]/}.hash
49
- fake_uuid seed
65
+ fake_uuid seed, uuid_algorithm
50
66
  end
51
67
 
52
68
  # Build a page with defaults by using FakeDb engine.
@@ -83,8 +99,8 @@ module DhEasy
83
99
  #
84
100
  # @param [String] raw_url URL to clean.
85
101
  #
86
- # @return [String]
87
- def self.clean_uri raw_url
102
+ # @return [URI::HTTPS]
103
+ def self.clean_uri_obj raw_url
88
104
  url = URI.parse(raw_url)
89
105
  url.hostname = url.hostname.downcase
90
106
  url.fragment = nil
@@ -101,7 +117,17 @@ module DhEasy
101
117
  end
102
118
  url.query = data.join('&')
103
119
  end
104
- url.to_s
120
+ url
121
+ end
122
+
123
+ # Clean an URL to remove fragment, lowercase schema and host, and sort
124
+ # query string.
125
+ #
126
+ # @param [String] raw_url URL to clean.
127
+ #
128
+ # @return [String]
129
+ def self.clean_uri raw_url
130
+ clean_uri_obj(raw_url).to_s
105
131
  end
106
132
 
107
133
  # Format headers for gid generation.
@@ -111,16 +137,87 @@ module DhEasy
111
137
  #
112
138
  # @return [Hash]
113
139
  def self.format_headers headers
114
- return {} if headers.nil?
115
- data = {}
140
+ return '' if headers.nil?
141
+ data = []
116
142
  headers.each do |key, value|
117
143
  unless value.is_a? Array
118
- data[key] = value
144
+ data << "#{key.downcase}:#{value.to_s}"
119
145
  next
120
146
  end
121
- data[key] = value.sort
147
+ data << "#{key.downcase}:#{value.sort.join ','}"
122
148
  end
123
- data
149
+ data.sort.join ';'
150
+ end
151
+
152
+ # Identify whenever it has a default_fetch_type.
153
+ # @private
154
+ #
155
+ # @param [String,nil] fetch_type Fetch type.
156
+ #
157
+ # @return [Boolean] `true` when default value, else `false`.
158
+ def self.is_default_fetch_type? fetch_type
159
+ return true if fetch_type.nil?
160
+ return true if fetch_type === DEFAULT_FETCH_TYPE
161
+ false
162
+ end
163
+
164
+ # Identify whenever a driver hash is empty.
165
+ # @private
166
+ #
167
+ # @param [Hash,nil] driver Driver hash.
168
+ #
169
+ # @return [Boolean] `true` when empty, else `false`.
170
+ def self.is_driver_empty? driver
171
+ return true if driver.nil?
172
+ return true unless driver.is_a? Hash
173
+ return false if driver['name'].to_s.strip != ''
174
+ return false if driver['code'].to_s.strip != ''
175
+ return false if driver['pre_code'].to_s.strip != ''
176
+ return false if !driver['stealth'].nil? && !!driver['stealth']
177
+ return false if !driver['enable_images'].nil? && !!driver['enable_images']
178
+ return false if !driver['goto_options'].nil? && driver['goto_options'].is_a?(Hash) && driver['goto_options'].keys.length > 0
179
+ true
180
+ end
181
+
182
+ # Identify whenever a display hash is empty.
183
+ # @private
184
+ #
185
+ # @param [Hash,nil] display Display hash.
186
+ #
187
+ # @return [Boolean] `true` when empty, else `false`.
188
+ def self.is_display_empty? display
189
+ return true if display.nil?
190
+ return true unless display.is_a? Hash
191
+ return false if !display['width'].nil? && display['width'].to_f.ceil > 0
192
+ return false if !display['height'].nil? && display['height'].to_f.ceil > 0
193
+ true
194
+ end
195
+
196
+ # Identify whenever a screenshot hash is empty.
197
+ # @private
198
+ #
199
+ # @param [Hash,nil] screenshot Screenshot hash.
200
+ #
201
+ # @return [Boolean] `true` when empty, else `false`.
202
+ def self.is_screenshot_empty? screenshot
203
+ return true if screenshot.nil?
204
+ return true unless screenshot.is_a? Hash
205
+ return true if screenshot['take_screenshot'].nil? || !screenshot['take_screenshot']
206
+ return true if !screenshot['options'].nil? && !screenshot['options'].is_a?(Hash)
207
+ return false
208
+ end
209
+
210
+ # Identify whenever a hash is empty.
211
+ # @private
212
+ #
213
+ # @param [Hash,nil] hash Hash to validate.
214
+ #
215
+ # @return [Boolean] `true` when empty, else `false`.
216
+ def self.is_hash_empty? hash
217
+ return true if hash.nil?
218
+ return true unless hash.is_a? Hash
219
+ return false if hash.keys.length > 0
220
+ true
124
221
  end
125
222
 
126
223
  # Build a job with defaults by using FakeDb engine.
@@ -159,7 +256,7 @@ module DhEasy
159
256
  # @return [String]
160
257
  def self.time_stamp time = nil
161
258
  time = Time.new if time.nil?
162
- time.utc.strftime('%Y-%m-%dT%H:%M:%SZ')
259
+ time.utc.strftime('%FT%T.%6N').gsub(/[0.]+\Z/,'') << "Z"
163
260
  end
164
261
 
165
262
  # Get current job or create new one from values.
@@ -209,7 +306,7 @@ module DhEasy
209
306
  # Current fake page gid.
210
307
  # @return [Integer,nil]
211
308
  def page_gid
212
- @page_gid ||= self.class.fake_uuid
309
+ @page_gid ||= self.fake_uuid
213
310
  end
214
311
 
215
312
  # Set current fake page gid value.
@@ -217,6 +314,21 @@ module DhEasy
217
314
  @page_gid = value
218
315
  end
219
316
 
317
+ # Current UUID algorithm.
318
+ # @return [Enumerator,nil]
319
+ def uuid_algorithm
320
+ @uuid_algorithm ||= DEFAULT_UUID_ALGORITHM
321
+ end
322
+
323
+ # Set current UUID algorithm value.
324
+ # @raise [ArgumentError] Whenever an invalid algorithm is provided
325
+ def uuid_algorithm= value
326
+ unless value.nil? || VALID_UUID_ALGORITHMS.include?(value)
327
+ raise ArgumentError.new("Invalid UUID algorithm, valid values are :md5, :sha1, :sha256")
328
+ end
329
+ @uuid_algorithm = value
330
+ end
331
+
220
332
  # Enable page gid override on page or output insert.
221
333
  def enable_page_gid_override
222
334
  @allow_page_gid_override = true
@@ -263,14 +375,26 @@ module DhEasy
263
375
  # whenever page gid can be overrided on page or output insert.
264
376
  # @option opts [Boolean, nil] :allow_job_id_override (false) Specify
265
377
  # whenever job id can be overrided on page or output insert.
378
+ # @option opts [Enumerator, nil] :uuid_algorithm (:md5) Specify the
379
+ # algorithm to be used to generate UUID values.
266
380
  def initialize opts = {}
267
381
  self.job_id = opts[:job_id]
268
382
  self.scraper_name = opts[:scraper_name]
269
383
  self.page_gid = opts[:page_gid]
384
+ self.uuid_algorithm = opts[:uuid_algorithm]
270
385
  @allow_page_gid_override = opts[:allow_page_gid_override].nil? ? false : !!opts[:allow_page_gid_override]
271
386
  @allow_job_id_override = opts[:allow_job_id_override].nil? ? false : !!opts[:allow_job_id_override]
272
387
  end
273
388
 
389
+ # Generate a fake UUID using the configured uuid algorithm.
390
+ #
391
+ # @param seed (nil) Object to use as seed for uuid.
392
+ #
393
+ # @return [String]
394
+ def fake_uuid seed = nil
395
+ self.class.fake_uuid seed, self.uuid_algorithm
396
+ end
397
+
274
398
  # Generate a fake scraper name.
275
399
  #
276
400
  # @return [String]
@@ -329,23 +453,42 @@ module DhEasy
329
453
  #
330
454
  # @return [String]
331
455
  def generate_page_gid page_data
332
- fields = [
333
- 'url',
334
- 'method',
335
- 'headers',
336
- 'fetch_type',
337
- 'cookie',
338
- 'no_redirect',
339
- 'body',
340
- 'ua_type'
341
- ]
342
- data = page_data.select{|k,v|fields.include? k}
343
- data['url'] = self.class.clean_uri data['url']
344
- data['headers'] = self.class.format_headers data['headers']
345
- data['cookie'] = DhEasy::Core::Helper::Cookie.parse_from_request data['cookie'] unless data['cookie'].nil?
346
- seed = data.select{|k,v|fields.include? k}.hash
347
- checksum = self.class.fake_uuid seed
348
- "#{URI.parse(data['url']).hostname}-#{checksum}"
456
+ # ensure page url
457
+ return "" if page_data['url'].nil? || page_data['url'].to_s.strip === ''
458
+
459
+ # calculate extra fields, keep field order to match datahen
460
+ data = []
461
+ data << "method:#{page_data['method'].to_s.downcase}"
462
+ no_url_encode = (!page_data['no_url_encode'].nil? && !!page_data['no_url_encode'])
463
+ uri = self.class.clean_uri_obj(page_data['url'])
464
+ url = (no_url_encode ? page_data['url'].to_s.lstrip : uri.to_s)
465
+ data << "url:#{url}"
466
+ headers = self.class.format_headers page_data['headers']
467
+ data << "headers:#{headers}"
468
+ data << "body:#{page_data['body'].to_s}"
469
+ no_redirect = (!page_data['no_redirect'].nil? && !!page_data['no_redirect'])
470
+ data << "no_redirect:#{no_redirect.to_s}"
471
+ ua_type = (page_data['ua_type'].to_s === '') ? 'desktop' : page_data['ua_type']
472
+ data << "ua_type:#{ua_type}"
473
+
474
+ # complex fields
475
+ data << "fetch_type:#{page_data['fetch_type']}" unless self.class.is_default_fetch_type? page_data['fetch_type']
476
+ # keep this cookie logic to match datahen
477
+ data << "cookie:#{page_data['cookie'].split(/;\s*/).sort.join(';')}" if page_data['cookie'].to_s.strip != ''
478
+ data << "http2:true" if page_data.has_key?('http2') && !page_data['http2'].nil? && !!page_data['http2']
479
+ data << "driverName:#{page_data['driver']['name']}" unless self.class.is_driver_empty? page_data['driver']
480
+ unless self.class.is_display_empty? page_data['display']
481
+ data << "display:#{page_data['display']['width']}x#{page_data['display']['height']}"
482
+ end
483
+ unless self.class.is_screenshot_empty? page_data['screenshot']
484
+ checksum = self.fake_uuid JSON.generate(page_data['screenshot'])
485
+ data << "screenshot:#{checksum}"
486
+ end
487
+
488
+ # generate GID
489
+ seed = data.join('|')
490
+ checksum = self.fake_uuid seed
491
+ "#{uri.hostname}-#{checksum}"
349
492
  end
350
493
 
351
494
  # Get page keys with key generators to emulate saving on db.
@@ -354,18 +497,45 @@ module DhEasy
354
497
  # @return [Hash]
355
498
  def page_defaults
356
499
  @page_defaults ||= {
500
+ 'job_id' => lambda{|page| job_id},
357
501
  'url' => nil,
358
502
  'status' => 'to_fetch',
359
- 'job_id' => lambda{|page| job_id},
503
+ 'page_type' => 'default',
360
504
  'method' => 'GET',
361
505
  'headers' => {},
362
- 'fetch_type' => 'standard',
506
+ 'fetch_type' => DEFAULT_FETCH_TYPE,
363
507
  'cookie' => nil,
364
508
  'no_redirect' => false,
365
509
  'body' => nil,
366
510
  'ua_type' => 'desktop',
367
511
  'no_url_encode' => false,
368
512
  'http2' => false,
513
+ 'priority' => 0,
514
+ 'parsing_try_count' => 0,
515
+ 'parsing_fail_count' => 0,
516
+ 'fetching_at' => '0001-01-01T00:00:00Z',
517
+ 'fetching_try_count' => 0,
518
+ 'refetch_count' => 0,
519
+ 'fetched_from' => '',
520
+ 'content_size' => 0,
521
+ 'force_fetch' => false,
522
+ 'driver' => {
523
+ 'name' => '',
524
+ 'pre_code' => '',
525
+ 'code' => '',
526
+ 'goto_options' => nil,
527
+ 'stealth' => false,
528
+ 'enable_images' => false
529
+ },
530
+ 'display' => {
531
+ 'width' => 0,
532
+ 'height' => 0
533
+ },
534
+ 'screenshot' => {
535
+ 'take_screenshot' => false,
536
+ 'options' => nil
537
+ },
538
+ 'driver_log' => nil,
369
539
  'vars' => {}
370
540
  }
371
541
  end
@@ -380,17 +550,40 @@ module DhEasy
380
550
  def pages
381
551
  return @pages unless @page.nil?
382
552
 
553
+ defaults = self.page_defaults
383
554
  collection = self.class.new_collection PAGE_KEYS,
384
- defaults: page_defaults
555
+ defaults: defaults
385
556
  collection.bind_event(:before_defaults) do |collection, raw_item|
386
557
  item = DhEasy::Core.deep_stringify_keys raw_item
558
+ if !item['driver'].nil? && item['driver'].is_a?(Hash)
559
+ item['driver'] = defaults['driver'].merge item['driver']
560
+ end
561
+ if !item['display'].nil? && item['display'].is_a?(Hash)
562
+ item['display'] = defaults['display'].merge item['display']
563
+ end
564
+ if !item['screenshot'].nil? && item['screenshot'].is_a?(Hash)
565
+ item['screenshot'] = defaults['screenshot'].merge item['screenshot']
566
+ end
387
567
  item.delete 'job_id' unless allow_job_id_override?
388
568
  item
389
569
  end
390
570
  collection.bind_event(:before_insert) do |collection, item, match|
571
+ item['driver'] = nil if self.class.is_driver_empty? item['driver']
572
+ item['display'] = nil if self.class.is_display_empty? item['display']
573
+ item['screenshot'] = nil if self.class.is_screenshot_empty? item['screenshot']
574
+ item['headers'] = nil if self.class.is_hash_empty? item['headers']
575
+ item['vars'] = nil if self.class.is_hash_empty? item['vars']
576
+ uri = self.class.clean_uri_obj(item['url'])
577
+ item['hostname'] = uri.hostname
578
+ uri = nil
391
579
  if item['gid'].nil? || !allow_page_gid_override?
392
580
  item['gid'] = generate_page_gid item
393
581
  end
582
+
583
+ # 30 days = 60 * 60 * 24 * 30 = 2592000
584
+ item['freshness'] ||= self.class.time_stamp (Time.now - 2592000)
585
+ item['to_fetch'] ||= self.class.time_stamp
586
+ item['created_at'] ||= self.class.time_stamp
394
587
  item
395
588
  end
396
589
  collection.bind_event(:after_insert) do |collection, item|
@@ -406,7 +599,7 @@ module DhEasy
406
599
  # @return [String]
407
600
  def generate_output_id data
408
601
  # Generate random UUID to match Datahen behavior
409
- self.class.fake_uuid
602
+ self.fake_uuid
410
603
  end
411
604
 
412
605
  # Get output keys with key generators to emulate saving on db.
@@ -293,7 +293,6 @@ module DhEasy
293
293
  raise ArgumentError.new("per_page needs to be an Integer between 1 and #{MAX_FIND_OUTPUTS_PER_PAGE}.")
294
294
  end
295
295
 
296
- count = 0
297
296
  offset = (page - 1) * per_page
298
297
  job = latest_job_by(opts[:scraper_name])
299
298
  fixed_query = query.merge(
@@ -1,6 +1,6 @@
1
1
  module DhEasy
2
2
  module Core
3
3
  # Gem version
4
- VERSION = "0.2.2"
4
+ VERSION = "0.3.1"
5
5
  end
6
6
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dh_easy-core
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eduardo Rosales
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-12-04 00:00:00.000000000 Z
11
+ date: 2021-05-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: datahen
@@ -241,8 +241,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
241
241
  - !ruby/object:Gem::Version
242
242
  version: '0'
243
243
  requirements: []
244
- rubyforge_project:
245
- rubygems_version: 2.7.6
244
+ rubygems_version: 3.0.3
246
245
  signing_key:
247
246
  specification_version: 4
248
247
  summary: DataHen Easy toolkit core module