dh_easy-core 0.2.2 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/doc/DhEasy.html +6 -6
- data/doc/DhEasy/Core.html +39 -40
- data/doc/DhEasy/Core/Config.html +6 -6
- data/doc/DhEasy/Core/Exception.html +6 -6
- data/doc/DhEasy/Core/Exception/OutdatedError.html +6 -6
- data/doc/DhEasy/Core/Helper.html +6 -6
- data/doc/DhEasy/Core/Helper/Cookie.html +6 -6
- data/doc/DhEasy/Core/Mock.html +6 -6
- data/doc/DhEasy/Core/Mock/FakeDb.html +963 -400
- data/doc/DhEasy/Core/Mock/FakeExecutor.html +26 -37
- data/doc/DhEasy/Core/Mock/FakeFinisher.html +6 -6
- data/doc/DhEasy/Core/Mock/FakeParser.html +6 -6
- data/doc/DhEasy/Core/Mock/FakeSeeder.html +6 -6
- data/doc/DhEasy/Core/Plugin.html +6 -6
- data/doc/DhEasy/Core/Plugin/CollectionVault.html +6 -6
- data/doc/DhEasy/Core/Plugin/ConfigBehavior.html +7 -7
- data/doc/DhEasy/Core/Plugin/ContextIntegrator.html +6 -6
- data/doc/DhEasy/Core/Plugin/Executor.html +6 -6
- data/doc/DhEasy/Core/Plugin/ExecutorBehavior.html +6 -6
- data/doc/DhEasy/Core/Plugin/Finisher.html +6 -6
- data/doc/DhEasy/Core/Plugin/FinisherBehavior.html +6 -6
- data/doc/DhEasy/Core/Plugin/InitializeHook.html +6 -6
- data/doc/DhEasy/Core/Plugin/Parser.html +6 -6
- data/doc/DhEasy/Core/Plugin/ParserBehavior.html +6 -6
- data/doc/DhEasy/Core/Plugin/Seeder.html +6 -6
- data/doc/DhEasy/Core/Plugin/SeederBehavior.html +6 -6
- data/doc/DhEasy/Core/SmartCollection.html +6 -6
- data/doc/_index.html +7 -7
- data/doc/class_list.html +2 -2
- data/doc/css/style.css +2 -2
- data/doc/file.README.html +9 -16
- data/doc/file_list.html +2 -2
- data/doc/frames.html +2 -2
- data/doc/index.html +9 -16
- data/doc/js/app.js +14 -3
- data/doc/method_list.html +80 -48
- data/doc/top-level-namespace.html +6 -6
- data/lib/dh_easy/core.rb +2 -1
- data/lib/dh_easy/core/mock/fake_db.rb +228 -35
- data/lib/dh_easy/core/mock/fake_executor.rb +0 -1
- data/lib/dh_easy/core/version.rb +1 -1
- metadata +3 -4
@@ -18,6 +18,12 @@ module DhEasy
|
|
18
18
|
}
|
19
19
|
# Default collection for saved outputs
|
20
20
|
DEFAULT_COLLECTION = 'default'
|
21
|
+
# Default page's fetch type
|
22
|
+
DEFAULT_FETCH_TYPE = 'standard'
|
23
|
+
# Default uuid algorithm
|
24
|
+
DEFAULT_UUID_ALGORITHM = :md5
|
25
|
+
# Valid uuid algorithms
|
26
|
+
VALID_UUID_ALGORITHMS = [:md5, :sha1, :sha256]
|
21
27
|
|
22
28
|
# Generate a smart collection with keys and initial values.
|
23
29
|
#
|
@@ -32,21 +38,31 @@ module DhEasy
|
|
32
38
|
# Generate a fake UUID.
|
33
39
|
#
|
34
40
|
# @param seed (nil) Object to use as seed for uuid.
|
41
|
+
# @param [Enumerator] algorithm (nil) Algorithm to use: sha256 (default), sha1, md5.
|
35
42
|
#
|
36
43
|
# @return [String]
|
37
|
-
def self.fake_uuid seed = nil
|
44
|
+
def self.fake_uuid seed = nil, algorithm = nil
|
38
45
|
seed ||= (Time.new.to_f + rand)
|
39
|
-
|
46
|
+
algorithm ||= DEFAULT_UUID_ALGORITHM
|
47
|
+
case algorithm
|
48
|
+
when :sha256
|
49
|
+
Digest::SHA256.hexdigest seed.to_s
|
50
|
+
when :sha1
|
51
|
+
Digest::SHA1.hexdigest seed.to_s
|
52
|
+
else
|
53
|
+
Digest::MD5.hexdigest seed.to_s
|
54
|
+
end
|
40
55
|
end
|
41
56
|
|
42
57
|
# Generate a fake UUID based on output fields without `_` prefix.
|
43
58
|
#
|
44
59
|
# @param [Hash] data Output data.
|
60
|
+
# @param [Enumerator] uuid_algorithm (nil) Algorithm to use: sha256 (default), sha1, md5.
|
45
61
|
#
|
46
62
|
# @return [String]
|
47
|
-
def self.output_uuid data
|
63
|
+
def self.output_uuid data, uuid_algorithm = nil
|
48
64
|
seed = data.select{|k,v|k.to_s =~ /^[^_]/}.hash
|
49
|
-
fake_uuid seed
|
65
|
+
fake_uuid seed, uuid_algorithm
|
50
66
|
end
|
51
67
|
|
52
68
|
# Build a page with defaults by using FakeDb engine.
|
@@ -83,8 +99,8 @@ module DhEasy
|
|
83
99
|
#
|
84
100
|
# @param [String] raw_url URL to clean.
|
85
101
|
#
|
86
|
-
# @return [
|
87
|
-
def self.
|
102
|
+
# @return [URI::HTTPS]
|
103
|
+
def self.clean_uri_obj raw_url
|
88
104
|
url = URI.parse(raw_url)
|
89
105
|
url.hostname = url.hostname.downcase
|
90
106
|
url.fragment = nil
|
@@ -101,7 +117,17 @@ module DhEasy
|
|
101
117
|
end
|
102
118
|
url.query = data.join('&')
|
103
119
|
end
|
104
|
-
url
|
120
|
+
url
|
121
|
+
end
|
122
|
+
|
123
|
+
# Clean an URL to remove fragment, lowercase schema and host, and sort
|
124
|
+
# query string.
|
125
|
+
#
|
126
|
+
# @param [String] raw_url URL to clean.
|
127
|
+
#
|
128
|
+
# @return [String]
|
129
|
+
def self.clean_uri raw_url
|
130
|
+
clean_uri_obj(raw_url).to_s
|
105
131
|
end
|
106
132
|
|
107
133
|
# Format headers for gid generation.
|
@@ -111,16 +137,87 @@ module DhEasy
|
|
111
137
|
#
|
112
138
|
# @return [Hash]
|
113
139
|
def self.format_headers headers
|
114
|
-
return
|
115
|
-
data =
|
140
|
+
return '' if headers.nil?
|
141
|
+
data = []
|
116
142
|
headers.each do |key, value|
|
117
143
|
unless value.is_a? Array
|
118
|
-
data
|
144
|
+
data << "#{key.downcase}:#{value.to_s}"
|
119
145
|
next
|
120
146
|
end
|
121
|
-
data
|
147
|
+
data << "#{key.downcase}:#{value.sort.join ','}"
|
122
148
|
end
|
123
|
-
data
|
149
|
+
data.sort.join ';'
|
150
|
+
end
|
151
|
+
|
152
|
+
# Identify whenever it has a default_fetch_type.
|
153
|
+
# @private
|
154
|
+
#
|
155
|
+
# @param [String,nil] fetch_type Fetch type.
|
156
|
+
#
|
157
|
+
# @return [Boolean] `true` when default value, else `false`.
|
158
|
+
def self.is_default_fetch_type? fetch_type
|
159
|
+
return true if fetch_type.nil?
|
160
|
+
return true if fetch_type === DEFAULT_FETCH_TYPE
|
161
|
+
false
|
162
|
+
end
|
163
|
+
|
164
|
+
# Identify whenever a driver hash is empty.
|
165
|
+
# @private
|
166
|
+
#
|
167
|
+
# @param [Hash,nil] driver Driver hash.
|
168
|
+
#
|
169
|
+
# @return [Boolean] `true` when empty, else `false`.
|
170
|
+
def self.is_driver_empty? driver
|
171
|
+
return true if driver.nil?
|
172
|
+
return true unless driver.is_a? Hash
|
173
|
+
return false if driver['name'].to_s.strip != ''
|
174
|
+
return false if driver['code'].to_s.strip != ''
|
175
|
+
return false if driver['pre_code'].to_s.strip != ''
|
176
|
+
return false if !driver['stealth'].nil? && !!driver['stealth']
|
177
|
+
return false if !driver['enable_images'].nil? && !!driver['enable_images']
|
178
|
+
return false if !driver['goto_options'].nil? && driver['goto_options'].is_a?(Hash) && driver['goto_options'].keys.length > 0
|
179
|
+
true
|
180
|
+
end
|
181
|
+
|
182
|
+
# Identify whenever a display hash is empty.
|
183
|
+
# @private
|
184
|
+
#
|
185
|
+
# @param [Hash,nil] display Display hash.
|
186
|
+
#
|
187
|
+
# @return [Boolean] `true` when empty, else `false`.
|
188
|
+
def self.is_display_empty? display
|
189
|
+
return true if display.nil?
|
190
|
+
return true unless display.is_a? Hash
|
191
|
+
return false if !display['width'].nil? && display['width'].to_f.ceil > 0
|
192
|
+
return false if !display['height'].nil? && display['height'].to_f.ceil > 0
|
193
|
+
true
|
194
|
+
end
|
195
|
+
|
196
|
+
# Identify whenever a screenshot hash is empty.
|
197
|
+
# @private
|
198
|
+
#
|
199
|
+
# @param [Hash,nil] screenshot Screenshot hash.
|
200
|
+
#
|
201
|
+
# @return [Boolean] `true` when empty, else `false`.
|
202
|
+
def self.is_screenshot_empty? screenshot
|
203
|
+
return true if screenshot.nil?
|
204
|
+
return true unless screenshot.is_a? Hash
|
205
|
+
return true if screenshot['take_screenshot'].nil? || !screenshot['take_screenshot']
|
206
|
+
return true if !screenshot['options'].nil? && !screenshot['options'].is_a?(Hash)
|
207
|
+
return false
|
208
|
+
end
|
209
|
+
|
210
|
+
# Identify whenever a hash is empty.
|
211
|
+
# @private
|
212
|
+
#
|
213
|
+
# @param [Hash,nil] hash Hash to validate.
|
214
|
+
#
|
215
|
+
# @return [Boolean] `true` when empty, else `false`.
|
216
|
+
def self.is_hash_empty? hash
|
217
|
+
return true if hash.nil?
|
218
|
+
return true unless hash.is_a? Hash
|
219
|
+
return false if hash.keys.length > 0
|
220
|
+
true
|
124
221
|
end
|
125
222
|
|
126
223
|
# Build a job with defaults by using FakeDb engine.
|
@@ -159,7 +256,7 @@ module DhEasy
|
|
159
256
|
# @return [String]
|
160
257
|
def self.time_stamp time = nil
|
161
258
|
time = Time.new if time.nil?
|
162
|
-
time.utc.strftime('%
|
259
|
+
time.utc.strftime('%FT%T.%6N').gsub(/[0.]+\Z/,'') << "Z"
|
163
260
|
end
|
164
261
|
|
165
262
|
# Get current job or create new one from values.
|
@@ -209,7 +306,7 @@ module DhEasy
|
|
209
306
|
# Current fake page gid.
|
210
307
|
# @return [Integer,nil]
|
211
308
|
def page_gid
|
212
|
-
@page_gid ||= self.
|
309
|
+
@page_gid ||= self.fake_uuid
|
213
310
|
end
|
214
311
|
|
215
312
|
# Set current fake page gid value.
|
@@ -217,6 +314,21 @@ module DhEasy
|
|
217
314
|
@page_gid = value
|
218
315
|
end
|
219
316
|
|
317
|
+
# Current UUID algorithm.
|
318
|
+
# @return [Enumerator,nil]
|
319
|
+
def uuid_algorithm
|
320
|
+
@uuid_algorithm ||= DEFAULT_UUID_ALGORITHM
|
321
|
+
end
|
322
|
+
|
323
|
+
# Set current UUID algorithm value.
|
324
|
+
# @raise [ArgumentError] Whenever an invalid algorithm is provided
|
325
|
+
def uuid_algorithm= value
|
326
|
+
unless value.nil? || VALID_UUID_ALGORITHMS.include?(value)
|
327
|
+
raise ArgumentError.new("Invalid UUID algorithm, valid values are :md5, :sha1, :sha256")
|
328
|
+
end
|
329
|
+
@uuid_algorithm = value
|
330
|
+
end
|
331
|
+
|
220
332
|
# Enable page gid override on page or output insert.
|
221
333
|
def enable_page_gid_override
|
222
334
|
@allow_page_gid_override = true
|
@@ -263,14 +375,26 @@ module DhEasy
|
|
263
375
|
# whenever page gid can be overrided on page or output insert.
|
264
376
|
# @option opts [Boolean, nil] :allow_job_id_override (false) Specify
|
265
377
|
# whenever job id can be overrided on page or output insert.
|
378
|
+
# @option opts [Enumerator, nil] :uuid_algorithm (:md5) Specify the
|
379
|
+
# algorithm to be used to generate UUID values.
|
266
380
|
def initialize opts = {}
|
267
381
|
self.job_id = opts[:job_id]
|
268
382
|
self.scraper_name = opts[:scraper_name]
|
269
383
|
self.page_gid = opts[:page_gid]
|
384
|
+
self.uuid_algorithm = opts[:uuid_algorithm]
|
270
385
|
@allow_page_gid_override = opts[:allow_page_gid_override].nil? ? false : !!opts[:allow_page_gid_override]
|
271
386
|
@allow_job_id_override = opts[:allow_job_id_override].nil? ? false : !!opts[:allow_job_id_override]
|
272
387
|
end
|
273
388
|
|
389
|
+
# Generate a fake UUID using the configured uuid algorithm.
|
390
|
+
#
|
391
|
+
# @param seed (nil) Object to use as seed for uuid.
|
392
|
+
#
|
393
|
+
# @return [String]
|
394
|
+
def fake_uuid seed = nil
|
395
|
+
self.class.fake_uuid seed, self.uuid_algorithm
|
396
|
+
end
|
397
|
+
|
274
398
|
# Generate a fake scraper name.
|
275
399
|
#
|
276
400
|
# @return [String]
|
@@ -329,23 +453,42 @@ module DhEasy
|
|
329
453
|
#
|
330
454
|
# @return [String]
|
331
455
|
def generate_page_gid page_data
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
data
|
344
|
-
data
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
"
|
456
|
+
# ensure page url
|
457
|
+
return "" if page_data['url'].nil? || page_data['url'].to_s.strip === ''
|
458
|
+
|
459
|
+
# calculate extra fields, keep field order to match datahen
|
460
|
+
data = []
|
461
|
+
data << "method:#{page_data['method'].to_s.downcase}"
|
462
|
+
no_url_encode = (!page_data['no_url_encode'].nil? && !!page_data['no_url_encode'])
|
463
|
+
uri = self.class.clean_uri_obj(page_data['url'])
|
464
|
+
url = (no_url_encode ? page_data['url'].to_s.lstrip : uri.to_s)
|
465
|
+
data << "url:#{url}"
|
466
|
+
headers = self.class.format_headers page_data['headers']
|
467
|
+
data << "headers:#{headers}"
|
468
|
+
data << "body:#{page_data['body'].to_s}"
|
469
|
+
no_redirect = (!page_data['no_redirect'].nil? && !!page_data['no_redirect'])
|
470
|
+
data << "no_redirect:#{no_redirect.to_s}"
|
471
|
+
ua_type = (page_data['ua_type'].to_s === '') ? 'desktop' : page_data['ua_type']
|
472
|
+
data << "ua_type:#{ua_type}"
|
473
|
+
|
474
|
+
# complex fields
|
475
|
+
data << "fetch_type:#{page_data['fetch_type']}" unless self.class.is_default_fetch_type? page_data['fetch_type']
|
476
|
+
# keep this cookie logic to match datahen
|
477
|
+
data << "cookie:#{page_data['cookie'].split(/;\s*/).sort.join(';')}" if page_data['cookie'].to_s.strip != ''
|
478
|
+
data << "http2:true" if page_data.has_key?('http2') && !page_data['http2'].nil? && !!page_data['http2']
|
479
|
+
data << "driverName:#{page_data['driver']['name']}" unless self.class.is_driver_empty? page_data['driver']
|
480
|
+
unless self.class.is_display_empty? page_data['display']
|
481
|
+
data << "display:#{page_data['display']['width']}x#{page_data['display']['height']}"
|
482
|
+
end
|
483
|
+
unless self.class.is_screenshot_empty? page_data['screenshot']
|
484
|
+
checksum = self.fake_uuid JSON.generate(page_data['screenshot'])
|
485
|
+
data << "screenshot:#{checksum}"
|
486
|
+
end
|
487
|
+
|
488
|
+
# generate GID
|
489
|
+
seed = data.join('|')
|
490
|
+
checksum = self.fake_uuid seed
|
491
|
+
"#{uri.hostname}-#{checksum}"
|
349
492
|
end
|
350
493
|
|
351
494
|
# Get page keys with key generators to emulate saving on db.
|
@@ -354,18 +497,45 @@ module DhEasy
|
|
354
497
|
# @return [Hash]
|
355
498
|
def page_defaults
|
356
499
|
@page_defaults ||= {
|
500
|
+
'job_id' => lambda{|page| job_id},
|
357
501
|
'url' => nil,
|
358
502
|
'status' => 'to_fetch',
|
359
|
-
'
|
503
|
+
'page_type' => 'default',
|
360
504
|
'method' => 'GET',
|
361
505
|
'headers' => {},
|
362
|
-
'fetch_type' =>
|
506
|
+
'fetch_type' => DEFAULT_FETCH_TYPE,
|
363
507
|
'cookie' => nil,
|
364
508
|
'no_redirect' => false,
|
365
509
|
'body' => nil,
|
366
510
|
'ua_type' => 'desktop',
|
367
511
|
'no_url_encode' => false,
|
368
512
|
'http2' => false,
|
513
|
+
'priority' => 0,
|
514
|
+
'parsing_try_count' => 0,
|
515
|
+
'parsing_fail_count' => 0,
|
516
|
+
'fetching_at' => '0001-01-01T00:00:00Z',
|
517
|
+
'fetching_try_count' => 0,
|
518
|
+
'refetch_count' => 0,
|
519
|
+
'fetched_from' => '',
|
520
|
+
'content_size' => 0,
|
521
|
+
'force_fetch' => false,
|
522
|
+
'driver' => {
|
523
|
+
'name' => '',
|
524
|
+
'pre_code' => '',
|
525
|
+
'code' => '',
|
526
|
+
'goto_options' => nil,
|
527
|
+
'stealth' => false,
|
528
|
+
'enable_images' => false
|
529
|
+
},
|
530
|
+
'display' => {
|
531
|
+
'width' => 0,
|
532
|
+
'height' => 0
|
533
|
+
},
|
534
|
+
'screenshot' => {
|
535
|
+
'take_screenshot' => false,
|
536
|
+
'options' => nil
|
537
|
+
},
|
538
|
+
'driver_log' => nil,
|
369
539
|
'vars' => {}
|
370
540
|
}
|
371
541
|
end
|
@@ -380,17 +550,40 @@ module DhEasy
|
|
380
550
|
def pages
|
381
551
|
return @pages unless @page.nil?
|
382
552
|
|
553
|
+
defaults = self.page_defaults
|
383
554
|
collection = self.class.new_collection PAGE_KEYS,
|
384
|
-
defaults:
|
555
|
+
defaults: defaults
|
385
556
|
collection.bind_event(:before_defaults) do |collection, raw_item|
|
386
557
|
item = DhEasy::Core.deep_stringify_keys raw_item
|
558
|
+
if !item['driver'].nil? && item['driver'].is_a?(Hash)
|
559
|
+
item['driver'] = defaults['driver'].merge item['driver']
|
560
|
+
end
|
561
|
+
if !item['display'].nil? && item['display'].is_a?(Hash)
|
562
|
+
item['display'] = defaults['display'].merge item['display']
|
563
|
+
end
|
564
|
+
if !item['screenshot'].nil? && item['screenshot'].is_a?(Hash)
|
565
|
+
item['screenshot'] = defaults['screenshot'].merge item['screenshot']
|
566
|
+
end
|
387
567
|
item.delete 'job_id' unless allow_job_id_override?
|
388
568
|
item
|
389
569
|
end
|
390
570
|
collection.bind_event(:before_insert) do |collection, item, match|
|
571
|
+
item['driver'] = nil if self.class.is_driver_empty? item['driver']
|
572
|
+
item['display'] = nil if self.class.is_display_empty? item['display']
|
573
|
+
item['screenshot'] = nil if self.class.is_screenshot_empty? item['screenshot']
|
574
|
+
item['headers'] = nil if self.class.is_hash_empty? item['headers']
|
575
|
+
item['vars'] = nil if self.class.is_hash_empty? item['vars']
|
576
|
+
uri = self.class.clean_uri_obj(item['url'])
|
577
|
+
item['hostname'] = uri.hostname
|
578
|
+
uri = nil
|
391
579
|
if item['gid'].nil? || !allow_page_gid_override?
|
392
580
|
item['gid'] = generate_page_gid item
|
393
581
|
end
|
582
|
+
|
583
|
+
# 30 days = 60 * 60 * 24 * 30 = 2592000
|
584
|
+
item['freshness'] ||= self.class.time_stamp (Time.now - 2592000)
|
585
|
+
item['to_fetch'] ||= self.class.time_stamp
|
586
|
+
item['created_at'] ||= self.class.time_stamp
|
394
587
|
item
|
395
588
|
end
|
396
589
|
collection.bind_event(:after_insert) do |collection, item|
|
@@ -406,7 +599,7 @@ module DhEasy
|
|
406
599
|
# @return [String]
|
407
600
|
def generate_output_id data
|
408
601
|
# Generate random UUID to match Datahen behavior
|
409
|
-
self.
|
602
|
+
self.fake_uuid
|
410
603
|
end
|
411
604
|
|
412
605
|
# Get output keys with key generators to emulate saving on db.
|
@@ -293,7 +293,6 @@ module DhEasy
|
|
293
293
|
raise ArgumentError.new("per_page needs to be an Integer between 1 and #{MAX_FIND_OUTPUTS_PER_PAGE}.")
|
294
294
|
end
|
295
295
|
|
296
|
-
count = 0
|
297
296
|
offset = (page - 1) * per_page
|
298
297
|
job = latest_job_by(opts[:scraper_name])
|
299
298
|
fixed_query = query.merge(
|
data/lib/dh_easy/core/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dh_easy-core
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eduardo Rosales
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-05-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: datahen
|
@@ -241,8 +241,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
241
241
|
- !ruby/object:Gem::Version
|
242
242
|
version: '0'
|
243
243
|
requirements: []
|
244
|
-
|
245
|
-
rubygems_version: 2.7.6
|
244
|
+
rubygems_version: 3.0.3
|
246
245
|
signing_key:
|
247
246
|
specification_version: 4
|
248
247
|
summary: DataHen Easy toolkit core module
|