datahen 0.20.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/datahen/cli/scraper.rb +9 -6
- data/lib/datahen/cli/scraper_job.rb +3 -2
- data/lib/datahen/client/base.rb +45 -0
- data/lib/datahen/client/job.rb +10 -3
- data/lib/datahen/client/job_output.rb +5 -4
- data/lib/datahen/client/job_page.rb +4 -1
- data/lib/datahen/client/scraper.rb +4 -2
- data/lib/datahen/client/scraper_job.rb +4 -2
- data/lib/datahen/scraper/batch_parser.rb +1 -1
- data/lib/datahen/scraper/executor.rb +9 -2
- data/lib/datahen/scraper/ruby_finisher_executor.rb +2 -0
- data/lib/datahen/scraper/ruby_parser_executor.rb +2 -0
- data/lib/datahen/scraper/ruby_seeder_executor.rb +2 -0
- data/lib/datahen/version.rb +1 -1
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4aa3927b9865f2815f64463f1d48b2dadddeaa73d2bd446a98ea9eb9ecb3ff5f
|
4
|
+
data.tar.gz: 91b7a370e740721202e9f14c043cc5c28cb9e6102dcd701c5121506042ba753b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c643100e60ea20686d882377b7e982829f93b4d4d8750342d47370f9d649688e94517462041b08ec1387901fcc8f33a0b0105e5f4ab43cd378dca5768cc190c4
|
7
|
+
data.tar.gz: ba3bae8b462aa6894520dc115452a27fcb3d64571820970ad7b69592670549824ec53daa1c8194392750f746a14ced4baf2e552eafc2684735dc185c57af8e8e
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -24,8 +24,9 @@ module Datahen
|
|
24
24
|
option :freshness_type, :aliases => :t, desc: 'Set how fresh the page cache is. Possible values: day, week, month, year. Default: any'
|
25
25
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
26
26
|
option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
|
27
|
-
option :
|
28
|
-
option :
|
27
|
+
option :parsers, :aliases => :pw, type: :numeric, desc: 'Set how many parser workers to use. Default: 1'
|
28
|
+
option :fetchers, :aliases => :fw, type: :numeric, desc: 'Set how many fetcher workers to use. Default: 1'
|
29
|
+
option :browsers, :aliases => :bw, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
|
29
30
|
option :disable_scheduler, type: :boolean, desc: 'Set true to disable scheduler. Default: false'
|
30
31
|
option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
|
31
32
|
option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
|
@@ -52,8 +53,9 @@ module Datahen
|
|
52
53
|
option :freshness_type, :aliases => :t, desc: 'Set how fresh the page cache is. Possible values: day, week, month, year. Default: any'
|
53
54
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
54
55
|
option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
|
55
|
-
option :
|
56
|
-
option :
|
56
|
+
option :parsers, :aliases => :pw, type: :numeric, desc: 'Set how many parser workers to use. Default: 1'
|
57
|
+
option :fetchers, :aliases => :fw, type: :numeric, desc: 'Set how many fetcher workers to use. Default: 1'
|
58
|
+
option :browsers, :aliases => :bw, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
|
57
59
|
option :disable_scheduler, type: :boolean, desc: 'Set true to disable scheduler. Default: false'
|
58
60
|
option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
|
59
61
|
option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
|
@@ -97,8 +99,9 @@ module Datahen
|
|
97
99
|
long_desc <<-LONGDESC
|
98
100
|
Starts a scraper by creating an active scrape job\x5
|
99
101
|
LONGDESC
|
100
|
-
option :
|
101
|
-
option :
|
102
|
+
option :parsers, :aliases => :pw, type: :numeric, desc: 'Set how many parser workers to use. Default: 1'
|
103
|
+
option :fetchers, :aliases => :fw, type: :numeric, desc: 'Set how many fetcher workers to use. Default: 1'
|
104
|
+
option :browsers, :aliases => :bw, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
|
102
105
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
103
106
|
option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
|
104
107
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
@@ -99,8 +99,9 @@ module Datahen
|
|
99
99
|
long_desc <<-LONGDESC
|
100
100
|
Updates a scraper's current job.
|
101
101
|
LONGDESC
|
102
|
-
option :
|
103
|
-
option :
|
102
|
+
option :parsers, :aliases => :pw, type: :numeric, desc: 'Set how many parser workers to use. Scraper job must be restarted (paused then resumed) for it to take effect. Default: 1. '
|
103
|
+
option :fetchers, :aliases => :fw, type: :numeric, desc: 'Set how many fetcher workers to use. Scraper job must be restarted (paused then resumed) for it to take effect. Default: 1. '
|
104
|
+
option :browsers, :aliases => :bw, type: :numeric, desc: 'Set how many browser workers to use. Scraper job must be restarted (paused then resumed) for it to take effect. Default: 0. '
|
104
105
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
105
106
|
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
106
107
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
data/lib/datahen/client/base.rb
CHANGED
@@ -7,6 +7,12 @@ module Datahen
|
|
7
7
|
|
8
8
|
default_timeout 60
|
9
9
|
|
10
|
+
DEFAULT_RETRY_LIMIT = {
|
11
|
+
seeder: nil,
|
12
|
+
parser: nil,
|
13
|
+
finisher: nil
|
14
|
+
}
|
15
|
+
|
10
16
|
def self.env_auth_token
|
11
17
|
ENV['DATAHEN_TOKEN']
|
12
18
|
end
|
@@ -33,6 +39,42 @@ module Datahen
|
|
33
39
|
@auth_token = value
|
34
40
|
end
|
35
41
|
|
42
|
+
def default_retry_limit
|
43
|
+
@default_retry_limit ||= DEFAULT_RETRY_LIMIT.dup
|
44
|
+
end
|
45
|
+
|
46
|
+
def left_merge target, source
|
47
|
+
# validate source and target
|
48
|
+
return {} if target.nil? || !target.is_a?(Hash)
|
49
|
+
return target if source.nil? || !source.is_a?(Hash)
|
50
|
+
|
51
|
+
# left merge source into target
|
52
|
+
target.merge(source.select{|k,v|target.has_key?(k)})
|
53
|
+
end
|
54
|
+
|
55
|
+
def retry times, delay = nil, err_msg = nil
|
56
|
+
limit = times.nil? ? nil : times.to_i
|
57
|
+
delay = delay.nil? ? 5 : delay.to_i
|
58
|
+
count = 0
|
59
|
+
begin
|
60
|
+
yield
|
61
|
+
rescue StandardError => e
|
62
|
+
STDERR.puts(e.inspect)
|
63
|
+
|
64
|
+
# wait before retry (default 5 sec)
|
65
|
+
sleep(delay) if delay > 0
|
66
|
+
|
67
|
+
# raise error when retry limit is reached
|
68
|
+
raise e unless limit.nil? || count < limit
|
69
|
+
|
70
|
+
# retry with a 100+ failsafe to prevent overflow error due integer limit
|
71
|
+
should_aprox = limit.nil? && count > 99
|
72
|
+
count += 1 unless should_aprox
|
73
|
+
puts "#{err_msg.nil? ? '' : "#{err_msg} "}Retry \##{count}#{should_aprox ? '+' : ''}..."
|
74
|
+
retry
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
36
78
|
def initialize(opts={})
|
37
79
|
@ignore_ssl = opts[:ignore_ssl]
|
38
80
|
self.class.base_uri(env_api_url)
|
@@ -45,6 +87,9 @@ module Datahen
|
|
45
87
|
verify: !ignore_ssl
|
46
88
|
}
|
47
89
|
|
90
|
+
# extract and merge retry limits
|
91
|
+
@default_retry_limit = self.left_merge(DEFAULT_RETRY_LIMIT, opts[:retry_limit])
|
92
|
+
|
48
93
|
query = {}
|
49
94
|
query[:p] = opts[:page] if opts[:page]
|
50
95
|
query[:pp] = opts[:per_page] if opts[:per_page]
|
data/lib/datahen/client/job.rb
CHANGED
@@ -17,7 +17,8 @@ module Datahen
|
|
17
17
|
def update(job_id, opts={})
|
18
18
|
body = {}
|
19
19
|
body[:status] = opts[:status] if opts[:status]
|
20
|
-
body[:
|
20
|
+
body[:parser_worker_count] = opts[:parsers] if opts[:parsers]
|
21
|
+
body[:fetcher_worker_count] = opts[:fetchers] if opts[:fetchers]
|
21
22
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
22
23
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
23
24
|
body[:profile] = opts[:profile] if opts[:profile]
|
@@ -54,7 +55,10 @@ module Datahen
|
|
54
55
|
|
55
56
|
params = @options.merge({body: body.to_json})
|
56
57
|
|
57
|
-
|
58
|
+
limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:seeder]
|
59
|
+
self.retry(limit, 5, "Error while updating the seeder.") do
|
60
|
+
self.class.put("/jobs/#{job_id}/seeding_update", params)
|
61
|
+
end
|
58
62
|
end
|
59
63
|
|
60
64
|
def finisher_update(job_id, opts={})
|
@@ -65,7 +69,10 @@ module Datahen
|
|
65
69
|
|
66
70
|
params = @options.merge({body: body.to_json})
|
67
71
|
|
68
|
-
|
72
|
+
limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:finisher]
|
73
|
+
self.retry(limit, 5, "Error while updating the finisher.") do
|
74
|
+
self.class.put("/jobs/#{job_id}/finisher_update", params)
|
75
|
+
end
|
69
76
|
end
|
70
77
|
|
71
78
|
def profile(job_id, opts={})
|
@@ -5,9 +5,11 @@ module Datahen
|
|
5
5
|
self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records/#{id}", @options)
|
6
6
|
end
|
7
7
|
|
8
|
-
def all(job_id, collection = 'default')
|
9
|
-
|
10
|
-
self.
|
8
|
+
def all(job_id, collection = 'default', opts = {})
|
9
|
+
limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : 0
|
10
|
+
self.retry(limit, 10, "Error while updating the seeder.") do
|
11
|
+
self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
|
12
|
+
end
|
11
13
|
end
|
12
14
|
|
13
15
|
def collections(job_id)
|
@@ -16,4 +18,3 @@ module Datahen
|
|
16
18
|
end
|
17
19
|
end
|
18
20
|
end
|
19
|
-
|
@@ -68,7 +68,10 @@ module Datahen
|
|
68
68
|
|
69
69
|
params = @options.merge({body: body.to_json})
|
70
70
|
|
71
|
-
|
71
|
+
limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:parser]
|
72
|
+
self.retry(limit, 5, "Error while updating the parser.") do
|
73
|
+
self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", params)
|
74
|
+
end
|
72
75
|
end
|
73
76
|
|
74
77
|
def find_content(job_id, gid)
|
@@ -18,7 +18,8 @@ module Datahen
|
|
18
18
|
body[:git_branch] = opts[:branch] || opts[:git_branch] || "master" if opts[:branch] || opts[:git_branch]
|
19
19
|
body[:freshness_type] = opts[:freshness_type] if opts[:freshness_type]
|
20
20
|
body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
|
21
|
-
body[:
|
21
|
+
body[:parser_worker_count] = opts[:parsers] || opts[:parser_worker_count] if opts[:parsers] || opts[:parser_worker_count]
|
22
|
+
body[:fetcher_worker_count] = opts[:fetchers] || opts[:fetcher_worker_count] if opts[:fetchers] || opts[:fetcher_worker_count]
|
22
23
|
body[:browser_worker_count] = opts[:browsers] || opts[:browser_worker_count] if opts[:browsers] || opts[:browser_worker_count]
|
23
24
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
24
25
|
body[:disable_scheduler] = opts[:disable_scheduler] if opts[:disable_scheduler]
|
@@ -42,7 +43,8 @@ module Datahen
|
|
42
43
|
body[:git_branch] = opts[:branch] || opts[:git_branch] if opts[:branch] || opts[:git_branch]
|
43
44
|
body[:freshness_type] = opts[:freshness_type] if opts[:freshness_type]
|
44
45
|
body[:force_fetch] = opts[:force_fetch] if opts.has_key?("force_fetch") || opts.has_key?(:force_fetch)
|
45
|
-
body[:
|
46
|
+
body[:parser_worker_count] = opts[:parsers] || opts[:parser_worker_count] if opts[:parsers] || opts[:parser_worker_count]
|
47
|
+
body[:fetcher_worker_count] = opts[:fetchers] || opts[:fetcher_worker_count] if opts[:fetchers] || opts[:fetcher_worker_count]
|
46
48
|
body[:browser_worker_count] = opts[:browsers] || opts[:browser_worker_count] if opts[:browsers] || opts[:browser_worker_count]
|
47
49
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
48
50
|
body[:disable_scheduler] = opts[:disable_scheduler] if opts.has_key?("disable_scheduler") || opts.has_key?(:disable_scheduler)
|
@@ -8,7 +8,8 @@ module Datahen
|
|
8
8
|
|
9
9
|
def create(scraper_name, opts={})
|
10
10
|
body = {}
|
11
|
-
body[:
|
11
|
+
body[:parser_worker_count] = opts[:parsers] if opts[:parsers]
|
12
|
+
body[:fetcher_worker_count] = opts[:fetchers] if opts[:fetchers]
|
12
13
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
13
14
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
14
15
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
@@ -36,7 +37,8 @@ module Datahen
|
|
36
37
|
def update(scraper_name, opts={})
|
37
38
|
body = {}
|
38
39
|
body[:status] = opts[:status] if opts[:status]
|
39
|
-
body[:
|
40
|
+
body[:parser_worker_count] = opts[:parsers] if opts[:parsers]
|
41
|
+
body[:fetcher_worker_count] = opts[:fetchers] if opts[:fetchers]
|
40
42
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
41
43
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
42
44
|
body[:profile] = opts[:profile] if opts[:profile]
|
@@ -152,7 +152,7 @@ module Datahen
|
|
152
152
|
@page_types = []
|
153
153
|
@parsers = Concurrent::Hash.new
|
154
154
|
@config = YAML.load_file(config_file)
|
155
|
-
self.config['parsers'].each do |v|
|
155
|
+
(self.config['parsers'] || []).each do |v|
|
156
156
|
next if !v['disabled'].nil? && !!v['disabled']
|
157
157
|
@page_types << v['page_type']
|
158
158
|
self.parsers[v['page_type']] = v['file']
|
@@ -5,6 +5,7 @@ module Datahen
|
|
5
5
|
class Executor
|
6
6
|
# Max allowed page size when query outputs (see #find_outputs).
|
7
7
|
MAX_FIND_OUTPUTS_PER_PAGE = 500
|
8
|
+
FIND_OUTPUTS_RETRY_LIMIT = 0
|
8
9
|
|
9
10
|
attr_accessor :filename, :page, :gid, :job_id
|
10
11
|
|
@@ -159,13 +160,18 @@ module Datahen
|
|
159
160
|
options = {
|
160
161
|
query: query,
|
161
162
|
page: page,
|
162
|
-
per_page: per_page
|
163
|
+
per_page: per_page
|
164
|
+
}
|
163
165
|
|
164
166
|
# Get job_id
|
165
167
|
query_job_id = opts[:job_id] || get_job_id(opts[:scraper_name], self.job_id)
|
166
168
|
|
169
|
+
# find outputs
|
170
|
+
retry_limit = opts.has_key?(:retry_limit) ? opts[:retry_limit] : self.class::FIND_OUTPUTS_RETRY_LIMIT
|
167
171
|
client = Client::JobOutput.new(options)
|
168
|
-
response = client.all(query_job_id, collection
|
172
|
+
response = client.all(query_job_id, collection, {
|
173
|
+
retry_limit: retry_limit
|
174
|
+
})
|
169
175
|
|
170
176
|
if response.code != 200
|
171
177
|
raise "response_code: #{response.code}|#{response.parsed_response}"
|
@@ -304,6 +310,7 @@ module Datahen
|
|
304
310
|
end
|
305
311
|
|
306
312
|
# saving to server
|
313
|
+
|
307
314
|
response = update_to_server(
|
308
315
|
job_id: job_id,
|
309
316
|
gid: gid,
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-08-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -277,7 +277,7 @@ metadata:
|
|
277
277
|
allowed_push_host: https://rubygems.org
|
278
278
|
homepage_uri: https://datahen.com
|
279
279
|
source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
|
280
|
-
post_install_message:
|
280
|
+
post_install_message:
|
281
281
|
rdoc_options: []
|
282
282
|
require_paths:
|
283
283
|
- lib
|
@@ -293,7 +293,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
293
293
|
version: '0'
|
294
294
|
requirements: []
|
295
295
|
rubygems_version: 3.0.3
|
296
|
-
signing_key:
|
296
|
+
signing_key:
|
297
297
|
specification_version: 4
|
298
298
|
summary: DataHen toolbelt for developers
|
299
299
|
test_files: []
|