datahen 1.1.1 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/exe/hen +1 -0
- data/lib/datahen/cli/scraper_page.rb +8 -4
- data/lib/datahen/cli.rb +6 -1
- data/lib/datahen/client/base.rb +11 -3
- data/lib/datahen/client/job.rb +10 -2
- data/lib/datahen/client/job_page.rb +5 -1
- data/lib/datahen/error/custom_retry_error.rb +12 -0
- data/lib/datahen/error.rb +1 -0
- data/lib/datahen/scraper.rb +0 -3
- data/lib/datahen/version.rb +1 -1
- data/lib/datahen.rb +3 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7f50f72a9ba32a9da0ecf43c684baa35c6ed84302f705ef8f87272a3ea0c9d0d
|
4
|
+
data.tar.gz: 5451f07252a2ba798313c02f2a3183e80d321e95d4b8eacc800b70217dd03077
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 52d5f2c71f63379503b2f054b3ef2ba587d43b35d5dfa5f20c0a1c0547f62a09a1c805ce7452a37e9d064f5cec1ba3aaf833ce09eb1d6583220fcf9ae88c10a3
|
7
|
+
data.tar.gz: 55d3564529cc1c1a3815b936f1b0184d5e62c95af8e2c85e051430b6c38e206a27622d0c4d00e250b4db1f8cdf7c9a0276b1252335d337a888cc614bc3242790
|
data/exe/hen
CHANGED
@@ -131,13 +131,14 @@ module Datahen
|
|
131
131
|
Reparse pages in a scraper's current job. You need to specify either a --gid or --parse-fail or --status or --page-type.\x5
|
132
132
|
LONGDESC
|
133
133
|
option :gid, :aliases => :g, type: :string, desc: 'Reparse a specific GID'
|
134
|
+
option :fetch_fail, type: :boolean, desc: 'Reparse only pages that fails fetching.'
|
134
135
|
option :parse_fail, type: :boolean, desc: 'Reparse only pages that fails parsing.'
|
135
136
|
option :status, type: :string, desc: 'Reparse only pages with a specific status.'
|
136
137
|
option :page_type, type: :string, desc: 'Refetches only pages with a specific page type.'
|
137
138
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
138
139
|
def reparse(scraper_name)
|
139
|
-
if !options.key?(:gid) && !options.key?(:parse_fail) && !options.key?(:status) && !options.key?(:page_type)
|
140
|
-
puts "Must specify either a --gid, --parse-fail, --status or --page-type"
|
140
|
+
if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status) && !options.key?(:page_type)
|
141
|
+
puts "Must specify either a --gid, --fetch-fail, --parse-fail, --status or --page-type"
|
141
142
|
return
|
142
143
|
end
|
143
144
|
|
@@ -155,11 +156,14 @@ module Datahen
|
|
155
156
|
Move pages in a scraper's current job to limbo. You need to specify either a --gid or --status.\x5
|
156
157
|
LONGDESC
|
157
158
|
option :gid, :aliases => :g, type: :string, desc: 'Move a specific GID to limbo'
|
159
|
+
option :fetch_fail, type: :boolean, desc: 'Move pages that fails fetching to limbo.'
|
160
|
+
option :parse_fail, type: :boolean, desc: 'Move pages that fails parsing to limbo.'
|
158
161
|
option :status, type: :string, desc: 'Move pages with a specific status to limbo.'
|
162
|
+
option :page_type, type: :string, desc: 'Move pages with a specific page type to limbo.'
|
159
163
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
160
164
|
def limbo(scraper_name)
|
161
|
-
if !options.key?(:gid) && !options.key?(:status)
|
162
|
-
puts "Must specify either a --gid or --
|
165
|
+
if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status) && !options.key?(:page_type)
|
166
|
+
puts "Must specify either a --gid, --fetch-fail, --parse-fail, --status or --page-type"
|
163
167
|
return
|
164
168
|
end
|
165
169
|
|
data/lib/datahen/cli.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
require 'thor'
|
2
|
-
require 'datahen
|
2
|
+
require 'datahen'
|
3
3
|
require 'datahen/cli/scraper_var'
|
4
4
|
require 'datahen/cli/scraper_exporter'
|
5
5
|
require 'datahen/cli/scraper_export'
|
@@ -44,5 +44,10 @@ module Datahen
|
|
44
44
|
|
45
45
|
desc "account SUBCOMMAND ...ARGS", "for account related activities"
|
46
46
|
subcommand "account", Account
|
47
|
+
|
48
|
+
desc "version", "Shows the gem version"
|
49
|
+
def version()
|
50
|
+
puts "#{VERSION}"
|
51
|
+
end
|
47
52
|
end
|
48
53
|
end
|
data/lib/datahen/client/base.rb
CHANGED
@@ -21,6 +21,10 @@ module Datahen
|
|
21
21
|
ENV['DATAHEN_IGNORE_SSL'].to_s.strip == '1'
|
22
22
|
end
|
23
23
|
|
24
|
+
def self.random_delay max_seconds = 2
|
25
|
+
(rand * max_seconds * 1000.0).to_i / 1000.0
|
26
|
+
end
|
27
|
+
|
24
28
|
def env_api_url
|
25
29
|
ENV['DATAHEN_API_URL'].nil? ? 'https://app.datahen.com/api/v1' : ENV['DATAHEN_API_URL']
|
26
30
|
end
|
@@ -58,11 +62,15 @@ module Datahen
|
|
58
62
|
count = 0
|
59
63
|
begin
|
60
64
|
yield
|
61
|
-
rescue StandardError => e
|
62
|
-
|
65
|
+
rescue Error::CustomRetryError, StandardError => e
|
66
|
+
is_custom_retry = e.is_a? Error::CustomRetryError
|
67
|
+
real_delay = is_custom_retry ? e.delay : delay
|
68
|
+
err_msg = is_custom_retry ? e.error : e.inspect
|
69
|
+
|
70
|
+
STDERR.puts(err_msg)
|
63
71
|
|
64
72
|
# wait before retry (default 5 sec)
|
65
|
-
sleep(delay) if
|
73
|
+
sleep(delay) if real_delay > 0
|
66
74
|
|
67
75
|
# raise error when retry limit is reached
|
68
76
|
raise e unless limit.nil? || count < limit
|
data/lib/datahen/client/job.rb
CHANGED
@@ -57,7 +57,11 @@ module Datahen
|
|
57
57
|
|
58
58
|
limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:seeder]
|
59
59
|
self.retry(limit, 5, "Error while updating the seeder.") do
|
60
|
-
self.class.put("/jobs/#{job_id}/seeding_update", params)
|
60
|
+
response = self.class.put("/jobs/#{job_id}/seeding_update", params)
|
61
|
+
if response.code == 422 && response.body.to_s =~ /pq:\s*deadlock/i
|
62
|
+
raise CustomRetryError.new(self.class.random_delay(5), response.body.to_s)
|
63
|
+
end
|
64
|
+
response
|
61
65
|
end
|
62
66
|
end
|
63
67
|
|
@@ -71,7 +75,11 @@ module Datahen
|
|
71
75
|
|
72
76
|
limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:finisher]
|
73
77
|
self.retry(limit, 5, "Error while updating the finisher.") do
|
74
|
-
self.class.put("/jobs/#{job_id}/finisher_update", params)
|
78
|
+
response = self.class.put("/jobs/#{job_id}/finisher_update", params)
|
79
|
+
if response.code == 422 && response.body.to_s =~ /pq:\s*deadlock/
|
80
|
+
raise CustomRetryError.new(self.class.random_delay(5), response.body.to_s)
|
81
|
+
end
|
82
|
+
response
|
75
83
|
end
|
76
84
|
end
|
77
85
|
|
@@ -70,7 +70,11 @@ module Datahen
|
|
70
70
|
|
71
71
|
limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:parser]
|
72
72
|
self.retry(limit, 5, "Error while updating the parser.") do
|
73
|
-
self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", params)
|
73
|
+
response = self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", params)
|
74
|
+
if response.code == 422 && response.body.to_s =~ /pq:\s*deadlock/i
|
75
|
+
raise Error::CustomRetryError.new(self.class.random_delay(5), response.body.to_s)
|
76
|
+
end
|
77
|
+
response
|
74
78
|
end
|
75
79
|
end
|
76
80
|
|
data/lib/datahen/error.rb
CHANGED
data/lib/datahen/scraper.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
require "datahen/error"
|
2
|
-
require "datahen/plugin"
|
3
1
|
require "datahen/scraper/parser"
|
4
2
|
require "datahen/scraper/batch_parser"
|
5
3
|
require "datahen/scraper/seeder"
|
@@ -8,7 +6,6 @@ require "datahen/scraper/executor"
|
|
8
6
|
require "datahen/scraper/ruby_parser_executor"
|
9
7
|
require "datahen/scraper/ruby_seeder_executor"
|
10
8
|
require "datahen/scraper/ruby_finisher_executor"
|
11
|
-
require "datahen/client"
|
12
9
|
|
13
10
|
module Datahen
|
14
11
|
module Scraper
|
data/lib/datahen/version.rb
CHANGED
data/lib/datahen.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-11-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -257,6 +257,7 @@ files:
|
|
257
257
|
- lib/datahen/client/scraper_job_var.rb
|
258
258
|
- lib/datahen/client/scraper_var.rb
|
259
259
|
- lib/datahen/error.rb
|
260
|
+
- lib/datahen/error/custom_retry_error.rb
|
260
261
|
- lib/datahen/error/safe_terminate_error.rb
|
261
262
|
- lib/datahen/plugin.rb
|
262
263
|
- lib/datahen/plugin/context_exposer.rb
|