datahen 1.1.1 → 1.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/exe/hen +1 -0
- data/lib/datahen/cli/scraper_page.rb +8 -4
- data/lib/datahen/cli.rb +6 -1
- data/lib/datahen/client/base.rb +11 -3
- data/lib/datahen/client/job.rb +10 -2
- data/lib/datahen/client/job_page.rb +5 -1
- data/lib/datahen/error/custom_retry_error.rb +12 -0
- data/lib/datahen/error.rb +1 -0
- data/lib/datahen/scraper.rb +0 -3
- data/lib/datahen/version.rb +1 -1
- data/lib/datahen.rb +3 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7f50f72a9ba32a9da0ecf43c684baa35c6ed84302f705ef8f87272a3ea0c9d0d
|
4
|
+
data.tar.gz: 5451f07252a2ba798313c02f2a3183e80d321e95d4b8eacc800b70217dd03077
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 52d5f2c71f63379503b2f054b3ef2ba587d43b35d5dfa5f20c0a1c0547f62a09a1c805ce7452a37e9d064f5cec1ba3aaf833ce09eb1d6583220fcf9ae88c10a3
|
7
|
+
data.tar.gz: 55d3564529cc1c1a3815b936f1b0184d5e62c95af8e2c85e051430b6c38e206a27622d0c4d00e250b4db1f8cdf7c9a0276b1252335d337a888cc614bc3242790
|
data/exe/hen
CHANGED
@@ -131,13 +131,14 @@ module Datahen
|
|
131
131
|
Reparse pages in a scraper's current job. You need to specify either a --gid or --parse-fail or --status or --page-type.\x5
|
132
132
|
LONGDESC
|
133
133
|
option :gid, :aliases => :g, type: :string, desc: 'Reparse a specific GID'
|
134
|
+
option :fetch_fail, type: :boolean, desc: 'Reparse only pages that fails fetching.'
|
134
135
|
option :parse_fail, type: :boolean, desc: 'Reparse only pages that fails parsing.'
|
135
136
|
option :status, type: :string, desc: 'Reparse only pages with a specific status.'
|
136
137
|
option :page_type, type: :string, desc: 'Refetches only pages with a specific page type.'
|
137
138
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
138
139
|
def reparse(scraper_name)
|
139
|
-
if !options.key?(:gid) && !options.key?(:parse_fail) && !options.key?(:status) && !options.key?(:page_type)
|
140
|
-
puts "Must specify either a --gid, --parse-fail, --status or --page-type"
|
140
|
+
if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status) && !options.key?(:page_type)
|
141
|
+
puts "Must specify either a --gid, --fetch-fail, --parse-fail, --status or --page-type"
|
141
142
|
return
|
142
143
|
end
|
143
144
|
|
@@ -155,11 +156,14 @@ module Datahen
|
|
155
156
|
Move pages in a scraper's current job to limbo. You need to specify either a --gid or --status.\x5
|
156
157
|
LONGDESC
|
157
158
|
option :gid, :aliases => :g, type: :string, desc: 'Move a specific GID to limbo'
|
159
|
+
option :fetch_fail, type: :boolean, desc: 'Move pages that fails fetching to limbo.'
|
160
|
+
option :parse_fail, type: :boolean, desc: 'Move pages that fails parsing to limbo.'
|
158
161
|
option :status, type: :string, desc: 'Move pages with a specific status to limbo.'
|
162
|
+
option :page_type, type: :string, desc: 'Move pages with a specific page type to limbo.'
|
159
163
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
160
164
|
def limbo(scraper_name)
|
161
|
-
if !options.key?(:gid) && !options.key?(:status)
|
162
|
-
puts "Must specify either a --gid or --
|
165
|
+
if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status) && !options.key?(:page_type)
|
166
|
+
puts "Must specify either a --gid, --fetch-fail, --parse-fail, --status or --page-type"
|
163
167
|
return
|
164
168
|
end
|
165
169
|
|
data/lib/datahen/cli.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
require 'thor'
|
2
|
-
require 'datahen
|
2
|
+
require 'datahen'
|
3
3
|
require 'datahen/cli/scraper_var'
|
4
4
|
require 'datahen/cli/scraper_exporter'
|
5
5
|
require 'datahen/cli/scraper_export'
|
@@ -44,5 +44,10 @@ module Datahen
|
|
44
44
|
|
45
45
|
desc "account SUBCOMMAND ...ARGS", "for account related activities"
|
46
46
|
subcommand "account", Account
|
47
|
+
|
48
|
+
desc "version", "Shows the gem version"
|
49
|
+
def version()
|
50
|
+
puts "#{VERSION}"
|
51
|
+
end
|
47
52
|
end
|
48
53
|
end
|
data/lib/datahen/client/base.rb
CHANGED
@@ -21,6 +21,10 @@ module Datahen
|
|
21
21
|
ENV['DATAHEN_IGNORE_SSL'].to_s.strip == '1'
|
22
22
|
end
|
23
23
|
|
24
|
+
def self.random_delay max_seconds = 2
|
25
|
+
(rand * max_seconds * 1000.0).to_i / 1000.0
|
26
|
+
end
|
27
|
+
|
24
28
|
def env_api_url
|
25
29
|
ENV['DATAHEN_API_URL'].nil? ? 'https://app.datahen.com/api/v1' : ENV['DATAHEN_API_URL']
|
26
30
|
end
|
@@ -58,11 +62,15 @@ module Datahen
|
|
58
62
|
count = 0
|
59
63
|
begin
|
60
64
|
yield
|
61
|
-
rescue StandardError => e
|
62
|
-
|
65
|
+
rescue Error::CustomRetryError, StandardError => e
|
66
|
+
is_custom_retry = e.is_a? Error::CustomRetryError
|
67
|
+
real_delay = is_custom_retry ? e.delay : delay
|
68
|
+
err_msg = is_custom_retry ? e.error : e.inspect
|
69
|
+
|
70
|
+
STDERR.puts(err_msg)
|
63
71
|
|
64
72
|
# wait before retry (default 5 sec)
|
65
|
-
sleep(delay) if
|
73
|
+
sleep(delay) if real_delay > 0
|
66
74
|
|
67
75
|
# raise error when retry limit is reached
|
68
76
|
raise e unless limit.nil? || count < limit
|
data/lib/datahen/client/job.rb
CHANGED
@@ -57,7 +57,11 @@ module Datahen
|
|
57
57
|
|
58
58
|
limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:seeder]
|
59
59
|
self.retry(limit, 5, "Error while updating the seeder.") do
|
60
|
-
self.class.put("/jobs/#{job_id}/seeding_update", params)
|
60
|
+
response = self.class.put("/jobs/#{job_id}/seeding_update", params)
|
61
|
+
if response.code == 422 && response.body.to_s =~ /pq:\s*deadlock/i
|
62
|
+
raise CustomRetryError.new(self.class.random_delay(5), response.body.to_s)
|
63
|
+
end
|
64
|
+
response
|
61
65
|
end
|
62
66
|
end
|
63
67
|
|
@@ -71,7 +75,11 @@ module Datahen
|
|
71
75
|
|
72
76
|
limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:finisher]
|
73
77
|
self.retry(limit, 5, "Error while updating the finisher.") do
|
74
|
-
self.class.put("/jobs/#{job_id}/finisher_update", params)
|
78
|
+
response = self.class.put("/jobs/#{job_id}/finisher_update", params)
|
79
|
+
if response.code == 422 && response.body.to_s =~ /pq:\s*deadlock/
|
80
|
+
raise CustomRetryError.new(self.class.random_delay(5), response.body.to_s)
|
81
|
+
end
|
82
|
+
response
|
75
83
|
end
|
76
84
|
end
|
77
85
|
|
@@ -70,7 +70,11 @@ module Datahen
|
|
70
70
|
|
71
71
|
limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:parser]
|
72
72
|
self.retry(limit, 5, "Error while updating the parser.") do
|
73
|
-
self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", params)
|
73
|
+
response = self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", params)
|
74
|
+
if response.code == 422 && response.body.to_s =~ /pq:\s*deadlock/i
|
75
|
+
raise Error::CustomRetryError.new(self.class.random_delay(5), response.body.to_s)
|
76
|
+
end
|
77
|
+
response
|
74
78
|
end
|
75
79
|
end
|
76
80
|
|
data/lib/datahen/error.rb
CHANGED
data/lib/datahen/scraper.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
require "datahen/error"
|
2
|
-
require "datahen/plugin"
|
3
1
|
require "datahen/scraper/parser"
|
4
2
|
require "datahen/scraper/batch_parser"
|
5
3
|
require "datahen/scraper/seeder"
|
@@ -8,7 +6,6 @@ require "datahen/scraper/executor"
|
|
8
6
|
require "datahen/scraper/ruby_parser_executor"
|
9
7
|
require "datahen/scraper/ruby_seeder_executor"
|
10
8
|
require "datahen/scraper/ruby_finisher_executor"
|
11
|
-
require "datahen/client"
|
12
9
|
|
13
10
|
module Datahen
|
14
11
|
module Scraper
|
data/lib/datahen/version.rb
CHANGED
data/lib/datahen.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-11-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -257,6 +257,7 @@ files:
|
|
257
257
|
- lib/datahen/client/scraper_job_var.rb
|
258
258
|
- lib/datahen/client/scraper_var.rb
|
259
259
|
- lib/datahen/error.rb
|
260
|
+
- lib/datahen/error/custom_retry_error.rb
|
260
261
|
- lib/datahen/error/safe_terminate_error.rb
|
261
262
|
- lib/datahen/plugin.rb
|
262
263
|
- lib/datahen/plugin/context_exposer.rb
|