datahen 1.4.0 → 1.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/datahen/cli/job_output.rb +36 -6
- data/lib/datahen/cli/scraper.rb +27 -2
- data/lib/datahen/cli/scraper_job.rb +4 -0
- data/lib/datahen/cli/scraper_page.rb +19 -2
- data/lib/datahen/cli/scraper_task.rb +48 -0
- data/lib/datahen/cli.rb +1 -0
- data/lib/datahen/client/base.rb +15 -2
- data/lib/datahen/client/job.rb +5 -1
- data/lib/datahen/client/job_output.rb +1 -1
- data/lib/datahen/client/job_page.rb +9 -0
- data/lib/datahen/client/job_task.rb +17 -0
- data/lib/datahen/client/scraper.rb +8 -0
- data/lib/datahen/client/scraper_job.rb +8 -0
- data/lib/datahen/client/scraper_task.rb +17 -0
- data/lib/datahen/client.rb +2 -0
- data/lib/datahen/scraper/batch_parser.rb +6 -2
- data/lib/datahen/scraper/executor.rb +7 -2
- data/lib/datahen/scraper/ruby_parser_executor.rb +10 -3
- data/lib/datahen/version.rb +1 -1
- metadata +9 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 96d2bc30d1c96ce684d83efa54b6dff5966db2a1bba7ab4856b11caba2803086
|
4
|
+
data.tar.gz: 985712d5d7e6559ac64b76669241f56d704c754deb06a164e1f449aad10ef29e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d9c6bd3e60034339a8354fe4bda365b91f21b6ec68da8f384d7380abcafa5ccce2c2aacd6cc7a8da37378b8681afe58765bcc461211812c623a8958eac7a5f72
|
7
|
+
data.tar.gz: ac5eb5c8de4e4b0a6d28d96179bab4bf347662247b94e775ed0a25e0f0ef00a542f01f8a1a06525b565e7bd1055d5cd30b480a28d28c7ebf5de893b89b9f5e3a
|
@@ -20,10 +20,20 @@ module Datahen
|
|
20
20
|
collection = options.fetch(:collection) { 'default' }
|
21
21
|
if options[:job]
|
22
22
|
client = Client::JobOutput.new(options)
|
23
|
-
|
23
|
+
json = JSON.parse(client.all(options[:job], collection).body)
|
24
|
+
if json['error'] == ""
|
25
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
26
|
+
else
|
27
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
28
|
+
end
|
24
29
|
else
|
25
30
|
client = Client::ScraperJobOutput.new(options)
|
26
|
-
|
31
|
+
json = JSON.parse(client.all(scraper_name, collection).body)
|
32
|
+
if json['error'] == ""
|
33
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
34
|
+
else
|
35
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
36
|
+
end
|
27
37
|
end
|
28
38
|
end
|
29
39
|
|
@@ -38,10 +48,20 @@ module Datahen
|
|
38
48
|
collection = options.fetch(:collection) { 'default' }
|
39
49
|
if options[:job]
|
40
50
|
client = Client::JobOutput.new(options)
|
41
|
-
|
51
|
+
json = JSON.parse(client.find(options[:job], collection, id).body)
|
52
|
+
if json['error'] == ""
|
53
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
54
|
+
else
|
55
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
56
|
+
end
|
42
57
|
else
|
43
58
|
client = Client::ScraperJobOutput.new(options)
|
44
|
-
|
59
|
+
json = JSON.parse(client.find(scraper_name, collection, id).body)
|
60
|
+
if json['error'] == ""
|
61
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
62
|
+
else
|
63
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
64
|
+
end
|
45
65
|
end
|
46
66
|
end
|
47
67
|
|
@@ -56,10 +76,20 @@ module Datahen
|
|
56
76
|
|
57
77
|
if options[:job]
|
58
78
|
client = Client::JobOutput.new(options)
|
59
|
-
|
79
|
+
json = JSON.parse(client.collections(options[:job]).body)
|
80
|
+
if json['error'] == ""
|
81
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
82
|
+
else
|
83
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
84
|
+
end
|
60
85
|
else
|
61
86
|
client = Client::ScraperJobOutput.new(options)
|
62
|
-
|
87
|
+
json = JSON.parse(client.collections(scraper_name).body)
|
88
|
+
if json['error'] == ""
|
89
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
90
|
+
else
|
91
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
92
|
+
end
|
63
93
|
end
|
64
94
|
end
|
65
95
|
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -37,6 +37,10 @@ module Datahen
|
|
37
37
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
38
38
|
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
39
39
|
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
40
|
+
option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
|
41
|
+
option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
|
42
|
+
option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
|
43
|
+
option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
|
40
44
|
def create(scraper_name, git_repository)
|
41
45
|
# puts "options #{options}"
|
42
46
|
client = Client::Scraper.new(options)
|
@@ -66,6 +70,10 @@ module Datahen
|
|
66
70
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
67
71
|
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
68
72
|
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
73
|
+
option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
|
74
|
+
option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
|
75
|
+
option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
|
76
|
+
option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
|
69
77
|
def update(scraper_name)
|
70
78
|
client = Client::Scraper.new(options)
|
71
79
|
puts "#{client.update(scraper_name, options)}"
|
@@ -106,6 +114,10 @@ module Datahen
|
|
106
114
|
option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
|
107
115
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
108
116
|
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
117
|
+
option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
|
118
|
+
option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
|
119
|
+
option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
|
120
|
+
option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
|
109
121
|
def start(scraper_name)
|
110
122
|
client = Client::ScraperJob.new(options)
|
111
123
|
puts "Starting a scrape job..."
|
@@ -188,9 +200,19 @@ module Datahen
|
|
188
200
|
def history(scraper_name)
|
189
201
|
client = Client::JobStat.new(options)
|
190
202
|
if options[:job]
|
191
|
-
|
203
|
+
json = JSON.parse(client.job_stats_history(options[:job], options).body)
|
204
|
+
if json['error'] == ""
|
205
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
206
|
+
else
|
207
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
208
|
+
end
|
192
209
|
else
|
193
|
-
|
210
|
+
json = JSON.parse(client.scraper_job_stats_history(scraper_name, options).body)
|
211
|
+
if json['error'] == ""
|
212
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
213
|
+
else
|
214
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
215
|
+
end
|
194
216
|
end
|
195
217
|
end
|
196
218
|
|
@@ -227,6 +249,9 @@ module Datahen
|
|
227
249
|
desc "var SUBCOMMAND ...ARGS", "for managing scraper's variables"
|
228
250
|
subcommand "var", ScraperVar
|
229
251
|
|
252
|
+
desc "task SUBCOMMAND ...ARGS", "manage task on a job"
|
253
|
+
subcommand "task", ScraperTask
|
254
|
+
|
230
255
|
|
231
256
|
end
|
232
257
|
end
|
@@ -108,6 +108,10 @@ module Datahen
|
|
108
108
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
109
109
|
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
110
110
|
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
111
|
+
option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
|
112
|
+
option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
|
113
|
+
option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
|
114
|
+
option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
|
111
115
|
def update(scraper_name)
|
112
116
|
if options[:job]
|
113
117
|
client = Client::Job.new(options)
|
@@ -13,6 +13,10 @@ module Datahen
|
|
13
13
|
LONGDESC
|
14
14
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
15
15
|
option :page_type, :aliases => :t, type: :string, desc: 'Filter by page_type'
|
16
|
+
option :url, :aliases => :u, type: :string, desc: 'Filter by url'
|
17
|
+
option :effective_url, :aliases => :U, type: :string, desc: 'Filter by effective_url'
|
18
|
+
option :body, :aliases => :b, type: :string, desc: 'Filter by body'
|
19
|
+
option :parent_gid, :aliases => :G, type: :string, desc: 'Filter by parent_gid'
|
16
20
|
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
17
21
|
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
18
22
|
option :fetch_fail, type: :boolean, desc: 'Returns only pages that fails fetching.'
|
@@ -21,10 +25,20 @@ module Datahen
|
|
21
25
|
def list(scraper_name)
|
22
26
|
if options[:job]
|
23
27
|
client = Client::JobPage.new(options)
|
24
|
-
|
28
|
+
json = JSON.parse(client.all(options[:job]).body)
|
29
|
+
if json['error'] == ""
|
30
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
31
|
+
else
|
32
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
33
|
+
end
|
25
34
|
else
|
26
35
|
client = Client::ScraperJobPage.new(options)
|
27
|
-
|
36
|
+
json = JSON.parse(client.all(scraper_name).body)
|
37
|
+
if json['error'] == ""
|
38
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
39
|
+
else
|
40
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
41
|
+
end
|
28
42
|
end
|
29
43
|
end
|
30
44
|
|
@@ -84,6 +98,9 @@ module Datahen
|
|
84
98
|
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
85
99
|
option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
86
100
|
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
101
|
+
option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
|
102
|
+
option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
|
103
|
+
option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
|
87
104
|
def update(scraper_name, gid)
|
88
105
|
begin
|
89
106
|
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module Datahen
|
2
|
+
class CLI < Thor
|
3
|
+
class ScraperTask < Thor
|
4
|
+
package_name "scraper task"
|
5
|
+
def self.banner(command, namespace = nil, subcommand = false)
|
6
|
+
"#{basename} #{@package_name} #{command.usage}"
|
7
|
+
end
|
8
|
+
|
9
|
+
desc "list <scraper_name>", "List Tasks on a scraper's current job"
|
10
|
+
long_desc <<-LONGDESC
|
11
|
+
List all tasks in a scraper's current job or given job ID.\x5
|
12
|
+
LONGDESC
|
13
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
14
|
+
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
15
|
+
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
16
|
+
option :status, type: :array, desc: 'Returns only tasks with specific status.'
|
17
|
+
option :action, type: :array, desc: 'Returns only tasks with specific action.'
|
18
|
+
option :"include-system", type: :boolean, desc: 'If it is true, will returns all actions. If it is false only tasks with specific action ["refetch", "reparse", "terminate"].'
|
19
|
+
def list(scraper_name)
|
20
|
+
if options[:job]
|
21
|
+
client = Client::JobTask.new(options)
|
22
|
+
puts "#{client.all(options[:job])}"
|
23
|
+
else
|
24
|
+
client = Client::ScraperTask.new(options)
|
25
|
+
puts "#{client.all(scraper_name)}"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
desc "show <scraper_name> <task_id>", "Show task in scraper's current job"
|
31
|
+
long_desc <<-LONGDESC
|
32
|
+
Shows a task in a scraper's current job or given job ID.\x5
|
33
|
+
LONGDESC
|
34
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
35
|
+
def show(scraper_name, task_id)
|
36
|
+
if options[:job]
|
37
|
+
client = Client::JobTask.new(options)
|
38
|
+
puts "#{client.find(options[:job], task_id)}"
|
39
|
+
else
|
40
|
+
client = Client::ScraperTask.new(options)
|
41
|
+
puts "#{client.find(scraper_name, task_id)}"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
data/lib/datahen/cli.rb
CHANGED
@@ -11,6 +11,7 @@ require 'datahen/cli/scraper_page'
|
|
11
11
|
require 'datahen/cli/job_output'
|
12
12
|
require 'datahen/cli/job'
|
13
13
|
require 'datahen/cli/scraper_deployment'
|
14
|
+
require 'datahen/cli/scraper_task'
|
14
15
|
require 'datahen/cli/scraper'
|
15
16
|
require 'datahen/cli/parser'
|
16
17
|
require 'datahen/cli/seeder'
|
data/lib/datahen/client/base.rb
CHANGED
@@ -56,12 +56,18 @@ module Datahen
|
|
56
56
|
target.merge(source.select{|k,v|target.has_key?(k)})
|
57
57
|
end
|
58
58
|
|
59
|
-
def retry times, delay = nil, err_msg = nil
|
59
|
+
def retry times, delay = nil, err_msg = nil, stream = false
|
60
60
|
limit = times.nil? ? nil : times.to_i
|
61
61
|
delay = delay.nil? ? 5 : delay.to_i
|
62
62
|
count = 0
|
63
63
|
begin
|
64
|
-
yield
|
64
|
+
val = yield
|
65
|
+
if stream
|
66
|
+
return if val.nil?
|
67
|
+
if val['error'] != ""
|
68
|
+
raise StandardError.new(val['error'])
|
69
|
+
end
|
70
|
+
end
|
65
71
|
rescue Error::CustomRetryError, StandardError => e
|
66
72
|
is_custom_retry = e.is_a? Error::CustomRetryError
|
67
73
|
real_delay = is_custom_retry ? e.delay : delay
|
@@ -81,6 +87,7 @@ module Datahen
|
|
81
87
|
puts "#{err_msg.nil? ? '' : "#{err_msg} "}Retry \##{count}#{should_aprox ? '+' : ''}..."
|
82
88
|
retry
|
83
89
|
end
|
90
|
+
val
|
84
91
|
end
|
85
92
|
|
86
93
|
def initialize(opts={})
|
@@ -105,6 +112,10 @@ module Datahen
|
|
105
112
|
query[:parsefail] = opts[:parse_fail] if opts[:parse_fail]
|
106
113
|
query[:status] = opts[:status] if opts[:status]
|
107
114
|
query[:page_type] = opts[:page_type] if opts[:page_type]
|
115
|
+
query[:url] = opts[:url] if opts[:url]
|
116
|
+
query[:effective_url] = opts[:effective_url] if opts[:effective_url]
|
117
|
+
query[:body] = opts[:body] if opts[:body]
|
118
|
+
query[:parent_gid] = opts[:parent_gid] if opts[:parent_gid]
|
108
119
|
query[:gid] = opts[:gid] if opts[:gid]
|
109
120
|
query[:"min-timestamp"] = opts[:"min-timestamp"] if opts[:"min-timestamp"]
|
110
121
|
query[:"max-timestamp"] = opts[:"max-timestamp"] if opts[:"max-timestamp"]
|
@@ -112,6 +123,8 @@ module Datahen
|
|
112
123
|
query[:order] = opts[:order] if opts[:order]
|
113
124
|
query[:filter] = opts[:filter] if opts[:filter]
|
114
125
|
query[:force] = opts[:force] if opts[:force]
|
126
|
+
query[:action] = opts[:action] if opts[:action]
|
127
|
+
query[:"include-system"] = opts[:"include-system"] if opts[:"include-system"]
|
115
128
|
|
116
129
|
if opts[:query]
|
117
130
|
if opts[:query].is_a?(Hash)
|
data/lib/datahen/client/job.rb
CHANGED
@@ -25,6 +25,10 @@ module Datahen
|
|
25
25
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
26
26
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
27
27
|
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
28
|
+
body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
|
29
|
+
body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
|
30
|
+
body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
|
31
|
+
body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
|
28
32
|
params = @options.merge({body: body.to_json})
|
29
33
|
|
30
34
|
self.class.put("/jobs/#{job_id}", params)
|
@@ -97,7 +101,7 @@ module Datahen
|
|
97
101
|
def sync_schema(job_id, opts={})
|
98
102
|
params = @options.merge(opts)
|
99
103
|
|
100
|
-
self.class.put("/
|
104
|
+
self.class.put("/jobs/#{job_id}/sync/schema", params)
|
101
105
|
end
|
102
106
|
|
103
107
|
end
|
@@ -7,7 +7,7 @@ module Datahen
|
|
7
7
|
|
8
8
|
def all(job_id, collection = 'default', opts = {})
|
9
9
|
limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : 0
|
10
|
-
self.retry(limit, 10, "Error while updating the seeder.") do
|
10
|
+
self.retry(limit, 10, "Error while updating the seeder.", true) do
|
11
11
|
self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
|
12
12
|
end
|
13
13
|
end
|
@@ -18,6 +18,9 @@ module Datahen
|
|
18
18
|
body[:max_size] = opts[:max_size] if opts[:max_size]
|
19
19
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
20
20
|
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
21
|
+
body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
|
22
|
+
body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
|
23
|
+
body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
|
21
24
|
|
22
25
|
params = @options.merge({body: body.to_json})
|
23
26
|
|
@@ -55,6 +58,7 @@ module Datahen
|
|
55
58
|
body[:parsing_status] = opts.fetch(:parsing_status){ nil }
|
56
59
|
body[:log_error] = opts[:log_error] if opts[:log_error]
|
57
60
|
body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
|
61
|
+
body[:parsing_try_limit] = opts[:parsing_try_limit] if opts.fetch(:parsing_try_limit){ nil }
|
58
62
|
|
59
63
|
params = @options.merge({body: body.to_json})
|
60
64
|
|
@@ -90,6 +94,11 @@ module Datahen
|
|
90
94
|
params = @options.merge(opts)
|
91
95
|
self.class.put("/jobs/#{job_id}/pages/limbo", params)
|
92
96
|
end
|
97
|
+
|
98
|
+
def still_alive(job_id, gid, opts={})
|
99
|
+
params = @options.merge(opts)
|
100
|
+
self.class.put("/jobs/#{job_id}/pages/#{gid}/still_alive", params)
|
101
|
+
end
|
93
102
|
end
|
94
103
|
end
|
95
104
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class JobTask < Datahen::Client::Base
|
4
|
+
def all(job_id, opts={})
|
5
|
+
params = @options.merge(opts)
|
6
|
+
self.class.get("/jobs/#{job_id}/tasks", params)
|
7
|
+
end
|
8
|
+
|
9
|
+
def find(job_id, task_id, opts={})
|
10
|
+
params = @options.merge(opts)
|
11
|
+
self.class.get("/jobs/#{job_id}/tasks/#{task_id}", params)
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
@@ -32,6 +32,10 @@ module Datahen
|
|
32
32
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
33
33
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
34
34
|
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
35
|
+
body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
|
36
|
+
body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
|
37
|
+
body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
|
38
|
+
body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
|
35
39
|
params = @options.merge({body: body.to_json})
|
36
40
|
self.class.post("/scrapers", params)
|
37
41
|
end
|
@@ -57,6 +61,10 @@ module Datahen
|
|
57
61
|
body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
|
58
62
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
59
63
|
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
64
|
+
body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
|
65
|
+
body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
|
66
|
+
body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
|
67
|
+
body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
|
60
68
|
params = @options.merge({body: body.to_json})
|
61
69
|
|
62
70
|
self.class.put("/scrapers/#{scraper_name}", params)
|
@@ -15,6 +15,10 @@ module Datahen
|
|
15
15
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
16
16
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
17
17
|
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
18
|
+
body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
|
19
|
+
body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
|
20
|
+
body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
|
21
|
+
body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
|
18
22
|
if opts[:vars]
|
19
23
|
if opts[:vars].is_a?(Array)
|
20
24
|
body[:vars] = opts[:vars]
|
@@ -45,6 +49,10 @@ module Datahen
|
|
45
49
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
46
50
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
47
51
|
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
52
|
+
body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
|
53
|
+
body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
|
54
|
+
body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
|
55
|
+
body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
|
48
56
|
params = @options.merge({body: body.to_json})
|
49
57
|
|
50
58
|
self.class.put("/scrapers/#{scraper_name}/current_job", params)
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class ScraperTask < Datahen::Client::Base
|
4
|
+
def all(scraper_name, opts={})
|
5
|
+
params = @options.merge(opts)
|
6
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/tasks", params)
|
7
|
+
end
|
8
|
+
|
9
|
+
def find(scraper_name, task_id, opts={})
|
10
|
+
params = @options.merge(opts)
|
11
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/tasks/#{task_id}", params)
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
data/lib/datahen/client.rb
CHANGED
@@ -24,6 +24,8 @@ require "datahen/client/scraper_var"
|
|
24
24
|
require "datahen/client/job_var"
|
25
25
|
require "datahen/client/scraper_job_var"
|
26
26
|
require "datahen/client/job_finisher"
|
27
|
+
require "datahen/client/job_task"
|
28
|
+
require "datahen/client/scraper_task"
|
27
29
|
|
28
30
|
module Datahen
|
29
31
|
module Client
|
@@ -227,7 +227,11 @@ module Datahen
|
|
227
227
|
|
228
228
|
# add pages
|
229
229
|
count = 0
|
230
|
-
|
230
|
+
json = JSON.parse(response.body)
|
231
|
+
if json['error'] != ""
|
232
|
+
return 0
|
233
|
+
end
|
234
|
+
(json['data'] || []).each do |page|
|
231
235
|
count += 1
|
232
236
|
next if self.loaded_pages.has_key? page['gid']
|
233
237
|
self.pages << (self.loaded_pages[page['gid']] = page)
|
@@ -307,7 +311,7 @@ module Datahen
|
|
307
311
|
is_waiting = true
|
308
312
|
puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
|
309
313
|
if self.second_dequeue_count > 1 && !self.not_found
|
310
|
-
puts "\nWARNING: Your job
|
314
|
+
puts "\nWARNING: Your job might not be optimized. Consider increasing your job's \"parser_dequeue_scale\" if the `to_parse` queue is not empty or near empty \n"
|
311
315
|
end
|
312
316
|
end
|
313
317
|
self.class.wait 1
|
@@ -172,11 +172,16 @@ module Datahen
|
|
172
172
|
response = client.all(query_job_id, collection, {
|
173
173
|
retry_limit: retry_limit
|
174
174
|
})
|
175
|
-
|
176
175
|
if response.code != 200
|
177
176
|
raise "response_code: #{response.code}|#{response.parsed_response}"
|
178
177
|
end
|
179
|
-
|
178
|
+
|
179
|
+
# check stream error
|
180
|
+
json_data = response.body != 'null' ? response.parsed_response : {}
|
181
|
+
if json_data['error'] != ""
|
182
|
+
raise "response_code: #{response.code}|Stream error: #{json_data['error']}"
|
183
|
+
end
|
184
|
+
json_data['data'].nil? ? [] : json_data['data']
|
180
185
|
end
|
181
186
|
|
182
187
|
# Find one output by collection and query with pagination.
|
@@ -33,7 +33,6 @@ module Datahen
|
|
33
33
|
:failed_content,
|
34
34
|
:outputs,
|
35
35
|
:pages,
|
36
|
-
:page,
|
37
36
|
:save_pages,
|
38
37
|
:save_outputs,
|
39
38
|
:find_output,
|
@@ -41,7 +40,8 @@ module Datahen
|
|
41
40
|
:refetch,
|
42
41
|
:reparse,
|
43
42
|
:limbo,
|
44
|
-
:finish
|
43
|
+
:finish,
|
44
|
+
:still_alive
|
45
45
|
].freeze
|
46
46
|
end
|
47
47
|
|
@@ -240,6 +240,12 @@ module Datahen
|
|
240
240
|
@failed_content ||= get_failed_content(job_id, gid)
|
241
241
|
end
|
242
242
|
|
243
|
+
def still_alive page_gid = nil
|
244
|
+
page_gid = gid if page_gid.nil?
|
245
|
+
client = Client::JobPage.new()
|
246
|
+
client.still_alive(job_id, page_gid)
|
247
|
+
end
|
248
|
+
|
243
249
|
def handle_error(e)
|
244
250
|
error = ["Parsing #{e.class}: #{e.to_s} (Job:#{job_id} GID:#{gid})",clean_backtrace(e.backtrace)].join("\n")
|
245
251
|
|
@@ -247,7 +253,8 @@ module Datahen
|
|
247
253
|
job_id: job_id,
|
248
254
|
gid: gid,
|
249
255
|
parsing_status: :failed,
|
250
|
-
log_error: error
|
256
|
+
log_error: error,
|
257
|
+
parsing_try_limit: (page || {})['parsing_try_limit'])
|
251
258
|
end
|
252
259
|
|
253
260
|
end
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.5.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-01-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -227,6 +227,7 @@ files:
|
|
227
227
|
- lib/datahen/cli/scraper_job.rb
|
228
228
|
- lib/datahen/cli/scraper_job_var.rb
|
229
229
|
- lib/datahen/cli/scraper_page.rb
|
230
|
+
- lib/datahen/cli/scraper_task.rb
|
230
231
|
- lib/datahen/cli/scraper_var.rb
|
231
232
|
- lib/datahen/cli/seeder.rb
|
232
233
|
- lib/datahen/client.rb
|
@@ -245,6 +246,7 @@ files:
|
|
245
246
|
- lib/datahen/client/job_output.rb
|
246
247
|
- lib/datahen/client/job_page.rb
|
247
248
|
- lib/datahen/client/job_stat.rb
|
249
|
+
- lib/datahen/client/job_task.rb
|
248
250
|
- lib/datahen/client/job_var.rb
|
249
251
|
- lib/datahen/client/scraper.rb
|
250
252
|
- lib/datahen/client/scraper_deployment.rb
|
@@ -255,6 +257,7 @@ files:
|
|
255
257
|
- lib/datahen/client/scraper_job_output.rb
|
256
258
|
- lib/datahen/client/scraper_job_page.rb
|
257
259
|
- lib/datahen/client/scraper_job_var.rb
|
260
|
+
- lib/datahen/client/scraper_task.rb
|
258
261
|
- lib/datahen/client/scraper_var.rb
|
259
262
|
- lib/datahen/error.rb
|
260
263
|
- lib/datahen/error/custom_retry_error.rb
|
@@ -278,7 +281,7 @@ metadata:
|
|
278
281
|
allowed_push_host: https://rubygems.org
|
279
282
|
homepage_uri: https://datahen.com
|
280
283
|
source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
|
281
|
-
post_install_message:
|
284
|
+
post_install_message:
|
282
285
|
rdoc_options: []
|
283
286
|
require_paths:
|
284
287
|
- lib
|
@@ -293,8 +296,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
293
296
|
- !ruby/object:Gem::Version
|
294
297
|
version: '0'
|
295
298
|
requirements: []
|
296
|
-
rubygems_version: 3.
|
297
|
-
signing_key:
|
299
|
+
rubygems_version: 3.0.3
|
300
|
+
signing_key:
|
298
301
|
specification_version: 4
|
299
302
|
summary: DataHen toolbelt for developers
|
300
303
|
test_files: []
|