datahen 1.4.0 → 1.5.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/datahen/cli/job_output.rb +36 -6
- data/lib/datahen/cli/scraper.rb +27 -2
- data/lib/datahen/cli/scraper_job.rb +4 -0
- data/lib/datahen/cli/scraper_page.rb +19 -2
- data/lib/datahen/cli/scraper_task.rb +48 -0
- data/lib/datahen/cli.rb +1 -0
- data/lib/datahen/client/base.rb +15 -2
- data/lib/datahen/client/job.rb +5 -1
- data/lib/datahen/client/job_output.rb +1 -1
- data/lib/datahen/client/job_page.rb +9 -0
- data/lib/datahen/client/job_task.rb +17 -0
- data/lib/datahen/client/scraper.rb +8 -0
- data/lib/datahen/client/scraper_job.rb +8 -0
- data/lib/datahen/client/scraper_task.rb +17 -0
- data/lib/datahen/client.rb +2 -0
- data/lib/datahen/scraper/batch_parser.rb +9 -4
- data/lib/datahen/scraper/executor.rb +7 -2
- data/lib/datahen/scraper/ruby_parser_executor.rb +10 -3
- data/lib/datahen/version.rb +1 -1
- metadata +9 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dc47c55d814a573f9de29c4725aba0cad212dceae5dfa2d330dc980ad4f64253
|
4
|
+
data.tar.gz: 508fe6249ef13f07c835297758f31eb8459080b13c4256c9393d0c8c6ea2b171
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 23c817ae6f20698c95fdae1503bfe9fc6072f617389e979a1389eab746de07632fab432b2a3a5a824c56c9ad3b6a254a1508ecfd7bb691013843e7b93831daf5
|
7
|
+
data.tar.gz: c3a1df12099bc6bf159ac7689d7c7be36dc15416b6ecc3c347e7ef1ba1e8e844a58d9600f00803c7fd73837915b88674aa8e470e504acc6ac89d6fd0897df632
|
@@ -20,10 +20,20 @@ module Datahen
|
|
20
20
|
collection = options.fetch(:collection) { 'default' }
|
21
21
|
if options[:job]
|
22
22
|
client = Client::JobOutput.new(options)
|
23
|
-
|
23
|
+
json = JSON.parse(client.all(options[:job], collection).body)
|
24
|
+
if json['error'] == ""
|
25
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
26
|
+
else
|
27
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
28
|
+
end
|
24
29
|
else
|
25
30
|
client = Client::ScraperJobOutput.new(options)
|
26
|
-
|
31
|
+
json = JSON.parse(client.all(scraper_name, collection).body)
|
32
|
+
if json['error'] == ""
|
33
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
34
|
+
else
|
35
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
36
|
+
end
|
27
37
|
end
|
28
38
|
end
|
29
39
|
|
@@ -38,10 +48,20 @@ module Datahen
|
|
38
48
|
collection = options.fetch(:collection) { 'default' }
|
39
49
|
if options[:job]
|
40
50
|
client = Client::JobOutput.new(options)
|
41
|
-
|
51
|
+
json = JSON.parse(client.find(options[:job], collection, id).body)
|
52
|
+
if json['error'] == ""
|
53
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
54
|
+
else
|
55
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
56
|
+
end
|
42
57
|
else
|
43
58
|
client = Client::ScraperJobOutput.new(options)
|
44
|
-
|
59
|
+
json = JSON.parse(client.find(scraper_name, collection, id).body)
|
60
|
+
if json['error'] == ""
|
61
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
62
|
+
else
|
63
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
64
|
+
end
|
45
65
|
end
|
46
66
|
end
|
47
67
|
|
@@ -56,10 +76,20 @@ module Datahen
|
|
56
76
|
|
57
77
|
if options[:job]
|
58
78
|
client = Client::JobOutput.new(options)
|
59
|
-
|
79
|
+
json = JSON.parse(client.collections(options[:job]).body)
|
80
|
+
if json['error'] == ""
|
81
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
82
|
+
else
|
83
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
84
|
+
end
|
60
85
|
else
|
61
86
|
client = Client::ScraperJobOutput.new(options)
|
62
|
-
|
87
|
+
json = JSON.parse(client.collections(scraper_name).body)
|
88
|
+
if json['error'] == ""
|
89
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
90
|
+
else
|
91
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
92
|
+
end
|
63
93
|
end
|
64
94
|
end
|
65
95
|
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -37,6 +37,10 @@ module Datahen
|
|
37
37
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
38
38
|
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
39
39
|
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
40
|
+
option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
|
41
|
+
option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
|
42
|
+
option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
|
43
|
+
option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
|
40
44
|
def create(scraper_name, git_repository)
|
41
45
|
# puts "options #{options}"
|
42
46
|
client = Client::Scraper.new(options)
|
@@ -66,6 +70,10 @@ module Datahen
|
|
66
70
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
67
71
|
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
68
72
|
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
73
|
+
option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
|
74
|
+
option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
|
75
|
+
option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
|
76
|
+
option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
|
69
77
|
def update(scraper_name)
|
70
78
|
client = Client::Scraper.new(options)
|
71
79
|
puts "#{client.update(scraper_name, options)}"
|
@@ -106,6 +114,10 @@ module Datahen
|
|
106
114
|
option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
|
107
115
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
108
116
|
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
117
|
+
option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
|
118
|
+
option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
|
119
|
+
option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
|
120
|
+
option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
|
109
121
|
def start(scraper_name)
|
110
122
|
client = Client::ScraperJob.new(options)
|
111
123
|
puts "Starting a scrape job..."
|
@@ -188,9 +200,19 @@ module Datahen
|
|
188
200
|
def history(scraper_name)
|
189
201
|
client = Client::JobStat.new(options)
|
190
202
|
if options[:job]
|
191
|
-
|
203
|
+
json = JSON.parse(client.job_stats_history(options[:job], options).body)
|
204
|
+
if json['error'] == ""
|
205
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
206
|
+
else
|
207
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
208
|
+
end
|
192
209
|
else
|
193
|
-
|
210
|
+
json = JSON.parse(client.scraper_job_stats_history(scraper_name, options).body)
|
211
|
+
if json['error'] == ""
|
212
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
213
|
+
else
|
214
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
215
|
+
end
|
194
216
|
end
|
195
217
|
end
|
196
218
|
|
@@ -227,6 +249,9 @@ module Datahen
|
|
227
249
|
desc "var SUBCOMMAND ...ARGS", "for managing scraper's variables"
|
228
250
|
subcommand "var", ScraperVar
|
229
251
|
|
252
|
+
desc "task SUBCOMMAND ...ARGS", "manage task on a job"
|
253
|
+
subcommand "task", ScraperTask
|
254
|
+
|
230
255
|
|
231
256
|
end
|
232
257
|
end
|
@@ -108,6 +108,10 @@ module Datahen
|
|
108
108
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
109
109
|
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
110
110
|
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
111
|
+
option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
|
112
|
+
option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
|
113
|
+
option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
|
114
|
+
option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
|
111
115
|
def update(scraper_name)
|
112
116
|
if options[:job]
|
113
117
|
client = Client::Job.new(options)
|
@@ -13,6 +13,10 @@ module Datahen
|
|
13
13
|
LONGDESC
|
14
14
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
15
15
|
option :page_type, :aliases => :t, type: :string, desc: 'Filter by page_type'
|
16
|
+
option :url, :aliases => :u, type: :string, desc: 'Filter by url'
|
17
|
+
option :effective_url, :aliases => :U, type: :string, desc: 'Filter by effective_url'
|
18
|
+
option :body, :aliases => :b, type: :string, desc: 'Filter by body'
|
19
|
+
option :parent_gid, :aliases => :G, type: :string, desc: 'Filter by parent_gid'
|
16
20
|
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
17
21
|
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
18
22
|
option :fetch_fail, type: :boolean, desc: 'Returns only pages that fails fetching.'
|
@@ -21,10 +25,20 @@ module Datahen
|
|
21
25
|
def list(scraper_name)
|
22
26
|
if options[:job]
|
23
27
|
client = Client::JobPage.new(options)
|
24
|
-
|
28
|
+
json = JSON.parse(client.all(options[:job]).body)
|
29
|
+
if json['error'] == ""
|
30
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
31
|
+
else
|
32
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
33
|
+
end
|
25
34
|
else
|
26
35
|
client = Client::ScraperJobPage.new(options)
|
27
|
-
|
36
|
+
json = JSON.parse(client.all(scraper_name).body)
|
37
|
+
if json['error'] == ""
|
38
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
39
|
+
else
|
40
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
41
|
+
end
|
28
42
|
end
|
29
43
|
end
|
30
44
|
|
@@ -84,6 +98,9 @@ module Datahen
|
|
84
98
|
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
85
99
|
option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
86
100
|
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
101
|
+
option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
|
102
|
+
option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
|
103
|
+
option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
|
87
104
|
def update(scraper_name, gid)
|
88
105
|
begin
|
89
106
|
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module Datahen
|
2
|
+
class CLI < Thor
|
3
|
+
class ScraperTask < Thor
|
4
|
+
package_name "scraper task"
|
5
|
+
def self.banner(command, namespace = nil, subcommand = false)
|
6
|
+
"#{basename} #{@package_name} #{command.usage}"
|
7
|
+
end
|
8
|
+
|
9
|
+
desc "list <scraper_name>", "List Tasks on a scraper's current job"
|
10
|
+
long_desc <<-LONGDESC
|
11
|
+
List all tasks in a scraper's current job or given job ID.\x5
|
12
|
+
LONGDESC
|
13
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
14
|
+
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
15
|
+
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
16
|
+
option :status, type: :array, desc: 'Returns only tasks with specific status.'
|
17
|
+
option :action, type: :array, desc: 'Returns only tasks with specific action.'
|
18
|
+
option :"include-system", type: :boolean, desc: 'If it is true, will returns all actions. If it is false only tasks with specific action ["refetch", "reparse", "terminate"].'
|
19
|
+
def list(scraper_name)
|
20
|
+
if options[:job]
|
21
|
+
client = Client::JobTask.new(options)
|
22
|
+
puts "#{client.all(options[:job])}"
|
23
|
+
else
|
24
|
+
client = Client::ScraperTask.new(options)
|
25
|
+
puts "#{client.all(scraper_name)}"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
desc "show <scraper_name> <task_id>", "Show task in scraper's current job"
|
31
|
+
long_desc <<-LONGDESC
|
32
|
+
Shows a task in a scraper's current job or given job ID.\x5
|
33
|
+
LONGDESC
|
34
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
35
|
+
def show(scraper_name, task_id)
|
36
|
+
if options[:job]
|
37
|
+
client = Client::JobTask.new(options)
|
38
|
+
puts "#{client.find(options[:job], task_id)}"
|
39
|
+
else
|
40
|
+
client = Client::ScraperTask.new(options)
|
41
|
+
puts "#{client.find(scraper_name, task_id)}"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
data/lib/datahen/cli.rb
CHANGED
@@ -11,6 +11,7 @@ require 'datahen/cli/scraper_page'
|
|
11
11
|
require 'datahen/cli/job_output'
|
12
12
|
require 'datahen/cli/job'
|
13
13
|
require 'datahen/cli/scraper_deployment'
|
14
|
+
require 'datahen/cli/scraper_task'
|
14
15
|
require 'datahen/cli/scraper'
|
15
16
|
require 'datahen/cli/parser'
|
16
17
|
require 'datahen/cli/seeder'
|
data/lib/datahen/client/base.rb
CHANGED
@@ -56,12 +56,18 @@ module Datahen
|
|
56
56
|
target.merge(source.select{|k,v|target.has_key?(k)})
|
57
57
|
end
|
58
58
|
|
59
|
-
def retry times, delay = nil, err_msg = nil
|
59
|
+
def retry times, delay = nil, err_msg = nil, stream = false
|
60
60
|
limit = times.nil? ? nil : times.to_i
|
61
61
|
delay = delay.nil? ? 5 : delay.to_i
|
62
62
|
count = 0
|
63
63
|
begin
|
64
|
-
yield
|
64
|
+
val = yield
|
65
|
+
if stream
|
66
|
+
return if val.nil?
|
67
|
+
if val['error'] != ""
|
68
|
+
raise StandardError.new(val['error'])
|
69
|
+
end
|
70
|
+
end
|
65
71
|
rescue Error::CustomRetryError, StandardError => e
|
66
72
|
is_custom_retry = e.is_a? Error::CustomRetryError
|
67
73
|
real_delay = is_custom_retry ? e.delay : delay
|
@@ -81,6 +87,7 @@ module Datahen
|
|
81
87
|
puts "#{err_msg.nil? ? '' : "#{err_msg} "}Retry \##{count}#{should_aprox ? '+' : ''}..."
|
82
88
|
retry
|
83
89
|
end
|
90
|
+
val
|
84
91
|
end
|
85
92
|
|
86
93
|
def initialize(opts={})
|
@@ -105,6 +112,10 @@ module Datahen
|
|
105
112
|
query[:parsefail] = opts[:parse_fail] if opts[:parse_fail]
|
106
113
|
query[:status] = opts[:status] if opts[:status]
|
107
114
|
query[:page_type] = opts[:page_type] if opts[:page_type]
|
115
|
+
query[:url] = opts[:url] if opts[:url]
|
116
|
+
query[:effective_url] = opts[:effective_url] if opts[:effective_url]
|
117
|
+
query[:body] = opts[:body] if opts[:body]
|
118
|
+
query[:parent_gid] = opts[:parent_gid] if opts[:parent_gid]
|
108
119
|
query[:gid] = opts[:gid] if opts[:gid]
|
109
120
|
query[:"min-timestamp"] = opts[:"min-timestamp"] if opts[:"min-timestamp"]
|
110
121
|
query[:"max-timestamp"] = opts[:"max-timestamp"] if opts[:"max-timestamp"]
|
@@ -112,6 +123,8 @@ module Datahen
|
|
112
123
|
query[:order] = opts[:order] if opts[:order]
|
113
124
|
query[:filter] = opts[:filter] if opts[:filter]
|
114
125
|
query[:force] = opts[:force] if opts[:force]
|
126
|
+
query[:action] = opts[:action] if opts[:action]
|
127
|
+
query[:"include-system"] = opts[:"include-system"] if opts[:"include-system"]
|
115
128
|
|
116
129
|
if opts[:query]
|
117
130
|
if opts[:query].is_a?(Hash)
|
data/lib/datahen/client/job.rb
CHANGED
@@ -25,6 +25,10 @@ module Datahen
|
|
25
25
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
26
26
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
27
27
|
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
28
|
+
body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
|
29
|
+
body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
|
30
|
+
body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
|
31
|
+
body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
|
28
32
|
params = @options.merge({body: body.to_json})
|
29
33
|
|
30
34
|
self.class.put("/jobs/#{job_id}", params)
|
@@ -97,7 +101,7 @@ module Datahen
|
|
97
101
|
def sync_schema(job_id, opts={})
|
98
102
|
params = @options.merge(opts)
|
99
103
|
|
100
|
-
self.class.put("/
|
104
|
+
self.class.put("/jobs/#{job_id}/sync/schema", params)
|
101
105
|
end
|
102
106
|
|
103
107
|
end
|
@@ -7,7 +7,7 @@ module Datahen
|
|
7
7
|
|
8
8
|
def all(job_id, collection = 'default', opts = {})
|
9
9
|
limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : 0
|
10
|
-
self.retry(limit, 10, "Error while updating the seeder.") do
|
10
|
+
self.retry(limit, 10, "Error while updating the seeder.", true) do
|
11
11
|
self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
|
12
12
|
end
|
13
13
|
end
|
@@ -18,6 +18,9 @@ module Datahen
|
|
18
18
|
body[:max_size] = opts[:max_size] if opts[:max_size]
|
19
19
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
20
20
|
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
21
|
+
body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
|
22
|
+
body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
|
23
|
+
body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
|
21
24
|
|
22
25
|
params = @options.merge({body: body.to_json})
|
23
26
|
|
@@ -55,6 +58,7 @@ module Datahen
|
|
55
58
|
body[:parsing_status] = opts.fetch(:parsing_status){ nil }
|
56
59
|
body[:log_error] = opts[:log_error] if opts[:log_error]
|
57
60
|
body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
|
61
|
+
body[:parsing_try_limit] = opts[:parsing_try_limit] if opts.fetch(:parsing_try_limit){ nil }
|
58
62
|
|
59
63
|
params = @options.merge({body: body.to_json})
|
60
64
|
|
@@ -90,6 +94,11 @@ module Datahen
|
|
90
94
|
params = @options.merge(opts)
|
91
95
|
self.class.put("/jobs/#{job_id}/pages/limbo", params)
|
92
96
|
end
|
97
|
+
|
98
|
+
def still_alive(job_id, gid, opts={})
|
99
|
+
params = @options.merge(opts)
|
100
|
+
self.class.put("/jobs/#{job_id}/pages/#{gid}/still_alive", params)
|
101
|
+
end
|
93
102
|
end
|
94
103
|
end
|
95
104
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class JobTask < Datahen::Client::Base
|
4
|
+
def all(job_id, opts={})
|
5
|
+
params = @options.merge(opts)
|
6
|
+
self.class.get("/jobs/#{job_id}/tasks", params)
|
7
|
+
end
|
8
|
+
|
9
|
+
def find(job_id, task_id, opts={})
|
10
|
+
params = @options.merge(opts)
|
11
|
+
self.class.get("/jobs/#{job_id}/tasks/#{task_id}", params)
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
@@ -32,6 +32,10 @@ module Datahen
|
|
32
32
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
33
33
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
34
34
|
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
35
|
+
body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
|
36
|
+
body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
|
37
|
+
body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
|
38
|
+
body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
|
35
39
|
params = @options.merge({body: body.to_json})
|
36
40
|
self.class.post("/scrapers", params)
|
37
41
|
end
|
@@ -57,6 +61,10 @@ module Datahen
|
|
57
61
|
body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
|
58
62
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
59
63
|
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
64
|
+
body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
|
65
|
+
body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
|
66
|
+
body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
|
67
|
+
body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
|
60
68
|
params = @options.merge({body: body.to_json})
|
61
69
|
|
62
70
|
self.class.put("/scrapers/#{scraper_name}", params)
|
@@ -15,6 +15,10 @@ module Datahen
|
|
15
15
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
16
16
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
17
17
|
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
18
|
+
body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
|
19
|
+
body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
|
20
|
+
body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
|
21
|
+
body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
|
18
22
|
if opts[:vars]
|
19
23
|
if opts[:vars].is_a?(Array)
|
20
24
|
body[:vars] = opts[:vars]
|
@@ -45,6 +49,10 @@ module Datahen
|
|
45
49
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
46
50
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
47
51
|
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
52
|
+
body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
|
53
|
+
body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
|
54
|
+
body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
|
55
|
+
body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
|
48
56
|
params = @options.merge({body: body.to_json})
|
49
57
|
|
50
58
|
self.class.put("/scrapers/#{scraper_name}/current_job", params)
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class ScraperTask < Datahen::Client::Base
|
4
|
+
def all(scraper_name, opts={})
|
5
|
+
params = @options.merge(opts)
|
6
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/tasks", params)
|
7
|
+
end
|
8
|
+
|
9
|
+
def find(scraper_name, task_id, opts={})
|
10
|
+
params = @options.merge(opts)
|
11
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/tasks/#{task_id}", params)
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
data/lib/datahen/client.rb
CHANGED
@@ -24,6 +24,8 @@ require "datahen/client/scraper_var"
|
|
24
24
|
require "datahen/client/job_var"
|
25
25
|
require "datahen/client/scraper_job_var"
|
26
26
|
require "datahen/client/job_finisher"
|
27
|
+
require "datahen/client/job_task"
|
28
|
+
require "datahen/client/scraper_task"
|
27
29
|
|
28
30
|
module Datahen
|
29
31
|
module Client
|
@@ -219,15 +219,20 @@ module Datahen
|
|
219
219
|
self.dequeuer_is_alive!
|
220
220
|
|
221
221
|
# ensure a valid response or try again
|
222
|
-
|
223
|
-
|
222
|
+
has_empty_response = (response.body.nil? || response.body.empty?)
|
223
|
+
if has_empty_response || response.response.code.to_i != 200
|
224
|
+
self.repeat_puts(has_empty_response ? 'null' : response.body)
|
224
225
|
self.recollect_garbage
|
225
226
|
return 0
|
226
227
|
end
|
227
228
|
|
228
229
|
# add pages
|
229
230
|
count = 0
|
230
|
-
|
231
|
+
json = JSON.parse(response.body)
|
232
|
+
if json['error'] != ""
|
233
|
+
return 0
|
234
|
+
end
|
235
|
+
(json['data'] || []).each do |page|
|
231
236
|
count += 1
|
232
237
|
next if self.loaded_pages.has_key? page['gid']
|
233
238
|
self.pages << (self.loaded_pages[page['gid']] = page)
|
@@ -307,7 +312,7 @@ module Datahen
|
|
307
312
|
is_waiting = true
|
308
313
|
puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
|
309
314
|
if self.second_dequeue_count > 1 && !self.not_found
|
310
|
-
puts "\nWARNING: Your job
|
315
|
+
puts "\nWARNING: Your job might not be optimized. Consider increasing your job's \"parser_dequeue_scale\" if the `to_parse` queue is not empty or near empty \n"
|
311
316
|
end
|
312
317
|
end
|
313
318
|
self.class.wait 1
|
@@ -172,11 +172,16 @@ module Datahen
|
|
172
172
|
response = client.all(query_job_id, collection, {
|
173
173
|
retry_limit: retry_limit
|
174
174
|
})
|
175
|
-
|
176
175
|
if response.code != 200
|
177
176
|
raise "response_code: #{response.code}|#{response.parsed_response}"
|
178
177
|
end
|
179
|
-
|
178
|
+
|
179
|
+
# check stream error
|
180
|
+
json_data = response.body != 'null' ? response.parsed_response : {}
|
181
|
+
if json_data['error'] != ""
|
182
|
+
raise "response_code: #{response.code}|Stream error: #{json_data['error']}"
|
183
|
+
end
|
184
|
+
json_data['data'].nil? ? [] : json_data['data']
|
180
185
|
end
|
181
186
|
|
182
187
|
# Find one output by collection and query with pagination.
|
@@ -33,7 +33,6 @@ module Datahen
|
|
33
33
|
:failed_content,
|
34
34
|
:outputs,
|
35
35
|
:pages,
|
36
|
-
:page,
|
37
36
|
:save_pages,
|
38
37
|
:save_outputs,
|
39
38
|
:find_output,
|
@@ -41,7 +40,8 @@ module Datahen
|
|
41
40
|
:refetch,
|
42
41
|
:reparse,
|
43
42
|
:limbo,
|
44
|
-
:finish
|
43
|
+
:finish,
|
44
|
+
:still_alive
|
45
45
|
].freeze
|
46
46
|
end
|
47
47
|
|
@@ -240,6 +240,12 @@ module Datahen
|
|
240
240
|
@failed_content ||= get_failed_content(job_id, gid)
|
241
241
|
end
|
242
242
|
|
243
|
+
def still_alive page_gid = nil
|
244
|
+
page_gid = gid if page_gid.nil?
|
245
|
+
client = Client::JobPage.new()
|
246
|
+
client.still_alive(job_id, page_gid)
|
247
|
+
end
|
248
|
+
|
243
249
|
def handle_error(e)
|
244
250
|
error = ["Parsing #{e.class}: #{e.to_s} (Job:#{job_id} GID:#{gid})",clean_backtrace(e.backtrace)].join("\n")
|
245
251
|
|
@@ -247,7 +253,8 @@ module Datahen
|
|
247
253
|
job_id: job_id,
|
248
254
|
gid: gid,
|
249
255
|
parsing_status: :failed,
|
250
|
-
log_error: error
|
256
|
+
log_error: error,
|
257
|
+
parsing_try_limit: (page || {})['parsing_try_limit'])
|
251
258
|
end
|
252
259
|
|
253
260
|
end
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-01-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -227,6 +227,7 @@ files:
|
|
227
227
|
- lib/datahen/cli/scraper_job.rb
|
228
228
|
- lib/datahen/cli/scraper_job_var.rb
|
229
229
|
- lib/datahen/cli/scraper_page.rb
|
230
|
+
- lib/datahen/cli/scraper_task.rb
|
230
231
|
- lib/datahen/cli/scraper_var.rb
|
231
232
|
- lib/datahen/cli/seeder.rb
|
232
233
|
- lib/datahen/client.rb
|
@@ -245,6 +246,7 @@ files:
|
|
245
246
|
- lib/datahen/client/job_output.rb
|
246
247
|
- lib/datahen/client/job_page.rb
|
247
248
|
- lib/datahen/client/job_stat.rb
|
249
|
+
- lib/datahen/client/job_task.rb
|
248
250
|
- lib/datahen/client/job_var.rb
|
249
251
|
- lib/datahen/client/scraper.rb
|
250
252
|
- lib/datahen/client/scraper_deployment.rb
|
@@ -255,6 +257,7 @@ files:
|
|
255
257
|
- lib/datahen/client/scraper_job_output.rb
|
256
258
|
- lib/datahen/client/scraper_job_page.rb
|
257
259
|
- lib/datahen/client/scraper_job_var.rb
|
260
|
+
- lib/datahen/client/scraper_task.rb
|
258
261
|
- lib/datahen/client/scraper_var.rb
|
259
262
|
- lib/datahen/error.rb
|
260
263
|
- lib/datahen/error/custom_retry_error.rb
|
@@ -278,7 +281,7 @@ metadata:
|
|
278
281
|
allowed_push_host: https://rubygems.org
|
279
282
|
homepage_uri: https://datahen.com
|
280
283
|
source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
|
281
|
-
post_install_message:
|
284
|
+
post_install_message:
|
282
285
|
rdoc_options: []
|
283
286
|
require_paths:
|
284
287
|
- lib
|
@@ -293,8 +296,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
293
296
|
- !ruby/object:Gem::Version
|
294
297
|
version: '0'
|
295
298
|
requirements: []
|
296
|
-
rubygems_version: 3.
|
297
|
-
signing_key:
|
299
|
+
rubygems_version: 3.0.3
|
300
|
+
signing_key:
|
298
301
|
specification_version: 4
|
299
302
|
summary: DataHen toolbelt for developers
|
300
303
|
test_files: []
|