datahen 1.4.0 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ae63999d11bc052d81e3b1de67a0741702dd980e719dd544b5e689f0383e7a34
4
- data.tar.gz: faf53b662afa26409bff83c3007127211863ce33ff45b8c60aab56491fdcafe7
3
+ metadata.gz: 96d2bc30d1c96ce684d83efa54b6dff5966db2a1bba7ab4856b11caba2803086
4
+ data.tar.gz: 985712d5d7e6559ac64b76669241f56d704c754deb06a164e1f449aad10ef29e
5
5
  SHA512:
6
- metadata.gz: b0e7a0ddc975202df66785211cee796e1aef61de921f99ba0481645f59fb65963c03517e1d4f5b471d2ed108087011f20d6b693d6d199e2e96c860412b675415
7
- data.tar.gz: 70a13268ba6f3df8f560a1b4d65b261ec4744bbd71eb22b04c25dc5809e70af3541cc30d9dad0e4a4cc22c3a353b90bc24c4a43cfc70e7f82fc2080001596d38
6
+ metadata.gz: d9c6bd3e60034339a8354fe4bda365b91f21b6ec68da8f384d7380abcafa5ccce2c2aacd6cc7a8da37378b8681afe58765bcc461211812c623a8958eac7a5f72
7
+ data.tar.gz: ac5eb5c8de4e4b0a6d28d96179bab4bf347662247b94e775ed0a25e0f0ef00a542f01f8a1a06525b565e7bd1055d5cd30b480a28d28c7ebf5de893b89b9f5e3a
@@ -20,10 +20,20 @@ module Datahen
20
20
  collection = options.fetch(:collection) { 'default' }
21
21
  if options[:job]
22
22
  client = Client::JobOutput.new(options)
23
- puts "#{client.all(options[:job], collection)}"
23
+ json = JSON.parse(client.all(options[:job], collection).body)
24
+ if json['error'] == ""
25
+ puts "#{JSON.pretty_generate(json['data'])}"
26
+ else
27
+ puts "#{JSON.pretty_generate(json['error'])}"
28
+ end
24
29
  else
25
30
  client = Client::ScraperJobOutput.new(options)
26
- puts "#{client.all(scraper_name, collection)}"
31
+ json = JSON.parse(client.all(scraper_name, collection).body)
32
+ if json['error'] == ""
33
+ puts "#{JSON.pretty_generate(json['data'])}"
34
+ else
35
+ puts "#{JSON.pretty_generate(json['error'])}"
36
+ end
27
37
  end
28
38
  end
29
39
 
@@ -38,10 +48,20 @@ module Datahen
38
48
  collection = options.fetch(:collection) { 'default' }
39
49
  if options[:job]
40
50
  client = Client::JobOutput.new(options)
41
- puts "#{client.find(options[:job], collection, id)}"
51
+ json = JSON.parse(client.find(options[:job], collection, id).body)
52
+ if json['error'] == ""
53
+ puts "#{JSON.pretty_generate(json['data'])}"
54
+ else
55
+ puts "#{JSON.pretty_generate(json['error'])}"
56
+ end
42
57
  else
43
58
  client = Client::ScraperJobOutput.new(options)
44
- puts "#{client.find(scraper_name, collection, id)}"
59
+ json = JSON.parse(client.find(scraper_name, collection, id).body)
60
+ if json['error'] == ""
61
+ puts "#{JSON.pretty_generate(json['data'])}"
62
+ else
63
+ puts "#{JSON.pretty_generate(json['error'])}"
64
+ end
45
65
  end
46
66
  end
47
67
 
@@ -56,10 +76,20 @@ module Datahen
56
76
 
57
77
  if options[:job]
58
78
  client = Client::JobOutput.new(options)
59
- puts "#{client.collections(options[:job])}"
79
+ json = JSON.parse(client.collections(options[:job]).body)
80
+ if json['error'] == ""
81
+ puts "#{JSON.pretty_generate(json['data'])}"
82
+ else
83
+ puts "#{JSON.pretty_generate(json['error'])}"
84
+ end
60
85
  else
61
86
  client = Client::ScraperJobOutput.new(options)
62
- puts "#{client.collections(scraper_name)}"
87
+ json = JSON.parse(client.collections(scraper_name).body)
88
+ if json['error'] == ""
89
+ puts "#{JSON.pretty_generate(json['data'])}"
90
+ else
91
+ puts "#{JSON.pretty_generate(json['error'])}"
92
+ end
63
93
  end
64
94
  end
65
95
 
@@ -37,6 +37,10 @@ module Datahen
37
37
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
38
38
  option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
39
39
  option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
40
+ option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
41
+ option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
42
+ option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
43
+ option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
40
44
  def create(scraper_name, git_repository)
41
45
  # puts "options #{options}"
42
46
  client = Client::Scraper.new(options)
@@ -66,6 +70,10 @@ module Datahen
66
70
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
67
71
  option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
68
72
  option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
73
+ option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
74
+ option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
75
+ option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
76
+ option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
69
77
  def update(scraper_name)
70
78
  client = Client::Scraper.new(options)
71
79
  puts "#{client.update(scraper_name, options)}"
@@ -106,6 +114,10 @@ module Datahen
106
114
  option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
107
115
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
108
116
  option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
117
+ option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
118
+ option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
119
+ option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
120
+ option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
109
121
  def start(scraper_name)
110
122
  client = Client::ScraperJob.new(options)
111
123
  puts "Starting a scrape job..."
@@ -188,9 +200,19 @@ module Datahen
188
200
  def history(scraper_name)
189
201
  client = Client::JobStat.new(options)
190
202
  if options[:job]
191
- puts "#{client.job_stats_history(options[:job], options)}"
203
+ json = JSON.parse(client.job_stats_history(options[:job], options).body)
204
+ if json['error'] == ""
205
+ puts "#{JSON.pretty_generate(json['data'])}"
206
+ else
207
+ puts "#{JSON.pretty_generate(json['error'])}"
208
+ end
192
209
  else
193
- puts "#{client.scraper_job_stats_history(scraper_name, options)}"
210
+ json = JSON.parse(client.scraper_job_stats_history(scraper_name, options).body)
211
+ if json['error'] == ""
212
+ puts "#{JSON.pretty_generate(json['data'])}"
213
+ else
214
+ puts "#{JSON.pretty_generate(json['error'])}"
215
+ end
194
216
  end
195
217
  end
196
218
 
@@ -227,6 +249,9 @@ module Datahen
227
249
  desc "var SUBCOMMAND ...ARGS", "for managing scraper's variables"
228
250
  subcommand "var", ScraperVar
229
251
 
252
+ desc "task SUBCOMMAND ...ARGS", "manage task on a job"
253
+ subcommand "task", ScraperTask
254
+
230
255
 
231
256
  end
232
257
  end
@@ -108,6 +108,10 @@ module Datahen
108
108
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
109
109
  option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
110
110
  option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
111
+ option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
112
+ option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
113
+ option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
114
+ option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
111
115
  def update(scraper_name)
112
116
  if options[:job]
113
117
  client = Client::Job.new(options)
@@ -13,6 +13,10 @@ module Datahen
13
13
  LONGDESC
14
14
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
15
15
  option :page_type, :aliases => :t, type: :string, desc: 'Filter by page_type'
16
+ option :url, :aliases => :u, type: :string, desc: 'Filter by url'
17
+ option :effective_url, :aliases => :U, type: :string, desc: 'Filter by effective_url'
18
+ option :body, :aliases => :b, type: :string, desc: 'Filter by body'
19
+ option :parent_gid, :aliases => :G, type: :string, desc: 'Filter by parent_gid'
16
20
  option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
17
21
  option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
18
22
  option :fetch_fail, type: :boolean, desc: 'Returns only pages that fails fetching.'
@@ -21,10 +25,20 @@ module Datahen
21
25
  def list(scraper_name)
22
26
  if options[:job]
23
27
  client = Client::JobPage.new(options)
24
- puts "#{client.all(options[:job])}"
28
+ json = JSON.parse(client.all(options[:job]).body)
29
+ if json['error'] == ""
30
+ puts "#{JSON.pretty_generate(json['data'])}"
31
+ else
32
+ puts "#{JSON.pretty_generate(json['error'])}"
33
+ end
25
34
  else
26
35
  client = Client::ScraperJobPage.new(options)
27
- puts "#{client.all(scraper_name)}"
36
+ json = JSON.parse(client.all(scraper_name).body)
37
+ if json['error'] == ""
38
+ puts "#{JSON.pretty_generate(json['data'])}"
39
+ else
40
+ puts "#{JSON.pretty_generate(json['error'])}"
41
+ end
28
42
  end
29
43
  end
30
44
 
@@ -84,6 +98,9 @@ module Datahen
84
98
  option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
85
99
  option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
86
100
  option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
101
+ option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
102
+ option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
103
+ option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
87
104
  def update(scraper_name, gid)
88
105
  begin
89
106
  options[:vars] = JSON.parse(options[:vars]) if options[:vars]
@@ -0,0 +1,48 @@
1
+ module Datahen
2
+ class CLI < Thor
3
+ class ScraperTask < Thor
4
+ package_name "scraper task"
5
+ def self.banner(command, namespace = nil, subcommand = false)
6
+ "#{basename} #{@package_name} #{command.usage}"
7
+ end
8
+
9
+ desc "list <scraper_name>", "List Tasks on a scraper's current job"
10
+ long_desc <<-LONGDESC
11
+ List all tasks in a scraper's current job or given job ID.\x5
12
+ LONGDESC
13
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
14
+ option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
15
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
16
+ option :status, type: :array, desc: 'Returns only tasks with specific status.'
17
+ option :action, type: :array, desc: 'Returns only tasks with specific action.'
18
+ option :"include-system", type: :boolean, desc: 'If it is true, will returns all actions. If it is false only tasks with specific action ["refetch", "reparse", "terminate"].'
19
+ def list(scraper_name)
20
+ if options[:job]
21
+ client = Client::JobTask.new(options)
22
+ puts "#{client.all(options[:job])}"
23
+ else
24
+ client = Client::ScraperTask.new(options)
25
+ puts "#{client.all(scraper_name)}"
26
+ end
27
+ end
28
+
29
+
30
+ desc "show <scraper_name> <task_id>", "Show task in scraper's current job"
31
+ long_desc <<-LONGDESC
32
+ Shows a task in a scraper's current job or given job ID.\x5
33
+ LONGDESC
34
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
35
+ def show(scraper_name, task_id)
36
+ if options[:job]
37
+ client = Client::JobTask.new(options)
38
+ puts "#{client.find(options[:job], task_id)}"
39
+ else
40
+ client = Client::ScraperTask.new(options)
41
+ puts "#{client.find(scraper_name, task_id)}"
42
+ end
43
+ end
44
+
45
+ end
46
+ end
47
+
48
+ end
data/lib/datahen/cli.rb CHANGED
@@ -11,6 +11,7 @@ require 'datahen/cli/scraper_page'
11
11
  require 'datahen/cli/job_output'
12
12
  require 'datahen/cli/job'
13
13
  require 'datahen/cli/scraper_deployment'
14
+ require 'datahen/cli/scraper_task'
14
15
  require 'datahen/cli/scraper'
15
16
  require 'datahen/cli/parser'
16
17
  require 'datahen/cli/seeder'
@@ -56,12 +56,18 @@ module Datahen
56
56
  target.merge(source.select{|k,v|target.has_key?(k)})
57
57
  end
58
58
 
59
- def retry times, delay = nil, err_msg = nil
59
+ def retry times, delay = nil, err_msg = nil, stream = false
60
60
  limit = times.nil? ? nil : times.to_i
61
61
  delay = delay.nil? ? 5 : delay.to_i
62
62
  count = 0
63
63
  begin
64
- yield
64
+ val = yield
65
+ if stream
66
+ return if val.nil?
67
+ if val['error'] != ""
68
+ raise StandardError.new(val['error'])
69
+ end
70
+ end
65
71
  rescue Error::CustomRetryError, StandardError => e
66
72
  is_custom_retry = e.is_a? Error::CustomRetryError
67
73
  real_delay = is_custom_retry ? e.delay : delay
@@ -81,6 +87,7 @@ module Datahen
81
87
  puts "#{err_msg.nil? ? '' : "#{err_msg} "}Retry \##{count}#{should_aprox ? '+' : ''}..."
82
88
  retry
83
89
  end
90
+ val
84
91
  end
85
92
 
86
93
  def initialize(opts={})
@@ -105,6 +112,10 @@ module Datahen
105
112
  query[:parsefail] = opts[:parse_fail] if opts[:parse_fail]
106
113
  query[:status] = opts[:status] if opts[:status]
107
114
  query[:page_type] = opts[:page_type] if opts[:page_type]
115
+ query[:url] = opts[:url] if opts[:url]
116
+ query[:effective_url] = opts[:effective_url] if opts[:effective_url]
117
+ query[:body] = opts[:body] if opts[:body]
118
+ query[:parent_gid] = opts[:parent_gid] if opts[:parent_gid]
108
119
  query[:gid] = opts[:gid] if opts[:gid]
109
120
  query[:"min-timestamp"] = opts[:"min-timestamp"] if opts[:"min-timestamp"]
110
121
  query[:"max-timestamp"] = opts[:"max-timestamp"] if opts[:"max-timestamp"]
@@ -112,6 +123,8 @@ module Datahen
112
123
  query[:order] = opts[:order] if opts[:order]
113
124
  query[:filter] = opts[:filter] if opts[:filter]
114
125
  query[:force] = opts[:force] if opts[:force]
126
+ query[:action] = opts[:action] if opts[:action]
127
+ query[:"include-system"] = opts[:"include-system"] if opts[:"include-system"]
115
128
 
116
129
  if opts[:query]
117
130
  if opts[:query].is_a?(Hash)
@@ -25,6 +25,10 @@ module Datahen
25
25
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
26
26
  body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
27
27
  body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
28
+ body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
29
+ body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
30
+ body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
31
+ body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
28
32
  params = @options.merge({body: body.to_json})
29
33
 
30
34
  self.class.put("/jobs/#{job_id}", params)
@@ -97,7 +101,7 @@ module Datahen
97
101
  def sync_schema(job_id, opts={})
98
102
  params = @options.merge(opts)
99
103
 
100
- self.class.put("/sync/jobs/#{job_id}/schema", params)
104
+ self.class.put("/jobs/#{job_id}/sync/schema", params)
101
105
  end
102
106
 
103
107
  end
@@ -7,7 +7,7 @@ module Datahen
7
7
 
8
8
  def all(job_id, collection = 'default', opts = {})
9
9
  limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : 0
10
- self.retry(limit, 10, "Error while updating the seeder.") do
10
+ self.retry(limit, 10, "Error while updating the seeder.", true) do
11
11
  self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
12
12
  end
13
13
  end
@@ -18,6 +18,9 @@ module Datahen
18
18
  body[:max_size] = opts[:max_size] if opts[:max_size]
19
19
  body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
20
20
  body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
21
+ body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
22
+ body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
23
+ body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
21
24
 
22
25
  params = @options.merge({body: body.to_json})
23
26
 
@@ -55,6 +58,7 @@ module Datahen
55
58
  body[:parsing_status] = opts.fetch(:parsing_status){ nil }
56
59
  body[:log_error] = opts[:log_error] if opts[:log_error]
57
60
  body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
61
+ body[:parsing_try_limit] = opts[:parsing_try_limit] if opts.fetch(:parsing_try_limit){ nil }
58
62
 
59
63
  params = @options.merge({body: body.to_json})
60
64
 
@@ -90,6 +94,11 @@ module Datahen
90
94
  params = @options.merge(opts)
91
95
  self.class.put("/jobs/#{job_id}/pages/limbo", params)
92
96
  end
97
+
98
+ def still_alive(job_id, gid, opts={})
99
+ params = @options.merge(opts)
100
+ self.class.put("/jobs/#{job_id}/pages/#{gid}/still_alive", params)
101
+ end
93
102
  end
94
103
  end
95
104
  end
@@ -0,0 +1,17 @@
1
+ module Datahen
2
+ module Client
3
+ class JobTask < Datahen::Client::Base
4
+ def all(job_id, opts={})
5
+ params = @options.merge(opts)
6
+ self.class.get("/jobs/#{job_id}/tasks", params)
7
+ end
8
+
9
+ def find(job_id, task_id, opts={})
10
+ params = @options.merge(opts)
11
+ self.class.get("/jobs/#{job_id}/tasks/#{task_id}", params)
12
+ end
13
+
14
+ end
15
+
16
+ end
17
+ end
@@ -32,6 +32,10 @@ module Datahen
32
32
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
33
33
  body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
34
34
  body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
35
+ body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
36
+ body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
37
+ body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
38
+ body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
35
39
  params = @options.merge({body: body.to_json})
36
40
  self.class.post("/scrapers", params)
37
41
  end
@@ -57,6 +61,10 @@ module Datahen
57
61
  body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
58
62
  body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
59
63
  body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
64
+ body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
65
+ body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
66
+ body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
67
+ body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
60
68
  params = @options.merge({body: body.to_json})
61
69
 
62
70
  self.class.put("/scrapers/#{scraper_name}", params)
@@ -15,6 +15,10 @@ module Datahen
15
15
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
16
16
  body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
17
17
  body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
18
+ body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
19
+ body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
20
+ body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
21
+ body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
18
22
  if opts[:vars]
19
23
  if opts[:vars].is_a?(Array)
20
24
  body[:vars] = opts[:vars]
@@ -45,6 +49,10 @@ module Datahen
45
49
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
46
50
  body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
47
51
  body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
52
+ body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
53
+ body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
54
+ body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
55
+ body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
48
56
  params = @options.merge({body: body.to_json})
49
57
 
50
58
  self.class.put("/scrapers/#{scraper_name}/current_job", params)
@@ -0,0 +1,17 @@
1
+ module Datahen
2
+ module Client
3
+ class ScraperTask < Datahen::Client::Base
4
+ def all(scraper_name, opts={})
5
+ params = @options.merge(opts)
6
+ self.class.get("/scrapers/#{scraper_name}/current_job/tasks", params)
7
+ end
8
+
9
+ def find(scraper_name, task_id, opts={})
10
+ params = @options.merge(opts)
11
+ self.class.get("/scrapers/#{scraper_name}/current_job/tasks/#{task_id}", params)
12
+ end
13
+
14
+ end
15
+
16
+ end
17
+ end
@@ -24,6 +24,8 @@ require "datahen/client/scraper_var"
24
24
  require "datahen/client/job_var"
25
25
  require "datahen/client/scraper_job_var"
26
26
  require "datahen/client/job_finisher"
27
+ require "datahen/client/job_task"
28
+ require "datahen/client/scraper_task"
27
29
 
28
30
  module Datahen
29
31
  module Client
@@ -227,7 +227,11 @@ module Datahen
227
227
 
228
228
  # add pages
229
229
  count = 0
230
- (JSON.parse(response.body) || []).each do |page|
230
+ json = JSON.parse(response.body)
231
+ if json['error'] != ""
232
+ return 0
233
+ end
234
+ (json['data'] || []).each do |page|
231
235
  count += 1
232
236
  next if self.loaded_pages.has_key? page['gid']
233
237
  self.pages << (self.loaded_pages[page['gid']] = page)
@@ -307,7 +311,7 @@ module Datahen
307
311
  is_waiting = true
308
312
  puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
309
313
  if self.second_dequeue_count > 1 && !self.not_found
310
- puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
314
+ puts "\nWARNING: Your job might not be optimized. Consider increasing your job's \"parser_dequeue_scale\" if the `to_parse` queue is not empty or near empty \n"
311
315
  end
312
316
  end
313
317
  self.class.wait 1
@@ -172,11 +172,16 @@ module Datahen
172
172
  response = client.all(query_job_id, collection, {
173
173
  retry_limit: retry_limit
174
174
  })
175
-
176
175
  if response.code != 200
177
176
  raise "response_code: #{response.code}|#{response.parsed_response}"
178
177
  end
179
- (response.body != 'null') ? response.parsed_response : []
178
+
179
+ # check stream error
180
+ json_data = response.body != 'null' ? response.parsed_response : {}
181
+ if json_data['error'] != ""
182
+ raise "response_code: #{response.code}|Stream error: #{json_data['error']}"
183
+ end
184
+ json_data['data'].nil? ? [] : json_data['data']
180
185
  end
181
186
 
182
187
  # Find one output by collection and query with pagination.
@@ -33,7 +33,6 @@ module Datahen
33
33
  :failed_content,
34
34
  :outputs,
35
35
  :pages,
36
- :page,
37
36
  :save_pages,
38
37
  :save_outputs,
39
38
  :find_output,
@@ -41,7 +40,8 @@ module Datahen
41
40
  :refetch,
42
41
  :reparse,
43
42
  :limbo,
44
- :finish
43
+ :finish,
44
+ :still_alive
45
45
  ].freeze
46
46
  end
47
47
 
@@ -240,6 +240,12 @@ module Datahen
240
240
  @failed_content ||= get_failed_content(job_id, gid)
241
241
  end
242
242
 
243
+ def still_alive page_gid = nil
244
+ page_gid = gid if page_gid.nil?
245
+ client = Client::JobPage.new()
246
+ client.still_alive(job_id, page_gid)
247
+ end
248
+
243
249
  def handle_error(e)
244
250
  error = ["Parsing #{e.class}: #{e.to_s} (Job:#{job_id} GID:#{gid})",clean_backtrace(e.backtrace)].join("\n")
245
251
 
@@ -247,7 +253,8 @@ module Datahen
247
253
  job_id: job_id,
248
254
  gid: gid,
249
255
  parsing_status: :failed,
250
- log_error: error)
256
+ log_error: error,
257
+ parsing_try_limit: (page || {})['parsing_try_limit'])
251
258
  end
252
259
 
253
260
  end
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "1.4.0"
2
+ VERSION = "1.5.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0
4
+ version: 1.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-11-01 00:00:00.000000000 Z
11
+ date: 2024-01-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -227,6 +227,7 @@ files:
227
227
  - lib/datahen/cli/scraper_job.rb
228
228
  - lib/datahen/cli/scraper_job_var.rb
229
229
  - lib/datahen/cli/scraper_page.rb
230
+ - lib/datahen/cli/scraper_task.rb
230
231
  - lib/datahen/cli/scraper_var.rb
231
232
  - lib/datahen/cli/seeder.rb
232
233
  - lib/datahen/client.rb
@@ -245,6 +246,7 @@ files:
245
246
  - lib/datahen/client/job_output.rb
246
247
  - lib/datahen/client/job_page.rb
247
248
  - lib/datahen/client/job_stat.rb
249
+ - lib/datahen/client/job_task.rb
248
250
  - lib/datahen/client/job_var.rb
249
251
  - lib/datahen/client/scraper.rb
250
252
  - lib/datahen/client/scraper_deployment.rb
@@ -255,6 +257,7 @@ files:
255
257
  - lib/datahen/client/scraper_job_output.rb
256
258
  - lib/datahen/client/scraper_job_page.rb
257
259
  - lib/datahen/client/scraper_job_var.rb
260
+ - lib/datahen/client/scraper_task.rb
258
261
  - lib/datahen/client/scraper_var.rb
259
262
  - lib/datahen/error.rb
260
263
  - lib/datahen/error/custom_retry_error.rb
@@ -278,7 +281,7 @@ metadata:
278
281
  allowed_push_host: https://rubygems.org
279
282
  homepage_uri: https://datahen.com
280
283
  source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
281
- post_install_message:
284
+ post_install_message:
282
285
  rdoc_options: []
283
286
  require_paths:
284
287
  - lib
@@ -293,8 +296,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
293
296
  - !ruby/object:Gem::Version
294
297
  version: '0'
295
298
  requirements: []
296
- rubygems_version: 3.1.4
297
- signing_key:
299
+ rubygems_version: 3.0.3
300
+ signing_key:
298
301
  specification_version: 4
299
302
  summary: DataHen toolbelt for developers
300
303
  test_files: []