datahen 1.4.0 → 1.5.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ae63999d11bc052d81e3b1de67a0741702dd980e719dd544b5e689f0383e7a34
4
- data.tar.gz: faf53b662afa26409bff83c3007127211863ce33ff45b8c60aab56491fdcafe7
3
+ metadata.gz: 96d2bc30d1c96ce684d83efa54b6dff5966db2a1bba7ab4856b11caba2803086
4
+ data.tar.gz: 985712d5d7e6559ac64b76669241f56d704c754deb06a164e1f449aad10ef29e
5
5
  SHA512:
6
- metadata.gz: b0e7a0ddc975202df66785211cee796e1aef61de921f99ba0481645f59fb65963c03517e1d4f5b471d2ed108087011f20d6b693d6d199e2e96c860412b675415
7
- data.tar.gz: 70a13268ba6f3df8f560a1b4d65b261ec4744bbd71eb22b04c25dc5809e70af3541cc30d9dad0e4a4cc22c3a353b90bc24c4a43cfc70e7f82fc2080001596d38
6
+ metadata.gz: d9c6bd3e60034339a8354fe4bda365b91f21b6ec68da8f384d7380abcafa5ccce2c2aacd6cc7a8da37378b8681afe58765bcc461211812c623a8958eac7a5f72
7
+ data.tar.gz: ac5eb5c8de4e4b0a6d28d96179bab4bf347662247b94e775ed0a25e0f0ef00a542f01f8a1a06525b565e7bd1055d5cd30b480a28d28c7ebf5de893b89b9f5e3a
@@ -20,10 +20,20 @@ module Datahen
20
20
  collection = options.fetch(:collection) { 'default' }
21
21
  if options[:job]
22
22
  client = Client::JobOutput.new(options)
23
- puts "#{client.all(options[:job], collection)}"
23
+ json = JSON.parse(client.all(options[:job], collection).body)
24
+ if json['error'] == ""
25
+ puts "#{JSON.pretty_generate(json['data'])}"
26
+ else
27
+ puts "#{JSON.pretty_generate(json['error'])}"
28
+ end
24
29
  else
25
30
  client = Client::ScraperJobOutput.new(options)
26
- puts "#{client.all(scraper_name, collection)}"
31
+ json = JSON.parse(client.all(scraper_name, collection).body)
32
+ if json['error'] == ""
33
+ puts "#{JSON.pretty_generate(json['data'])}"
34
+ else
35
+ puts "#{JSON.pretty_generate(json['error'])}"
36
+ end
27
37
  end
28
38
  end
29
39
 
@@ -38,10 +48,20 @@ module Datahen
38
48
  collection = options.fetch(:collection) { 'default' }
39
49
  if options[:job]
40
50
  client = Client::JobOutput.new(options)
41
- puts "#{client.find(options[:job], collection, id)}"
51
+ json = JSON.parse(client.find(options[:job], collection, id).body)
52
+ if json['error'] == ""
53
+ puts "#{JSON.pretty_generate(json['data'])}"
54
+ else
55
+ puts "#{JSON.pretty_generate(json['error'])}"
56
+ end
42
57
  else
43
58
  client = Client::ScraperJobOutput.new(options)
44
- puts "#{client.find(scraper_name, collection, id)}"
59
+ json = JSON.parse(client.find(scraper_name, collection, id).body)
60
+ if json['error'] == ""
61
+ puts "#{JSON.pretty_generate(json['data'])}"
62
+ else
63
+ puts "#{JSON.pretty_generate(json['error'])}"
64
+ end
45
65
  end
46
66
  end
47
67
 
@@ -56,10 +76,20 @@ module Datahen
56
76
 
57
77
  if options[:job]
58
78
  client = Client::JobOutput.new(options)
59
- puts "#{client.collections(options[:job])}"
79
+ json = JSON.parse(client.collections(options[:job]).body)
80
+ if json['error'] == ""
81
+ puts "#{JSON.pretty_generate(json['data'])}"
82
+ else
83
+ puts "#{JSON.pretty_generate(json['error'])}"
84
+ end
60
85
  else
61
86
  client = Client::ScraperJobOutput.new(options)
62
- puts "#{client.collections(scraper_name)}"
87
+ json = JSON.parse(client.collections(scraper_name).body)
88
+ if json['error'] == ""
89
+ puts "#{JSON.pretty_generate(json['data'])}"
90
+ else
91
+ puts "#{JSON.pretty_generate(json['error'])}"
92
+ end
63
93
  end
64
94
  end
65
95
 
@@ -37,6 +37,10 @@ module Datahen
37
37
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
38
38
  option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
39
39
  option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
40
+ option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
41
+ option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
42
+ option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
43
+ option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
40
44
  def create(scraper_name, git_repository)
41
45
  # puts "options #{options}"
42
46
  client = Client::Scraper.new(options)
@@ -66,6 +70,10 @@ module Datahen
66
70
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
67
71
  option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
68
72
  option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
73
+ option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
74
+ option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
75
+ option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
76
+ option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
69
77
  def update(scraper_name)
70
78
  client = Client::Scraper.new(options)
71
79
  puts "#{client.update(scraper_name, options)}"
@@ -106,6 +114,10 @@ module Datahen
106
114
  option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
107
115
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
108
116
  option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
117
+ option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
118
+ option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
119
+ option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
120
+ option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
109
121
  def start(scraper_name)
110
122
  client = Client::ScraperJob.new(options)
111
123
  puts "Starting a scrape job..."
@@ -188,9 +200,19 @@ module Datahen
188
200
  def history(scraper_name)
189
201
  client = Client::JobStat.new(options)
190
202
  if options[:job]
191
- puts "#{client.job_stats_history(options[:job], options)}"
203
+ json = JSON.parse(client.job_stats_history(options[:job], options).body)
204
+ if json['error'] == ""
205
+ puts "#{JSON.pretty_generate(json['data'])}"
206
+ else
207
+ puts "#{JSON.pretty_generate(json['error'])}"
208
+ end
192
209
  else
193
- puts "#{client.scraper_job_stats_history(scraper_name, options)}"
210
+ json = JSON.parse(client.scraper_job_stats_history(scraper_name, options).body)
211
+ if json['error'] == ""
212
+ puts "#{JSON.pretty_generate(json['data'])}"
213
+ else
214
+ puts "#{JSON.pretty_generate(json['error'])}"
215
+ end
194
216
  end
195
217
  end
196
218
 
@@ -227,6 +249,9 @@ module Datahen
227
249
  desc "var SUBCOMMAND ...ARGS", "for managing scraper's variables"
228
250
  subcommand "var", ScraperVar
229
251
 
252
+ desc "task SUBCOMMAND ...ARGS", "manage task on a job"
253
+ subcommand "task", ScraperTask
254
+
230
255
 
231
256
  end
232
257
  end
@@ -108,6 +108,10 @@ module Datahen
108
108
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
109
109
  option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
110
110
  option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
111
+ option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
112
+ option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
113
+ option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
114
+ option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
111
115
  def update(scraper_name)
112
116
  if options[:job]
113
117
  client = Client::Job.new(options)
@@ -13,6 +13,10 @@ module Datahen
13
13
  LONGDESC
14
14
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
15
15
  option :page_type, :aliases => :t, type: :string, desc: 'Filter by page_type'
16
+ option :url, :aliases => :u, type: :string, desc: 'Filter by url'
17
+ option :effective_url, :aliases => :U, type: :string, desc: 'Filter by effective_url'
18
+ option :body, :aliases => :b, type: :string, desc: 'Filter by body'
19
+ option :parent_gid, :aliases => :G, type: :string, desc: 'Filter by parent_gid'
16
20
  option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
17
21
  option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
18
22
  option :fetch_fail, type: :boolean, desc: 'Returns only pages that fails fetching.'
@@ -21,10 +25,20 @@ module Datahen
21
25
  def list(scraper_name)
22
26
  if options[:job]
23
27
  client = Client::JobPage.new(options)
24
- puts "#{client.all(options[:job])}"
28
+ json = JSON.parse(client.all(options[:job]).body)
29
+ if json['error'] == ""
30
+ puts "#{JSON.pretty_generate(json['data'])}"
31
+ else
32
+ puts "#{JSON.pretty_generate(json['error'])}"
33
+ end
25
34
  else
26
35
  client = Client::ScraperJobPage.new(options)
27
- puts "#{client.all(scraper_name)}"
36
+ json = JSON.parse(client.all(scraper_name).body)
37
+ if json['error'] == ""
38
+ puts "#{JSON.pretty_generate(json['data'])}"
39
+ else
40
+ puts "#{JSON.pretty_generate(json['error'])}"
41
+ end
28
42
  end
29
43
  end
30
44
 
@@ -84,6 +98,9 @@ module Datahen
84
98
  option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
85
99
  option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
86
100
  option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
101
+ option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
102
+ option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
103
+ option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
87
104
  def update(scraper_name, gid)
88
105
  begin
89
106
  options[:vars] = JSON.parse(options[:vars]) if options[:vars]
@@ -0,0 +1,48 @@
1
+ module Datahen
2
+ class CLI < Thor
3
+ class ScraperTask < Thor
4
+ package_name "scraper task"
5
+ def self.banner(command, namespace = nil, subcommand = false)
6
+ "#{basename} #{@package_name} #{command.usage}"
7
+ end
8
+
9
+ desc "list <scraper_name>", "List Tasks on a scraper's current job"
10
+ long_desc <<-LONGDESC
11
+ List all tasks in a scraper's current job or given job ID.\x5
12
+ LONGDESC
13
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
14
+ option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
15
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
16
+ option :status, type: :array, desc: 'Returns only tasks with specific status.'
17
+ option :action, type: :array, desc: 'Returns only tasks with specific action.'
18
+ option :"include-system", type: :boolean, desc: 'If it is true, will returns all actions. If it is false only tasks with specific action ["refetch", "reparse", "terminate"].'
19
+ def list(scraper_name)
20
+ if options[:job]
21
+ client = Client::JobTask.new(options)
22
+ puts "#{client.all(options[:job])}"
23
+ else
24
+ client = Client::ScraperTask.new(options)
25
+ puts "#{client.all(scraper_name)}"
26
+ end
27
+ end
28
+
29
+
30
+ desc "show <scraper_name> <task_id>", "Show task in scraper's current job"
31
+ long_desc <<-LONGDESC
32
+ Shows a task in a scraper's current job or given job ID.\x5
33
+ LONGDESC
34
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
35
+ def show(scraper_name, task_id)
36
+ if options[:job]
37
+ client = Client::JobTask.new(options)
38
+ puts "#{client.find(options[:job], task_id)}"
39
+ else
40
+ client = Client::ScraperTask.new(options)
41
+ puts "#{client.find(scraper_name, task_id)}"
42
+ end
43
+ end
44
+
45
+ end
46
+ end
47
+
48
+ end
data/lib/datahen/cli.rb CHANGED
@@ -11,6 +11,7 @@ require 'datahen/cli/scraper_page'
11
11
  require 'datahen/cli/job_output'
12
12
  require 'datahen/cli/job'
13
13
  require 'datahen/cli/scraper_deployment'
14
+ require 'datahen/cli/scraper_task'
14
15
  require 'datahen/cli/scraper'
15
16
  require 'datahen/cli/parser'
16
17
  require 'datahen/cli/seeder'
@@ -56,12 +56,18 @@ module Datahen
56
56
  target.merge(source.select{|k,v|target.has_key?(k)})
57
57
  end
58
58
 
59
- def retry times, delay = nil, err_msg = nil
59
+ def retry times, delay = nil, err_msg = nil, stream = false
60
60
  limit = times.nil? ? nil : times.to_i
61
61
  delay = delay.nil? ? 5 : delay.to_i
62
62
  count = 0
63
63
  begin
64
- yield
64
+ val = yield
65
+ if stream
66
+ return if val.nil?
67
+ if val['error'] != ""
68
+ raise StandardError.new(val['error'])
69
+ end
70
+ end
65
71
  rescue Error::CustomRetryError, StandardError => e
66
72
  is_custom_retry = e.is_a? Error::CustomRetryError
67
73
  real_delay = is_custom_retry ? e.delay : delay
@@ -81,6 +87,7 @@ module Datahen
81
87
  puts "#{err_msg.nil? ? '' : "#{err_msg} "}Retry \##{count}#{should_aprox ? '+' : ''}..."
82
88
  retry
83
89
  end
90
+ val
84
91
  end
85
92
 
86
93
  def initialize(opts={})
@@ -105,6 +112,10 @@ module Datahen
105
112
  query[:parsefail] = opts[:parse_fail] if opts[:parse_fail]
106
113
  query[:status] = opts[:status] if opts[:status]
107
114
  query[:page_type] = opts[:page_type] if opts[:page_type]
115
+ query[:url] = opts[:url] if opts[:url]
116
+ query[:effective_url] = opts[:effective_url] if opts[:effective_url]
117
+ query[:body] = opts[:body] if opts[:body]
118
+ query[:parent_gid] = opts[:parent_gid] if opts[:parent_gid]
108
119
  query[:gid] = opts[:gid] if opts[:gid]
109
120
  query[:"min-timestamp"] = opts[:"min-timestamp"] if opts[:"min-timestamp"]
110
121
  query[:"max-timestamp"] = opts[:"max-timestamp"] if opts[:"max-timestamp"]
@@ -112,6 +123,8 @@ module Datahen
112
123
  query[:order] = opts[:order] if opts[:order]
113
124
  query[:filter] = opts[:filter] if opts[:filter]
114
125
  query[:force] = opts[:force] if opts[:force]
126
+ query[:action] = opts[:action] if opts[:action]
127
+ query[:"include-system"] = opts[:"include-system"] if opts[:"include-system"]
115
128
 
116
129
  if opts[:query]
117
130
  if opts[:query].is_a?(Hash)
@@ -25,6 +25,10 @@ module Datahen
25
25
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
26
26
  body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
27
27
  body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
28
+ body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
29
+ body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
30
+ body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
31
+ body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
28
32
  params = @options.merge({body: body.to_json})
29
33
 
30
34
  self.class.put("/jobs/#{job_id}", params)
@@ -97,7 +101,7 @@ module Datahen
97
101
  def sync_schema(job_id, opts={})
98
102
  params = @options.merge(opts)
99
103
 
100
- self.class.put("/sync/jobs/#{job_id}/schema", params)
104
+ self.class.put("/jobs/#{job_id}/sync/schema", params)
101
105
  end
102
106
 
103
107
  end
@@ -7,7 +7,7 @@ module Datahen
7
7
 
8
8
  def all(job_id, collection = 'default', opts = {})
9
9
  limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : 0
10
- self.retry(limit, 10, "Error while updating the seeder.") do
10
+ self.retry(limit, 10, "Error while updating the seeder.", true) do
11
11
  self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
12
12
  end
13
13
  end
@@ -18,6 +18,9 @@ module Datahen
18
18
  body[:max_size] = opts[:max_size] if opts[:max_size]
19
19
  body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
20
20
  body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
21
+ body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
22
+ body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
23
+ body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
21
24
 
22
25
  params = @options.merge({body: body.to_json})
23
26
 
@@ -55,6 +58,7 @@ module Datahen
55
58
  body[:parsing_status] = opts.fetch(:parsing_status){ nil }
56
59
  body[:log_error] = opts[:log_error] if opts[:log_error]
57
60
  body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
61
+ body[:parsing_try_limit] = opts[:parsing_try_limit] if opts.fetch(:parsing_try_limit){ nil }
58
62
 
59
63
  params = @options.merge({body: body.to_json})
60
64
 
@@ -90,6 +94,11 @@ module Datahen
90
94
  params = @options.merge(opts)
91
95
  self.class.put("/jobs/#{job_id}/pages/limbo", params)
92
96
  end
97
+
98
+ def still_alive(job_id, gid, opts={})
99
+ params = @options.merge(opts)
100
+ self.class.put("/jobs/#{job_id}/pages/#{gid}/still_alive", params)
101
+ end
93
102
  end
94
103
  end
95
104
  end
@@ -0,0 +1,17 @@
1
+ module Datahen
2
+ module Client
3
+ class JobTask < Datahen::Client::Base
4
+ def all(job_id, opts={})
5
+ params = @options.merge(opts)
6
+ self.class.get("/jobs/#{job_id}/tasks", params)
7
+ end
8
+
9
+ def find(job_id, task_id, opts={})
10
+ params = @options.merge(opts)
11
+ self.class.get("/jobs/#{job_id}/tasks/#{task_id}", params)
12
+ end
13
+
14
+ end
15
+
16
+ end
17
+ end
@@ -32,6 +32,10 @@ module Datahen
32
32
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
33
33
  body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
34
34
  body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
35
+ body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
36
+ body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
37
+ body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
38
+ body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
35
39
  params = @options.merge({body: body.to_json})
36
40
  self.class.post("/scrapers", params)
37
41
  end
@@ -57,6 +61,10 @@ module Datahen
57
61
  body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
58
62
  body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
59
63
  body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
64
+ body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
65
+ body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
66
+ body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
67
+ body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
60
68
  params = @options.merge({body: body.to_json})
61
69
 
62
70
  self.class.put("/scrapers/#{scraper_name}", params)
@@ -15,6 +15,10 @@ module Datahen
15
15
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
16
16
  body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
17
17
  body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
18
+ body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
19
+ body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
20
+ body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
21
+ body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
18
22
  if opts[:vars]
19
23
  if opts[:vars].is_a?(Array)
20
24
  body[:vars] = opts[:vars]
@@ -45,6 +49,10 @@ module Datahen
45
49
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
46
50
  body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
47
51
  body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
52
+ body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
53
+ body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
54
+ body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
55
+ body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
48
56
  params = @options.merge({body: body.to_json})
49
57
 
50
58
  self.class.put("/scrapers/#{scraper_name}/current_job", params)
@@ -0,0 +1,17 @@
1
+ module Datahen
2
+ module Client
3
+ class ScraperTask < Datahen::Client::Base
4
+ def all(scraper_name, opts={})
5
+ params = @options.merge(opts)
6
+ self.class.get("/scrapers/#{scraper_name}/current_job/tasks", params)
7
+ end
8
+
9
+ def find(scraper_name, task_id, opts={})
10
+ params = @options.merge(opts)
11
+ self.class.get("/scrapers/#{scraper_name}/current_job/tasks/#{task_id}", params)
12
+ end
13
+
14
+ end
15
+
16
+ end
17
+ end
@@ -24,6 +24,8 @@ require "datahen/client/scraper_var"
24
24
  require "datahen/client/job_var"
25
25
  require "datahen/client/scraper_job_var"
26
26
  require "datahen/client/job_finisher"
27
+ require "datahen/client/job_task"
28
+ require "datahen/client/scraper_task"
27
29
 
28
30
  module Datahen
29
31
  module Client
@@ -227,7 +227,11 @@ module Datahen
227
227
 
228
228
  # add pages
229
229
  count = 0
230
- (JSON.parse(response.body) || []).each do |page|
230
+ json = JSON.parse(response.body)
231
+ if json['error'] != ""
232
+ return 0
233
+ end
234
+ (json['data'] || []).each do |page|
231
235
  count += 1
232
236
  next if self.loaded_pages.has_key? page['gid']
233
237
  self.pages << (self.loaded_pages[page['gid']] = page)
@@ -307,7 +311,7 @@ module Datahen
307
311
  is_waiting = true
308
312
  puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
309
313
  if self.second_dequeue_count > 1 && !self.not_found
310
- puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
314
+ puts "\nWARNING: Your job might not be optimized. Consider increasing your job's \"parser_dequeue_scale\" if the `to_parse` queue is not empty or near empty \n"
311
315
  end
312
316
  end
313
317
  self.class.wait 1
@@ -172,11 +172,16 @@ module Datahen
172
172
  response = client.all(query_job_id, collection, {
173
173
  retry_limit: retry_limit
174
174
  })
175
-
176
175
  if response.code != 200
177
176
  raise "response_code: #{response.code}|#{response.parsed_response}"
178
177
  end
179
- (response.body != 'null') ? response.parsed_response : []
178
+
179
+ # check stream error
180
+ json_data = response.body != 'null' ? response.parsed_response : {}
181
+ if json_data['error'] != ""
182
+ raise "response_code: #{response.code}|Stream error: #{json_data['error']}"
183
+ end
184
+ json_data['data'].nil? ? [] : json_data['data']
180
185
  end
181
186
 
182
187
  # Find one output by collection and query with pagination.
@@ -33,7 +33,6 @@ module Datahen
33
33
  :failed_content,
34
34
  :outputs,
35
35
  :pages,
36
- :page,
37
36
  :save_pages,
38
37
  :save_outputs,
39
38
  :find_output,
@@ -41,7 +40,8 @@ module Datahen
41
40
  :refetch,
42
41
  :reparse,
43
42
  :limbo,
44
- :finish
43
+ :finish,
44
+ :still_alive
45
45
  ].freeze
46
46
  end
47
47
 
@@ -240,6 +240,12 @@ module Datahen
240
240
  @failed_content ||= get_failed_content(job_id, gid)
241
241
  end
242
242
 
243
+ def still_alive page_gid = nil
244
+ page_gid = gid if page_gid.nil?
245
+ client = Client::JobPage.new()
246
+ client.still_alive(job_id, page_gid)
247
+ end
248
+
243
249
  def handle_error(e)
244
250
  error = ["Parsing #{e.class}: #{e.to_s} (Job:#{job_id} GID:#{gid})",clean_backtrace(e.backtrace)].join("\n")
245
251
 
@@ -247,7 +253,8 @@ module Datahen
247
253
  job_id: job_id,
248
254
  gid: gid,
249
255
  parsing_status: :failed,
250
- log_error: error)
256
+ log_error: error,
257
+ parsing_try_limit: (page || {})['parsing_try_limit'])
251
258
  end
252
259
 
253
260
  end
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "1.4.0"
2
+ VERSION = "1.5.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0
4
+ version: 1.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-11-01 00:00:00.000000000 Z
11
+ date: 2024-01-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -227,6 +227,7 @@ files:
227
227
  - lib/datahen/cli/scraper_job.rb
228
228
  - lib/datahen/cli/scraper_job_var.rb
229
229
  - lib/datahen/cli/scraper_page.rb
230
+ - lib/datahen/cli/scraper_task.rb
230
231
  - lib/datahen/cli/scraper_var.rb
231
232
  - lib/datahen/cli/seeder.rb
232
233
  - lib/datahen/client.rb
@@ -245,6 +246,7 @@ files:
245
246
  - lib/datahen/client/job_output.rb
246
247
  - lib/datahen/client/job_page.rb
247
248
  - lib/datahen/client/job_stat.rb
249
+ - lib/datahen/client/job_task.rb
248
250
  - lib/datahen/client/job_var.rb
249
251
  - lib/datahen/client/scraper.rb
250
252
  - lib/datahen/client/scraper_deployment.rb
@@ -255,6 +257,7 @@ files:
255
257
  - lib/datahen/client/scraper_job_output.rb
256
258
  - lib/datahen/client/scraper_job_page.rb
257
259
  - lib/datahen/client/scraper_job_var.rb
260
+ - lib/datahen/client/scraper_task.rb
258
261
  - lib/datahen/client/scraper_var.rb
259
262
  - lib/datahen/error.rb
260
263
  - lib/datahen/error/custom_retry_error.rb
@@ -278,7 +281,7 @@ metadata:
278
281
  allowed_push_host: https://rubygems.org
279
282
  homepage_uri: https://datahen.com
280
283
  source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
281
- post_install_message:
284
+ post_install_message:
282
285
  rdoc_options: []
283
286
  require_paths:
284
287
  - lib
@@ -293,8 +296,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
293
296
  - !ruby/object:Gem::Version
294
297
  version: '0'
295
298
  requirements: []
296
- rubygems_version: 3.1.4
297
- signing_key:
299
+ rubygems_version: 3.0.3
300
+ signing_key:
298
301
  specification_version: 4
299
302
  summary: DataHen toolbelt for developers
300
303
  test_files: []