datahen 1.3.2 → 1.5.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 30a28a5de1830e60d44942ff9605b99557e15804bc9ea3a00939859000cd1169
4
- data.tar.gz: 6d2c4bff4b795e02def479f6e37f8535045de4ab02e4b1a4fe4f8fd7ca4284ef
3
+ metadata.gz: 96d2bc30d1c96ce684d83efa54b6dff5966db2a1bba7ab4856b11caba2803086
4
+ data.tar.gz: 985712d5d7e6559ac64b76669241f56d704c754deb06a164e1f449aad10ef29e
5
5
  SHA512:
6
- metadata.gz: 0642ad5e85396000ad9e493d0f8da0963d29b45b52f845b3555d792e427abdee8e5a80f7a58b57e75929f0b45887209d682e38bfc4087e4f9f0f0133da9259d5
7
- data.tar.gz: 24f9166b668a755602e63307dbd88c7b8d385b44a697c2b634b161cd1c54bc73b8ef084c59004996df2e9f3770bb63c02bb0fb6846a5326f3d2fe1b0b7537393
6
+ metadata.gz: d9c6bd3e60034339a8354fe4bda365b91f21b6ec68da8f384d7380abcafa5ccce2c2aacd6cc7a8da37378b8681afe58765bcc461211812c623a8958eac7a5f72
7
+ data.tar.gz: ac5eb5c8de4e4b0a6d28d96179bab4bf347662247b94e775ed0a25e0f0ef00a542f01f8a1a06525b565e7bd1055d5cd30b480a28d28c7ebf5de893b89b9f5e3a
@@ -20,10 +20,20 @@ module Datahen
20
20
  collection = options.fetch(:collection) { 'default' }
21
21
  if options[:job]
22
22
  client = Client::JobOutput.new(options)
23
- puts "#{client.all(options[:job], collection)}"
23
+ json = JSON.parse(client.all(options[:job], collection).body)
24
+ if json['error'] == ""
25
+ puts "#{JSON.pretty_generate(json['data'])}"
26
+ else
27
+ puts "#{JSON.pretty_generate(json['error'])}"
28
+ end
24
29
  else
25
30
  client = Client::ScraperJobOutput.new(options)
26
- puts "#{client.all(scraper_name, collection)}"
31
+ json = JSON.parse(client.all(scraper_name, collection).body)
32
+ if json['error'] == ""
33
+ puts "#{JSON.pretty_generate(json['data'])}"
34
+ else
35
+ puts "#{JSON.pretty_generate(json['error'])}"
36
+ end
27
37
  end
28
38
  end
29
39
 
@@ -38,10 +48,20 @@ module Datahen
38
48
  collection = options.fetch(:collection) { 'default' }
39
49
  if options[:job]
40
50
  client = Client::JobOutput.new(options)
41
- puts "#{client.find(options[:job], collection, id)}"
51
+ json = JSON.parse(client.find(options[:job], collection, id).body)
52
+ if json['error'] == ""
53
+ puts "#{JSON.pretty_generate(json['data'])}"
54
+ else
55
+ puts "#{JSON.pretty_generate(json['error'])}"
56
+ end
42
57
  else
43
58
  client = Client::ScraperJobOutput.new(options)
44
- puts "#{client.find(scraper_name, collection, id)}"
59
+ json = JSON.parse(client.find(scraper_name, collection, id).body)
60
+ if json['error'] == ""
61
+ puts "#{JSON.pretty_generate(json['data'])}"
62
+ else
63
+ puts "#{JSON.pretty_generate(json['error'])}"
64
+ end
45
65
  end
46
66
  end
47
67
 
@@ -56,10 +76,20 @@ module Datahen
56
76
 
57
77
  if options[:job]
58
78
  client = Client::JobOutput.new(options)
59
- puts "#{client.collections(options[:job])}"
79
+ json = JSON.parse(client.collections(options[:job]).body)
80
+ if json['error'] == ""
81
+ puts "#{JSON.pretty_generate(json['data'])}"
82
+ else
83
+ puts "#{JSON.pretty_generate(json['error'])}"
84
+ end
60
85
  else
61
86
  client = Client::ScraperJobOutput.new(options)
62
- puts "#{client.collections(scraper_name)}"
87
+ json = JSON.parse(client.collections(scraper_name).body)
88
+ if json['error'] == ""
89
+ puts "#{JSON.pretty_generate(json['data'])}"
90
+ else
91
+ puts "#{JSON.pretty_generate(json['error'])}"
92
+ end
63
93
  end
64
94
  end
65
95
 
@@ -37,6 +37,10 @@ module Datahen
37
37
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
38
38
  option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
39
39
  option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
40
+ option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
41
+ option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
42
+ option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
43
+ option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
40
44
  def create(scraper_name, git_repository)
41
45
  # puts "options #{options}"
42
46
  client = Client::Scraper.new(options)
@@ -66,6 +70,10 @@ module Datahen
66
70
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
67
71
  option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
68
72
  option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
73
+ option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
74
+ option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
75
+ option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
76
+ option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
69
77
  def update(scraper_name)
70
78
  client = Client::Scraper.new(options)
71
79
  puts "#{client.update(scraper_name, options)}"
@@ -106,6 +114,10 @@ module Datahen
106
114
  option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
107
115
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
108
116
  option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
117
+ option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
118
+ option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
119
+ option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
120
+ option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
109
121
  def start(scraper_name)
110
122
  client = Client::ScraperJob.new(options)
111
123
  puts "Starting a scrape job..."
@@ -188,9 +200,19 @@ module Datahen
188
200
  def history(scraper_name)
189
201
  client = Client::JobStat.new(options)
190
202
  if options[:job]
191
- puts "#{client.job_stats_history(options[:job], options)}"
203
+ json = JSON.parse(client.job_stats_history(options[:job], options).body)
204
+ if json['error'] == ""
205
+ puts "#{JSON.pretty_generate(json['data'])}"
206
+ else
207
+ puts "#{JSON.pretty_generate(json['error'])}"
208
+ end
192
209
  else
193
- puts "#{client.scraper_job_stats_history(scraper_name, options)}"
210
+ json = JSON.parse(client.scraper_job_stats_history(scraper_name, options).body)
211
+ if json['error'] == ""
212
+ puts "#{JSON.pretty_generate(json['data'])}"
213
+ else
214
+ puts "#{JSON.pretty_generate(json['error'])}"
215
+ end
194
216
  end
195
217
  end
196
218
 
@@ -227,6 +249,9 @@ module Datahen
227
249
  desc "var SUBCOMMAND ...ARGS", "for managing scraper's variables"
228
250
  subcommand "var", ScraperVar
229
251
 
252
+ desc "task SUBCOMMAND ...ARGS", "manage task on a job"
253
+ subcommand "task", ScraperTask
254
+
230
255
 
231
256
  end
232
257
  end
@@ -108,6 +108,10 @@ module Datahen
108
108
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
109
109
  option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
110
110
  option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
111
+ option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
112
+ option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
113
+ option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
114
+ option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
111
115
  def update(scraper_name)
112
116
  if options[:job]
113
117
  client = Client::Job.new(options)
@@ -13,6 +13,10 @@ module Datahen
13
13
  LONGDESC
14
14
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
15
15
  option :page_type, :aliases => :t, type: :string, desc: 'Filter by page_type'
16
+ option :url, :aliases => :u, type: :string, desc: 'Filter by url'
17
+ option :effective_url, :aliases => :U, type: :string, desc: 'Filter by effective_url'
18
+ option :body, :aliases => :b, type: :string, desc: 'Filter by body'
19
+ option :parent_gid, :aliases => :G, type: :string, desc: 'Filter by parent_gid'
16
20
  option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
17
21
  option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
18
22
  option :fetch_fail, type: :boolean, desc: 'Returns only pages that fails fetching.'
@@ -21,53 +25,65 @@ module Datahen
21
25
  def list(scraper_name)
22
26
  if options[:job]
23
27
  client = Client::JobPage.new(options)
24
- puts "#{client.all(options[:job])}"
28
+ json = JSON.parse(client.all(options[:job]).body)
29
+ if json['error'] == ""
30
+ puts "#{JSON.pretty_generate(json['data'])}"
31
+ else
32
+ puts "#{JSON.pretty_generate(json['error'])}"
33
+ end
25
34
  else
26
35
  client = Client::ScraperJobPage.new(options)
27
- puts "#{client.all(scraper_name)}"
36
+ json = JSON.parse(client.all(scraper_name).body)
37
+ if json['error'] == ""
38
+ puts "#{JSON.pretty_generate(json['data'])}"
39
+ else
40
+ puts "#{JSON.pretty_generate(json['error'])}"
41
+ end
28
42
  end
29
43
  end
30
44
 
31
- desc "add <scraper_name> <url>", "Enqueues a page to a scraper's current job"
45
+ desc "add <scraper_name> <page_json>", "Enqueues a page to a scraper's current job"
32
46
  long_desc <<-LONGDESC
33
47
  Enqueues a page to a scraper's current job\x5
34
48
  LONGDESC
35
49
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
36
- option :method, :aliases => :m, type: :string, desc: 'Set request method. Default: GET'
37
- option :headers, :aliases => :H, type: :string, banner: :JSON, desc: 'Set request headers. Must be in json format. i.e: {"Foo":"bar"} '
38
- option :cookie, :aliases => :c, type: :string, desc: 'Set request cookie.'
39
- option :vars, :aliases => :v, type: :string, banner: :JSON, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
40
- option :page_type, :aliases => :t, desc: 'Set page type'
41
- option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
42
- option :fetch_type, :aliases => :F, desc: 'Set fetch type. Default: http'
43
- option :body, :aliases => :b, desc: 'Set request body'
44
- option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
45
- option :freshness, :aliases => :s, desc: 'Set how fresh the page cache is. Accepts timestap format.'
46
- option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
47
- option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
48
- option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
49
- option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
50
- def add(scraper_name, url)
50
+ def add(scraper_name, page_json)
51
51
  begin
52
- options[:headers] = JSON.parse(options[:headers]) if options[:headers]
53
- options[:vars] = JSON.parse(options[:vars]) if options[:vars]
54
- method = options[:method]
52
+ page = JSON.parse(page_json)
55
53
 
56
54
  if options[:job]
57
55
  client = Client::JobPage.new(options)
58
- puts "#{client.enqueue(options[:job], method, url, options)}"
56
+ puts "#{client.enqueue(options[:job], page, options)}"
59
57
  else
60
58
  client = Client::ScraperJobPage.new(options)
61
- puts "#{client.enqueue(scraper_name, method, url, options)}"
59
+ puts "#{client.enqueue(scraper_name, page, options)}"
62
60
  end
63
61
 
64
62
  rescue JSON::ParserError
65
- if options[:headers]
66
- puts "Error: #{options[:headers]} on headers is not a valid JSON"
67
- end
68
- if options[:vars]
69
- puts "Error: #{options[:vars]} on vars is not a valid JSON"
63
+ puts "Error: Invalid JSON"
64
+ end
65
+ end
66
+
67
+
68
+ desc "getgid <scraper_name> <page_json>", "Get the generated GID for a scraper's current job"
69
+ long_desc <<-LONGDESC
70
+ Get the generated GID for a scraper's current job.\x5
71
+ LONGDESC
72
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
73
+ def getgid(scraper_name, page_json)
74
+ begin
75
+ page = JSON.parse(page_json)
76
+
77
+ if options[:job]
78
+ client = Client::JobPage.new(options)
79
+ puts "#{client.get_gid(options[:job], page, options)}"
80
+ else
81
+ client = Client::ScraperJobPage.new(options)
82
+ puts "#{client.get_gid(scraper_name, page, options)}"
70
83
  end
84
+
85
+ rescue JSON::ParserError
86
+ puts "Error: Invalid JSON"
71
87
  end
72
88
  end
73
89
 
@@ -82,6 +98,9 @@ module Datahen
82
98
  option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
83
99
  option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
84
100
  option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
101
+ option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
102
+ option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
103
+ option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
85
104
  def update(scraper_name, gid)
86
105
  begin
87
106
  options[:vars] = JSON.parse(options[:vars]) if options[:vars]
@@ -0,0 +1,48 @@
1
+ module Datahen
2
+ class CLI < Thor
3
+ class ScraperTask < Thor
4
+ package_name "scraper task"
5
+ def self.banner(command, namespace = nil, subcommand = false)
6
+ "#{basename} #{@package_name} #{command.usage}"
7
+ end
8
+
9
+ desc "list <scraper_name>", "List Tasks on a scraper's current job"
10
+ long_desc <<-LONGDESC
11
+ List all tasks in a scraper's current job or given job ID.\x5
12
+ LONGDESC
13
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
14
+ option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
15
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
16
+ option :status, type: :array, desc: 'Returns only tasks with specific status.'
17
+ option :action, type: :array, desc: 'Returns only tasks with specific action.'
18
+ option :"include-system", type: :boolean, desc: 'If it is true, will returns all actions. If it is false only tasks with specific action ["refetch", "reparse", "terminate"].'
19
+ def list(scraper_name)
20
+ if options[:job]
21
+ client = Client::JobTask.new(options)
22
+ puts "#{client.all(options[:job])}"
23
+ else
24
+ client = Client::ScraperTask.new(options)
25
+ puts "#{client.all(scraper_name)}"
26
+ end
27
+ end
28
+
29
+
30
+ desc "show <scraper_name> <task_id>", "Show task in scraper's current job"
31
+ long_desc <<-LONGDESC
32
+ Shows a task in a scraper's current job or given job ID.\x5
33
+ LONGDESC
34
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
35
+ def show(scraper_name, task_id)
36
+ if options[:job]
37
+ client = Client::JobTask.new(options)
38
+ puts "#{client.find(options[:job], task_id)}"
39
+ else
40
+ client = Client::ScraperTask.new(options)
41
+ puts "#{client.find(scraper_name, task_id)}"
42
+ end
43
+ end
44
+
45
+ end
46
+ end
47
+
48
+ end
data/lib/datahen/cli.rb CHANGED
@@ -11,6 +11,7 @@ require 'datahen/cli/scraper_page'
11
11
  require 'datahen/cli/job_output'
12
12
  require 'datahen/cli/job'
13
13
  require 'datahen/cli/scraper_deployment'
14
+ require 'datahen/cli/scraper_task'
14
15
  require 'datahen/cli/scraper'
15
16
  require 'datahen/cli/parser'
16
17
  require 'datahen/cli/seeder'
@@ -56,12 +56,18 @@ module Datahen
56
56
  target.merge(source.select{|k,v|target.has_key?(k)})
57
57
  end
58
58
 
59
- def retry times, delay = nil, err_msg = nil
59
+ def retry times, delay = nil, err_msg = nil, stream = false
60
60
  limit = times.nil? ? nil : times.to_i
61
61
  delay = delay.nil? ? 5 : delay.to_i
62
62
  count = 0
63
63
  begin
64
- yield
64
+ val = yield
65
+ if stream
66
+ return if val.nil?
67
+ if val['error'] != ""
68
+ raise StandardError.new(val['error'])
69
+ end
70
+ end
65
71
  rescue Error::CustomRetryError, StandardError => e
66
72
  is_custom_retry = e.is_a? Error::CustomRetryError
67
73
  real_delay = is_custom_retry ? e.delay : delay
@@ -81,6 +87,7 @@ module Datahen
81
87
  puts "#{err_msg.nil? ? '' : "#{err_msg} "}Retry \##{count}#{should_aprox ? '+' : ''}..."
82
88
  retry
83
89
  end
90
+ val
84
91
  end
85
92
 
86
93
  def initialize(opts={})
@@ -105,6 +112,10 @@ module Datahen
105
112
  query[:parsefail] = opts[:parse_fail] if opts[:parse_fail]
106
113
  query[:status] = opts[:status] if opts[:status]
107
114
  query[:page_type] = opts[:page_type] if opts[:page_type]
115
+ query[:url] = opts[:url] if opts[:url]
116
+ query[:effective_url] = opts[:effective_url] if opts[:effective_url]
117
+ query[:body] = opts[:body] if opts[:body]
118
+ query[:parent_gid] = opts[:parent_gid] if opts[:parent_gid]
108
119
  query[:gid] = opts[:gid] if opts[:gid]
109
120
  query[:"min-timestamp"] = opts[:"min-timestamp"] if opts[:"min-timestamp"]
110
121
  query[:"max-timestamp"] = opts[:"max-timestamp"] if opts[:"max-timestamp"]
@@ -112,6 +123,8 @@ module Datahen
112
123
  query[:order] = opts[:order] if opts[:order]
113
124
  query[:filter] = opts[:filter] if opts[:filter]
114
125
  query[:force] = opts[:force] if opts[:force]
126
+ query[:action] = opts[:action] if opts[:action]
127
+ query[:"include-system"] = opts[:"include-system"] if opts[:"include-system"]
115
128
 
116
129
  if opts[:query]
117
130
  if opts[:query].is_a?(Hash)
@@ -25,6 +25,10 @@ module Datahen
25
25
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
26
26
  body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
27
27
  body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
28
+ body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
29
+ body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
30
+ body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
31
+ body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
28
32
  params = @options.merge({body: body.to_json})
29
33
 
30
34
  self.class.put("/jobs/#{job_id}", params)
@@ -97,7 +101,7 @@ module Datahen
97
101
  def sync_schema(job_id, opts={})
98
102
  params = @options.merge(opts)
99
103
 
100
- self.class.put("/sync/jobs/#{job_id}/schema", params)
104
+ self.class.put("/jobs/#{job_id}/sync/schema", params)
101
105
  end
102
106
 
103
107
  end
@@ -7,7 +7,7 @@ module Datahen
7
7
 
8
8
  def all(job_id, collection = 'default', opts = {})
9
9
  limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : 0
10
- self.retry(limit, 10, "Error while updating the seeder.") do
10
+ self.retry(limit, 10, "Error while updating the seeder.", true) do
11
11
  self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
12
12
  end
13
13
  end
@@ -18,34 +18,27 @@ module Datahen
18
18
  body[:max_size] = opts[:max_size] if opts[:max_size]
19
19
  body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
20
20
  body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
21
+ body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
22
+ body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
23
+ body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
21
24
 
22
25
  params = @options.merge({body: body.to_json})
23
26
 
24
27
  self.class.put("/jobs/#{job_id}/pages/#{gid}", params)
25
28
  end
26
29
 
27
- def enqueue(job_id, method, url, opts={})
28
- body = {}
29
- body[:method] = method != "" ? method : "GET"
30
- body[:url] = url
31
- body[:page_type] = opts[:page_type] if opts[:page_type]
32
- body[:priority] = opts[:priority] if opts[:priority]
33
- body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
34
- body[:body] = opts[:body] if opts[:body]
35
- body[:headers] = opts[:headers] if opts[:headers]
36
- body[:vars] = opts[:vars] if opts[:vars]
37
- body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
38
- body[:freshness] = opts[:freshness] if opts[:freshness]
39
- body[:ua_type] = opts[:ua_type] if opts[:ua_type]
40
- body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
41
- body[:cookie] = opts[:cookie] if opts[:cookie]
42
- body[:max_size] = opts[:max_size] if opts[:max_size]
43
- body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
44
- body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
45
-
46
- params = @options.merge({body: body.to_json})
30
+ def enqueue(job_id, page, opts={})
31
+ params = @options.merge(opts).merge({body: page.to_json})
47
32
 
48
33
  self.class.post("/jobs/#{job_id}/pages", params)
34
+
35
+ end
36
+
37
+ def get_gid(job_id, page, opts={})
38
+
39
+ params = @options.merge(opts).merge({body: page.to_json})
40
+
41
+ self.class.post("/jobs/#{job_id}/generate_gid", params)
49
42
  end
50
43
 
51
44
  def dequeue(job_id, limit, page_types, parse_fetching_failed, opts = {})
@@ -65,6 +58,7 @@ module Datahen
65
58
  body[:parsing_status] = opts.fetch(:parsing_status){ nil }
66
59
  body[:log_error] = opts[:log_error] if opts[:log_error]
67
60
  body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
61
+ body[:parsing_try_limit] = opts[:parsing_try_limit] if opts.fetch(:parsing_try_limit){ nil }
68
62
 
69
63
  params = @options.merge({body: body.to_json})
70
64
 
@@ -100,6 +94,11 @@ module Datahen
100
94
  params = @options.merge(opts)
101
95
  self.class.put("/jobs/#{job_id}/pages/limbo", params)
102
96
  end
97
+
98
+ def still_alive(job_id, gid, opts={})
99
+ params = @options.merge(opts)
100
+ self.class.put("/jobs/#{job_id}/pages/#{gid}/still_alive", params)
101
+ end
103
102
  end
104
103
  end
105
104
  end
@@ -0,0 +1,17 @@
1
+ module Datahen
2
+ module Client
3
+ class JobTask < Datahen::Client::Base
4
+ def all(job_id, opts={})
5
+ params = @options.merge(opts)
6
+ self.class.get("/jobs/#{job_id}/tasks", params)
7
+ end
8
+
9
+ def find(job_id, task_id, opts={})
10
+ params = @options.merge(opts)
11
+ self.class.get("/jobs/#{job_id}/tasks/#{task_id}", params)
12
+ end
13
+
14
+ end
15
+
16
+ end
17
+ end
@@ -32,6 +32,10 @@ module Datahen
32
32
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
33
33
  body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
34
34
  body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
35
+ body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
36
+ body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
37
+ body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
38
+ body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
35
39
  params = @options.merge({body: body.to_json})
36
40
  self.class.post("/scrapers", params)
37
41
  end
@@ -57,6 +61,10 @@ module Datahen
57
61
  body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
58
62
  body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
59
63
  body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
64
+ body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
65
+ body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
66
+ body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
67
+ body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
60
68
  params = @options.merge({body: body.to_json})
61
69
 
62
70
  self.class.put("/scrapers/#{scraper_name}", params)
@@ -15,6 +15,10 @@ module Datahen
15
15
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
16
16
  body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
17
17
  body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
18
+ body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
19
+ body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
20
+ body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
21
+ body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
18
22
  if opts[:vars]
19
23
  if opts[:vars].is_a?(Array)
20
24
  body[:vars] = opts[:vars]
@@ -45,6 +49,10 @@ module Datahen
45
49
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
46
50
  body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
47
51
  body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
52
+ body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
53
+ body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
54
+ body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
55
+ body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
48
56
  params = @options.merge({body: body.to_json})
49
57
 
50
58
  self.class.put("/scrapers/#{scraper_name}/current_job", params)
@@ -47,30 +47,19 @@ module Datahen
47
47
  self.class.put("/scrapers/#{scraper_name}/current_job/pages/limbo", params)
48
48
  end
49
49
 
50
- def enqueue(scraper_name, method, url, opts={})
51
- body = {}
52
- body[:method] = method != "" ? method : "GET"
53
- body[:url] = url
54
- body[:page_type] = opts[:page_type] if opts[:page_type]
55
- body[:priority] = opts[:priority] if opts[:priority]
56
- body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
57
- body[:body] = opts[:body] if opts[:body]
58
- body[:headers] = opts[:headers] if opts[:headers]
59
- body[:vars] = opts[:vars] if opts[:vars]
60
- body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
61
- body[:freshness] = opts[:freshness] if opts[:freshness]
62
- body[:ua_type] = opts[:ua_type] if opts[:ua_type]
63
- body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
64
- body[:cookie] = opts[:cookie] if opts[:cookie]
65
- body[:max_size] = opts[:max_size] if opts[:max_size]
66
- body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
67
- body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
68
-
69
- params = @options.merge({body: body.to_json})
50
+ def enqueue(scraper_name, page, opts={})
51
+ params = @options.merge(opts).merge({body: page.to_json})
70
52
 
71
53
  self.class.post("/scrapers/#{scraper_name}/current_job/pages", params)
72
54
  end
73
55
 
56
+ def get_gid(scraper_name, page, opts={})
57
+
58
+ params = @options.merge(opts).merge({body: page.to_json})
59
+
60
+ self.class.post("/scrapers/#{scraper_name}/current_job/generate_gid", params)
61
+ end
62
+
74
63
  def find_content(scraper_name, gid)
75
64
  self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}/content", @options)
76
65
  end
@@ -0,0 +1,17 @@
1
+ module Datahen
2
+ module Client
3
+ class ScraperTask < Datahen::Client::Base
4
+ def all(scraper_name, opts={})
5
+ params = @options.merge(opts)
6
+ self.class.get("/scrapers/#{scraper_name}/current_job/tasks", params)
7
+ end
8
+
9
+ def find(scraper_name, task_id, opts={})
10
+ params = @options.merge(opts)
11
+ self.class.get("/scrapers/#{scraper_name}/current_job/tasks/#{task_id}", params)
12
+ end
13
+
14
+ end
15
+
16
+ end
17
+ end
@@ -24,6 +24,8 @@ require "datahen/client/scraper_var"
24
24
  require "datahen/client/job_var"
25
25
  require "datahen/client/scraper_job_var"
26
26
  require "datahen/client/job_finisher"
27
+ require "datahen/client/job_task"
28
+ require "datahen/client/scraper_task"
27
29
 
28
30
  module Datahen
29
31
  module Client
@@ -227,7 +227,11 @@ module Datahen
227
227
 
228
228
  # add pages
229
229
  count = 0
230
- (JSON.parse(response.body) || []).each do |page|
230
+ json = JSON.parse(response.body)
231
+ if json['error'] != ""
232
+ return 0
233
+ end
234
+ (json['data'] || []).each do |page|
231
235
  count += 1
232
236
  next if self.loaded_pages.has_key? page['gid']
233
237
  self.pages << (self.loaded_pages[page['gid']] = page)
@@ -307,7 +311,7 @@ module Datahen
307
311
  is_waiting = true
308
312
  puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
309
313
  if self.second_dequeue_count > 1 && !self.not_found
310
- puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
314
+ puts "\nWARNING: Your job might not be optimized. Consider increasing your job's \"parser_dequeue_scale\" if the `to_parse` queue is not empty or near empty \n"
311
315
  end
312
316
  end
313
317
  self.class.wait 1
@@ -172,11 +172,16 @@ module Datahen
172
172
  response = client.all(query_job_id, collection, {
173
173
  retry_limit: retry_limit
174
174
  })
175
-
176
175
  if response.code != 200
177
176
  raise "response_code: #{response.code}|#{response.parsed_response}"
178
177
  end
179
- (response.body != 'null') ? response.parsed_response : []
178
+
179
+ # check stream error
180
+ json_data = response.body != 'null' ? response.parsed_response : {}
181
+ if json_data['error'] != ""
182
+ raise "response_code: #{response.code}|Stream error: #{json_data['error']}"
183
+ end
184
+ json_data['data'].nil? ? [] : json_data['data']
180
185
  end
181
186
 
182
187
  # Find one output by collection and query with pagination.
@@ -33,7 +33,6 @@ module Datahen
33
33
  :failed_content,
34
34
  :outputs,
35
35
  :pages,
36
- :page,
37
36
  :save_pages,
38
37
  :save_outputs,
39
38
  :find_output,
@@ -41,7 +40,8 @@ module Datahen
41
40
  :refetch,
42
41
  :reparse,
43
42
  :limbo,
44
- :finish
43
+ :finish,
44
+ :still_alive
45
45
  ].freeze
46
46
  end
47
47
 
@@ -240,6 +240,12 @@ module Datahen
240
240
  @failed_content ||= get_failed_content(job_id, gid)
241
241
  end
242
242
 
243
+ def still_alive page_gid = nil
244
+ page_gid = gid if page_gid.nil?
245
+ client = Client::JobPage.new()
246
+ client.still_alive(job_id, page_gid)
247
+ end
248
+
243
249
  def handle_error(e)
244
250
  error = ["Parsing #{e.class}: #{e.to_s} (Job:#{job_id} GID:#{gid})",clean_backtrace(e.backtrace)].join("\n")
245
251
 
@@ -247,7 +253,8 @@ module Datahen
247
253
  job_id: job_id,
248
254
  gid: gid,
249
255
  parsing_status: :failed,
250
- log_error: error)
256
+ log_error: error,
257
+ parsing_try_limit: (page || {})['parsing_try_limit'])
251
258
  end
252
259
 
253
260
  end
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "1.3.2"
2
+ VERSION = "1.5.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.2
4
+ version: 1.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-06-08 00:00:00.000000000 Z
11
+ date: 2024-01-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -227,6 +227,7 @@ files:
227
227
  - lib/datahen/cli/scraper_job.rb
228
228
  - lib/datahen/cli/scraper_job_var.rb
229
229
  - lib/datahen/cli/scraper_page.rb
230
+ - lib/datahen/cli/scraper_task.rb
230
231
  - lib/datahen/cli/scraper_var.rb
231
232
  - lib/datahen/cli/seeder.rb
232
233
  - lib/datahen/client.rb
@@ -245,6 +246,7 @@ files:
245
246
  - lib/datahen/client/job_output.rb
246
247
  - lib/datahen/client/job_page.rb
247
248
  - lib/datahen/client/job_stat.rb
249
+ - lib/datahen/client/job_task.rb
248
250
  - lib/datahen/client/job_var.rb
249
251
  - lib/datahen/client/scraper.rb
250
252
  - lib/datahen/client/scraper_deployment.rb
@@ -255,6 +257,7 @@ files:
255
257
  - lib/datahen/client/scraper_job_output.rb
256
258
  - lib/datahen/client/scraper_job_page.rb
257
259
  - lib/datahen/client/scraper_job_var.rb
260
+ - lib/datahen/client/scraper_task.rb
258
261
  - lib/datahen/client/scraper_var.rb
259
262
  - lib/datahen/error.rb
260
263
  - lib/datahen/error/custom_retry_error.rb