datahen 1.3.2 → 1.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/datahen/cli/job_output.rb +36 -6
- data/lib/datahen/cli/scraper.rb +27 -2
- data/lib/datahen/cli/scraper_job.rb +4 -0
- data/lib/datahen/cli/scraper_page.rb +47 -28
- data/lib/datahen/cli/scraper_task.rb +48 -0
- data/lib/datahen/cli.rb +1 -0
- data/lib/datahen/client/base.rb +15 -2
- data/lib/datahen/client/job.rb +5 -1
- data/lib/datahen/client/job_output.rb +1 -1
- data/lib/datahen/client/job_page.rb +19 -20
- data/lib/datahen/client/job_task.rb +17 -0
- data/lib/datahen/client/scraper.rb +8 -0
- data/lib/datahen/client/scraper_job.rb +8 -0
- data/lib/datahen/client/scraper_job_page.rb +9 -20
- data/lib/datahen/client/scraper_task.rb +17 -0
- data/lib/datahen/client.rb +2 -0
- data/lib/datahen/scraper/batch_parser.rb +6 -2
- data/lib/datahen/scraper/executor.rb +7 -2
- data/lib/datahen/scraper/ruby_parser_executor.rb +10 -3
- data/lib/datahen/version.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 96d2bc30d1c96ce684d83efa54b6dff5966db2a1bba7ab4856b11caba2803086
|
4
|
+
data.tar.gz: 985712d5d7e6559ac64b76669241f56d704c754deb06a164e1f449aad10ef29e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d9c6bd3e60034339a8354fe4bda365b91f21b6ec68da8f384d7380abcafa5ccce2c2aacd6cc7a8da37378b8681afe58765bcc461211812c623a8958eac7a5f72
|
7
|
+
data.tar.gz: ac5eb5c8de4e4b0a6d28d96179bab4bf347662247b94e775ed0a25e0f0ef00a542f01f8a1a06525b565e7bd1055d5cd30b480a28d28c7ebf5de893b89b9f5e3a
|
@@ -20,10 +20,20 @@ module Datahen
|
|
20
20
|
collection = options.fetch(:collection) { 'default' }
|
21
21
|
if options[:job]
|
22
22
|
client = Client::JobOutput.new(options)
|
23
|
-
|
23
|
+
json = JSON.parse(client.all(options[:job], collection).body)
|
24
|
+
if json['error'] == ""
|
25
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
26
|
+
else
|
27
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
28
|
+
end
|
24
29
|
else
|
25
30
|
client = Client::ScraperJobOutput.new(options)
|
26
|
-
|
31
|
+
json = JSON.parse(client.all(scraper_name, collection).body)
|
32
|
+
if json['error'] == ""
|
33
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
34
|
+
else
|
35
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
36
|
+
end
|
27
37
|
end
|
28
38
|
end
|
29
39
|
|
@@ -38,10 +48,20 @@ module Datahen
|
|
38
48
|
collection = options.fetch(:collection) { 'default' }
|
39
49
|
if options[:job]
|
40
50
|
client = Client::JobOutput.new(options)
|
41
|
-
|
51
|
+
json = JSON.parse(client.find(options[:job], collection, id).body)
|
52
|
+
if json['error'] == ""
|
53
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
54
|
+
else
|
55
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
56
|
+
end
|
42
57
|
else
|
43
58
|
client = Client::ScraperJobOutput.new(options)
|
44
|
-
|
59
|
+
json = JSON.parse(client.find(scraper_name, collection, id).body)
|
60
|
+
if json['error'] == ""
|
61
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
62
|
+
else
|
63
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
64
|
+
end
|
45
65
|
end
|
46
66
|
end
|
47
67
|
|
@@ -56,10 +76,20 @@ module Datahen
|
|
56
76
|
|
57
77
|
if options[:job]
|
58
78
|
client = Client::JobOutput.new(options)
|
59
|
-
|
79
|
+
json = JSON.parse(client.collections(options[:job]).body)
|
80
|
+
if json['error'] == ""
|
81
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
82
|
+
else
|
83
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
84
|
+
end
|
60
85
|
else
|
61
86
|
client = Client::ScraperJobOutput.new(options)
|
62
|
-
|
87
|
+
json = JSON.parse(client.collections(scraper_name).body)
|
88
|
+
if json['error'] == ""
|
89
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
90
|
+
else
|
91
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
92
|
+
end
|
63
93
|
end
|
64
94
|
end
|
65
95
|
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -37,6 +37,10 @@ module Datahen
|
|
37
37
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
38
38
|
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
39
39
|
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
40
|
+
option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
|
41
|
+
option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
|
42
|
+
option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
|
43
|
+
option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
|
40
44
|
def create(scraper_name, git_repository)
|
41
45
|
# puts "options #{options}"
|
42
46
|
client = Client::Scraper.new(options)
|
@@ -66,6 +70,10 @@ module Datahen
|
|
66
70
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
67
71
|
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
68
72
|
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
73
|
+
option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
|
74
|
+
option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
|
75
|
+
option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
|
76
|
+
option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
|
69
77
|
def update(scraper_name)
|
70
78
|
client = Client::Scraper.new(options)
|
71
79
|
puts "#{client.update(scraper_name, options)}"
|
@@ -106,6 +114,10 @@ module Datahen
|
|
106
114
|
option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
|
107
115
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
108
116
|
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
117
|
+
option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
|
118
|
+
option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
|
119
|
+
option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
|
120
|
+
option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
|
109
121
|
def start(scraper_name)
|
110
122
|
client = Client::ScraperJob.new(options)
|
111
123
|
puts "Starting a scrape job..."
|
@@ -188,9 +200,19 @@ module Datahen
|
|
188
200
|
def history(scraper_name)
|
189
201
|
client = Client::JobStat.new(options)
|
190
202
|
if options[:job]
|
191
|
-
|
203
|
+
json = JSON.parse(client.job_stats_history(options[:job], options).body)
|
204
|
+
if json['error'] == ""
|
205
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
206
|
+
else
|
207
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
208
|
+
end
|
192
209
|
else
|
193
|
-
|
210
|
+
json = JSON.parse(client.scraper_job_stats_history(scraper_name, options).body)
|
211
|
+
if json['error'] == ""
|
212
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
213
|
+
else
|
214
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
215
|
+
end
|
194
216
|
end
|
195
217
|
end
|
196
218
|
|
@@ -227,6 +249,9 @@ module Datahen
|
|
227
249
|
desc "var SUBCOMMAND ...ARGS", "for managing scraper's variables"
|
228
250
|
subcommand "var", ScraperVar
|
229
251
|
|
252
|
+
desc "task SUBCOMMAND ...ARGS", "manage task on a job"
|
253
|
+
subcommand "task", ScraperTask
|
254
|
+
|
230
255
|
|
231
256
|
end
|
232
257
|
end
|
@@ -108,6 +108,10 @@ module Datahen
|
|
108
108
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
109
109
|
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
110
110
|
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
111
|
+
option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
|
112
|
+
option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
|
113
|
+
option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
|
114
|
+
option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
|
111
115
|
def update(scraper_name)
|
112
116
|
if options[:job]
|
113
117
|
client = Client::Job.new(options)
|
@@ -13,6 +13,10 @@ module Datahen
|
|
13
13
|
LONGDESC
|
14
14
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
15
15
|
option :page_type, :aliases => :t, type: :string, desc: 'Filter by page_type'
|
16
|
+
option :url, :aliases => :u, type: :string, desc: 'Filter by url'
|
17
|
+
option :effective_url, :aliases => :U, type: :string, desc: 'Filter by effective_url'
|
18
|
+
option :body, :aliases => :b, type: :string, desc: 'Filter by body'
|
19
|
+
option :parent_gid, :aliases => :G, type: :string, desc: 'Filter by parent_gid'
|
16
20
|
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
17
21
|
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
18
22
|
option :fetch_fail, type: :boolean, desc: 'Returns only pages that fails fetching.'
|
@@ -21,53 +25,65 @@ module Datahen
|
|
21
25
|
def list(scraper_name)
|
22
26
|
if options[:job]
|
23
27
|
client = Client::JobPage.new(options)
|
24
|
-
|
28
|
+
json = JSON.parse(client.all(options[:job]).body)
|
29
|
+
if json['error'] == ""
|
30
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
31
|
+
else
|
32
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
33
|
+
end
|
25
34
|
else
|
26
35
|
client = Client::ScraperJobPage.new(options)
|
27
|
-
|
36
|
+
json = JSON.parse(client.all(scraper_name).body)
|
37
|
+
if json['error'] == ""
|
38
|
+
puts "#{JSON.pretty_generate(json['data'])}"
|
39
|
+
else
|
40
|
+
puts "#{JSON.pretty_generate(json['error'])}"
|
41
|
+
end
|
28
42
|
end
|
29
43
|
end
|
30
44
|
|
31
|
-
desc "add <scraper_name> <
|
45
|
+
desc "add <scraper_name> <page_json>", "Enqueues a page to a scraper's current job"
|
32
46
|
long_desc <<-LONGDESC
|
33
47
|
Enqueues a page to a scraper's current job\x5
|
34
48
|
LONGDESC
|
35
49
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
36
|
-
|
37
|
-
option :headers, :aliases => :H, type: :string, banner: :JSON, desc: 'Set request headers. Must be in json format. i.e: {"Foo":"bar"} '
|
38
|
-
option :cookie, :aliases => :c, type: :string, desc: 'Set request cookie.'
|
39
|
-
option :vars, :aliases => :v, type: :string, banner: :JSON, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
40
|
-
option :page_type, :aliases => :t, desc: 'Set page type'
|
41
|
-
option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
|
42
|
-
option :fetch_type, :aliases => :F, desc: 'Set fetch type. Default: http'
|
43
|
-
option :body, :aliases => :b, desc: 'Set request body'
|
44
|
-
option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
|
45
|
-
option :freshness, :aliases => :s, desc: 'Set how fresh the page cache is. Accepts timestap format.'
|
46
|
-
option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
|
47
|
-
option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
|
48
|
-
option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
49
|
-
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
50
|
-
def add(scraper_name, url)
|
50
|
+
def add(scraper_name, page_json)
|
51
51
|
begin
|
52
|
-
|
53
|
-
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
54
|
-
method = options[:method]
|
52
|
+
page = JSON.parse(page_json)
|
55
53
|
|
56
54
|
if options[:job]
|
57
55
|
client = Client::JobPage.new(options)
|
58
|
-
puts "#{client.enqueue(options[:job],
|
56
|
+
puts "#{client.enqueue(options[:job], page, options)}"
|
59
57
|
else
|
60
58
|
client = Client::ScraperJobPage.new(options)
|
61
|
-
puts "#{client.enqueue(scraper_name,
|
59
|
+
puts "#{client.enqueue(scraper_name, page, options)}"
|
62
60
|
end
|
63
61
|
|
64
62
|
rescue JSON::ParserError
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
63
|
+
puts "Error: Invalid JSON"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
|
68
|
+
desc "getgid <scraper_name> <page_json>", "Get the generated GID for a scraper's current job"
|
69
|
+
long_desc <<-LONGDESC
|
70
|
+
Get the generated GID for a scraper's current job.\x5
|
71
|
+
LONGDESC
|
72
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
73
|
+
def getgid(scraper_name, page_json)
|
74
|
+
begin
|
75
|
+
page = JSON.parse(page_json)
|
76
|
+
|
77
|
+
if options[:job]
|
78
|
+
client = Client::JobPage.new(options)
|
79
|
+
puts "#{client.get_gid(options[:job], page, options)}"
|
80
|
+
else
|
81
|
+
client = Client::ScraperJobPage.new(options)
|
82
|
+
puts "#{client.get_gid(scraper_name, page, options)}"
|
70
83
|
end
|
84
|
+
|
85
|
+
rescue JSON::ParserError
|
86
|
+
puts "Error: Invalid JSON"
|
71
87
|
end
|
72
88
|
end
|
73
89
|
|
@@ -82,6 +98,9 @@ module Datahen
|
|
82
98
|
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
83
99
|
option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
84
100
|
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
101
|
+
option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
|
102
|
+
option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
|
103
|
+
option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
|
85
104
|
def update(scraper_name, gid)
|
86
105
|
begin
|
87
106
|
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module Datahen
|
2
|
+
class CLI < Thor
|
3
|
+
class ScraperTask < Thor
|
4
|
+
package_name "scraper task"
|
5
|
+
def self.banner(command, namespace = nil, subcommand = false)
|
6
|
+
"#{basename} #{@package_name} #{command.usage}"
|
7
|
+
end
|
8
|
+
|
9
|
+
desc "list <scraper_name>", "List Tasks on a scraper's current job"
|
10
|
+
long_desc <<-LONGDESC
|
11
|
+
List all tasks in a scraper's current job or given job ID.\x5
|
12
|
+
LONGDESC
|
13
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
14
|
+
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
15
|
+
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
16
|
+
option :status, type: :array, desc: 'Returns only tasks with specific status.'
|
17
|
+
option :action, type: :array, desc: 'Returns only tasks with specific action.'
|
18
|
+
option :"include-system", type: :boolean, desc: 'If it is true, will returns all actions. If it is false only tasks with specific action ["refetch", "reparse", "terminate"].'
|
19
|
+
def list(scraper_name)
|
20
|
+
if options[:job]
|
21
|
+
client = Client::JobTask.new(options)
|
22
|
+
puts "#{client.all(options[:job])}"
|
23
|
+
else
|
24
|
+
client = Client::ScraperTask.new(options)
|
25
|
+
puts "#{client.all(scraper_name)}"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
desc "show <scraper_name> <task_id>", "Show task in scraper's current job"
|
31
|
+
long_desc <<-LONGDESC
|
32
|
+
Shows a task in a scraper's current job or given job ID.\x5
|
33
|
+
LONGDESC
|
34
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
35
|
+
def show(scraper_name, task_id)
|
36
|
+
if options[:job]
|
37
|
+
client = Client::JobTask.new(options)
|
38
|
+
puts "#{client.find(options[:job], task_id)}"
|
39
|
+
else
|
40
|
+
client = Client::ScraperTask.new(options)
|
41
|
+
puts "#{client.find(scraper_name, task_id)}"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
data/lib/datahen/cli.rb
CHANGED
@@ -11,6 +11,7 @@ require 'datahen/cli/scraper_page'
|
|
11
11
|
require 'datahen/cli/job_output'
|
12
12
|
require 'datahen/cli/job'
|
13
13
|
require 'datahen/cli/scraper_deployment'
|
14
|
+
require 'datahen/cli/scraper_task'
|
14
15
|
require 'datahen/cli/scraper'
|
15
16
|
require 'datahen/cli/parser'
|
16
17
|
require 'datahen/cli/seeder'
|
data/lib/datahen/client/base.rb
CHANGED
@@ -56,12 +56,18 @@ module Datahen
|
|
56
56
|
target.merge(source.select{|k,v|target.has_key?(k)})
|
57
57
|
end
|
58
58
|
|
59
|
-
def retry times, delay = nil, err_msg = nil
|
59
|
+
def retry times, delay = nil, err_msg = nil, stream = false
|
60
60
|
limit = times.nil? ? nil : times.to_i
|
61
61
|
delay = delay.nil? ? 5 : delay.to_i
|
62
62
|
count = 0
|
63
63
|
begin
|
64
|
-
yield
|
64
|
+
val = yield
|
65
|
+
if stream
|
66
|
+
return if val.nil?
|
67
|
+
if val['error'] != ""
|
68
|
+
raise StandardError.new(val['error'])
|
69
|
+
end
|
70
|
+
end
|
65
71
|
rescue Error::CustomRetryError, StandardError => e
|
66
72
|
is_custom_retry = e.is_a? Error::CustomRetryError
|
67
73
|
real_delay = is_custom_retry ? e.delay : delay
|
@@ -81,6 +87,7 @@ module Datahen
|
|
81
87
|
puts "#{err_msg.nil? ? '' : "#{err_msg} "}Retry \##{count}#{should_aprox ? '+' : ''}..."
|
82
88
|
retry
|
83
89
|
end
|
90
|
+
val
|
84
91
|
end
|
85
92
|
|
86
93
|
def initialize(opts={})
|
@@ -105,6 +112,10 @@ module Datahen
|
|
105
112
|
query[:parsefail] = opts[:parse_fail] if opts[:parse_fail]
|
106
113
|
query[:status] = opts[:status] if opts[:status]
|
107
114
|
query[:page_type] = opts[:page_type] if opts[:page_type]
|
115
|
+
query[:url] = opts[:url] if opts[:url]
|
116
|
+
query[:effective_url] = opts[:effective_url] if opts[:effective_url]
|
117
|
+
query[:body] = opts[:body] if opts[:body]
|
118
|
+
query[:parent_gid] = opts[:parent_gid] if opts[:parent_gid]
|
108
119
|
query[:gid] = opts[:gid] if opts[:gid]
|
109
120
|
query[:"min-timestamp"] = opts[:"min-timestamp"] if opts[:"min-timestamp"]
|
110
121
|
query[:"max-timestamp"] = opts[:"max-timestamp"] if opts[:"max-timestamp"]
|
@@ -112,6 +123,8 @@ module Datahen
|
|
112
123
|
query[:order] = opts[:order] if opts[:order]
|
113
124
|
query[:filter] = opts[:filter] if opts[:filter]
|
114
125
|
query[:force] = opts[:force] if opts[:force]
|
126
|
+
query[:action] = opts[:action] if opts[:action]
|
127
|
+
query[:"include-system"] = opts[:"include-system"] if opts[:"include-system"]
|
115
128
|
|
116
129
|
if opts[:query]
|
117
130
|
if opts[:query].is_a?(Hash)
|
data/lib/datahen/client/job.rb
CHANGED
@@ -25,6 +25,10 @@ module Datahen
|
|
25
25
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
26
26
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
27
27
|
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
28
|
+
body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
|
29
|
+
body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
|
30
|
+
body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
|
31
|
+
body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
|
28
32
|
params = @options.merge({body: body.to_json})
|
29
33
|
|
30
34
|
self.class.put("/jobs/#{job_id}", params)
|
@@ -97,7 +101,7 @@ module Datahen
|
|
97
101
|
def sync_schema(job_id, opts={})
|
98
102
|
params = @options.merge(opts)
|
99
103
|
|
100
|
-
self.class.put("/
|
104
|
+
self.class.put("/jobs/#{job_id}/sync/schema", params)
|
101
105
|
end
|
102
106
|
|
103
107
|
end
|
@@ -7,7 +7,7 @@ module Datahen
|
|
7
7
|
|
8
8
|
def all(job_id, collection = 'default', opts = {})
|
9
9
|
limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : 0
|
10
|
-
self.retry(limit, 10, "Error while updating the seeder.") do
|
10
|
+
self.retry(limit, 10, "Error while updating the seeder.", true) do
|
11
11
|
self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
|
12
12
|
end
|
13
13
|
end
|
@@ -18,34 +18,27 @@ module Datahen
|
|
18
18
|
body[:max_size] = opts[:max_size] if opts[:max_size]
|
19
19
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
20
20
|
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
21
|
+
body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
|
22
|
+
body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
|
23
|
+
body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
|
21
24
|
|
22
25
|
params = @options.merge({body: body.to_json})
|
23
26
|
|
24
27
|
self.class.put("/jobs/#{job_id}/pages/#{gid}", params)
|
25
28
|
end
|
26
29
|
|
27
|
-
def enqueue(job_id,
|
28
|
-
|
29
|
-
body[:method] = method != "" ? method : "GET"
|
30
|
-
body[:url] = url
|
31
|
-
body[:page_type] = opts[:page_type] if opts[:page_type]
|
32
|
-
body[:priority] = opts[:priority] if opts[:priority]
|
33
|
-
body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
|
34
|
-
body[:body] = opts[:body] if opts[:body]
|
35
|
-
body[:headers] = opts[:headers] if opts[:headers]
|
36
|
-
body[:vars] = opts[:vars] if opts[:vars]
|
37
|
-
body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
|
38
|
-
body[:freshness] = opts[:freshness] if opts[:freshness]
|
39
|
-
body[:ua_type] = opts[:ua_type] if opts[:ua_type]
|
40
|
-
body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
|
41
|
-
body[:cookie] = opts[:cookie] if opts[:cookie]
|
42
|
-
body[:max_size] = opts[:max_size] if opts[:max_size]
|
43
|
-
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
44
|
-
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
45
|
-
|
46
|
-
params = @options.merge({body: body.to_json})
|
30
|
+
def enqueue(job_id, page, opts={})
|
31
|
+
params = @options.merge(opts).merge({body: page.to_json})
|
47
32
|
|
48
33
|
self.class.post("/jobs/#{job_id}/pages", params)
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
def get_gid(job_id, page, opts={})
|
38
|
+
|
39
|
+
params = @options.merge(opts).merge({body: page.to_json})
|
40
|
+
|
41
|
+
self.class.post("/jobs/#{job_id}/generate_gid", params)
|
49
42
|
end
|
50
43
|
|
51
44
|
def dequeue(job_id, limit, page_types, parse_fetching_failed, opts = {})
|
@@ -65,6 +58,7 @@ module Datahen
|
|
65
58
|
body[:parsing_status] = opts.fetch(:parsing_status){ nil }
|
66
59
|
body[:log_error] = opts[:log_error] if opts[:log_error]
|
67
60
|
body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
|
61
|
+
body[:parsing_try_limit] = opts[:parsing_try_limit] if opts.fetch(:parsing_try_limit){ nil }
|
68
62
|
|
69
63
|
params = @options.merge({body: body.to_json})
|
70
64
|
|
@@ -100,6 +94,11 @@ module Datahen
|
|
100
94
|
params = @options.merge(opts)
|
101
95
|
self.class.put("/jobs/#{job_id}/pages/limbo", params)
|
102
96
|
end
|
97
|
+
|
98
|
+
def still_alive(job_id, gid, opts={})
|
99
|
+
params = @options.merge(opts)
|
100
|
+
self.class.put("/jobs/#{job_id}/pages/#{gid}/still_alive", params)
|
101
|
+
end
|
103
102
|
end
|
104
103
|
end
|
105
104
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class JobTask < Datahen::Client::Base
|
4
|
+
def all(job_id, opts={})
|
5
|
+
params = @options.merge(opts)
|
6
|
+
self.class.get("/jobs/#{job_id}/tasks", params)
|
7
|
+
end
|
8
|
+
|
9
|
+
def find(job_id, task_id, opts={})
|
10
|
+
params = @options.merge(opts)
|
11
|
+
self.class.get("/jobs/#{job_id}/tasks/#{task_id}", params)
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
@@ -32,6 +32,10 @@ module Datahen
|
|
32
32
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
33
33
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
34
34
|
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
35
|
+
body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
|
36
|
+
body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
|
37
|
+
body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
|
38
|
+
body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
|
35
39
|
params = @options.merge({body: body.to_json})
|
36
40
|
self.class.post("/scrapers", params)
|
37
41
|
end
|
@@ -57,6 +61,10 @@ module Datahen
|
|
57
61
|
body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
|
58
62
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
59
63
|
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
64
|
+
body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
|
65
|
+
body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
|
66
|
+
body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
|
67
|
+
body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
|
60
68
|
params = @options.merge({body: body.to_json})
|
61
69
|
|
62
70
|
self.class.put("/scrapers/#{scraper_name}", params)
|
@@ -15,6 +15,10 @@ module Datahen
|
|
15
15
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
16
16
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
17
17
|
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
18
|
+
body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
|
19
|
+
body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
|
20
|
+
body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
|
21
|
+
body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
|
18
22
|
if opts[:vars]
|
19
23
|
if opts[:vars].is_a?(Array)
|
20
24
|
body[:vars] = opts[:vars]
|
@@ -45,6 +49,10 @@ module Datahen
|
|
45
49
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
46
50
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
47
51
|
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
52
|
+
body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
|
53
|
+
body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
|
54
|
+
body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
|
55
|
+
body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
|
48
56
|
params = @options.merge({body: body.to_json})
|
49
57
|
|
50
58
|
self.class.put("/scrapers/#{scraper_name}/current_job", params)
|
@@ -47,30 +47,19 @@ module Datahen
|
|
47
47
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/limbo", params)
|
48
48
|
end
|
49
49
|
|
50
|
-
def enqueue(scraper_name,
|
51
|
-
|
52
|
-
body[:method] = method != "" ? method : "GET"
|
53
|
-
body[:url] = url
|
54
|
-
body[:page_type] = opts[:page_type] if opts[:page_type]
|
55
|
-
body[:priority] = opts[:priority] if opts[:priority]
|
56
|
-
body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
|
57
|
-
body[:body] = opts[:body] if opts[:body]
|
58
|
-
body[:headers] = opts[:headers] if opts[:headers]
|
59
|
-
body[:vars] = opts[:vars] if opts[:vars]
|
60
|
-
body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
|
61
|
-
body[:freshness] = opts[:freshness] if opts[:freshness]
|
62
|
-
body[:ua_type] = opts[:ua_type] if opts[:ua_type]
|
63
|
-
body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
|
64
|
-
body[:cookie] = opts[:cookie] if opts[:cookie]
|
65
|
-
body[:max_size] = opts[:max_size] if opts[:max_size]
|
66
|
-
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
67
|
-
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
68
|
-
|
69
|
-
params = @options.merge({body: body.to_json})
|
50
|
+
def enqueue(scraper_name, page, opts={})
|
51
|
+
params = @options.merge(opts).merge({body: page.to_json})
|
70
52
|
|
71
53
|
self.class.post("/scrapers/#{scraper_name}/current_job/pages", params)
|
72
54
|
end
|
73
55
|
|
56
|
+
def get_gid(scraper_name, page, opts={})
|
57
|
+
|
58
|
+
params = @options.merge(opts).merge({body: page.to_json})
|
59
|
+
|
60
|
+
self.class.post("/scrapers/#{scraper_name}/current_job/generate_gid", params)
|
61
|
+
end
|
62
|
+
|
74
63
|
def find_content(scraper_name, gid)
|
75
64
|
self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}/content", @options)
|
76
65
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class ScraperTask < Datahen::Client::Base
|
4
|
+
def all(scraper_name, opts={})
|
5
|
+
params = @options.merge(opts)
|
6
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/tasks", params)
|
7
|
+
end
|
8
|
+
|
9
|
+
def find(scraper_name, task_id, opts={})
|
10
|
+
params = @options.merge(opts)
|
11
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/tasks/#{task_id}", params)
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
data/lib/datahen/client.rb
CHANGED
@@ -24,6 +24,8 @@ require "datahen/client/scraper_var"
|
|
24
24
|
require "datahen/client/job_var"
|
25
25
|
require "datahen/client/scraper_job_var"
|
26
26
|
require "datahen/client/job_finisher"
|
27
|
+
require "datahen/client/job_task"
|
28
|
+
require "datahen/client/scraper_task"
|
27
29
|
|
28
30
|
module Datahen
|
29
31
|
module Client
|
@@ -227,7 +227,11 @@ module Datahen
|
|
227
227
|
|
228
228
|
# add pages
|
229
229
|
count = 0
|
230
|
-
|
230
|
+
json = JSON.parse(response.body)
|
231
|
+
if json['error'] != ""
|
232
|
+
return 0
|
233
|
+
end
|
234
|
+
(json['data'] || []).each do |page|
|
231
235
|
count += 1
|
232
236
|
next if self.loaded_pages.has_key? page['gid']
|
233
237
|
self.pages << (self.loaded_pages[page['gid']] = page)
|
@@ -307,7 +311,7 @@ module Datahen
|
|
307
311
|
is_waiting = true
|
308
312
|
puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
|
309
313
|
if self.second_dequeue_count > 1 && !self.not_found
|
310
|
-
puts "\nWARNING: Your job
|
314
|
+
puts "\nWARNING: Your job might not be optimized. Consider increasing your job's \"parser_dequeue_scale\" if the `to_parse` queue is not empty or near empty \n"
|
311
315
|
end
|
312
316
|
end
|
313
317
|
self.class.wait 1
|
@@ -172,11 +172,16 @@ module Datahen
|
|
172
172
|
response = client.all(query_job_id, collection, {
|
173
173
|
retry_limit: retry_limit
|
174
174
|
})
|
175
|
-
|
176
175
|
if response.code != 200
|
177
176
|
raise "response_code: #{response.code}|#{response.parsed_response}"
|
178
177
|
end
|
179
|
-
|
178
|
+
|
179
|
+
# check stream error
|
180
|
+
json_data = response.body != 'null' ? response.parsed_response : {}
|
181
|
+
if json_data['error'] != ""
|
182
|
+
raise "response_code: #{response.code}|Stream error: #{json_data['error']}"
|
183
|
+
end
|
184
|
+
json_data['data'].nil? ? [] : json_data['data']
|
180
185
|
end
|
181
186
|
|
182
187
|
# Find one output by collection and query with pagination.
|
@@ -33,7 +33,6 @@ module Datahen
|
|
33
33
|
:failed_content,
|
34
34
|
:outputs,
|
35
35
|
:pages,
|
36
|
-
:page,
|
37
36
|
:save_pages,
|
38
37
|
:save_outputs,
|
39
38
|
:find_output,
|
@@ -41,7 +40,8 @@ module Datahen
|
|
41
40
|
:refetch,
|
42
41
|
:reparse,
|
43
42
|
:limbo,
|
44
|
-
:finish
|
43
|
+
:finish,
|
44
|
+
:still_alive
|
45
45
|
].freeze
|
46
46
|
end
|
47
47
|
|
@@ -240,6 +240,12 @@ module Datahen
|
|
240
240
|
@failed_content ||= get_failed_content(job_id, gid)
|
241
241
|
end
|
242
242
|
|
243
|
+
def still_alive page_gid = nil
|
244
|
+
page_gid = gid if page_gid.nil?
|
245
|
+
client = Client::JobPage.new()
|
246
|
+
client.still_alive(job_id, page_gid)
|
247
|
+
end
|
248
|
+
|
243
249
|
def handle_error(e)
|
244
250
|
error = ["Parsing #{e.class}: #{e.to_s} (Job:#{job_id} GID:#{gid})",clean_backtrace(e.backtrace)].join("\n")
|
245
251
|
|
@@ -247,7 +253,8 @@ module Datahen
|
|
247
253
|
job_id: job_id,
|
248
254
|
gid: gid,
|
249
255
|
parsing_status: :failed,
|
250
|
-
log_error: error
|
256
|
+
log_error: error,
|
257
|
+
parsing_try_limit: (page || {})['parsing_try_limit'])
|
251
258
|
end
|
252
259
|
|
253
260
|
end
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.5.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-01-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -227,6 +227,7 @@ files:
|
|
227
227
|
- lib/datahen/cli/scraper_job.rb
|
228
228
|
- lib/datahen/cli/scraper_job_var.rb
|
229
229
|
- lib/datahen/cli/scraper_page.rb
|
230
|
+
- lib/datahen/cli/scraper_task.rb
|
230
231
|
- lib/datahen/cli/scraper_var.rb
|
231
232
|
- lib/datahen/cli/seeder.rb
|
232
233
|
- lib/datahen/client.rb
|
@@ -245,6 +246,7 @@ files:
|
|
245
246
|
- lib/datahen/client/job_output.rb
|
246
247
|
- lib/datahen/client/job_page.rb
|
247
248
|
- lib/datahen/client/job_stat.rb
|
249
|
+
- lib/datahen/client/job_task.rb
|
248
250
|
- lib/datahen/client/job_var.rb
|
249
251
|
- lib/datahen/client/scraper.rb
|
250
252
|
- lib/datahen/client/scraper_deployment.rb
|
@@ -255,6 +257,7 @@ files:
|
|
255
257
|
- lib/datahen/client/scraper_job_output.rb
|
256
258
|
- lib/datahen/client/scraper_job_page.rb
|
257
259
|
- lib/datahen/client/scraper_job_var.rb
|
260
|
+
- lib/datahen/client/scraper_task.rb
|
258
261
|
- lib/datahen/client/scraper_var.rb
|
259
262
|
- lib/datahen/error.rb
|
260
263
|
- lib/datahen/error/custom_retry_error.rb
|