datahen 0.13.0 → 0.14.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/datahen/cli/job.rb +14 -2
- data/lib/datahen/cli/parser.rb +9 -5
- data/lib/datahen/cli/scraper.rb +4 -3
- data/lib/datahen/cli/scraper_export.rb +2 -3
- data/lib/datahen/cli/scraper_finisher.rb +8 -2
- data/lib/datahen/cli/scraper_job.rb +42 -11
- data/lib/datahen/cli/scraper_job_var.rb +33 -10
- data/lib/datahen/cli/scraper_page.rb +17 -5
- data/lib/datahen/cli/seeder.rb +4 -2
- data/lib/datahen/client.rb +2 -0
- data/lib/datahen/client/base.rb +2 -0
- data/lib/datahen/client/job.rb +8 -2
- data/lib/datahen/client/job_finisher.rb +16 -0
- data/lib/datahen/client/job_page.rb +11 -0
- data/lib/datahen/client/job_stat.rb +12 -4
- data/lib/datahen/client/job_var.rb +28 -0
- data/lib/datahen/client/scraper_job.rb +6 -2
- data/lib/datahen/client/scraper_job_page.rb +3 -5
- data/lib/datahen/scraper/executor.rb +13 -7
- data/lib/datahen/scraper/parser.rb +10 -4
- data/lib/datahen/scraper/ruby_parser_executor.rb +7 -4
- data/lib/datahen/scraper/ruby_seeder_executor.rb +4 -1
- data/lib/datahen/scraper/seeder.rb +7 -3
- data/lib/datahen/version.rb +1 -1
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b00adfb4f357beeae276a130cc5e0ee1d34ddd8bdef4a0374f4b55f89f894460
|
4
|
+
data.tar.gz: 6363d591d93d99addcaaea6d964b2fb07e4d8222f873c3a8f6496f48c97b1483
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 75e9f1b1e5ba61563c3ff9d12071cb76d77382353003776d88817940623e37aeb300df2114260f86a5a18a59e1dc9f74a33c00029e183c0681c616dce109d6c6
|
7
|
+
data.tar.gz: 8e2a07b11d20fe88c93aa0af0abf10ceb6593d5d2db63d9b5c3744c6f8b59754274a3949b6331d7d71a3018f7cf65e1a7d31e2ab6f4dbbcde494b33357dab40a
|
data/lib/datahen/cli/job.rb
CHANGED
@@ -18,12 +18,24 @@ module Datahen
|
|
18
18
|
puts "#{client.all()}"
|
19
19
|
end
|
20
20
|
|
21
|
-
desc "show <job_id>", "Show a job"
|
21
|
+
desc "show <job_id>", "Show a job (Defaults to showing data from cached job)"
|
22
|
+
option :live, type: :boolean, desc: 'Get data from the live job, not cached job.'
|
22
23
|
def show(job_id)
|
23
24
|
client = Client::Job.new(options)
|
24
|
-
puts "#{client.find(job_id)}"
|
25
|
+
puts "#{client.find(job_id, options)}"
|
25
26
|
end
|
26
27
|
|
28
|
+
desc "stats <job_id>", "Get the stat for a job (Defaults to showing data from cached stats)"
|
29
|
+
long_desc <<-LONGDESC
|
30
|
+
Get stats for a scraper's current job\n
|
31
|
+
LONGDESC
|
32
|
+
option :live, type: :boolean, desc: 'Get data from the live stats, not cached stats.'
|
33
|
+
def stats(job_id)
|
34
|
+
client = Client::JobStat.new(options)
|
35
|
+
puts "#{client.job_current_stats(job_id, options)}"
|
36
|
+
end
|
37
|
+
|
38
|
+
|
27
39
|
end
|
28
40
|
end
|
29
41
|
|
data/lib/datahen/cli/parser.rb
CHANGED
@@ -10,12 +10,13 @@ module Datahen
|
|
10
10
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
11
11
|
option :global, :aliases => :g, type: :boolean, default: false, desc: 'Use globalpage instead of a job page'
|
12
12
|
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
13
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
13
14
|
def try_parse(scraper_name, parser_file, gid)
|
14
|
-
begin
|
15
|
-
|
15
|
+
begin
|
16
|
+
|
16
17
|
if options[:job]
|
17
18
|
job_id = options[:job]
|
18
|
-
elsif options[:global]
|
19
|
+
elsif options[:global]
|
19
20
|
job_id = nil
|
20
21
|
else
|
21
22
|
job = Client::ScraperJob.new(options).find(scraper_name)
|
@@ -24,7 +25,7 @@ module Datahen
|
|
24
25
|
|
25
26
|
|
26
27
|
vars = JSON.parse(options[:vars]) if options[:vars]
|
27
|
-
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars)
|
28
|
+
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars, options[:"keep-outputs"])
|
28
29
|
|
29
30
|
rescue JSON::ParserError
|
30
31
|
if options[:vars]
|
@@ -40,6 +41,8 @@ module Datahen
|
|
40
41
|
<GID>: Global ID of the page.\x5
|
41
42
|
LONGDESC
|
42
43
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
44
|
+
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
45
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
43
46
|
def exec_parse(scraper_name, parser_file, *gids)
|
44
47
|
gids.each do |gid|
|
45
48
|
begin
|
@@ -52,7 +55,8 @@ module Datahen
|
|
52
55
|
job_id = job['id']
|
53
56
|
end
|
54
57
|
|
55
|
-
|
58
|
+
vars = JSON.parse(options[:vars]) if options[:vars]
|
59
|
+
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true, vars, options[:"keep-outputs"])
|
56
60
|
rescue => e
|
57
61
|
puts e
|
58
62
|
end
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -140,17 +140,18 @@ module Datahen
|
|
140
140
|
end
|
141
141
|
end
|
142
142
|
|
143
|
-
desc "stats <scraper_name>", "Get the
|
143
|
+
desc "stats <scraper_name>", "Get the stat for a current job (Defaults to showing data from cached stats)"
|
144
144
|
long_desc <<-LONGDESC
|
145
145
|
Get stats for a scraper's current job\n
|
146
146
|
LONGDESC
|
147
147
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
148
|
+
option :live, type: :boolean, desc: 'Get data from the live stats, not cached stats.'
|
148
149
|
def stats(scraper_name)
|
149
150
|
client = Client::JobStat.new(options)
|
150
151
|
if options[:job]
|
151
|
-
puts "#{client.job_current_stats(options[:job])}"
|
152
|
+
puts "#{client.job_current_stats(options[:job], options)}"
|
152
153
|
else
|
153
|
-
puts "#{client.scraper_job_current_stats(scraper_name)}"
|
154
|
+
puts "#{client.scraper_job_current_stats(scraper_name, options)}"
|
154
155
|
end
|
155
156
|
end
|
156
157
|
|
@@ -12,7 +12,6 @@ module Datahen
|
|
12
12
|
puts "#{client.find(export_id)}"
|
13
13
|
end
|
14
14
|
|
15
|
-
|
16
15
|
desc "list", "Gets a list of exports"
|
17
16
|
long_desc <<-LONGDESC
|
18
17
|
List exports.
|
@@ -34,13 +33,13 @@ module Datahen
|
|
34
33
|
def download(export_id)
|
35
34
|
client = Client::ScraperExport.new(options)
|
36
35
|
result = JSON.parse(client.download(export_id).to_s)
|
37
|
-
|
36
|
+
|
38
37
|
if result['signed_url']
|
39
38
|
puts "Download url: \"#{result['signed_url']}\""
|
40
39
|
`open "#{result['signed_url']}"`
|
41
40
|
else
|
42
41
|
puts "Exported file does not exist"
|
43
|
-
end
|
42
|
+
end
|
44
43
|
end
|
45
44
|
|
46
45
|
|
@@ -11,9 +11,15 @@ module Datahen
|
|
11
11
|
long_desc <<-LONGDESC
|
12
12
|
Reset finisher on a scraper's current job.\x5
|
13
13
|
LONGDESC
|
14
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
14
15
|
def reset(scraper_name)
|
15
|
-
|
16
|
-
|
16
|
+
if options[:job]
|
17
|
+
client = Client::JobFinisher.new(options)
|
18
|
+
puts "#{client.reset(options[:job])}"
|
19
|
+
else
|
20
|
+
client = Client::ScraperFinisher.new(options)
|
21
|
+
puts "#{client.reset(scraper_name)}"
|
22
|
+
end
|
17
23
|
end
|
18
24
|
end
|
19
25
|
end
|
@@ -6,10 +6,17 @@ module Datahen
|
|
6
6
|
"#{basename} #{@package_name} #{command.usage}"
|
7
7
|
end
|
8
8
|
|
9
|
-
desc "show <scraper_name>", "Show a scraper's current job"
|
9
|
+
desc "show <scraper_name>", "Show a scraper's current job (Defaults to showing data from cached job)"
|
10
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
11
|
+
option :live, type: :boolean, desc: 'Get data from the live job, not cached job.'
|
10
12
|
def show(scraper_name)
|
11
|
-
|
12
|
-
|
13
|
+
if options[:job]
|
14
|
+
client = Client::Job.new(options)
|
15
|
+
puts "#{client.find(options[:job], options)}"
|
16
|
+
else
|
17
|
+
client = Client::ScraperJob.new(options)
|
18
|
+
puts "#{client.find(scraper_name, options)}"
|
19
|
+
end
|
13
20
|
end
|
14
21
|
|
15
22
|
|
@@ -29,27 +36,45 @@ module Datahen
|
|
29
36
|
long_desc <<-LONGDESC
|
30
37
|
Cancels a scraper's current job
|
31
38
|
LONGDESC
|
39
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
32
40
|
def cancel(scraper_name)
|
33
|
-
|
34
|
-
|
41
|
+
if options[:job]
|
42
|
+
client = Client::Job.new(options)
|
43
|
+
puts "#{client.cancel(options[:job])}"
|
44
|
+
else
|
45
|
+
client = Client::ScraperJob.new(options)
|
46
|
+
puts "#{client.cancel(scraper_name)}"
|
47
|
+
end
|
35
48
|
end
|
36
49
|
|
37
50
|
desc "resume <scraper_name>", "resumes a scraper's current job"
|
38
51
|
long_desc <<-LONGDESC
|
39
52
|
Resumes a scraper's current job
|
40
53
|
LONGDESC
|
54
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
41
55
|
def resume(scraper_name)
|
42
|
-
|
43
|
-
|
56
|
+
if options[:job]
|
57
|
+
client = Client::Job.new(options)
|
58
|
+
puts "#{client.resume(options[:job])}"
|
59
|
+
else
|
60
|
+
client = Client::ScraperJob.new(options)
|
61
|
+
puts "#{client.resume(scraper_name)}"
|
62
|
+
end
|
44
63
|
end
|
45
64
|
|
46
65
|
desc "pause <scraper_name>", "pauses a scraper's current job"
|
47
66
|
long_desc <<-LONGDESC
|
48
67
|
pauses a scraper's current job
|
49
68
|
LONGDESC
|
69
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
50
70
|
def pause(scraper_name)
|
51
|
-
|
52
|
-
|
71
|
+
if options[:job]
|
72
|
+
client = Client::Job.new(options)
|
73
|
+
puts "#{client.pause(options[:job])}"
|
74
|
+
else
|
75
|
+
client = Client::ScraperJob.new(options)
|
76
|
+
puts "#{client.pause(scraper_name)}"
|
77
|
+
end
|
53
78
|
end
|
54
79
|
|
55
80
|
|
@@ -60,9 +85,15 @@ module Datahen
|
|
60
85
|
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 1. '
|
61
86
|
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 0. '
|
62
87
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
88
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
63
89
|
def update(scraper_name)
|
64
|
-
|
65
|
-
|
90
|
+
if options[:job]
|
91
|
+
client = Client::Job.new(options)
|
92
|
+
puts "#{client.update(options[:job], options)}"
|
93
|
+
else
|
94
|
+
client = Client::ScraperJob.new(options)
|
95
|
+
puts "#{client.update(scraper_name, options)}"
|
96
|
+
end
|
66
97
|
end
|
67
98
|
|
68
99
|
desc "var SUBCOMMAND ...ARGS", "for managing scraper's job variables"
|
@@ -13,9 +13,15 @@ module Datahen
|
|
13
13
|
LONGDESC
|
14
14
|
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
15
15
|
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
16
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
16
17
|
def list(scraper_name)
|
17
|
-
|
18
|
-
|
18
|
+
if options[:job]
|
19
|
+
client = Client::JobVar.new(options)
|
20
|
+
puts "#{client.all(options[:job])}"
|
21
|
+
else
|
22
|
+
client = Client::ScraperJobVar.new(options)
|
23
|
+
puts "#{client.all(scraper_name)}"
|
24
|
+
end
|
19
25
|
end
|
20
26
|
|
21
27
|
desc "set <scraper_name> <var_name> <value>", "Set an environment var on the scrape job"
|
@@ -24,23 +30,40 @@ module Datahen
|
|
24
30
|
<var_name>: Var name can only consist of alphabets, numbers, underscores. Name must be unique to your scrape job, otherwise it will be overwritten.\x5
|
25
31
|
<value>: Value of variable.\x5
|
26
32
|
LONGDESC
|
27
|
-
option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
|
33
|
+
option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
|
34
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
28
35
|
def set(scraper_name, var_name, value)
|
29
|
-
|
30
|
-
|
31
|
-
|
36
|
+
if options[:job]
|
37
|
+
client = Client::JobVar.new(options)
|
38
|
+
puts "#{client.set(options[:job], var_name, value, options)}"
|
39
|
+
else
|
40
|
+
client = Client::ScraperJobVar.new(options)
|
41
|
+
puts "#{client.set(scraper_name, var_name, value, options)}"
|
42
|
+
end
|
32
43
|
end
|
33
44
|
|
34
45
|
desc "show <scraper_name> <var_name>", "Show an environment variable on the scrape job"
|
46
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
35
47
|
def show(scraper_name, var_name)
|
36
|
-
|
37
|
-
|
48
|
+
if options[:job]
|
49
|
+
client = Client::JobVar.new(options)
|
50
|
+
puts "#{client.find(options[:job], var_name)}"
|
51
|
+
else
|
52
|
+
client = Client::ScraperJobVar.new(options)
|
53
|
+
puts "#{client.find(scraper_name, var_name)}"
|
54
|
+
end
|
38
55
|
end
|
39
56
|
|
40
57
|
desc "unset <scraper_name> <var_name>", "Deletes an environment variable on the scrape job"
|
58
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
41
59
|
def unset(scraper_name, var_name)
|
42
|
-
|
43
|
-
|
60
|
+
if options[:job]
|
61
|
+
client = Client::JobVar.new(options)
|
62
|
+
puts "#{client.unset(options[:job], var_name)}"
|
63
|
+
else
|
64
|
+
client = Client::ScraperJobVar.new(options)
|
65
|
+
puts "#{client.unset(scraper_name, var_name)}"
|
66
|
+
end
|
44
67
|
end
|
45
68
|
end
|
46
69
|
end
|
@@ -105,13 +105,19 @@ module Datahen
|
|
105
105
|
option :fetch_fail, type: :boolean, desc: 'Refetches only pages that fails fetching.'
|
106
106
|
option :parse_fail, type: :boolean, desc: 'Refetches only pages that fails parsing.'
|
107
107
|
option :status, type: :string, desc: 'Refetches only pages with a specific status.'
|
108
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
108
109
|
def refetch(scraper_name)
|
109
110
|
if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status)
|
110
111
|
puts "Must specify either a --gid, --fetch-fail, --parse-fail or --status"
|
111
112
|
return
|
112
113
|
end
|
113
|
-
|
114
|
-
|
114
|
+
if options[:job]
|
115
|
+
client = Client::JobPage.new(options)
|
116
|
+
puts "#{client.refetch(options[:job])}"
|
117
|
+
else
|
118
|
+
client = Client::ScraperJobPage.new(options)
|
119
|
+
puts "#{client.refetch(scraper_name)}"
|
120
|
+
end
|
115
121
|
end
|
116
122
|
|
117
123
|
desc "reparse <scraper_name>", "Reparse Pages on a scraper's current job"
|
@@ -121,6 +127,7 @@ module Datahen
|
|
121
127
|
option :gid, :aliases => :g, type: :string, desc: 'Reparse a specific GID'
|
122
128
|
option :parse_fail, type: :boolean, desc: 'Reparse only pages that fails parsing.'
|
123
129
|
option :status, type: :string, desc: 'Reparse only pages with a specific status.'
|
130
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
124
131
|
def reparse(scraper_name)
|
125
132
|
begin
|
126
133
|
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
@@ -130,8 +137,13 @@ module Datahen
|
|
130
137
|
return
|
131
138
|
end
|
132
139
|
|
133
|
-
|
134
|
-
|
140
|
+
if options[:job]
|
141
|
+
client = Client::JobPage.new(options)
|
142
|
+
puts "#{client.reparse(options[:job])}"
|
143
|
+
else
|
144
|
+
client = Client::ScraperJobPage.new(options)
|
145
|
+
puts "#{client.reparse(scraper_name)}"
|
146
|
+
end
|
135
147
|
|
136
148
|
rescue JSON::ParserError
|
137
149
|
if options[:vars]
|
@@ -218,7 +230,7 @@ module Datahen
|
|
218
230
|
end
|
219
231
|
end
|
220
232
|
|
221
|
-
desc "failedcontent <gid>", "Show a page's failed content in scraper's current job"
|
233
|
+
desc "failedcontent <scraper_name> <gid>", "Show a page's failed content in scraper's current job"
|
222
234
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
223
235
|
def failedcontent(scraper_name, gid)
|
224
236
|
result = nil
|
data/lib/datahen/cli/seeder.rb
CHANGED
@@ -7,6 +7,7 @@ module Datahen
|
|
7
7
|
<seeder_file>: Seeder script file will be executed.\x5
|
8
8
|
LONGDESC
|
9
9
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
10
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
10
11
|
def try_seed(scraper_name, seeder_file)
|
11
12
|
if options[:job]
|
12
13
|
job_id = options[:job]
|
@@ -14,8 +15,8 @@ module Datahen
|
|
14
15
|
job = Client::ScraperJob.new(options).find(scraper_name)
|
15
16
|
job_id = job['id']
|
16
17
|
end
|
17
|
-
|
18
|
-
puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, false)
|
18
|
+
|
19
|
+
puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, false, options[:"keep-outputs"])
|
19
20
|
end
|
20
21
|
|
21
22
|
desc "exec <scraper_name> <seeder_file>", "Executes a seeder script onto a scraper's current job."
|
@@ -24,6 +25,7 @@ module Datahen
|
|
24
25
|
<seeder_file>: Seeder script file that will be executed on the scraper's current job.\x5
|
25
26
|
LONGDESC
|
26
27
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
28
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
27
29
|
def exec_parse(scraper_name, seeder_file)
|
28
30
|
if options[:job]
|
29
31
|
job_id = options[:job]
|
data/lib/datahen/client.rb
CHANGED
@@ -20,7 +20,9 @@ require "datahen/client/job_stat"
|
|
20
20
|
require "datahen/client/backblaze_content"
|
21
21
|
require "datahen/client/env_var"
|
22
22
|
require "datahen/client/scraper_var"
|
23
|
+
require "datahen/client/job_var"
|
23
24
|
require "datahen/client/scraper_job_var"
|
25
|
+
require "datahen/client/job_finisher"
|
24
26
|
|
25
27
|
|
26
28
|
module Datahen
|
data/lib/datahen/client/base.rb
CHANGED
data/lib/datahen/client/job.rb
CHANGED
@@ -6,8 +6,12 @@ module Datahen
|
|
6
6
|
self.class.get("/jobs", params)
|
7
7
|
end
|
8
8
|
|
9
|
-
def find(job_id)
|
10
|
-
|
9
|
+
def find(job_id, opts={})
|
10
|
+
if opts[:live]
|
11
|
+
self.class.get("/jobs/#{job_id}", @options)
|
12
|
+
else
|
13
|
+
self.class.get("/cached/jobs/#{job_id}", @options)
|
14
|
+
end
|
11
15
|
end
|
12
16
|
|
13
17
|
def update(job_id, opts={})
|
@@ -15,6 +19,7 @@ module Datahen
|
|
15
19
|
body[:status] = opts[:status] if opts[:status]
|
16
20
|
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
17
21
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
22
|
+
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
18
23
|
params = @options.merge({body: body.to_json})
|
19
24
|
|
20
25
|
self.class.put("/jobs/#{job_id}", params)
|
@@ -41,6 +46,7 @@ module Datahen
|
|
41
46
|
body[:pages] = opts.fetch(:pages) {[]}
|
42
47
|
body[:seeding_status] = opts.fetch(:seeding_status){ nil }
|
43
48
|
body[:log_error] = opts[:log_error] if opts[:log_error]
|
49
|
+
body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
|
44
50
|
|
45
51
|
params = @options.merge({body: body.to_json})
|
46
52
|
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class JobFinisher < Datahen::Client::Base
|
4
|
+
# Reset finisher on a scraper's current job.
|
5
|
+
#
|
6
|
+
# @param [Integer] job_id Job ID
|
7
|
+
# @param [Hash] opts ({}) API custom parameters.
|
8
|
+
#
|
9
|
+
# @return [HTTParty::Response]
|
10
|
+
def reset(job_id, opts={})
|
11
|
+
params = @options.merge(opts)
|
12
|
+
self.class.put("/jobs/#{job_id}/finisher/reset", params)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -48,6 +48,7 @@ module Datahen
|
|
48
48
|
body[:pages] = opts.fetch(:pages) {[]}
|
49
49
|
body[:parsing_status] = opts.fetch(:parsing_status){ nil }
|
50
50
|
body[:log_error] = opts[:log_error] if opts[:log_error]
|
51
|
+
body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
|
51
52
|
|
52
53
|
params = @options.merge({body: body.to_json})
|
53
54
|
|
@@ -61,6 +62,16 @@ module Datahen
|
|
61
62
|
def find_failed_content(job_id, gid)
|
62
63
|
self.class.get("/jobs/#{job_id}/pages/#{gid}/failed_content", @options)
|
63
64
|
end
|
65
|
+
|
66
|
+
def reparse(job_id, opts={})
|
67
|
+
params = @options.merge(opts)
|
68
|
+
self.class.put("/jobs/#{job_id}/pages/reparse", params)
|
69
|
+
end
|
70
|
+
|
71
|
+
def refetch(job_id, opts={})
|
72
|
+
params = @options.merge(opts)
|
73
|
+
self.class.put("/jobs/#{job_id}/pages/refetch", params)
|
74
|
+
end
|
64
75
|
end
|
65
76
|
end
|
66
77
|
end
|
@@ -2,12 +2,20 @@ module Datahen
|
|
2
2
|
module Client
|
3
3
|
class JobStat < Datahen::Client::Base
|
4
4
|
|
5
|
-
def job_current_stats(job_id)
|
6
|
-
|
5
|
+
def job_current_stats(job_id, opts={})
|
6
|
+
if opts[:live]
|
7
|
+
self.class.get("/jobs/#{job_id}/stats/current", @options)
|
8
|
+
else
|
9
|
+
self.class.get("/cached/jobs/#{job_id}/stats/current", @options)
|
10
|
+
end
|
7
11
|
end
|
8
12
|
|
9
|
-
def scraper_job_current_stats(scraper_name)
|
10
|
-
|
13
|
+
def scraper_job_current_stats(scraper_name, opts={})
|
14
|
+
if opts[:live]
|
15
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/stats/current", @options)
|
16
|
+
else
|
17
|
+
self.class.get("/cached/scrapers/#{scraper_name}/current_job/stats/current", @options)
|
18
|
+
end
|
11
19
|
end
|
12
20
|
|
13
21
|
def job_stats_history(job_id)
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class JobVar < Datahen::Client::Base
|
4
|
+
|
5
|
+
def find(job_id, var_name)
|
6
|
+
self.class.get("/jobs/#{job_id}/vars/#{var_name}", @options)
|
7
|
+
end
|
8
|
+
|
9
|
+
def all(job_id, opts={})
|
10
|
+
params = @options.merge opts
|
11
|
+
self.class.get("/jobs/#{job_id}/vars", params)
|
12
|
+
end
|
13
|
+
|
14
|
+
def set(job_id, var_name, value, opts={})
|
15
|
+
body = {}
|
16
|
+
body[:value] = value
|
17
|
+
body[:secret] = opts[:secret] if opts[:secret]
|
18
|
+
params = @options.merge({body: body.to_json})
|
19
|
+
self.class.put("/jobs/#{job_id}/vars/#{var_name}", params)
|
20
|
+
end
|
21
|
+
|
22
|
+
def unset(job_id, var_name, opts={})
|
23
|
+
params = @options.merge(opts)
|
24
|
+
self.class.delete("/jobs/#{job_id}/vars/#{var_name}", params)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -15,8 +15,12 @@ module Datahen
|
|
15
15
|
self.class.post("/scrapers/#{scraper_name}/jobs", params)
|
16
16
|
end
|
17
17
|
|
18
|
-
def find(scraper_name)
|
19
|
-
|
18
|
+
def find(scraper_name, opts={})
|
19
|
+
if opts[:live]
|
20
|
+
self.class.get("/scrapers/#{scraper_name}/current_job", @options)
|
21
|
+
else
|
22
|
+
self.class.get("/cached/scrapers/#{scraper_name}/current_job", @options)
|
23
|
+
end
|
20
24
|
end
|
21
25
|
|
22
26
|
def update(scraper_name, opts={})
|
@@ -26,6 +26,9 @@ module Datahen
|
|
26
26
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/refetch", params)
|
27
27
|
end
|
28
28
|
|
29
|
+
# Deprecated, please use Datahen::Client::JobVar#refetch instead.
|
30
|
+
#
|
31
|
+
# @note This method will be removed at some point in the future.
|
29
32
|
def refetch_by_job(job_id, opts={})
|
30
33
|
params = @options.merge(opts)
|
31
34
|
self.class.put("/jobs/#{job_id}/pages/refetch", params)
|
@@ -36,11 +39,6 @@ module Datahen
|
|
36
39
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/reparse", params)
|
37
40
|
end
|
38
41
|
|
39
|
-
def reparse_by_job(job_id, opts={})
|
40
|
-
params = @options.merge(opts)
|
41
|
-
self.class.put("/jobs/#{job_id}/pages/reparse", params)
|
42
|
-
end
|
43
|
-
|
44
42
|
def enqueue(scraper_name, method, url, opts={})
|
45
43
|
body = {}
|
46
44
|
body[:method] = method != "" ? method : "GET"
|
@@ -60,12 +60,17 @@ module Datahen
|
|
60
60
|
|
61
61
|
def init_global_page()
|
62
62
|
client = Client::GlobalPage.new()
|
63
|
-
client.find(gid)
|
63
|
+
global_page = client.find(gid)
|
64
|
+
unless global_page.code == 200
|
65
|
+
raise "GID #{gid} not found. Aborting execution!"
|
66
|
+
else
|
67
|
+
global_page
|
68
|
+
end
|
64
69
|
end
|
65
70
|
|
66
|
-
def get_content(gid)
|
67
|
-
client = Client::
|
68
|
-
content_json = client.find_content(gid)
|
71
|
+
def get_content(job_id, gid)
|
72
|
+
client = Client::JobPage.new()
|
73
|
+
content_json = client.find_content(job_id, gid)
|
69
74
|
|
70
75
|
if content_json['available']
|
71
76
|
signed_url = content_json['signed_url']
|
@@ -75,7 +80,7 @@ module Datahen
|
|
75
80
|
end
|
76
81
|
end
|
77
82
|
|
78
|
-
def get_failed_content(gid)
|
83
|
+
def get_failed_content(job_id, gid)
|
79
84
|
client = Client::JobPage.new()
|
80
85
|
content_json = client.find_failed_content(job_id, gid)
|
81
86
|
|
@@ -287,11 +292,12 @@ module Datahen
|
|
287
292
|
end
|
288
293
|
|
289
294
|
# behave differently if it is a real save
|
295
|
+
save_status = status
|
290
296
|
if save
|
291
297
|
log_msg = "Saving #{log_msgs.join(' and ')}."
|
292
298
|
puts "#{log_msg}"
|
293
299
|
else
|
294
|
-
|
300
|
+
save_status = "#{status}_try"
|
295
301
|
end
|
296
302
|
|
297
303
|
# saving to server
|
@@ -300,7 +306,7 @@ module Datahen
|
|
300
306
|
gid: gid,
|
301
307
|
pages: pages_slice,
|
302
308
|
outputs: outputs_slice,
|
303
|
-
status:
|
309
|
+
status: save_status)
|
304
310
|
|
305
311
|
if response.code == 200
|
306
312
|
if save
|
@@ -1,18 +1,24 @@
|
|
1
1
|
module Datahen
|
2
2
|
module Scraper
|
3
3
|
class Parser
|
4
|
-
def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {})
|
4
|
+
def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {}, keep_outputs=false)
|
5
5
|
extname = File.extname(filename)
|
6
6
|
case extname
|
7
7
|
when '.rb'
|
8
|
-
executor = RubyParserExecutor.new(
|
8
|
+
executor = RubyParserExecutor.new(
|
9
|
+
filename: filename,
|
10
|
+
gid: gid,
|
11
|
+
job_id: job_id,
|
12
|
+
vars: vars,
|
13
|
+
keep_outputs: keep_outputs
|
14
|
+
)
|
9
15
|
executor.exec_parser(save)
|
10
16
|
else
|
11
17
|
puts "Unable to find a parser executor for file type \"#{extname}\""
|
12
18
|
end
|
13
19
|
end
|
14
20
|
|
15
|
-
|
21
|
+
|
16
22
|
end
|
17
23
|
end
|
18
|
-
end
|
24
|
+
end
|
@@ -15,6 +15,7 @@ module Datahen
|
|
15
15
|
@gid = options.fetch(:gid) { raise "GID is required"}
|
16
16
|
@job_id = options.fetch(:job_id)
|
17
17
|
@page_vars = options.fetch(:vars) { {} }
|
18
|
+
@keep_outputs = !!(options.fetch(:keep_outputs) { false })
|
18
19
|
end
|
19
20
|
|
20
21
|
def self.exposed_methods
|
@@ -66,7 +67,9 @@ module Datahen
|
|
66
67
|
response = parsing_update(
|
67
68
|
job_id: job_id,
|
68
69
|
gid: gid,
|
69
|
-
parsing_status: :starting
|
70
|
+
parsing_status: :starting,
|
71
|
+
keep_outputs: @keep_outputs
|
72
|
+
)
|
70
73
|
|
71
74
|
if response.code == 200
|
72
75
|
puts "Page Parsing Status Updated."
|
@@ -165,7 +168,7 @@ module Datahen
|
|
165
168
|
handle_error(e) if save
|
166
169
|
raise e
|
167
170
|
end
|
168
|
-
|
171
|
+
|
169
172
|
if refetch_self
|
170
173
|
refetch_page gid
|
171
174
|
elsif reparse_self
|
@@ -178,11 +181,11 @@ module Datahen
|
|
178
181
|
end
|
179
182
|
|
180
183
|
def content
|
181
|
-
@content ||= get_content(gid)
|
184
|
+
@content ||= get_content(job_id, gid)
|
182
185
|
end
|
183
186
|
|
184
187
|
def failed_content
|
185
|
-
@failed_content ||= get_failed_content(gid)
|
188
|
+
@failed_content ||= get_failed_content(job_id, gid)
|
186
189
|
end
|
187
190
|
|
188
191
|
def handle_error(e)
|
@@ -6,6 +6,7 @@ module Datahen
|
|
6
6
|
def initialize(options={})
|
7
7
|
@filename = options.fetch(:filename) { raise "Filename is required"}
|
8
8
|
@job_id = options[:job_id]
|
9
|
+
@keep_outputs = !!(options.fetch(:keep_outputs) { false })
|
9
10
|
end
|
10
11
|
|
11
12
|
def self.exposed_methods
|
@@ -81,7 +82,9 @@ module Datahen
|
|
81
82
|
|
82
83
|
response = seeding_update(
|
83
84
|
job_id: job_id,
|
84
|
-
seeding_status: :starting
|
85
|
+
seeding_status: :starting,
|
86
|
+
keep_outputs: @keep_outputs
|
87
|
+
)
|
85
88
|
|
86
89
|
if response.code == 200
|
87
90
|
puts "Seeding Status Updated."
|
@@ -2,11 +2,15 @@ module Datahen
|
|
2
2
|
module Scraper
|
3
3
|
class Seeder
|
4
4
|
|
5
|
-
def self.exec_seeder(filename, job_id=nil, save=false)
|
5
|
+
def self.exec_seeder(filename, job_id=nil, save=false, keep_outputs=false)
|
6
6
|
extname = File.extname(filename)
|
7
7
|
case extname
|
8
8
|
when '.rb'
|
9
|
-
executor = RubySeederExecutor.new(
|
9
|
+
executor = RubySeederExecutor.new(
|
10
|
+
filename: filename,
|
11
|
+
job_id: job_id,
|
12
|
+
keep_outputs: keep_outputs
|
13
|
+
)
|
10
14
|
executor.exec_seeder(save)
|
11
15
|
else
|
12
16
|
puts "Unable to find a seeder executor for file type \"#{extname}\""
|
@@ -15,4 +19,4 @@ module Datahen
|
|
15
19
|
|
16
20
|
end
|
17
21
|
end
|
18
|
-
end
|
22
|
+
end
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.14.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-08-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -215,10 +215,12 @@ files:
|
|
215
215
|
- lib/datahen/client/global_page.rb
|
216
216
|
- lib/datahen/client/job.rb
|
217
217
|
- lib/datahen/client/job_export.rb
|
218
|
+
- lib/datahen/client/job_finisher.rb
|
218
219
|
- lib/datahen/client/job_log.rb
|
219
220
|
- lib/datahen/client/job_output.rb
|
220
221
|
- lib/datahen/client/job_page.rb
|
221
222
|
- lib/datahen/client/job_stat.rb
|
223
|
+
- lib/datahen/client/job_var.rb
|
222
224
|
- lib/datahen/client/scraper.rb
|
223
225
|
- lib/datahen/client/scraper_deployment.rb
|
224
226
|
- lib/datahen/client/scraper_export.rb
|
@@ -262,7 +264,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
262
264
|
- !ruby/object:Gem::Version
|
263
265
|
version: '0'
|
264
266
|
requirements: []
|
265
|
-
rubygems_version: 3.
|
267
|
+
rubygems_version: 3.1.2
|
266
268
|
signing_key:
|
267
269
|
specification_version: 4
|
268
270
|
summary: DataHen toolbelt for developers
|