datahen 0.13.0 → 0.14.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/datahen/cli/job.rb +14 -2
- data/lib/datahen/cli/parser.rb +9 -5
- data/lib/datahen/cli/scraper.rb +4 -3
- data/lib/datahen/cli/scraper_export.rb +2 -3
- data/lib/datahen/cli/scraper_finisher.rb +8 -2
- data/lib/datahen/cli/scraper_job.rb +42 -11
- data/lib/datahen/cli/scraper_job_var.rb +33 -10
- data/lib/datahen/cli/scraper_page.rb +17 -5
- data/lib/datahen/cli/seeder.rb +4 -2
- data/lib/datahen/client.rb +2 -0
- data/lib/datahen/client/base.rb +2 -0
- data/lib/datahen/client/job.rb +8 -2
- data/lib/datahen/client/job_finisher.rb +16 -0
- data/lib/datahen/client/job_page.rb +11 -0
- data/lib/datahen/client/job_stat.rb +12 -4
- data/lib/datahen/client/job_var.rb +28 -0
- data/lib/datahen/client/scraper_job.rb +6 -2
- data/lib/datahen/client/scraper_job_page.rb +3 -5
- data/lib/datahen/scraper/executor.rb +13 -7
- data/lib/datahen/scraper/parser.rb +10 -4
- data/lib/datahen/scraper/ruby_parser_executor.rb +7 -4
- data/lib/datahen/scraper/ruby_seeder_executor.rb +4 -1
- data/lib/datahen/scraper/seeder.rb +7 -3
- data/lib/datahen/version.rb +1 -1
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b00adfb4f357beeae276a130cc5e0ee1d34ddd8bdef4a0374f4b55f89f894460
|
4
|
+
data.tar.gz: 6363d591d93d99addcaaea6d964b2fb07e4d8222f873c3a8f6496f48c97b1483
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 75e9f1b1e5ba61563c3ff9d12071cb76d77382353003776d88817940623e37aeb300df2114260f86a5a18a59e1dc9f74a33c00029e183c0681c616dce109d6c6
|
7
|
+
data.tar.gz: 8e2a07b11d20fe88c93aa0af0abf10ceb6593d5d2db63d9b5c3744c6f8b59754274a3949b6331d7d71a3018f7cf65e1a7d31e2ab6f4dbbcde494b33357dab40a
|
data/lib/datahen/cli/job.rb
CHANGED
@@ -18,12 +18,24 @@ module Datahen
|
|
18
18
|
puts "#{client.all()}"
|
19
19
|
end
|
20
20
|
|
21
|
-
desc "show <job_id>", "Show a job"
|
21
|
+
desc "show <job_id>", "Show a job (Defaults to showing data from cached job)"
|
22
|
+
option :live, type: :boolean, desc: 'Get data from the live job, not cached job.'
|
22
23
|
def show(job_id)
|
23
24
|
client = Client::Job.new(options)
|
24
|
-
puts "#{client.find(job_id)}"
|
25
|
+
puts "#{client.find(job_id, options)}"
|
25
26
|
end
|
26
27
|
|
28
|
+
desc "stats <job_id>", "Get the stat for a job (Defaults to showing data from cached stats)"
|
29
|
+
long_desc <<-LONGDESC
|
30
|
+
Get stats for a scraper's current job\n
|
31
|
+
LONGDESC
|
32
|
+
option :live, type: :boolean, desc: 'Get data from the live stats, not cached stats.'
|
33
|
+
def stats(job_id)
|
34
|
+
client = Client::JobStat.new(options)
|
35
|
+
puts "#{client.job_current_stats(job_id, options)}"
|
36
|
+
end
|
37
|
+
|
38
|
+
|
27
39
|
end
|
28
40
|
end
|
29
41
|
|
data/lib/datahen/cli/parser.rb
CHANGED
@@ -10,12 +10,13 @@ module Datahen
|
|
10
10
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
11
11
|
option :global, :aliases => :g, type: :boolean, default: false, desc: 'Use globalpage instead of a job page'
|
12
12
|
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
13
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
13
14
|
def try_parse(scraper_name, parser_file, gid)
|
14
|
-
begin
|
15
|
-
|
15
|
+
begin
|
16
|
+
|
16
17
|
if options[:job]
|
17
18
|
job_id = options[:job]
|
18
|
-
elsif options[:global]
|
19
|
+
elsif options[:global]
|
19
20
|
job_id = nil
|
20
21
|
else
|
21
22
|
job = Client::ScraperJob.new(options).find(scraper_name)
|
@@ -24,7 +25,7 @@ module Datahen
|
|
24
25
|
|
25
26
|
|
26
27
|
vars = JSON.parse(options[:vars]) if options[:vars]
|
27
|
-
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars)
|
28
|
+
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars, options[:"keep-outputs"])
|
28
29
|
|
29
30
|
rescue JSON::ParserError
|
30
31
|
if options[:vars]
|
@@ -40,6 +41,8 @@ module Datahen
|
|
40
41
|
<GID>: Global ID of the page.\x5
|
41
42
|
LONGDESC
|
42
43
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
44
|
+
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
45
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
43
46
|
def exec_parse(scraper_name, parser_file, *gids)
|
44
47
|
gids.each do |gid|
|
45
48
|
begin
|
@@ -52,7 +55,8 @@ module Datahen
|
|
52
55
|
job_id = job['id']
|
53
56
|
end
|
54
57
|
|
55
|
-
|
58
|
+
vars = JSON.parse(options[:vars]) if options[:vars]
|
59
|
+
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true, vars, options[:"keep-outputs"])
|
56
60
|
rescue => e
|
57
61
|
puts e
|
58
62
|
end
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -140,17 +140,18 @@ module Datahen
|
|
140
140
|
end
|
141
141
|
end
|
142
142
|
|
143
|
-
desc "stats <scraper_name>", "Get the
|
143
|
+
desc "stats <scraper_name>", "Get the stat for a current job (Defaults to showing data from cached stats)"
|
144
144
|
long_desc <<-LONGDESC
|
145
145
|
Get stats for a scraper's current job\n
|
146
146
|
LONGDESC
|
147
147
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
148
|
+
option :live, type: :boolean, desc: 'Get data from the live stats, not cached stats.'
|
148
149
|
def stats(scraper_name)
|
149
150
|
client = Client::JobStat.new(options)
|
150
151
|
if options[:job]
|
151
|
-
puts "#{client.job_current_stats(options[:job])}"
|
152
|
+
puts "#{client.job_current_stats(options[:job], options)}"
|
152
153
|
else
|
153
|
-
puts "#{client.scraper_job_current_stats(scraper_name)}"
|
154
|
+
puts "#{client.scraper_job_current_stats(scraper_name, options)}"
|
154
155
|
end
|
155
156
|
end
|
156
157
|
|
@@ -12,7 +12,6 @@ module Datahen
|
|
12
12
|
puts "#{client.find(export_id)}"
|
13
13
|
end
|
14
14
|
|
15
|
-
|
16
15
|
desc "list", "Gets a list of exports"
|
17
16
|
long_desc <<-LONGDESC
|
18
17
|
List exports.
|
@@ -34,13 +33,13 @@ module Datahen
|
|
34
33
|
def download(export_id)
|
35
34
|
client = Client::ScraperExport.new(options)
|
36
35
|
result = JSON.parse(client.download(export_id).to_s)
|
37
|
-
|
36
|
+
|
38
37
|
if result['signed_url']
|
39
38
|
puts "Download url: \"#{result['signed_url']}\""
|
40
39
|
`open "#{result['signed_url']}"`
|
41
40
|
else
|
42
41
|
puts "Exported file does not exist"
|
43
|
-
end
|
42
|
+
end
|
44
43
|
end
|
45
44
|
|
46
45
|
|
@@ -11,9 +11,15 @@ module Datahen
|
|
11
11
|
long_desc <<-LONGDESC
|
12
12
|
Reset finisher on a scraper's current job.\x5
|
13
13
|
LONGDESC
|
14
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
14
15
|
def reset(scraper_name)
|
15
|
-
|
16
|
-
|
16
|
+
if options[:job]
|
17
|
+
client = Client::JobFinisher.new(options)
|
18
|
+
puts "#{client.reset(options[:job])}"
|
19
|
+
else
|
20
|
+
client = Client::ScraperFinisher.new(options)
|
21
|
+
puts "#{client.reset(scraper_name)}"
|
22
|
+
end
|
17
23
|
end
|
18
24
|
end
|
19
25
|
end
|
@@ -6,10 +6,17 @@ module Datahen
|
|
6
6
|
"#{basename} #{@package_name} #{command.usage}"
|
7
7
|
end
|
8
8
|
|
9
|
-
desc "show <scraper_name>", "Show a scraper's current job"
|
9
|
+
desc "show <scraper_name>", "Show a scraper's current job (Defaults to showing data from cached job)"
|
10
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
11
|
+
option :live, type: :boolean, desc: 'Get data from the live job, not cached job.'
|
10
12
|
def show(scraper_name)
|
11
|
-
|
12
|
-
|
13
|
+
if options[:job]
|
14
|
+
client = Client::Job.new(options)
|
15
|
+
puts "#{client.find(options[:job], options)}"
|
16
|
+
else
|
17
|
+
client = Client::ScraperJob.new(options)
|
18
|
+
puts "#{client.find(scraper_name, options)}"
|
19
|
+
end
|
13
20
|
end
|
14
21
|
|
15
22
|
|
@@ -29,27 +36,45 @@ module Datahen
|
|
29
36
|
long_desc <<-LONGDESC
|
30
37
|
Cancels a scraper's current job
|
31
38
|
LONGDESC
|
39
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
32
40
|
def cancel(scraper_name)
|
33
|
-
|
34
|
-
|
41
|
+
if options[:job]
|
42
|
+
client = Client::Job.new(options)
|
43
|
+
puts "#{client.cancel(options[:job])}"
|
44
|
+
else
|
45
|
+
client = Client::ScraperJob.new(options)
|
46
|
+
puts "#{client.cancel(scraper_name)}"
|
47
|
+
end
|
35
48
|
end
|
36
49
|
|
37
50
|
desc "resume <scraper_name>", "resumes a scraper's current job"
|
38
51
|
long_desc <<-LONGDESC
|
39
52
|
Resumes a scraper's current job
|
40
53
|
LONGDESC
|
54
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
41
55
|
def resume(scraper_name)
|
42
|
-
|
43
|
-
|
56
|
+
if options[:job]
|
57
|
+
client = Client::Job.new(options)
|
58
|
+
puts "#{client.resume(options[:job])}"
|
59
|
+
else
|
60
|
+
client = Client::ScraperJob.new(options)
|
61
|
+
puts "#{client.resume(scraper_name)}"
|
62
|
+
end
|
44
63
|
end
|
45
64
|
|
46
65
|
desc "pause <scraper_name>", "pauses a scraper's current job"
|
47
66
|
long_desc <<-LONGDESC
|
48
67
|
pauses a scraper's current job
|
49
68
|
LONGDESC
|
69
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
50
70
|
def pause(scraper_name)
|
51
|
-
|
52
|
-
|
71
|
+
if options[:job]
|
72
|
+
client = Client::Job.new(options)
|
73
|
+
puts "#{client.pause(options[:job])}"
|
74
|
+
else
|
75
|
+
client = Client::ScraperJob.new(options)
|
76
|
+
puts "#{client.pause(scraper_name)}"
|
77
|
+
end
|
53
78
|
end
|
54
79
|
|
55
80
|
|
@@ -60,9 +85,15 @@ module Datahen
|
|
60
85
|
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 1. '
|
61
86
|
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 0. '
|
62
87
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
88
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
63
89
|
def update(scraper_name)
|
64
|
-
|
65
|
-
|
90
|
+
if options[:job]
|
91
|
+
client = Client::Job.new(options)
|
92
|
+
puts "#{client.update(options[:job], options)}"
|
93
|
+
else
|
94
|
+
client = Client::ScraperJob.new(options)
|
95
|
+
puts "#{client.update(scraper_name, options)}"
|
96
|
+
end
|
66
97
|
end
|
67
98
|
|
68
99
|
desc "var SUBCOMMAND ...ARGS", "for managing scraper's job variables"
|
@@ -13,9 +13,15 @@ module Datahen
|
|
13
13
|
LONGDESC
|
14
14
|
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
15
15
|
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
16
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
16
17
|
def list(scraper_name)
|
17
|
-
|
18
|
-
|
18
|
+
if options[:job]
|
19
|
+
client = Client::JobVar.new(options)
|
20
|
+
puts "#{client.all(options[:job])}"
|
21
|
+
else
|
22
|
+
client = Client::ScraperJobVar.new(options)
|
23
|
+
puts "#{client.all(scraper_name)}"
|
24
|
+
end
|
19
25
|
end
|
20
26
|
|
21
27
|
desc "set <scraper_name> <var_name> <value>", "Set an environment var on the scrape job"
|
@@ -24,23 +30,40 @@ module Datahen
|
|
24
30
|
<var_name>: Var name can only consist of alphabets, numbers, underscores. Name must be unique to your scrape job, otherwise it will be overwritten.\x5
|
25
31
|
<value>: Value of variable.\x5
|
26
32
|
LONGDESC
|
27
|
-
option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
|
33
|
+
option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
|
34
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
28
35
|
def set(scraper_name, var_name, value)
|
29
|
-
|
30
|
-
|
31
|
-
|
36
|
+
if options[:job]
|
37
|
+
client = Client::JobVar.new(options)
|
38
|
+
puts "#{client.set(options[:job], var_name, value, options)}"
|
39
|
+
else
|
40
|
+
client = Client::ScraperJobVar.new(options)
|
41
|
+
puts "#{client.set(scraper_name, var_name, value, options)}"
|
42
|
+
end
|
32
43
|
end
|
33
44
|
|
34
45
|
desc "show <scraper_name> <var_name>", "Show an environment variable on the scrape job"
|
46
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
35
47
|
def show(scraper_name, var_name)
|
36
|
-
|
37
|
-
|
48
|
+
if options[:job]
|
49
|
+
client = Client::JobVar.new(options)
|
50
|
+
puts "#{client.find(options[:job], var_name)}"
|
51
|
+
else
|
52
|
+
client = Client::ScraperJobVar.new(options)
|
53
|
+
puts "#{client.find(scraper_name, var_name)}"
|
54
|
+
end
|
38
55
|
end
|
39
56
|
|
40
57
|
desc "unset <scraper_name> <var_name>", "Deletes an environment variable on the scrape job"
|
58
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
41
59
|
def unset(scraper_name, var_name)
|
42
|
-
|
43
|
-
|
60
|
+
if options[:job]
|
61
|
+
client = Client::JobVar.new(options)
|
62
|
+
puts "#{client.unset(options[:job], var_name)}"
|
63
|
+
else
|
64
|
+
client = Client::ScraperJobVar.new(options)
|
65
|
+
puts "#{client.unset(scraper_name, var_name)}"
|
66
|
+
end
|
44
67
|
end
|
45
68
|
end
|
46
69
|
end
|
@@ -105,13 +105,19 @@ module Datahen
|
|
105
105
|
option :fetch_fail, type: :boolean, desc: 'Refetches only pages that fails fetching.'
|
106
106
|
option :parse_fail, type: :boolean, desc: 'Refetches only pages that fails parsing.'
|
107
107
|
option :status, type: :string, desc: 'Refetches only pages with a specific status.'
|
108
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
108
109
|
def refetch(scraper_name)
|
109
110
|
if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status)
|
110
111
|
puts "Must specify either a --gid, --fetch-fail, --parse-fail or --status"
|
111
112
|
return
|
112
113
|
end
|
113
|
-
|
114
|
-
|
114
|
+
if options[:job]
|
115
|
+
client = Client::JobPage.new(options)
|
116
|
+
puts "#{client.refetch(options[:job])}"
|
117
|
+
else
|
118
|
+
client = Client::ScraperJobPage.new(options)
|
119
|
+
puts "#{client.refetch(scraper_name)}"
|
120
|
+
end
|
115
121
|
end
|
116
122
|
|
117
123
|
desc "reparse <scraper_name>", "Reparse Pages on a scraper's current job"
|
@@ -121,6 +127,7 @@ module Datahen
|
|
121
127
|
option :gid, :aliases => :g, type: :string, desc: 'Reparse a specific GID'
|
122
128
|
option :parse_fail, type: :boolean, desc: 'Reparse only pages that fails parsing.'
|
123
129
|
option :status, type: :string, desc: 'Reparse only pages with a specific status.'
|
130
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
124
131
|
def reparse(scraper_name)
|
125
132
|
begin
|
126
133
|
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
@@ -130,8 +137,13 @@ module Datahen
|
|
130
137
|
return
|
131
138
|
end
|
132
139
|
|
133
|
-
|
134
|
-
|
140
|
+
if options[:job]
|
141
|
+
client = Client::JobPage.new(options)
|
142
|
+
puts "#{client.reparse(options[:job])}"
|
143
|
+
else
|
144
|
+
client = Client::ScraperJobPage.new(options)
|
145
|
+
puts "#{client.reparse(scraper_name)}"
|
146
|
+
end
|
135
147
|
|
136
148
|
rescue JSON::ParserError
|
137
149
|
if options[:vars]
|
@@ -218,7 +230,7 @@ module Datahen
|
|
218
230
|
end
|
219
231
|
end
|
220
232
|
|
221
|
-
desc "failedcontent <gid>", "Show a page's failed content in scraper's current job"
|
233
|
+
desc "failedcontent <scraper_name> <gid>", "Show a page's failed content in scraper's current job"
|
222
234
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
223
235
|
def failedcontent(scraper_name, gid)
|
224
236
|
result = nil
|
data/lib/datahen/cli/seeder.rb
CHANGED
@@ -7,6 +7,7 @@ module Datahen
|
|
7
7
|
<seeder_file>: Seeder script file will be executed.\x5
|
8
8
|
LONGDESC
|
9
9
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
10
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
10
11
|
def try_seed(scraper_name, seeder_file)
|
11
12
|
if options[:job]
|
12
13
|
job_id = options[:job]
|
@@ -14,8 +15,8 @@ module Datahen
|
|
14
15
|
job = Client::ScraperJob.new(options).find(scraper_name)
|
15
16
|
job_id = job['id']
|
16
17
|
end
|
17
|
-
|
18
|
-
puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, false)
|
18
|
+
|
19
|
+
puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, false, options[:"keep-outputs"])
|
19
20
|
end
|
20
21
|
|
21
22
|
desc "exec <scraper_name> <seeder_file>", "Executes a seeder script onto a scraper's current job."
|
@@ -24,6 +25,7 @@ module Datahen
|
|
24
25
|
<seeder_file>: Seeder script file that will be executed on the scraper's current job.\x5
|
25
26
|
LONGDESC
|
26
27
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
28
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
27
29
|
def exec_parse(scraper_name, seeder_file)
|
28
30
|
if options[:job]
|
29
31
|
job_id = options[:job]
|
data/lib/datahen/client.rb
CHANGED
@@ -20,7 +20,9 @@ require "datahen/client/job_stat"
|
|
20
20
|
require "datahen/client/backblaze_content"
|
21
21
|
require "datahen/client/env_var"
|
22
22
|
require "datahen/client/scraper_var"
|
23
|
+
require "datahen/client/job_var"
|
23
24
|
require "datahen/client/scraper_job_var"
|
25
|
+
require "datahen/client/job_finisher"
|
24
26
|
|
25
27
|
|
26
28
|
module Datahen
|
data/lib/datahen/client/base.rb
CHANGED
data/lib/datahen/client/job.rb
CHANGED
@@ -6,8 +6,12 @@ module Datahen
|
|
6
6
|
self.class.get("/jobs", params)
|
7
7
|
end
|
8
8
|
|
9
|
-
def find(job_id)
|
10
|
-
|
9
|
+
def find(job_id, opts={})
|
10
|
+
if opts[:live]
|
11
|
+
self.class.get("/jobs/#{job_id}", @options)
|
12
|
+
else
|
13
|
+
self.class.get("/cached/jobs/#{job_id}", @options)
|
14
|
+
end
|
11
15
|
end
|
12
16
|
|
13
17
|
def update(job_id, opts={})
|
@@ -15,6 +19,7 @@ module Datahen
|
|
15
19
|
body[:status] = opts[:status] if opts[:status]
|
16
20
|
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
17
21
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
22
|
+
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
18
23
|
params = @options.merge({body: body.to_json})
|
19
24
|
|
20
25
|
self.class.put("/jobs/#{job_id}", params)
|
@@ -41,6 +46,7 @@ module Datahen
|
|
41
46
|
body[:pages] = opts.fetch(:pages) {[]}
|
42
47
|
body[:seeding_status] = opts.fetch(:seeding_status){ nil }
|
43
48
|
body[:log_error] = opts[:log_error] if opts[:log_error]
|
49
|
+
body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
|
44
50
|
|
45
51
|
params = @options.merge({body: body.to_json})
|
46
52
|
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class JobFinisher < Datahen::Client::Base
|
4
|
+
# Reset finisher on a scraper's current job.
|
5
|
+
#
|
6
|
+
# @param [Integer] job_id Job ID
|
7
|
+
# @param [Hash] opts ({}) API custom parameters.
|
8
|
+
#
|
9
|
+
# @return [HTTParty::Response]
|
10
|
+
def reset(job_id, opts={})
|
11
|
+
params = @options.merge(opts)
|
12
|
+
self.class.put("/jobs/#{job_id}/finisher/reset", params)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -48,6 +48,7 @@ module Datahen
|
|
48
48
|
body[:pages] = opts.fetch(:pages) {[]}
|
49
49
|
body[:parsing_status] = opts.fetch(:parsing_status){ nil }
|
50
50
|
body[:log_error] = opts[:log_error] if opts[:log_error]
|
51
|
+
body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
|
51
52
|
|
52
53
|
params = @options.merge({body: body.to_json})
|
53
54
|
|
@@ -61,6 +62,16 @@ module Datahen
|
|
61
62
|
def find_failed_content(job_id, gid)
|
62
63
|
self.class.get("/jobs/#{job_id}/pages/#{gid}/failed_content", @options)
|
63
64
|
end
|
65
|
+
|
66
|
+
def reparse(job_id, opts={})
|
67
|
+
params = @options.merge(opts)
|
68
|
+
self.class.put("/jobs/#{job_id}/pages/reparse", params)
|
69
|
+
end
|
70
|
+
|
71
|
+
def refetch(job_id, opts={})
|
72
|
+
params = @options.merge(opts)
|
73
|
+
self.class.put("/jobs/#{job_id}/pages/refetch", params)
|
74
|
+
end
|
64
75
|
end
|
65
76
|
end
|
66
77
|
end
|
@@ -2,12 +2,20 @@ module Datahen
|
|
2
2
|
module Client
|
3
3
|
class JobStat < Datahen::Client::Base
|
4
4
|
|
5
|
-
def job_current_stats(job_id)
|
6
|
-
|
5
|
+
def job_current_stats(job_id, opts={})
|
6
|
+
if opts[:live]
|
7
|
+
self.class.get("/jobs/#{job_id}/stats/current", @options)
|
8
|
+
else
|
9
|
+
self.class.get("/cached/jobs/#{job_id}/stats/current", @options)
|
10
|
+
end
|
7
11
|
end
|
8
12
|
|
9
|
-
def scraper_job_current_stats(scraper_name)
|
10
|
-
|
13
|
+
def scraper_job_current_stats(scraper_name, opts={})
|
14
|
+
if opts[:live]
|
15
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/stats/current", @options)
|
16
|
+
else
|
17
|
+
self.class.get("/cached/scrapers/#{scraper_name}/current_job/stats/current", @options)
|
18
|
+
end
|
11
19
|
end
|
12
20
|
|
13
21
|
def job_stats_history(job_id)
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class JobVar < Datahen::Client::Base
|
4
|
+
|
5
|
+
def find(job_id, var_name)
|
6
|
+
self.class.get("/jobs/#{job_id}/vars/#{var_name}", @options)
|
7
|
+
end
|
8
|
+
|
9
|
+
def all(job_id, opts={})
|
10
|
+
params = @options.merge opts
|
11
|
+
self.class.get("/jobs/#{job_id}/vars", params)
|
12
|
+
end
|
13
|
+
|
14
|
+
def set(job_id, var_name, value, opts={})
|
15
|
+
body = {}
|
16
|
+
body[:value] = value
|
17
|
+
body[:secret] = opts[:secret] if opts[:secret]
|
18
|
+
params = @options.merge({body: body.to_json})
|
19
|
+
self.class.put("/jobs/#{job_id}/vars/#{var_name}", params)
|
20
|
+
end
|
21
|
+
|
22
|
+
def unset(job_id, var_name, opts={})
|
23
|
+
params = @options.merge(opts)
|
24
|
+
self.class.delete("/jobs/#{job_id}/vars/#{var_name}", params)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -15,8 +15,12 @@ module Datahen
|
|
15
15
|
self.class.post("/scrapers/#{scraper_name}/jobs", params)
|
16
16
|
end
|
17
17
|
|
18
|
-
def find(scraper_name)
|
19
|
-
|
18
|
+
def find(scraper_name, opts={})
|
19
|
+
if opts[:live]
|
20
|
+
self.class.get("/scrapers/#{scraper_name}/current_job", @options)
|
21
|
+
else
|
22
|
+
self.class.get("/cached/scrapers/#{scraper_name}/current_job", @options)
|
23
|
+
end
|
20
24
|
end
|
21
25
|
|
22
26
|
def update(scraper_name, opts={})
|
@@ -26,6 +26,9 @@ module Datahen
|
|
26
26
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/refetch", params)
|
27
27
|
end
|
28
28
|
|
29
|
+
# Deprecated, please use Datahen::Client::JobVar#refetch instead.
|
30
|
+
#
|
31
|
+
# @note This method will be removed at some point in the future.
|
29
32
|
def refetch_by_job(job_id, opts={})
|
30
33
|
params = @options.merge(opts)
|
31
34
|
self.class.put("/jobs/#{job_id}/pages/refetch", params)
|
@@ -36,11 +39,6 @@ module Datahen
|
|
36
39
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/reparse", params)
|
37
40
|
end
|
38
41
|
|
39
|
-
def reparse_by_job(job_id, opts={})
|
40
|
-
params = @options.merge(opts)
|
41
|
-
self.class.put("/jobs/#{job_id}/pages/reparse", params)
|
42
|
-
end
|
43
|
-
|
44
42
|
def enqueue(scraper_name, method, url, opts={})
|
45
43
|
body = {}
|
46
44
|
body[:method] = method != "" ? method : "GET"
|
@@ -60,12 +60,17 @@ module Datahen
|
|
60
60
|
|
61
61
|
def init_global_page()
|
62
62
|
client = Client::GlobalPage.new()
|
63
|
-
client.find(gid)
|
63
|
+
global_page = client.find(gid)
|
64
|
+
unless global_page.code == 200
|
65
|
+
raise "GID #{gid} not found. Aborting execution!"
|
66
|
+
else
|
67
|
+
global_page
|
68
|
+
end
|
64
69
|
end
|
65
70
|
|
66
|
-
def get_content(gid)
|
67
|
-
client = Client::
|
68
|
-
content_json = client.find_content(gid)
|
71
|
+
def get_content(job_id, gid)
|
72
|
+
client = Client::JobPage.new()
|
73
|
+
content_json = client.find_content(job_id, gid)
|
69
74
|
|
70
75
|
if content_json['available']
|
71
76
|
signed_url = content_json['signed_url']
|
@@ -75,7 +80,7 @@ module Datahen
|
|
75
80
|
end
|
76
81
|
end
|
77
82
|
|
78
|
-
def get_failed_content(gid)
|
83
|
+
def get_failed_content(job_id, gid)
|
79
84
|
client = Client::JobPage.new()
|
80
85
|
content_json = client.find_failed_content(job_id, gid)
|
81
86
|
|
@@ -287,11 +292,12 @@ module Datahen
|
|
287
292
|
end
|
288
293
|
|
289
294
|
# behave differently if it is a real save
|
295
|
+
save_status = status
|
290
296
|
if save
|
291
297
|
log_msg = "Saving #{log_msgs.join(' and ')}."
|
292
298
|
puts "#{log_msg}"
|
293
299
|
else
|
294
|
-
|
300
|
+
save_status = "#{status}_try"
|
295
301
|
end
|
296
302
|
|
297
303
|
# saving to server
|
@@ -300,7 +306,7 @@ module Datahen
|
|
300
306
|
gid: gid,
|
301
307
|
pages: pages_slice,
|
302
308
|
outputs: outputs_slice,
|
303
|
-
status:
|
309
|
+
status: save_status)
|
304
310
|
|
305
311
|
if response.code == 200
|
306
312
|
if save
|
@@ -1,18 +1,24 @@
|
|
1
1
|
module Datahen
|
2
2
|
module Scraper
|
3
3
|
class Parser
|
4
|
-
def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {})
|
4
|
+
def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {}, keep_outputs=false)
|
5
5
|
extname = File.extname(filename)
|
6
6
|
case extname
|
7
7
|
when '.rb'
|
8
|
-
executor = RubyParserExecutor.new(
|
8
|
+
executor = RubyParserExecutor.new(
|
9
|
+
filename: filename,
|
10
|
+
gid: gid,
|
11
|
+
job_id: job_id,
|
12
|
+
vars: vars,
|
13
|
+
keep_outputs: keep_outputs
|
14
|
+
)
|
9
15
|
executor.exec_parser(save)
|
10
16
|
else
|
11
17
|
puts "Unable to find a parser executor for file type \"#{extname}\""
|
12
18
|
end
|
13
19
|
end
|
14
20
|
|
15
|
-
|
21
|
+
|
16
22
|
end
|
17
23
|
end
|
18
|
-
end
|
24
|
+
end
|
@@ -15,6 +15,7 @@ module Datahen
|
|
15
15
|
@gid = options.fetch(:gid) { raise "GID is required"}
|
16
16
|
@job_id = options.fetch(:job_id)
|
17
17
|
@page_vars = options.fetch(:vars) { {} }
|
18
|
+
@keep_outputs = !!(options.fetch(:keep_outputs) { false })
|
18
19
|
end
|
19
20
|
|
20
21
|
def self.exposed_methods
|
@@ -66,7 +67,9 @@ module Datahen
|
|
66
67
|
response = parsing_update(
|
67
68
|
job_id: job_id,
|
68
69
|
gid: gid,
|
69
|
-
parsing_status: :starting
|
70
|
+
parsing_status: :starting,
|
71
|
+
keep_outputs: @keep_outputs
|
72
|
+
)
|
70
73
|
|
71
74
|
if response.code == 200
|
72
75
|
puts "Page Parsing Status Updated."
|
@@ -165,7 +168,7 @@ module Datahen
|
|
165
168
|
handle_error(e) if save
|
166
169
|
raise e
|
167
170
|
end
|
168
|
-
|
171
|
+
|
169
172
|
if refetch_self
|
170
173
|
refetch_page gid
|
171
174
|
elsif reparse_self
|
@@ -178,11 +181,11 @@ module Datahen
|
|
178
181
|
end
|
179
182
|
|
180
183
|
def content
|
181
|
-
@content ||= get_content(gid)
|
184
|
+
@content ||= get_content(job_id, gid)
|
182
185
|
end
|
183
186
|
|
184
187
|
def failed_content
|
185
|
-
@failed_content ||= get_failed_content(gid)
|
188
|
+
@failed_content ||= get_failed_content(job_id, gid)
|
186
189
|
end
|
187
190
|
|
188
191
|
def handle_error(e)
|
@@ -6,6 +6,7 @@ module Datahen
|
|
6
6
|
def initialize(options={})
|
7
7
|
@filename = options.fetch(:filename) { raise "Filename is required"}
|
8
8
|
@job_id = options[:job_id]
|
9
|
+
@keep_outputs = !!(options.fetch(:keep_outputs) { false })
|
9
10
|
end
|
10
11
|
|
11
12
|
def self.exposed_methods
|
@@ -81,7 +82,9 @@ module Datahen
|
|
81
82
|
|
82
83
|
response = seeding_update(
|
83
84
|
job_id: job_id,
|
84
|
-
seeding_status: :starting
|
85
|
+
seeding_status: :starting,
|
86
|
+
keep_outputs: @keep_outputs
|
87
|
+
)
|
85
88
|
|
86
89
|
if response.code == 200
|
87
90
|
puts "Seeding Status Updated."
|
@@ -2,11 +2,15 @@ module Datahen
|
|
2
2
|
module Scraper
|
3
3
|
class Seeder
|
4
4
|
|
5
|
-
def self.exec_seeder(filename, job_id=nil, save=false)
|
5
|
+
def self.exec_seeder(filename, job_id=nil, save=false, keep_outputs=false)
|
6
6
|
extname = File.extname(filename)
|
7
7
|
case extname
|
8
8
|
when '.rb'
|
9
|
-
executor = RubySeederExecutor.new(
|
9
|
+
executor = RubySeederExecutor.new(
|
10
|
+
filename: filename,
|
11
|
+
job_id: job_id,
|
12
|
+
keep_outputs: keep_outputs
|
13
|
+
)
|
10
14
|
executor.exec_seeder(save)
|
11
15
|
else
|
12
16
|
puts "Unable to find a seeder executor for file type \"#{extname}\""
|
@@ -15,4 +19,4 @@ module Datahen
|
|
15
19
|
|
16
20
|
end
|
17
21
|
end
|
18
|
-
end
|
22
|
+
end
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.14.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-08-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -215,10 +215,12 @@ files:
|
|
215
215
|
- lib/datahen/client/global_page.rb
|
216
216
|
- lib/datahen/client/job.rb
|
217
217
|
- lib/datahen/client/job_export.rb
|
218
|
+
- lib/datahen/client/job_finisher.rb
|
218
219
|
- lib/datahen/client/job_log.rb
|
219
220
|
- lib/datahen/client/job_output.rb
|
220
221
|
- lib/datahen/client/job_page.rb
|
221
222
|
- lib/datahen/client/job_stat.rb
|
223
|
+
- lib/datahen/client/job_var.rb
|
222
224
|
- lib/datahen/client/scraper.rb
|
223
225
|
- lib/datahen/client/scraper_deployment.rb
|
224
226
|
- lib/datahen/client/scraper_export.rb
|
@@ -262,7 +264,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
262
264
|
- !ruby/object:Gem::Version
|
263
265
|
version: '0'
|
264
266
|
requirements: []
|
265
|
-
rubygems_version: 3.
|
267
|
+
rubygems_version: 3.1.2
|
266
268
|
signing_key:
|
267
269
|
specification_version: 4
|
268
270
|
summary: DataHen toolbelt for developers
|