datahen 0.13.0 → 0.13.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/datahen/cli/parser.rb +9 -5
- data/lib/datahen/cli/scraper_export.rb +2 -3
- data/lib/datahen/cli/scraper_finisher.rb +8 -2
- data/lib/datahen/cli/scraper_job.rb +32 -8
- data/lib/datahen/cli/scraper_job_var.rb +33 -10
- data/lib/datahen/cli/scraper_page.rb +16 -4
- data/lib/datahen/cli/seeder.rb +4 -2
- data/lib/datahen/client.rb +2 -0
- data/lib/datahen/client/job.rb +2 -0
- data/lib/datahen/client/job_finisher.rb +16 -0
- data/lib/datahen/client/job_page.rb +11 -0
- data/lib/datahen/client/job_var.rb +28 -0
- data/lib/datahen/client/scraper_job_page.rb +3 -5
- data/lib/datahen/scraper/executor.rb +4 -4
- data/lib/datahen/scraper/parser.rb +10 -4
- data/lib/datahen/scraper/ruby_parser_executor.rb +7 -4
- data/lib/datahen/scraper/ruby_seeder_executor.rb +4 -1
- data/lib/datahen/scraper/seeder.rb +7 -3
- data/lib/datahen/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c05f6ab973fe74a21e7f68411a66b97444575cc75c5812e99c5e0da4ffe05d56
|
|
4
|
+
data.tar.gz: dac1d2be8f6281d3da328abd5f844036765991764bb1e3ffad4b72ee4c18eac6
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 48c15f9830308488d434dce7b7cf4888795724d88d4ca9de63cc2deca8397798b815145430af2143eb646366bfc4e97f37e563315214258b434276919e724ac7
|
|
7
|
+
data.tar.gz: faee0369f81ab45dbb6c62258a6214d530d51d5dcb0f6b6ebd7028bda2c48ccbe630b41006ffcc96dba773a28e84e724b6b1f2266ef7866605e5423f1ced61d2
|
data/lib/datahen/cli/parser.rb
CHANGED
|
@@ -10,12 +10,13 @@ module Datahen
|
|
|
10
10
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
|
11
11
|
option :global, :aliases => :g, type: :boolean, default: false, desc: 'Use globalpage instead of a job page'
|
|
12
12
|
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
|
13
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
|
13
14
|
def try_parse(scraper_name, parser_file, gid)
|
|
14
|
-
begin
|
|
15
|
-
|
|
15
|
+
begin
|
|
16
|
+
|
|
16
17
|
if options[:job]
|
|
17
18
|
job_id = options[:job]
|
|
18
|
-
elsif options[:global]
|
|
19
|
+
elsif options[:global]
|
|
19
20
|
job_id = nil
|
|
20
21
|
else
|
|
21
22
|
job = Client::ScraperJob.new(options).find(scraper_name)
|
|
@@ -24,7 +25,7 @@ module Datahen
|
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
vars = JSON.parse(options[:vars]) if options[:vars]
|
|
27
|
-
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars)
|
|
28
|
+
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars, options[:"keep-outputs"])
|
|
28
29
|
|
|
29
30
|
rescue JSON::ParserError
|
|
30
31
|
if options[:vars]
|
|
@@ -40,6 +41,8 @@ module Datahen
|
|
|
40
41
|
<GID>: Global ID of the page.\x5
|
|
41
42
|
LONGDESC
|
|
42
43
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
|
44
|
+
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
|
45
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
|
43
46
|
def exec_parse(scraper_name, parser_file, *gids)
|
|
44
47
|
gids.each do |gid|
|
|
45
48
|
begin
|
|
@@ -52,7 +55,8 @@ module Datahen
|
|
|
52
55
|
job_id = job['id']
|
|
53
56
|
end
|
|
54
57
|
|
|
55
|
-
|
|
58
|
+
vars = JSON.parse(options[:vars]) if options[:vars]
|
|
59
|
+
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true, vars, options[:"keep-outputs"])
|
|
56
60
|
rescue => e
|
|
57
61
|
puts e
|
|
58
62
|
end
|
|
@@ -12,7 +12,6 @@ module Datahen
|
|
|
12
12
|
puts "#{client.find(export_id)}"
|
|
13
13
|
end
|
|
14
14
|
|
|
15
|
-
|
|
16
15
|
desc "list", "Gets a list of exports"
|
|
17
16
|
long_desc <<-LONGDESC
|
|
18
17
|
List exports.
|
|
@@ -34,13 +33,13 @@ module Datahen
|
|
|
34
33
|
def download(export_id)
|
|
35
34
|
client = Client::ScraperExport.new(options)
|
|
36
35
|
result = JSON.parse(client.download(export_id).to_s)
|
|
37
|
-
|
|
36
|
+
|
|
38
37
|
if result['signed_url']
|
|
39
38
|
puts "Download url: \"#{result['signed_url']}\""
|
|
40
39
|
`open "#{result['signed_url']}"`
|
|
41
40
|
else
|
|
42
41
|
puts "Exported file does not exist"
|
|
43
|
-
end
|
|
42
|
+
end
|
|
44
43
|
end
|
|
45
44
|
|
|
46
45
|
|
|
@@ -11,9 +11,15 @@ module Datahen
|
|
|
11
11
|
long_desc <<-LONGDESC
|
|
12
12
|
Reset finisher on a scraper's current job.\x5
|
|
13
13
|
LONGDESC
|
|
14
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
|
14
15
|
def reset(scraper_name)
|
|
15
|
-
|
|
16
|
-
|
|
16
|
+
if options[:job]
|
|
17
|
+
client = Client::JobFinisher.new(options)
|
|
18
|
+
puts "#{client.reset(options[:job])}"
|
|
19
|
+
else
|
|
20
|
+
client = Client::ScraperFinisher.new(options)
|
|
21
|
+
puts "#{client.reset(scraper_name)}"
|
|
22
|
+
end
|
|
17
23
|
end
|
|
18
24
|
end
|
|
19
25
|
end
|
|
@@ -29,27 +29,45 @@ module Datahen
|
|
|
29
29
|
long_desc <<-LONGDESC
|
|
30
30
|
Cancels a scraper's current job
|
|
31
31
|
LONGDESC
|
|
32
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
|
32
33
|
def cancel(scraper_name)
|
|
33
|
-
|
|
34
|
-
|
|
34
|
+
if options[:job]
|
|
35
|
+
client = Client::Job.new(options)
|
|
36
|
+
puts "#{client.cancel(options[:job])}"
|
|
37
|
+
else
|
|
38
|
+
client = Client::ScraperJob.new(options)
|
|
39
|
+
puts "#{client.cancel(scraper_name)}"
|
|
40
|
+
end
|
|
35
41
|
end
|
|
36
42
|
|
|
37
43
|
desc "resume <scraper_name>", "resumes a scraper's current job"
|
|
38
44
|
long_desc <<-LONGDESC
|
|
39
45
|
Resumes a scraper's current job
|
|
40
46
|
LONGDESC
|
|
47
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
|
41
48
|
def resume(scraper_name)
|
|
42
|
-
|
|
43
|
-
|
|
49
|
+
if options[:job]
|
|
50
|
+
client = Client::Job.new(options)
|
|
51
|
+
puts "#{client.resume(options[:job])}"
|
|
52
|
+
else
|
|
53
|
+
client = Client::ScraperJob.new(options)
|
|
54
|
+
puts "#{client.resume(scraper_name)}"
|
|
55
|
+
end
|
|
44
56
|
end
|
|
45
57
|
|
|
46
58
|
desc "pause <scraper_name>", "pauses a scraper's current job"
|
|
47
59
|
long_desc <<-LONGDESC
|
|
48
60
|
pauses a scraper's current job
|
|
49
61
|
LONGDESC
|
|
62
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
|
50
63
|
def pause(scraper_name)
|
|
51
|
-
|
|
52
|
-
|
|
64
|
+
if options[:job]
|
|
65
|
+
client = Client::Job.new(options)
|
|
66
|
+
puts "#{client.pause(options[:job])}"
|
|
67
|
+
else
|
|
68
|
+
client = Client::ScraperJob.new(options)
|
|
69
|
+
puts "#{client.pause(scraper_name)}"
|
|
70
|
+
end
|
|
53
71
|
end
|
|
54
72
|
|
|
55
73
|
|
|
@@ -60,9 +78,15 @@ module Datahen
|
|
|
60
78
|
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 1. '
|
|
61
79
|
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 0. '
|
|
62
80
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
|
81
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
|
63
82
|
def update(scraper_name)
|
|
64
|
-
|
|
65
|
-
|
|
83
|
+
if options[:job]
|
|
84
|
+
client = Client::Job.new(options)
|
|
85
|
+
puts "#{client.update(options[:job], options)}"
|
|
86
|
+
else
|
|
87
|
+
client = Client::ScraperJob.new(options)
|
|
88
|
+
puts "#{client.update(scraper_name, options)}"
|
|
89
|
+
end
|
|
66
90
|
end
|
|
67
91
|
|
|
68
92
|
desc "var SUBCOMMAND ...ARGS", "for managing scraper's job variables"
|
|
@@ -13,9 +13,15 @@ module Datahen
|
|
|
13
13
|
LONGDESC
|
|
14
14
|
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
|
15
15
|
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
|
16
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
|
16
17
|
def list(scraper_name)
|
|
17
|
-
|
|
18
|
-
|
|
18
|
+
if options[:job]
|
|
19
|
+
client = Client::JobVar.new(options)
|
|
20
|
+
puts "#{client.all(options[:job])}"
|
|
21
|
+
else
|
|
22
|
+
client = Client::ScraperJobVar.new(options)
|
|
23
|
+
puts "#{client.all(scraper_name)}"
|
|
24
|
+
end
|
|
19
25
|
end
|
|
20
26
|
|
|
21
27
|
desc "set <scraper_name> <var_name> <value>", "Set an environment var on the scrape job"
|
|
@@ -24,23 +30,40 @@ module Datahen
|
|
|
24
30
|
<var_name>: Var name can only consist of alphabets, numbers, underscores. Name must be unique to your scrape job, otherwise it will be overwritten.\x5
|
|
25
31
|
<value>: Value of variable.\x5
|
|
26
32
|
LONGDESC
|
|
27
|
-
option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
|
|
33
|
+
option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
|
|
34
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
|
28
35
|
def set(scraper_name, var_name, value)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
36
|
+
if options[:job]
|
|
37
|
+
client = Client::JobVar.new(options)
|
|
38
|
+
puts "#{client.set(options[:job], var_name, value, options)}"
|
|
39
|
+
else
|
|
40
|
+
client = Client::ScraperJobVar.new(options)
|
|
41
|
+
puts "#{client.set(scraper_name, var_name, value, options)}"
|
|
42
|
+
end
|
|
32
43
|
end
|
|
33
44
|
|
|
34
45
|
desc "show <scraper_name> <var_name>", "Show an environment variable on the scrape job"
|
|
46
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
|
35
47
|
def show(scraper_name, var_name)
|
|
36
|
-
|
|
37
|
-
|
|
48
|
+
if options[:job]
|
|
49
|
+
client = Client::JobVar.new(options)
|
|
50
|
+
puts "#{client.find(options[:job], var_name)}"
|
|
51
|
+
else
|
|
52
|
+
client = Client::ScraperJobVar.new(options)
|
|
53
|
+
puts "#{client.find(scraper_name, var_name)}"
|
|
54
|
+
end
|
|
38
55
|
end
|
|
39
56
|
|
|
40
57
|
desc "unset <scraper_name> <var_name>", "Deletes an environment variable on the scrape job"
|
|
58
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
|
41
59
|
def unset(scraper_name, var_name)
|
|
42
|
-
|
|
43
|
-
|
|
60
|
+
if options[:job]
|
|
61
|
+
client = Client::JobVar.new(options)
|
|
62
|
+
puts "#{client.unset(options[:job], var_name)}"
|
|
63
|
+
else
|
|
64
|
+
client = Client::ScraperJobVar.new(options)
|
|
65
|
+
puts "#{client.unset(scraper_name, var_name)}"
|
|
66
|
+
end
|
|
44
67
|
end
|
|
45
68
|
end
|
|
46
69
|
end
|
|
@@ -105,13 +105,19 @@ module Datahen
|
|
|
105
105
|
option :fetch_fail, type: :boolean, desc: 'Refetches only pages that fails fetching.'
|
|
106
106
|
option :parse_fail, type: :boolean, desc: 'Refetches only pages that fails parsing.'
|
|
107
107
|
option :status, type: :string, desc: 'Refetches only pages with a specific status.'
|
|
108
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
|
108
109
|
def refetch(scraper_name)
|
|
109
110
|
if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status)
|
|
110
111
|
puts "Must specify either a --gid, --fetch-fail, --parse-fail or --status"
|
|
111
112
|
return
|
|
112
113
|
end
|
|
113
|
-
|
|
114
|
-
|
|
114
|
+
if options[:job]
|
|
115
|
+
client = Client::JobPage.new(options)
|
|
116
|
+
puts "#{client.refetch(options[:job])}"
|
|
117
|
+
else
|
|
118
|
+
client = Client::ScraperJobPage.new(options)
|
|
119
|
+
puts "#{client.refetch(scraper_name)}"
|
|
120
|
+
end
|
|
115
121
|
end
|
|
116
122
|
|
|
117
123
|
desc "reparse <scraper_name>", "Reparse Pages on a scraper's current job"
|
|
@@ -121,6 +127,7 @@ module Datahen
|
|
|
121
127
|
option :gid, :aliases => :g, type: :string, desc: 'Reparse a specific GID'
|
|
122
128
|
option :parse_fail, type: :boolean, desc: 'Reparse only pages that fails parsing.'
|
|
123
129
|
option :status, type: :string, desc: 'Reparse only pages with a specific status.'
|
|
130
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
|
124
131
|
def reparse(scraper_name)
|
|
125
132
|
begin
|
|
126
133
|
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
|
@@ -130,8 +137,13 @@ module Datahen
|
|
|
130
137
|
return
|
|
131
138
|
end
|
|
132
139
|
|
|
133
|
-
|
|
134
|
-
|
|
140
|
+
if options[:job]
|
|
141
|
+
client = Client::JobPage.new(options)
|
|
142
|
+
puts "#{client.reparse(options[:job])}"
|
|
143
|
+
else
|
|
144
|
+
client = Client::ScraperJobPage.new(options)
|
|
145
|
+
puts "#{client.reparse(scraper_name)}"
|
|
146
|
+
end
|
|
135
147
|
|
|
136
148
|
rescue JSON::ParserError
|
|
137
149
|
if options[:vars]
|
data/lib/datahen/cli/seeder.rb
CHANGED
|
@@ -7,6 +7,7 @@ module Datahen
|
|
|
7
7
|
<seeder_file>: Seeder script file will be executed.\x5
|
|
8
8
|
LONGDESC
|
|
9
9
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
|
10
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
|
10
11
|
def try_seed(scraper_name, seeder_file)
|
|
11
12
|
if options[:job]
|
|
12
13
|
job_id = options[:job]
|
|
@@ -14,8 +15,8 @@ module Datahen
|
|
|
14
15
|
job = Client::ScraperJob.new(options).find(scraper_name)
|
|
15
16
|
job_id = job['id']
|
|
16
17
|
end
|
|
17
|
-
|
|
18
|
-
puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, false)
|
|
18
|
+
|
|
19
|
+
puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, false, options[:"keep-outputs"])
|
|
19
20
|
end
|
|
20
21
|
|
|
21
22
|
desc "exec <scraper_name> <seeder_file>", "Executes a seeder script onto a scraper's current job."
|
|
@@ -24,6 +25,7 @@ module Datahen
|
|
|
24
25
|
<seeder_file>: Seeder script file that will be executed on the scraper's current job.\x5
|
|
25
26
|
LONGDESC
|
|
26
27
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
|
28
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
|
27
29
|
def exec_parse(scraper_name, seeder_file)
|
|
28
30
|
if options[:job]
|
|
29
31
|
job_id = options[:job]
|
data/lib/datahen/client.rb
CHANGED
|
@@ -20,7 +20,9 @@ require "datahen/client/job_stat"
|
|
|
20
20
|
require "datahen/client/backblaze_content"
|
|
21
21
|
require "datahen/client/env_var"
|
|
22
22
|
require "datahen/client/scraper_var"
|
|
23
|
+
require "datahen/client/job_var"
|
|
23
24
|
require "datahen/client/scraper_job_var"
|
|
25
|
+
require "datahen/client/job_finisher"
|
|
24
26
|
|
|
25
27
|
|
|
26
28
|
module Datahen
|
data/lib/datahen/client/job.rb
CHANGED
|
@@ -15,6 +15,7 @@ module Datahen
|
|
|
15
15
|
body[:status] = opts[:status] if opts[:status]
|
|
16
16
|
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
|
17
17
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
|
18
|
+
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
|
18
19
|
params = @options.merge({body: body.to_json})
|
|
19
20
|
|
|
20
21
|
self.class.put("/jobs/#{job_id}", params)
|
|
@@ -41,6 +42,7 @@ module Datahen
|
|
|
41
42
|
body[:pages] = opts.fetch(:pages) {[]}
|
|
42
43
|
body[:seeding_status] = opts.fetch(:seeding_status){ nil }
|
|
43
44
|
body[:log_error] = opts[:log_error] if opts[:log_error]
|
|
45
|
+
body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
|
|
44
46
|
|
|
45
47
|
params = @options.merge({body: body.to_json})
|
|
46
48
|
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
module Datahen
|
|
2
|
+
module Client
|
|
3
|
+
class JobFinisher < Datahen::Client::Base
|
|
4
|
+
# Reset finisher on a scraper's current job.
|
|
5
|
+
#
|
|
6
|
+
# @param [Integer] job_id Job ID
|
|
7
|
+
# @param [Hash] opts ({}) API custom parameters.
|
|
8
|
+
#
|
|
9
|
+
# @return [HTTParty::Response]
|
|
10
|
+
def reset(job_id, opts={})
|
|
11
|
+
params = @options.merge(opts)
|
|
12
|
+
self.class.put("/jobs/#{job_id}/finisher/reset", params)
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -48,6 +48,7 @@ module Datahen
|
|
|
48
48
|
body[:pages] = opts.fetch(:pages) {[]}
|
|
49
49
|
body[:parsing_status] = opts.fetch(:parsing_status){ nil }
|
|
50
50
|
body[:log_error] = opts[:log_error] if opts[:log_error]
|
|
51
|
+
body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
|
|
51
52
|
|
|
52
53
|
params = @options.merge({body: body.to_json})
|
|
53
54
|
|
|
@@ -61,6 +62,16 @@ module Datahen
|
|
|
61
62
|
def find_failed_content(job_id, gid)
|
|
62
63
|
self.class.get("/jobs/#{job_id}/pages/#{gid}/failed_content", @options)
|
|
63
64
|
end
|
|
65
|
+
|
|
66
|
+
def reparse(job_id, opts={})
|
|
67
|
+
params = @options.merge(opts)
|
|
68
|
+
self.class.put("/jobs/#{job_id}/pages/reparse", params)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def refetch(job_id, opts={})
|
|
72
|
+
params = @options.merge(opts)
|
|
73
|
+
self.class.put("/jobs/#{job_id}/pages/refetch", params)
|
|
74
|
+
end
|
|
64
75
|
end
|
|
65
76
|
end
|
|
66
77
|
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
module Datahen
|
|
2
|
+
module Client
|
|
3
|
+
class JobVar < Datahen::Client::Base
|
|
4
|
+
|
|
5
|
+
def find(job_id, var_name)
|
|
6
|
+
self.class.get("/jobs/#{job_id}/vars/#{var_name}", @options)
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def all(job_id, opts={})
|
|
10
|
+
params = @options.merge opts
|
|
11
|
+
self.class.get("/jobs/#{job_id}/vars", params)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def set(job_id, var_name, value, opts={})
|
|
15
|
+
body = {}
|
|
16
|
+
body[:value] = value
|
|
17
|
+
body[:secret] = opts[:secret] if opts[:secret]
|
|
18
|
+
params = @options.merge({body: body.to_json})
|
|
19
|
+
self.class.put("/jobs/#{job_id}/vars/#{var_name}", params)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def unset(job_id, var_name, opts={})
|
|
23
|
+
params = @options.merge(opts)
|
|
24
|
+
self.class.delete("/jobs/#{job_id}/vars/#{var_name}", params)
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -26,6 +26,9 @@ module Datahen
|
|
|
26
26
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/refetch", params)
|
|
27
27
|
end
|
|
28
28
|
|
|
29
|
+
# Deprecated, please use Datahen::Client::JobVar#refetch instead.
|
|
30
|
+
#
|
|
31
|
+
# @note This method will be removed at some point in the future.
|
|
29
32
|
def refetch_by_job(job_id, opts={})
|
|
30
33
|
params = @options.merge(opts)
|
|
31
34
|
self.class.put("/jobs/#{job_id}/pages/refetch", params)
|
|
@@ -36,11 +39,6 @@ module Datahen
|
|
|
36
39
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/reparse", params)
|
|
37
40
|
end
|
|
38
41
|
|
|
39
|
-
def reparse_by_job(job_id, opts={})
|
|
40
|
-
params = @options.merge(opts)
|
|
41
|
-
self.class.put("/jobs/#{job_id}/pages/reparse", params)
|
|
42
|
-
end
|
|
43
|
-
|
|
44
42
|
def enqueue(scraper_name, method, url, opts={})
|
|
45
43
|
body = {}
|
|
46
44
|
body[:method] = method != "" ? method : "GET"
|
|
@@ -63,9 +63,9 @@ module Datahen
|
|
|
63
63
|
client.find(gid)
|
|
64
64
|
end
|
|
65
65
|
|
|
66
|
-
def get_content(gid)
|
|
67
|
-
client = Client::
|
|
68
|
-
content_json = client.find_content(gid)
|
|
66
|
+
def get_content(job_id, gid)
|
|
67
|
+
client = Client::JobPage.new()
|
|
68
|
+
content_json = client.find_content(job_id, gid)
|
|
69
69
|
|
|
70
70
|
if content_json['available']
|
|
71
71
|
signed_url = content_json['signed_url']
|
|
@@ -75,7 +75,7 @@ module Datahen
|
|
|
75
75
|
end
|
|
76
76
|
end
|
|
77
77
|
|
|
78
|
-
def get_failed_content(gid)
|
|
78
|
+
def get_failed_content(job_id, gid)
|
|
79
79
|
client = Client::JobPage.new()
|
|
80
80
|
content_json = client.find_failed_content(job_id, gid)
|
|
81
81
|
|
|
@@ -1,18 +1,24 @@
|
|
|
1
1
|
module Datahen
|
|
2
2
|
module Scraper
|
|
3
3
|
class Parser
|
|
4
|
-
def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {})
|
|
4
|
+
def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {}, keep_outputs=false)
|
|
5
5
|
extname = File.extname(filename)
|
|
6
6
|
case extname
|
|
7
7
|
when '.rb'
|
|
8
|
-
executor = RubyParserExecutor.new(
|
|
8
|
+
executor = RubyParserExecutor.new(
|
|
9
|
+
filename: filename,
|
|
10
|
+
gid: gid,
|
|
11
|
+
job_id: job_id,
|
|
12
|
+
vars: vars,
|
|
13
|
+
keep_outputs: keep_outputs
|
|
14
|
+
)
|
|
9
15
|
executor.exec_parser(save)
|
|
10
16
|
else
|
|
11
17
|
puts "Unable to find a parser executor for file type \"#{extname}\""
|
|
12
18
|
end
|
|
13
19
|
end
|
|
14
20
|
|
|
15
|
-
|
|
21
|
+
|
|
16
22
|
end
|
|
17
23
|
end
|
|
18
|
-
end
|
|
24
|
+
end
|
|
@@ -15,6 +15,7 @@ module Datahen
|
|
|
15
15
|
@gid = options.fetch(:gid) { raise "GID is required"}
|
|
16
16
|
@job_id = options.fetch(:job_id)
|
|
17
17
|
@page_vars = options.fetch(:vars) { {} }
|
|
18
|
+
@keep_outputs = !!(options.fetch(:keep_outputs) { false })
|
|
18
19
|
end
|
|
19
20
|
|
|
20
21
|
def self.exposed_methods
|
|
@@ -66,7 +67,9 @@ module Datahen
|
|
|
66
67
|
response = parsing_update(
|
|
67
68
|
job_id: job_id,
|
|
68
69
|
gid: gid,
|
|
69
|
-
parsing_status: :starting
|
|
70
|
+
parsing_status: :starting,
|
|
71
|
+
keep_outputs: @keep_outputs
|
|
72
|
+
)
|
|
70
73
|
|
|
71
74
|
if response.code == 200
|
|
72
75
|
puts "Page Parsing Status Updated."
|
|
@@ -165,7 +168,7 @@ module Datahen
|
|
|
165
168
|
handle_error(e) if save
|
|
166
169
|
raise e
|
|
167
170
|
end
|
|
168
|
-
|
|
171
|
+
|
|
169
172
|
if refetch_self
|
|
170
173
|
refetch_page gid
|
|
171
174
|
elsif reparse_self
|
|
@@ -178,11 +181,11 @@ module Datahen
|
|
|
178
181
|
end
|
|
179
182
|
|
|
180
183
|
def content
|
|
181
|
-
@content ||= get_content(gid)
|
|
184
|
+
@content ||= get_content(job_id, gid)
|
|
182
185
|
end
|
|
183
186
|
|
|
184
187
|
def failed_content
|
|
185
|
-
@failed_content ||= get_failed_content(gid)
|
|
188
|
+
@failed_content ||= get_failed_content(job_id, gid)
|
|
186
189
|
end
|
|
187
190
|
|
|
188
191
|
def handle_error(e)
|
|
@@ -6,6 +6,7 @@ module Datahen
|
|
|
6
6
|
def initialize(options={})
|
|
7
7
|
@filename = options.fetch(:filename) { raise "Filename is required"}
|
|
8
8
|
@job_id = options[:job_id]
|
|
9
|
+
@keep_outputs = !!(options.fetch(:keep_outputs) { false })
|
|
9
10
|
end
|
|
10
11
|
|
|
11
12
|
def self.exposed_methods
|
|
@@ -81,7 +82,9 @@ module Datahen
|
|
|
81
82
|
|
|
82
83
|
response = seeding_update(
|
|
83
84
|
job_id: job_id,
|
|
84
|
-
seeding_status: :starting
|
|
85
|
+
seeding_status: :starting,
|
|
86
|
+
keep_outputs: @keep_outputs
|
|
87
|
+
)
|
|
85
88
|
|
|
86
89
|
if response.code == 200
|
|
87
90
|
puts "Seeding Status Updated."
|
|
@@ -2,11 +2,15 @@ module Datahen
|
|
|
2
2
|
module Scraper
|
|
3
3
|
class Seeder
|
|
4
4
|
|
|
5
|
-
def self.exec_seeder(filename, job_id=nil, save=false)
|
|
5
|
+
def self.exec_seeder(filename, job_id=nil, save=false, keep_outputs=false)
|
|
6
6
|
extname = File.extname(filename)
|
|
7
7
|
case extname
|
|
8
8
|
when '.rb'
|
|
9
|
-
executor = RubySeederExecutor.new(
|
|
9
|
+
executor = RubySeederExecutor.new(
|
|
10
|
+
filename: filename,
|
|
11
|
+
job_id: job_id,
|
|
12
|
+
keep_outputs: keep_outputs
|
|
13
|
+
)
|
|
10
14
|
executor.exec_seeder(save)
|
|
11
15
|
else
|
|
12
16
|
puts "Unable to find a seeder executor for file type \"#{extname}\""
|
|
@@ -15,4 +19,4 @@ module Datahen
|
|
|
15
19
|
|
|
16
20
|
end
|
|
17
21
|
end
|
|
18
|
-
end
|
|
22
|
+
end
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: datahen
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.13.
|
|
4
|
+
version: 0.13.7
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Parama Danoesubroto
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2020-
|
|
11
|
+
date: 2020-05-06 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: thor
|
|
@@ -215,10 +215,12 @@ files:
|
|
|
215
215
|
- lib/datahen/client/global_page.rb
|
|
216
216
|
- lib/datahen/client/job.rb
|
|
217
217
|
- lib/datahen/client/job_export.rb
|
|
218
|
+
- lib/datahen/client/job_finisher.rb
|
|
218
219
|
- lib/datahen/client/job_log.rb
|
|
219
220
|
- lib/datahen/client/job_output.rb
|
|
220
221
|
- lib/datahen/client/job_page.rb
|
|
221
222
|
- lib/datahen/client/job_stat.rb
|
|
223
|
+
- lib/datahen/client/job_var.rb
|
|
222
224
|
- lib/datahen/client/scraper.rb
|
|
223
225
|
- lib/datahen/client/scraper_deployment.rb
|
|
224
226
|
- lib/datahen/client/scraper_export.rb
|