datahen 0.13.0 → 0.13.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/datahen/cli/parser.rb +9 -5
- data/lib/datahen/cli/scraper_export.rb +2 -3
- data/lib/datahen/cli/scraper_finisher.rb +8 -2
- data/lib/datahen/cli/scraper_job.rb +32 -8
- data/lib/datahen/cli/scraper_job_var.rb +33 -10
- data/lib/datahen/cli/scraper_page.rb +16 -4
- data/lib/datahen/cli/seeder.rb +4 -2
- data/lib/datahen/client.rb +2 -0
- data/lib/datahen/client/job.rb +2 -0
- data/lib/datahen/client/job_finisher.rb +16 -0
- data/lib/datahen/client/job_page.rb +11 -0
- data/lib/datahen/client/job_var.rb +28 -0
- data/lib/datahen/client/scraper_job_page.rb +3 -5
- data/lib/datahen/scraper/executor.rb +4 -4
- data/lib/datahen/scraper/parser.rb +10 -4
- data/lib/datahen/scraper/ruby_parser_executor.rb +7 -4
- data/lib/datahen/scraper/ruby_seeder_executor.rb +4 -1
- data/lib/datahen/scraper/seeder.rb +7 -3
- data/lib/datahen/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c05f6ab973fe74a21e7f68411a66b97444575cc75c5812e99c5e0da4ffe05d56
|
4
|
+
data.tar.gz: dac1d2be8f6281d3da328abd5f844036765991764bb1e3ffad4b72ee4c18eac6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 48c15f9830308488d434dce7b7cf4888795724d88d4ca9de63cc2deca8397798b815145430af2143eb646366bfc4e97f37e563315214258b434276919e724ac7
|
7
|
+
data.tar.gz: faee0369f81ab45dbb6c62258a6214d530d51d5dcb0f6b6ebd7028bda2c48ccbe630b41006ffcc96dba773a28e84e724b6b1f2266ef7866605e5423f1ced61d2
|
data/lib/datahen/cli/parser.rb
CHANGED
@@ -10,12 +10,13 @@ module Datahen
|
|
10
10
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
11
11
|
option :global, :aliases => :g, type: :boolean, default: false, desc: 'Use globalpage instead of a job page'
|
12
12
|
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
13
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
13
14
|
def try_parse(scraper_name, parser_file, gid)
|
14
|
-
begin
|
15
|
-
|
15
|
+
begin
|
16
|
+
|
16
17
|
if options[:job]
|
17
18
|
job_id = options[:job]
|
18
|
-
elsif options[:global]
|
19
|
+
elsif options[:global]
|
19
20
|
job_id = nil
|
20
21
|
else
|
21
22
|
job = Client::ScraperJob.new(options).find(scraper_name)
|
@@ -24,7 +25,7 @@ module Datahen
|
|
24
25
|
|
25
26
|
|
26
27
|
vars = JSON.parse(options[:vars]) if options[:vars]
|
27
|
-
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars)
|
28
|
+
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars, options[:"keep-outputs"])
|
28
29
|
|
29
30
|
rescue JSON::ParserError
|
30
31
|
if options[:vars]
|
@@ -40,6 +41,8 @@ module Datahen
|
|
40
41
|
<GID>: Global ID of the page.\x5
|
41
42
|
LONGDESC
|
42
43
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
44
|
+
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
45
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
43
46
|
def exec_parse(scraper_name, parser_file, *gids)
|
44
47
|
gids.each do |gid|
|
45
48
|
begin
|
@@ -52,7 +55,8 @@ module Datahen
|
|
52
55
|
job_id = job['id']
|
53
56
|
end
|
54
57
|
|
55
|
-
|
58
|
+
vars = JSON.parse(options[:vars]) if options[:vars]
|
59
|
+
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true, vars, options[:"keep-outputs"])
|
56
60
|
rescue => e
|
57
61
|
puts e
|
58
62
|
end
|
@@ -12,7 +12,6 @@ module Datahen
|
|
12
12
|
puts "#{client.find(export_id)}"
|
13
13
|
end
|
14
14
|
|
15
|
-
|
16
15
|
desc "list", "Gets a list of exports"
|
17
16
|
long_desc <<-LONGDESC
|
18
17
|
List exports.
|
@@ -34,13 +33,13 @@ module Datahen
|
|
34
33
|
def download(export_id)
|
35
34
|
client = Client::ScraperExport.new(options)
|
36
35
|
result = JSON.parse(client.download(export_id).to_s)
|
37
|
-
|
36
|
+
|
38
37
|
if result['signed_url']
|
39
38
|
puts "Download url: \"#{result['signed_url']}\""
|
40
39
|
`open "#{result['signed_url']}"`
|
41
40
|
else
|
42
41
|
puts "Exported file does not exist"
|
43
|
-
end
|
42
|
+
end
|
44
43
|
end
|
45
44
|
|
46
45
|
|
@@ -11,9 +11,15 @@ module Datahen
|
|
11
11
|
long_desc <<-LONGDESC
|
12
12
|
Reset finisher on a scraper's current job.\x5
|
13
13
|
LONGDESC
|
14
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
14
15
|
def reset(scraper_name)
|
15
|
-
|
16
|
-
|
16
|
+
if options[:job]
|
17
|
+
client = Client::JobFinisher.new(options)
|
18
|
+
puts "#{client.reset(options[:job])}"
|
19
|
+
else
|
20
|
+
client = Client::ScraperFinisher.new(options)
|
21
|
+
puts "#{client.reset(scraper_name)}"
|
22
|
+
end
|
17
23
|
end
|
18
24
|
end
|
19
25
|
end
|
@@ -29,27 +29,45 @@ module Datahen
|
|
29
29
|
long_desc <<-LONGDESC
|
30
30
|
Cancels a scraper's current job
|
31
31
|
LONGDESC
|
32
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
32
33
|
def cancel(scraper_name)
|
33
|
-
|
34
|
-
|
34
|
+
if options[:job]
|
35
|
+
client = Client::Job.new(options)
|
36
|
+
puts "#{client.cancel(options[:job])}"
|
37
|
+
else
|
38
|
+
client = Client::ScraperJob.new(options)
|
39
|
+
puts "#{client.cancel(scraper_name)}"
|
40
|
+
end
|
35
41
|
end
|
36
42
|
|
37
43
|
desc "resume <scraper_name>", "resumes a scraper's current job"
|
38
44
|
long_desc <<-LONGDESC
|
39
45
|
Resumes a scraper's current job
|
40
46
|
LONGDESC
|
47
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
41
48
|
def resume(scraper_name)
|
42
|
-
|
43
|
-
|
49
|
+
if options[:job]
|
50
|
+
client = Client::Job.new(options)
|
51
|
+
puts "#{client.resume(options[:job])}"
|
52
|
+
else
|
53
|
+
client = Client::ScraperJob.new(options)
|
54
|
+
puts "#{client.resume(scraper_name)}"
|
55
|
+
end
|
44
56
|
end
|
45
57
|
|
46
58
|
desc "pause <scraper_name>", "pauses a scraper's current job"
|
47
59
|
long_desc <<-LONGDESC
|
48
60
|
pauses a scraper's current job
|
49
61
|
LONGDESC
|
62
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
50
63
|
def pause(scraper_name)
|
51
|
-
|
52
|
-
|
64
|
+
if options[:job]
|
65
|
+
client = Client::Job.new(options)
|
66
|
+
puts "#{client.pause(options[:job])}"
|
67
|
+
else
|
68
|
+
client = Client::ScraperJob.new(options)
|
69
|
+
puts "#{client.pause(scraper_name)}"
|
70
|
+
end
|
53
71
|
end
|
54
72
|
|
55
73
|
|
@@ -60,9 +78,15 @@ module Datahen
|
|
60
78
|
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 1. '
|
61
79
|
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 0. '
|
62
80
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
81
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
63
82
|
def update(scraper_name)
|
64
|
-
|
65
|
-
|
83
|
+
if options[:job]
|
84
|
+
client = Client::Job.new(options)
|
85
|
+
puts "#{client.update(options[:job], options)}"
|
86
|
+
else
|
87
|
+
client = Client::ScraperJob.new(options)
|
88
|
+
puts "#{client.update(scraper_name, options)}"
|
89
|
+
end
|
66
90
|
end
|
67
91
|
|
68
92
|
desc "var SUBCOMMAND ...ARGS", "for managing scraper's job variables"
|
@@ -13,9 +13,15 @@ module Datahen
|
|
13
13
|
LONGDESC
|
14
14
|
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
15
15
|
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
16
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
16
17
|
def list(scraper_name)
|
17
|
-
|
18
|
-
|
18
|
+
if options[:job]
|
19
|
+
client = Client::JobVar.new(options)
|
20
|
+
puts "#{client.all(options[:job])}"
|
21
|
+
else
|
22
|
+
client = Client::ScraperJobVar.new(options)
|
23
|
+
puts "#{client.all(scraper_name)}"
|
24
|
+
end
|
19
25
|
end
|
20
26
|
|
21
27
|
desc "set <scraper_name> <var_name> <value>", "Set an environment var on the scrape job"
|
@@ -24,23 +30,40 @@ module Datahen
|
|
24
30
|
<var_name>: Var name can only consist of alphabets, numbers, underscores. Name must be unique to your scrape job, otherwise it will be overwritten.\x5
|
25
31
|
<value>: Value of variable.\x5
|
26
32
|
LONGDESC
|
27
|
-
option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
|
33
|
+
option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
|
34
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
28
35
|
def set(scraper_name, var_name, value)
|
29
|
-
|
30
|
-
|
31
|
-
|
36
|
+
if options[:job]
|
37
|
+
client = Client::JobVar.new(options)
|
38
|
+
puts "#{client.set(options[:job], var_name, value, options)}"
|
39
|
+
else
|
40
|
+
client = Client::ScraperJobVar.new(options)
|
41
|
+
puts "#{client.set(scraper_name, var_name, value, options)}"
|
42
|
+
end
|
32
43
|
end
|
33
44
|
|
34
45
|
desc "show <scraper_name> <var_name>", "Show an environment variable on the scrape job"
|
46
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
35
47
|
def show(scraper_name, var_name)
|
36
|
-
|
37
|
-
|
48
|
+
if options[:job]
|
49
|
+
client = Client::JobVar.new(options)
|
50
|
+
puts "#{client.find(options[:job], var_name)}"
|
51
|
+
else
|
52
|
+
client = Client::ScraperJobVar.new(options)
|
53
|
+
puts "#{client.find(scraper_name, var_name)}"
|
54
|
+
end
|
38
55
|
end
|
39
56
|
|
40
57
|
desc "unset <scraper_name> <var_name>", "Deletes an environment variable on the scrape job"
|
58
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
41
59
|
def unset(scraper_name, var_name)
|
42
|
-
|
43
|
-
|
60
|
+
if options[:job]
|
61
|
+
client = Client::JobVar.new(options)
|
62
|
+
puts "#{client.unset(options[:job], var_name)}"
|
63
|
+
else
|
64
|
+
client = Client::ScraperJobVar.new(options)
|
65
|
+
puts "#{client.unset(scraper_name, var_name)}"
|
66
|
+
end
|
44
67
|
end
|
45
68
|
end
|
46
69
|
end
|
@@ -105,13 +105,19 @@ module Datahen
|
|
105
105
|
option :fetch_fail, type: :boolean, desc: 'Refetches only pages that fails fetching.'
|
106
106
|
option :parse_fail, type: :boolean, desc: 'Refetches only pages that fails parsing.'
|
107
107
|
option :status, type: :string, desc: 'Refetches only pages with a specific status.'
|
108
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
108
109
|
def refetch(scraper_name)
|
109
110
|
if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status)
|
110
111
|
puts "Must specify either a --gid, --fetch-fail, --parse-fail or --status"
|
111
112
|
return
|
112
113
|
end
|
113
|
-
|
114
|
-
|
114
|
+
if options[:job]
|
115
|
+
client = Client::JobPage.new(options)
|
116
|
+
puts "#{client.refetch(options[:job])}"
|
117
|
+
else
|
118
|
+
client = Client::ScraperJobPage.new(options)
|
119
|
+
puts "#{client.refetch(scraper_name)}"
|
120
|
+
end
|
115
121
|
end
|
116
122
|
|
117
123
|
desc "reparse <scraper_name>", "Reparse Pages on a scraper's current job"
|
@@ -121,6 +127,7 @@ module Datahen
|
|
121
127
|
option :gid, :aliases => :g, type: :string, desc: 'Reparse a specific GID'
|
122
128
|
option :parse_fail, type: :boolean, desc: 'Reparse only pages that fails parsing.'
|
123
129
|
option :status, type: :string, desc: 'Reparse only pages with a specific status.'
|
130
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
124
131
|
def reparse(scraper_name)
|
125
132
|
begin
|
126
133
|
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
@@ -130,8 +137,13 @@ module Datahen
|
|
130
137
|
return
|
131
138
|
end
|
132
139
|
|
133
|
-
|
134
|
-
|
140
|
+
if options[:job]
|
141
|
+
client = Client::JobPage.new(options)
|
142
|
+
puts "#{client.reparse(options[:job])}"
|
143
|
+
else
|
144
|
+
client = Client::ScraperJobPage.new(options)
|
145
|
+
puts "#{client.reparse(scraper_name)}"
|
146
|
+
end
|
135
147
|
|
136
148
|
rescue JSON::ParserError
|
137
149
|
if options[:vars]
|
data/lib/datahen/cli/seeder.rb
CHANGED
@@ -7,6 +7,7 @@ module Datahen
|
|
7
7
|
<seeder_file>: Seeder script file will be executed.\x5
|
8
8
|
LONGDESC
|
9
9
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
10
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
10
11
|
def try_seed(scraper_name, seeder_file)
|
11
12
|
if options[:job]
|
12
13
|
job_id = options[:job]
|
@@ -14,8 +15,8 @@ module Datahen
|
|
14
15
|
job = Client::ScraperJob.new(options).find(scraper_name)
|
15
16
|
job_id = job['id']
|
16
17
|
end
|
17
|
-
|
18
|
-
puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, false)
|
18
|
+
|
19
|
+
puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, false, options[:"keep-outputs"])
|
19
20
|
end
|
20
21
|
|
21
22
|
desc "exec <scraper_name> <seeder_file>", "Executes a seeder script onto a scraper's current job."
|
@@ -24,6 +25,7 @@ module Datahen
|
|
24
25
|
<seeder_file>: Seeder script file that will be executed on the scraper's current job.\x5
|
25
26
|
LONGDESC
|
26
27
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
28
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
27
29
|
def exec_parse(scraper_name, seeder_file)
|
28
30
|
if options[:job]
|
29
31
|
job_id = options[:job]
|
data/lib/datahen/client.rb
CHANGED
@@ -20,7 +20,9 @@ require "datahen/client/job_stat"
|
|
20
20
|
require "datahen/client/backblaze_content"
|
21
21
|
require "datahen/client/env_var"
|
22
22
|
require "datahen/client/scraper_var"
|
23
|
+
require "datahen/client/job_var"
|
23
24
|
require "datahen/client/scraper_job_var"
|
25
|
+
require "datahen/client/job_finisher"
|
24
26
|
|
25
27
|
|
26
28
|
module Datahen
|
data/lib/datahen/client/job.rb
CHANGED
@@ -15,6 +15,7 @@ module Datahen
|
|
15
15
|
body[:status] = opts[:status] if opts[:status]
|
16
16
|
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
17
17
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
18
|
+
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
18
19
|
params = @options.merge({body: body.to_json})
|
19
20
|
|
20
21
|
self.class.put("/jobs/#{job_id}", params)
|
@@ -41,6 +42,7 @@ module Datahen
|
|
41
42
|
body[:pages] = opts.fetch(:pages) {[]}
|
42
43
|
body[:seeding_status] = opts.fetch(:seeding_status){ nil }
|
43
44
|
body[:log_error] = opts[:log_error] if opts[:log_error]
|
45
|
+
body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
|
44
46
|
|
45
47
|
params = @options.merge({body: body.to_json})
|
46
48
|
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class JobFinisher < Datahen::Client::Base
|
4
|
+
# Reset finisher on a scraper's current job.
|
5
|
+
#
|
6
|
+
# @param [Integer] job_id Job ID
|
7
|
+
# @param [Hash] opts ({}) API custom parameters.
|
8
|
+
#
|
9
|
+
# @return [HTTParty::Response]
|
10
|
+
def reset(job_id, opts={})
|
11
|
+
params = @options.merge(opts)
|
12
|
+
self.class.put("/jobs/#{job_id}/finisher/reset", params)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -48,6 +48,7 @@ module Datahen
|
|
48
48
|
body[:pages] = opts.fetch(:pages) {[]}
|
49
49
|
body[:parsing_status] = opts.fetch(:parsing_status){ nil }
|
50
50
|
body[:log_error] = opts[:log_error] if opts[:log_error]
|
51
|
+
body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
|
51
52
|
|
52
53
|
params = @options.merge({body: body.to_json})
|
53
54
|
|
@@ -61,6 +62,16 @@ module Datahen
|
|
61
62
|
def find_failed_content(job_id, gid)
|
62
63
|
self.class.get("/jobs/#{job_id}/pages/#{gid}/failed_content", @options)
|
63
64
|
end
|
65
|
+
|
66
|
+
def reparse(job_id, opts={})
|
67
|
+
params = @options.merge(opts)
|
68
|
+
self.class.put("/jobs/#{job_id}/pages/reparse", params)
|
69
|
+
end
|
70
|
+
|
71
|
+
def refetch(job_id, opts={})
|
72
|
+
params = @options.merge(opts)
|
73
|
+
self.class.put("/jobs/#{job_id}/pages/refetch", params)
|
74
|
+
end
|
64
75
|
end
|
65
76
|
end
|
66
77
|
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class JobVar < Datahen::Client::Base
|
4
|
+
|
5
|
+
def find(job_id, var_name)
|
6
|
+
self.class.get("/jobs/#{job_id}/vars/#{var_name}", @options)
|
7
|
+
end
|
8
|
+
|
9
|
+
def all(job_id, opts={})
|
10
|
+
params = @options.merge opts
|
11
|
+
self.class.get("/jobs/#{job_id}/vars", params)
|
12
|
+
end
|
13
|
+
|
14
|
+
def set(job_id, var_name, value, opts={})
|
15
|
+
body = {}
|
16
|
+
body[:value] = value
|
17
|
+
body[:secret] = opts[:secret] if opts[:secret]
|
18
|
+
params = @options.merge({body: body.to_json})
|
19
|
+
self.class.put("/jobs/#{job_id}/vars/#{var_name}", params)
|
20
|
+
end
|
21
|
+
|
22
|
+
def unset(job_id, var_name, opts={})
|
23
|
+
params = @options.merge(opts)
|
24
|
+
self.class.delete("/jobs/#{job_id}/vars/#{var_name}", params)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -26,6 +26,9 @@ module Datahen
|
|
26
26
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/refetch", params)
|
27
27
|
end
|
28
28
|
|
29
|
+
# Deprecated, please use Datahen::Client::JobVar#refetch instead.
|
30
|
+
#
|
31
|
+
# @note This method will be removed at some point in the future.
|
29
32
|
def refetch_by_job(job_id, opts={})
|
30
33
|
params = @options.merge(opts)
|
31
34
|
self.class.put("/jobs/#{job_id}/pages/refetch", params)
|
@@ -36,11 +39,6 @@ module Datahen
|
|
36
39
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/reparse", params)
|
37
40
|
end
|
38
41
|
|
39
|
-
def reparse_by_job(job_id, opts={})
|
40
|
-
params = @options.merge(opts)
|
41
|
-
self.class.put("/jobs/#{job_id}/pages/reparse", params)
|
42
|
-
end
|
43
|
-
|
44
42
|
def enqueue(scraper_name, method, url, opts={})
|
45
43
|
body = {}
|
46
44
|
body[:method] = method != "" ? method : "GET"
|
@@ -63,9 +63,9 @@ module Datahen
|
|
63
63
|
client.find(gid)
|
64
64
|
end
|
65
65
|
|
66
|
-
def get_content(gid)
|
67
|
-
client = Client::
|
68
|
-
content_json = client.find_content(gid)
|
66
|
+
def get_content(job_id, gid)
|
67
|
+
client = Client::JobPage.new()
|
68
|
+
content_json = client.find_content(job_id, gid)
|
69
69
|
|
70
70
|
if content_json['available']
|
71
71
|
signed_url = content_json['signed_url']
|
@@ -75,7 +75,7 @@ module Datahen
|
|
75
75
|
end
|
76
76
|
end
|
77
77
|
|
78
|
-
def get_failed_content(gid)
|
78
|
+
def get_failed_content(job_id, gid)
|
79
79
|
client = Client::JobPage.new()
|
80
80
|
content_json = client.find_failed_content(job_id, gid)
|
81
81
|
|
@@ -1,18 +1,24 @@
|
|
1
1
|
module Datahen
|
2
2
|
module Scraper
|
3
3
|
class Parser
|
4
|
-
def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {})
|
4
|
+
def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {}, keep_outputs=false)
|
5
5
|
extname = File.extname(filename)
|
6
6
|
case extname
|
7
7
|
when '.rb'
|
8
|
-
executor = RubyParserExecutor.new(
|
8
|
+
executor = RubyParserExecutor.new(
|
9
|
+
filename: filename,
|
10
|
+
gid: gid,
|
11
|
+
job_id: job_id,
|
12
|
+
vars: vars,
|
13
|
+
keep_outputs: keep_outputs
|
14
|
+
)
|
9
15
|
executor.exec_parser(save)
|
10
16
|
else
|
11
17
|
puts "Unable to find a parser executor for file type \"#{extname}\""
|
12
18
|
end
|
13
19
|
end
|
14
20
|
|
15
|
-
|
21
|
+
|
16
22
|
end
|
17
23
|
end
|
18
|
-
end
|
24
|
+
end
|
@@ -15,6 +15,7 @@ module Datahen
|
|
15
15
|
@gid = options.fetch(:gid) { raise "GID is required"}
|
16
16
|
@job_id = options.fetch(:job_id)
|
17
17
|
@page_vars = options.fetch(:vars) { {} }
|
18
|
+
@keep_outputs = !!(options.fetch(:keep_outputs) { false })
|
18
19
|
end
|
19
20
|
|
20
21
|
def self.exposed_methods
|
@@ -66,7 +67,9 @@ module Datahen
|
|
66
67
|
response = parsing_update(
|
67
68
|
job_id: job_id,
|
68
69
|
gid: gid,
|
69
|
-
parsing_status: :starting
|
70
|
+
parsing_status: :starting,
|
71
|
+
keep_outputs: @keep_outputs
|
72
|
+
)
|
70
73
|
|
71
74
|
if response.code == 200
|
72
75
|
puts "Page Parsing Status Updated."
|
@@ -165,7 +168,7 @@ module Datahen
|
|
165
168
|
handle_error(e) if save
|
166
169
|
raise e
|
167
170
|
end
|
168
|
-
|
171
|
+
|
169
172
|
if refetch_self
|
170
173
|
refetch_page gid
|
171
174
|
elsif reparse_self
|
@@ -178,11 +181,11 @@ module Datahen
|
|
178
181
|
end
|
179
182
|
|
180
183
|
def content
|
181
|
-
@content ||= get_content(gid)
|
184
|
+
@content ||= get_content(job_id, gid)
|
182
185
|
end
|
183
186
|
|
184
187
|
def failed_content
|
185
|
-
@failed_content ||= get_failed_content(gid)
|
188
|
+
@failed_content ||= get_failed_content(job_id, gid)
|
186
189
|
end
|
187
190
|
|
188
191
|
def handle_error(e)
|
@@ -6,6 +6,7 @@ module Datahen
|
|
6
6
|
def initialize(options={})
|
7
7
|
@filename = options.fetch(:filename) { raise "Filename is required"}
|
8
8
|
@job_id = options[:job_id]
|
9
|
+
@keep_outputs = !!(options.fetch(:keep_outputs) { false })
|
9
10
|
end
|
10
11
|
|
11
12
|
def self.exposed_methods
|
@@ -81,7 +82,9 @@ module Datahen
|
|
81
82
|
|
82
83
|
response = seeding_update(
|
83
84
|
job_id: job_id,
|
84
|
-
seeding_status: :starting
|
85
|
+
seeding_status: :starting,
|
86
|
+
keep_outputs: @keep_outputs
|
87
|
+
)
|
85
88
|
|
86
89
|
if response.code == 200
|
87
90
|
puts "Seeding Status Updated."
|
@@ -2,11 +2,15 @@ module Datahen
|
|
2
2
|
module Scraper
|
3
3
|
class Seeder
|
4
4
|
|
5
|
-
def self.exec_seeder(filename, job_id=nil, save=false)
|
5
|
+
def self.exec_seeder(filename, job_id=nil, save=false, keep_outputs=false)
|
6
6
|
extname = File.extname(filename)
|
7
7
|
case extname
|
8
8
|
when '.rb'
|
9
|
-
executor = RubySeederExecutor.new(
|
9
|
+
executor = RubySeederExecutor.new(
|
10
|
+
filename: filename,
|
11
|
+
job_id: job_id,
|
12
|
+
keep_outputs: keep_outputs
|
13
|
+
)
|
10
14
|
executor.exec_seeder(save)
|
11
15
|
else
|
12
16
|
puts "Unable to find a seeder executor for file type \"#{extname}\""
|
@@ -15,4 +19,4 @@ module Datahen
|
|
15
19
|
|
16
20
|
end
|
17
21
|
end
|
18
|
-
end
|
22
|
+
end
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.13.
|
4
|
+
version: 0.13.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-05-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -215,10 +215,12 @@ files:
|
|
215
215
|
- lib/datahen/client/global_page.rb
|
216
216
|
- lib/datahen/client/job.rb
|
217
217
|
- lib/datahen/client/job_export.rb
|
218
|
+
- lib/datahen/client/job_finisher.rb
|
218
219
|
- lib/datahen/client/job_log.rb
|
219
220
|
- lib/datahen/client/job_output.rb
|
220
221
|
- lib/datahen/client/job_page.rb
|
221
222
|
- lib/datahen/client/job_stat.rb
|
223
|
+
- lib/datahen/client/job_var.rb
|
222
224
|
- lib/datahen/client/scraper.rb
|
223
225
|
- lib/datahen/client/scraper_deployment.rb
|
224
226
|
- lib/datahen/client/scraper_export.rb
|