datahen 0.13.0 → 0.14.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c3d1584f235873f6d22ae107c60e7b50a6d8ab5918f45a4052bef80e58b7cbf7
4
- data.tar.gz: b6b6ce4871017eddb70fd279c92246e498e57c3d7ca2c0bd72c225bdcdb5e119
3
+ metadata.gz: b00adfb4f357beeae276a130cc5e0ee1d34ddd8bdef4a0374f4b55f89f894460
4
+ data.tar.gz: 6363d591d93d99addcaaea6d964b2fb07e4d8222f873c3a8f6496f48c97b1483
5
5
  SHA512:
6
- metadata.gz: 713f5907efc90be21ba04b83c060f0f656d1f72176191399e1ff575952cad2b54c834ddfd6722441e933857d12539f55553adf2b4d71adead952465f0ccf1005
7
- data.tar.gz: bd7809ed5fbd6d12dc8680f31f194ecfb51ba0138765a88c9dde4745be2bd46533da97885f63524edca09cee1ab2d4cc3d7fd96bf04a2b6a6930a046646a8093
6
+ metadata.gz: 75e9f1b1e5ba61563c3ff9d12071cb76d77382353003776d88817940623e37aeb300df2114260f86a5a18a59e1dc9f74a33c00029e183c0681c616dce109d6c6
7
+ data.tar.gz: 8e2a07b11d20fe88c93aa0af0abf10ceb6593d5d2db63d9b5c3744c6f8b59754274a3949b6331d7d71a3018f7cf65e1a7d31e2ab6f4dbbcde494b33357dab40a
@@ -18,12 +18,24 @@ module Datahen
18
18
  puts "#{client.all()}"
19
19
  end
20
20
 
21
- desc "show <job_id>", "Show a job"
21
+ desc "show <job_id>", "Show a job (Defaults to showing data from cached job)"
22
+ option :live, type: :boolean, desc: 'Get data from the live job, not cached job.'
22
23
  def show(job_id)
23
24
  client = Client::Job.new(options)
24
- puts "#{client.find(job_id)}"
25
+ puts "#{client.find(job_id, options)}"
25
26
  end
26
27
 
28
+ desc "stats <job_id>", "Get the stat for a job (Defaults to showing data from cached stats)"
29
+ long_desc <<-LONGDESC
30
+ Get stats for a scraper's current job\n
31
+ LONGDESC
32
+ option :live, type: :boolean, desc: 'Get data from the live stats, not cached stats.'
33
+ def stats(job_id)
34
+ client = Client::JobStat.new(options)
35
+ puts "#{client.job_current_stats(job_id, options)}"
36
+ end
37
+
38
+
27
39
  end
28
40
  end
29
41
 
@@ -10,12 +10,13 @@ module Datahen
10
10
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
11
11
  option :global, :aliases => :g, type: :boolean, default: false, desc: 'Use globalpage instead of a job page'
12
12
  option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
13
+ option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
13
14
  def try_parse(scraper_name, parser_file, gid)
14
- begin
15
-
15
+ begin
16
+
16
17
  if options[:job]
17
18
  job_id = options[:job]
18
- elsif options[:global]
19
+ elsif options[:global]
19
20
  job_id = nil
20
21
  else
21
22
  job = Client::ScraperJob.new(options).find(scraper_name)
@@ -24,7 +25,7 @@ module Datahen
24
25
 
25
26
 
26
27
  vars = JSON.parse(options[:vars]) if options[:vars]
27
- puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars)
28
+ puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars, options[:"keep-outputs"])
28
29
 
29
30
  rescue JSON::ParserError
30
31
  if options[:vars]
@@ -40,6 +41,8 @@ module Datahen
40
41
  <GID>: Global ID of the page.\x5
41
42
  LONGDESC
42
43
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
44
+ option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
45
+ option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
43
46
  def exec_parse(scraper_name, parser_file, *gids)
44
47
  gids.each do |gid|
45
48
  begin
@@ -52,7 +55,8 @@ module Datahen
52
55
  job_id = job['id']
53
56
  end
54
57
 
55
- puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true)
58
+ vars = JSON.parse(options[:vars]) if options[:vars]
59
+ puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true, vars, options[:"keep-outputs"])
56
60
  rescue => e
57
61
  puts e
58
62
  end
@@ -140,17 +140,18 @@ module Datahen
140
140
  end
141
141
  end
142
142
 
143
- desc "stats <scraper_name>", "Get the current stat for a job"
143
+ desc "stats <scraper_name>", "Get the stat for a current job (Defaults to showing data from cached stats)"
144
144
  long_desc <<-LONGDESC
145
145
  Get stats for a scraper's current job\n
146
146
  LONGDESC
147
147
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
148
+ option :live, type: :boolean, desc: 'Get data from the live stats, not cached stats.'
148
149
  def stats(scraper_name)
149
150
  client = Client::JobStat.new(options)
150
151
  if options[:job]
151
- puts "#{client.job_current_stats(options[:job])}"
152
+ puts "#{client.job_current_stats(options[:job], options)}"
152
153
  else
153
- puts "#{client.scraper_job_current_stats(scraper_name)}"
154
+ puts "#{client.scraper_job_current_stats(scraper_name, options)}"
154
155
  end
155
156
  end
156
157
 
@@ -12,7 +12,6 @@ module Datahen
12
12
  puts "#{client.find(export_id)}"
13
13
  end
14
14
 
15
-
16
15
  desc "list", "Gets a list of exports"
17
16
  long_desc <<-LONGDESC
18
17
  List exports.
@@ -34,13 +33,13 @@ module Datahen
34
33
  def download(export_id)
35
34
  client = Client::ScraperExport.new(options)
36
35
  result = JSON.parse(client.download(export_id).to_s)
37
-
36
+
38
37
  if result['signed_url']
39
38
  puts "Download url: \"#{result['signed_url']}\""
40
39
  `open "#{result['signed_url']}"`
41
40
  else
42
41
  puts "Exported file does not exist"
43
- end
42
+ end
44
43
  end
45
44
 
46
45
 
@@ -11,9 +11,15 @@ module Datahen
11
11
  long_desc <<-LONGDESC
12
12
  Reset finisher on a scraper's current job.\x5
13
13
  LONGDESC
14
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
14
15
  def reset(scraper_name)
15
- client = Client::ScraperFinisher.new(options)
16
- puts "#{client.reset(scraper_name)}"
16
+ if options[:job]
17
+ client = Client::JobFinisher.new(options)
18
+ puts "#{client.reset(options[:job])}"
19
+ else
20
+ client = Client::ScraperFinisher.new(options)
21
+ puts "#{client.reset(scraper_name)}"
22
+ end
17
23
  end
18
24
  end
19
25
  end
@@ -6,10 +6,17 @@ module Datahen
6
6
  "#{basename} #{@package_name} #{command.usage}"
7
7
  end
8
8
 
9
- desc "show <scraper_name>", "Show a scraper's current job"
9
+ desc "show <scraper_name>", "Show a scraper's current job (Defaults to showing data from cached job)"
10
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
11
+ option :live, type: :boolean, desc: 'Get data from the live job, not cached job.'
10
12
  def show(scraper_name)
11
- client = Client::ScraperJob.new(options)
12
- puts "#{client.find(scraper_name)}"
13
+ if options[:job]
14
+ client = Client::Job.new(options)
15
+ puts "#{client.find(options[:job], options)}"
16
+ else
17
+ client = Client::ScraperJob.new(options)
18
+ puts "#{client.find(scraper_name, options)}"
19
+ end
13
20
  end
14
21
 
15
22
 
@@ -29,27 +36,45 @@ module Datahen
29
36
  long_desc <<-LONGDESC
30
37
  Cancels a scraper's current job
31
38
  LONGDESC
39
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
32
40
  def cancel(scraper_name)
33
- client = Client::ScraperJob.new(options)
34
- puts "#{client.cancel(scraper_name)}"
41
+ if options[:job]
42
+ client = Client::Job.new(options)
43
+ puts "#{client.cancel(options[:job])}"
44
+ else
45
+ client = Client::ScraperJob.new(options)
46
+ puts "#{client.cancel(scraper_name)}"
47
+ end
35
48
  end
36
49
 
37
50
  desc "resume <scraper_name>", "resumes a scraper's current job"
38
51
  long_desc <<-LONGDESC
39
52
  Resumes a scraper's current job
40
53
  LONGDESC
54
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
41
55
  def resume(scraper_name)
42
- client = Client::ScraperJob.new(options)
43
- puts "#{client.resume(scraper_name)}"
56
+ if options[:job]
57
+ client = Client::Job.new(options)
58
+ puts "#{client.resume(options[:job])}"
59
+ else
60
+ client = Client::ScraperJob.new(options)
61
+ puts "#{client.resume(scraper_name)}"
62
+ end
44
63
  end
45
64
 
46
65
  desc "pause <scraper_name>", "pauses a scraper's current job"
47
66
  long_desc <<-LONGDESC
48
67
  pauses a scraper's current job
49
68
  LONGDESC
69
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
50
70
  def pause(scraper_name)
51
- client = Client::ScraperJob.new(options)
52
- puts "#{client.pause(scraper_name)}"
71
+ if options[:job]
72
+ client = Client::Job.new(options)
73
+ puts "#{client.pause(options[:job])}"
74
+ else
75
+ client = Client::ScraperJob.new(options)
76
+ puts "#{client.pause(scraper_name)}"
77
+ end
53
78
  end
54
79
 
55
80
 
@@ -60,9 +85,15 @@ module Datahen
60
85
  option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 1. '
61
86
  option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 0. '
62
87
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
88
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
63
89
  def update(scraper_name)
64
- client = Client::ScraperJob.new(options)
65
- puts "#{client.update(scraper_name, options)}"
90
+ if options[:job]
91
+ client = Client::Job.new(options)
92
+ puts "#{client.update(options[:job], options)}"
93
+ else
94
+ client = Client::ScraperJob.new(options)
95
+ puts "#{client.update(scraper_name, options)}"
96
+ end
66
97
  end
67
98
 
68
99
  desc "var SUBCOMMAND ...ARGS", "for managing scraper's job variables"
@@ -13,9 +13,15 @@ module Datahen
13
13
  LONGDESC
14
14
  option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
15
15
  option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
16
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
16
17
  def list(scraper_name)
17
- client = Client::ScraperJobVar.new(options)
18
- puts "#{client.all(scraper_name)}"
18
+ if options[:job]
19
+ client = Client::JobVar.new(options)
20
+ puts "#{client.all(options[:job])}"
21
+ else
22
+ client = Client::ScraperJobVar.new(options)
23
+ puts "#{client.all(scraper_name)}"
24
+ end
19
25
  end
20
26
 
21
27
  desc "set <scraper_name> <var_name> <value>", "Set an environment var on the scrape job"
@@ -24,23 +30,40 @@ module Datahen
24
30
  <var_name>: Var name can only consist of alphabets, numbers, underscores. Name must be unique to your scrape job, otherwise it will be overwritten.\x5
25
31
  <value>: Value of variable.\x5
26
32
  LONGDESC
27
- option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
33
+ option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
34
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
28
35
  def set(scraper_name, var_name, value)
29
- # puts "options #{options}"
30
- client = Client::ScraperJobVar.new(options)
31
- puts "#{client.set(scraper_name, var_name, value, options)}"
36
+ if options[:job]
37
+ client = Client::JobVar.new(options)
38
+ puts "#{client.set(options[:job], var_name, value, options)}"
39
+ else
40
+ client = Client::ScraperJobVar.new(options)
41
+ puts "#{client.set(scraper_name, var_name, value, options)}"
42
+ end
32
43
  end
33
44
 
34
45
  desc "show <scraper_name> <var_name>", "Show an environment variable on the scrape job"
46
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
35
47
  def show(scraper_name, var_name)
36
- client = Client::ScraperJobVar.new(options)
37
- puts "#{client.find(scraper_name, var_name)}"
48
+ if options[:job]
49
+ client = Client::JobVar.new(options)
50
+ puts "#{client.find(options[:job], var_name)}"
51
+ else
52
+ client = Client::ScraperJobVar.new(options)
53
+ puts "#{client.find(scraper_name, var_name)}"
54
+ end
38
55
  end
39
56
 
40
57
  desc "unset <scraper_name> <var_name>", "Deletes an environment variable on the scrape job"
58
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
41
59
  def unset(scraper_name, var_name)
42
- client = Client::ScraperJobVar.new(options)
43
- puts "#{client.unset(scraper_name, var_name)}"
60
+ if options[:job]
61
+ client = Client::JobVar.new(options)
62
+ puts "#{client.unset(options[:job], var_name)}"
63
+ else
64
+ client = Client::ScraperJobVar.new(options)
65
+ puts "#{client.unset(scraper_name, var_name)}"
66
+ end
44
67
  end
45
68
  end
46
69
  end
@@ -105,13 +105,19 @@ module Datahen
105
105
  option :fetch_fail, type: :boolean, desc: 'Refetches only pages that fails fetching.'
106
106
  option :parse_fail, type: :boolean, desc: 'Refetches only pages that fails parsing.'
107
107
  option :status, type: :string, desc: 'Refetches only pages with a specific status.'
108
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
108
109
  def refetch(scraper_name)
109
110
  if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status)
110
111
  puts "Must specify either a --gid, --fetch-fail, --parse-fail or --status"
111
112
  return
112
113
  end
113
- client = Client::ScraperJobPage.new(options)
114
- puts "#{client.refetch(scraper_name)}"
114
+ if options[:job]
115
+ client = Client::JobPage.new(options)
116
+ puts "#{client.refetch(options[:job])}"
117
+ else
118
+ client = Client::ScraperJobPage.new(options)
119
+ puts "#{client.refetch(scraper_name)}"
120
+ end
115
121
  end
116
122
 
117
123
  desc "reparse <scraper_name>", "Reparse Pages on a scraper's current job"
@@ -121,6 +127,7 @@ module Datahen
121
127
  option :gid, :aliases => :g, type: :string, desc: 'Reparse a specific GID'
122
128
  option :parse_fail, type: :boolean, desc: 'Reparse only pages that fails parsing.'
123
129
  option :status, type: :string, desc: 'Reparse only pages with a specific status.'
130
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
124
131
  def reparse(scraper_name)
125
132
  begin
126
133
  options[:vars] = JSON.parse(options[:vars]) if options[:vars]
@@ -130,8 +137,13 @@ module Datahen
130
137
  return
131
138
  end
132
139
 
133
- client = Client::ScraperJobPage.new(options)
134
- puts "#{client.reparse(scraper_name)}"
140
+ if options[:job]
141
+ client = Client::JobPage.new(options)
142
+ puts "#{client.reparse(options[:job])}"
143
+ else
144
+ client = Client::ScraperJobPage.new(options)
145
+ puts "#{client.reparse(scraper_name)}"
146
+ end
135
147
 
136
148
  rescue JSON::ParserError
137
149
  if options[:vars]
@@ -218,7 +230,7 @@ module Datahen
218
230
  end
219
231
  end
220
232
 
221
- desc "failedcontent <gid>", "Show a page's failed content in scraper's current job"
233
+ desc "failedcontent <scraper_name> <gid>", "Show a page's failed content in scraper's current job"
222
234
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
223
235
  def failedcontent(scraper_name, gid)
224
236
  result = nil
@@ -7,6 +7,7 @@ module Datahen
7
7
  <seeder_file>: Seeder script file will be executed.\x5
8
8
  LONGDESC
9
9
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
10
+ option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
10
11
  def try_seed(scraper_name, seeder_file)
11
12
  if options[:job]
12
13
  job_id = options[:job]
@@ -14,8 +15,8 @@ module Datahen
14
15
  job = Client::ScraperJob.new(options).find(scraper_name)
15
16
  job_id = job['id']
16
17
  end
17
-
18
- puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, false)
18
+
19
+ puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, false, options[:"keep-outputs"])
19
20
  end
20
21
 
21
22
  desc "exec <scraper_name> <seeder_file>", "Executes a seeder script onto a scraper's current job."
@@ -24,6 +25,7 @@ module Datahen
24
25
  <seeder_file>: Seeder script file that will be executed on the scraper's current job.\x5
25
26
  LONGDESC
26
27
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
28
+ option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
27
29
  def exec_parse(scraper_name, seeder_file)
28
30
  if options[:job]
29
31
  job_id = options[:job]
@@ -20,7 +20,9 @@ require "datahen/client/job_stat"
20
20
  require "datahen/client/backblaze_content"
21
21
  require "datahen/client/env_var"
22
22
  require "datahen/client/scraper_var"
23
+ require "datahen/client/job_var"
23
24
  require "datahen/client/scraper_job_var"
25
+ require "datahen/client/job_finisher"
24
26
 
25
27
 
26
28
  module Datahen
@@ -5,6 +5,8 @@ module Datahen
5
5
  class Base
6
6
  include HTTParty
7
7
 
8
+ default_timeout 60
9
+
8
10
  def self.env_auth_token
9
11
  ENV['DATAHEN_TOKEN']
10
12
  end
@@ -6,8 +6,12 @@ module Datahen
6
6
  self.class.get("/jobs", params)
7
7
  end
8
8
 
9
- def find(job_id)
10
- self.class.get("/jobs/#{job_id}", @options)
9
+ def find(job_id, opts={})
10
+ if opts[:live]
11
+ self.class.get("/jobs/#{job_id}", @options)
12
+ else
13
+ self.class.get("/cached/jobs/#{job_id}", @options)
14
+ end
11
15
  end
12
16
 
13
17
  def update(job_id, opts={})
@@ -15,6 +19,7 @@ module Datahen
15
19
  body[:status] = opts[:status] if opts[:status]
16
20
  body[:standard_worker_count] = opts[:workers] if opts[:workers]
17
21
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
22
+ body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
18
23
  params = @options.merge({body: body.to_json})
19
24
 
20
25
  self.class.put("/jobs/#{job_id}", params)
@@ -41,6 +46,7 @@ module Datahen
41
46
  body[:pages] = opts.fetch(:pages) {[]}
42
47
  body[:seeding_status] = opts.fetch(:seeding_status){ nil }
43
48
  body[:log_error] = opts[:log_error] if opts[:log_error]
49
+ body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
44
50
 
45
51
  params = @options.merge({body: body.to_json})
46
52
 
@@ -0,0 +1,16 @@
1
+ module Datahen
2
+ module Client
3
+ class JobFinisher < Datahen::Client::Base
4
+ # Reset finisher on a scraper's current job.
5
+ #
6
+ # @param [Integer] job_id Job ID
7
+ # @param [Hash] opts ({}) API custom parameters.
8
+ #
9
+ # @return [HTTParty::Response]
10
+ def reset(job_id, opts={})
11
+ params = @options.merge(opts)
12
+ self.class.put("/jobs/#{job_id}/finisher/reset", params)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -48,6 +48,7 @@ module Datahen
48
48
  body[:pages] = opts.fetch(:pages) {[]}
49
49
  body[:parsing_status] = opts.fetch(:parsing_status){ nil }
50
50
  body[:log_error] = opts[:log_error] if opts[:log_error]
51
+ body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
51
52
 
52
53
  params = @options.merge({body: body.to_json})
53
54
 
@@ -61,6 +62,16 @@ module Datahen
61
62
  def find_failed_content(job_id, gid)
62
63
  self.class.get("/jobs/#{job_id}/pages/#{gid}/failed_content", @options)
63
64
  end
65
+
66
+ def reparse(job_id, opts={})
67
+ params = @options.merge(opts)
68
+ self.class.put("/jobs/#{job_id}/pages/reparse", params)
69
+ end
70
+
71
+ def refetch(job_id, opts={})
72
+ params = @options.merge(opts)
73
+ self.class.put("/jobs/#{job_id}/pages/refetch", params)
74
+ end
64
75
  end
65
76
  end
66
77
  end
@@ -2,12 +2,20 @@ module Datahen
2
2
  module Client
3
3
  class JobStat < Datahen::Client::Base
4
4
 
5
- def job_current_stats(job_id)
6
- self.class.get("/jobs/#{job_id}/stats/current", @options)
5
+ def job_current_stats(job_id, opts={})
6
+ if opts[:live]
7
+ self.class.get("/jobs/#{job_id}/stats/current", @options)
8
+ else
9
+ self.class.get("/cached/jobs/#{job_id}/stats/current", @options)
10
+ end
7
11
  end
8
12
 
9
- def scraper_job_current_stats(scraper_name)
10
- self.class.get("/scrapers/#{scraper_name}/current_job/stats/current", @options)
13
+ def scraper_job_current_stats(scraper_name, opts={})
14
+ if opts[:live]
15
+ self.class.get("/scrapers/#{scraper_name}/current_job/stats/current", @options)
16
+ else
17
+ self.class.get("/cached/scrapers/#{scraper_name}/current_job/stats/current", @options)
18
+ end
11
19
  end
12
20
 
13
21
  def job_stats_history(job_id)
@@ -0,0 +1,28 @@
1
+ module Datahen
2
+ module Client
3
+ class JobVar < Datahen::Client::Base
4
+
5
+ def find(job_id, var_name)
6
+ self.class.get("/jobs/#{job_id}/vars/#{var_name}", @options)
7
+ end
8
+
9
+ def all(job_id, opts={})
10
+ params = @options.merge opts
11
+ self.class.get("/jobs/#{job_id}/vars", params)
12
+ end
13
+
14
+ def set(job_id, var_name, value, opts={})
15
+ body = {}
16
+ body[:value] = value
17
+ body[:secret] = opts[:secret] if opts[:secret]
18
+ params = @options.merge({body: body.to_json})
19
+ self.class.put("/jobs/#{job_id}/vars/#{var_name}", params)
20
+ end
21
+
22
+ def unset(job_id, var_name, opts={})
23
+ params = @options.merge(opts)
24
+ self.class.delete("/jobs/#{job_id}/vars/#{var_name}", params)
25
+ end
26
+ end
27
+ end
28
+ end
@@ -15,8 +15,12 @@ module Datahen
15
15
  self.class.post("/scrapers/#{scraper_name}/jobs", params)
16
16
  end
17
17
 
18
- def find(scraper_name)
19
- self.class.get("/scrapers/#{scraper_name}/current_job", @options)
18
+ def find(scraper_name, opts={})
19
+ if opts[:live]
20
+ self.class.get("/scrapers/#{scraper_name}/current_job", @options)
21
+ else
22
+ self.class.get("/cached/scrapers/#{scraper_name}/current_job", @options)
23
+ end
20
24
  end
21
25
 
22
26
  def update(scraper_name, opts={})
@@ -26,6 +26,9 @@ module Datahen
26
26
  self.class.put("/scrapers/#{scraper_name}/current_job/pages/refetch", params)
27
27
  end
28
28
 
29
+ # Deprecated, please use Datahen::Client::JobVar#refetch instead.
30
+ #
31
+ # @note This method will be removed at some point in the future.
29
32
  def refetch_by_job(job_id, opts={})
30
33
  params = @options.merge(opts)
31
34
  self.class.put("/jobs/#{job_id}/pages/refetch", params)
@@ -36,11 +39,6 @@ module Datahen
36
39
  self.class.put("/scrapers/#{scraper_name}/current_job/pages/reparse", params)
37
40
  end
38
41
 
39
- def reparse_by_job(job_id, opts={})
40
- params = @options.merge(opts)
41
- self.class.put("/jobs/#{job_id}/pages/reparse", params)
42
- end
43
-
44
42
  def enqueue(scraper_name, method, url, opts={})
45
43
  body = {}
46
44
  body[:method] = method != "" ? method : "GET"
@@ -60,12 +60,17 @@ module Datahen
60
60
 
61
61
  def init_global_page()
62
62
  client = Client::GlobalPage.new()
63
- client.find(gid)
63
+ global_page = client.find(gid)
64
+ unless global_page.code == 200
65
+ raise "GID #{gid} not found. Aborting execution!"
66
+ else
67
+ global_page
68
+ end
64
69
  end
65
70
 
66
- def get_content(gid)
67
- client = Client::GlobalPage.new()
68
- content_json = client.find_content(gid)
71
+ def get_content(job_id, gid)
72
+ client = Client::JobPage.new()
73
+ content_json = client.find_content(job_id, gid)
69
74
 
70
75
  if content_json['available']
71
76
  signed_url = content_json['signed_url']
@@ -75,7 +80,7 @@ module Datahen
75
80
  end
76
81
  end
77
82
 
78
- def get_failed_content(gid)
83
+ def get_failed_content(job_id, gid)
79
84
  client = Client::JobPage.new()
80
85
  content_json = client.find_failed_content(job_id, gid)
81
86
 
@@ -287,11 +292,12 @@ module Datahen
287
292
  end
288
293
 
289
294
  # behave differently if it is a real save
295
+ save_status = status
290
296
  if save
291
297
  log_msg = "Saving #{log_msgs.join(' and ')}."
292
298
  puts "#{log_msg}"
293
299
  else
294
- status = "#{status}_try"
300
+ save_status = "#{status}_try"
295
301
  end
296
302
 
297
303
  # saving to server
@@ -300,7 +306,7 @@ module Datahen
300
306
  gid: gid,
301
307
  pages: pages_slice,
302
308
  outputs: outputs_slice,
303
- status: status)
309
+ status: save_status)
304
310
 
305
311
  if response.code == 200
306
312
  if save
@@ -1,18 +1,24 @@
1
1
  module Datahen
2
2
  module Scraper
3
3
  class Parser
4
- def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {})
4
+ def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {}, keep_outputs=false)
5
5
  extname = File.extname(filename)
6
6
  case extname
7
7
  when '.rb'
8
- executor = RubyParserExecutor.new(filename: filename, gid: gid, job_id: job_id, vars: vars)
8
+ executor = RubyParserExecutor.new(
9
+ filename: filename,
10
+ gid: gid,
11
+ job_id: job_id,
12
+ vars: vars,
13
+ keep_outputs: keep_outputs
14
+ )
9
15
  executor.exec_parser(save)
10
16
  else
11
17
  puts "Unable to find a parser executor for file type \"#{extname}\""
12
18
  end
13
19
  end
14
20
 
15
-
21
+
16
22
  end
17
23
  end
18
- end
24
+ end
@@ -15,6 +15,7 @@ module Datahen
15
15
  @gid = options.fetch(:gid) { raise "GID is required"}
16
16
  @job_id = options.fetch(:job_id)
17
17
  @page_vars = options.fetch(:vars) { {} }
18
+ @keep_outputs = !!(options.fetch(:keep_outputs) { false })
18
19
  end
19
20
 
20
21
  def self.exposed_methods
@@ -66,7 +67,9 @@ module Datahen
66
67
  response = parsing_update(
67
68
  job_id: job_id,
68
69
  gid: gid,
69
- parsing_status: :starting)
70
+ parsing_status: :starting,
71
+ keep_outputs: @keep_outputs
72
+ )
70
73
 
71
74
  if response.code == 200
72
75
  puts "Page Parsing Status Updated."
@@ -165,7 +168,7 @@ module Datahen
165
168
  handle_error(e) if save
166
169
  raise e
167
170
  end
168
-
171
+
169
172
  if refetch_self
170
173
  refetch_page gid
171
174
  elsif reparse_self
@@ -178,11 +181,11 @@ module Datahen
178
181
  end
179
182
 
180
183
  def content
181
- @content ||= get_content(gid)
184
+ @content ||= get_content(job_id, gid)
182
185
  end
183
186
 
184
187
  def failed_content
185
- @failed_content ||= get_failed_content(gid)
188
+ @failed_content ||= get_failed_content(job_id, gid)
186
189
  end
187
190
 
188
191
  def handle_error(e)
@@ -6,6 +6,7 @@ module Datahen
6
6
  def initialize(options={})
7
7
  @filename = options.fetch(:filename) { raise "Filename is required"}
8
8
  @job_id = options[:job_id]
9
+ @keep_outputs = !!(options.fetch(:keep_outputs) { false })
9
10
  end
10
11
 
11
12
  def self.exposed_methods
@@ -81,7 +82,9 @@ module Datahen
81
82
 
82
83
  response = seeding_update(
83
84
  job_id: job_id,
84
- seeding_status: :starting)
85
+ seeding_status: :starting,
86
+ keep_outputs: @keep_outputs
87
+ )
85
88
 
86
89
  if response.code == 200
87
90
  puts "Seeding Status Updated."
@@ -2,11 +2,15 @@ module Datahen
2
2
  module Scraper
3
3
  class Seeder
4
4
 
5
- def self.exec_seeder(filename, job_id=nil, save=false)
5
+ def self.exec_seeder(filename, job_id=nil, save=false, keep_outputs=false)
6
6
  extname = File.extname(filename)
7
7
  case extname
8
8
  when '.rb'
9
- executor = RubySeederExecutor.new(filename: filename, job_id: job_id)
9
+ executor = RubySeederExecutor.new(
10
+ filename: filename,
11
+ job_id: job_id,
12
+ keep_outputs: keep_outputs
13
+ )
10
14
  executor.exec_seeder(save)
11
15
  else
12
16
  puts "Unable to find a seeder executor for file type \"#{extname}\""
@@ -15,4 +19,4 @@ module Datahen
15
19
 
16
20
  end
17
21
  end
18
- end
22
+ end
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.13.0"
2
+ VERSION = "0.14.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.13.0
4
+ version: 0.14.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-03-02 00:00:00.000000000 Z
11
+ date: 2020-08-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -215,10 +215,12 @@ files:
215
215
  - lib/datahen/client/global_page.rb
216
216
  - lib/datahen/client/job.rb
217
217
  - lib/datahen/client/job_export.rb
218
+ - lib/datahen/client/job_finisher.rb
218
219
  - lib/datahen/client/job_log.rb
219
220
  - lib/datahen/client/job_output.rb
220
221
  - lib/datahen/client/job_page.rb
221
222
  - lib/datahen/client/job_stat.rb
223
+ - lib/datahen/client/job_var.rb
222
224
  - lib/datahen/client/scraper.rb
223
225
  - lib/datahen/client/scraper_deployment.rb
224
226
  - lib/datahen/client/scraper_export.rb
@@ -262,7 +264,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
262
264
  - !ruby/object:Gem::Version
263
265
  version: '0'
264
266
  requirements: []
265
- rubygems_version: 3.0.3
267
+ rubygems_version: 3.1.2
266
268
  signing_key:
267
269
  specification_version: 4
268
270
  summary: DataHen toolbelt for developers