datahen 0.11.1 → 0.14.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 80934b5ed446c59f749866cd5f497ef66f90e5af5509f3cd459d295352e537aa
4
- data.tar.gz: 7d7c4ebae89f5cbcaa1d56950d6b4c36089c10c20d54ec2e8e11fee6cc81433e
3
+ metadata.gz: 3ff2ed2cd4772450c01e3e88248ae89441de709198fdd177d3e572bbc5f0e474
4
+ data.tar.gz: 5701717fcba8a05b6f3e027d9bce33a3830fa20dabe3413255779899478cb4ab
5
5
  SHA512:
6
- metadata.gz: f1c68e2c7bb7f40e5c40c960d0dc412da03b7bc9f82e4d3b60f08e49fa1a796b34246b547bfb9ae67677aaf15944ff9d00f97dd97fa898309db0ee37864cd2b1
7
- data.tar.gz: 65f4a182ecc8e9638c1c22c43d03b4487588c112f33e75d85f2ffbee1f29e2d55bf67f861c3b6b34a97ca1014c39a0570c93b8f704c19fe28e7b5266d5a1795a
6
+ metadata.gz: 949ad06a090a4ac8c2ef5b4e053ed4b7668c051be15b6959a2948614e771c25e18774d9ee97fe1f5c03c130986b671a8b26ac253f592a993fa4ad393bcad7673
7
+ data.tar.gz: b73cfc6c070314f97cbc7917d571de67031247aac42f3474b2e71d04e8b3d650fc380a0ce3ca65c1d8339bf8743d94b666ecccca4431f7b89df4e7485a03a382
@@ -12,28 +12,15 @@ module Datahen
12
12
  def content(gid)
13
13
  client = Client::GlobalPage.new(options)
14
14
  result = JSON.parse(client.find_content(gid).to_s)
15
-
15
+
16
16
  if result['available'] == true
17
17
  puts "Preview content url: \"#{result['preview_url']}\""
18
18
  `open "#{result['preview_url']}"`
19
19
  else
20
20
  puts "Content does not exist"
21
- end
21
+ end
22
22
  end
23
23
 
24
- desc "failedcontent <gid>", "Show failed content of a globalpage"
25
- def failedcontent(gid)
26
- client = Client::GlobalPage.new(options)
27
- result = JSON.parse(client.find_failed_content(gid).to_s)
28
-
29
- if result['available'] == true
30
- puts "Preview failed content url: \"#{result['preview_url']}\""
31
- `open "#{result['preview_url']}"`
32
- else
33
- puts "Failed Content does not exist"
34
- end
35
- end
36
-
37
24
  end
38
25
  end
39
26
  end
@@ -18,12 +18,24 @@ module Datahen
18
18
  puts "#{client.all()}"
19
19
  end
20
20
 
21
- desc "show <job_id>", "Show a job"
21
+ desc "show <job_id>", "Show a job (Defaults to showing data from cached job)"
22
+ option :live, type: :boolean, desc: 'Get data from the live job, not cached job.'
22
23
  def show(job_id)
23
24
  client = Client::Job.new(options)
24
- puts "#{client.find(job_id)}"
25
+ puts "#{client.find(job_id, options)}"
25
26
  end
26
27
 
28
+ desc "stats <job_id>", "Get the stat for a job (Defaults to showing data from cached stats)"
29
+ long_desc <<-LONGDESC
30
+ Get stats for a scraper's current job\n
31
+ LONGDESC
32
+ option :live, type: :boolean, desc: 'Get data from the live stats, not cached stats.'
33
+ def stats(job_id)
34
+ client = Client::JobStat.new(options)
35
+ puts "#{client.job_current_stats(job_id, options)}"
36
+ end
37
+
38
+
27
39
  end
28
40
  end
29
41
 
@@ -10,12 +10,13 @@ module Datahen
10
10
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
11
11
  option :global, :aliases => :g, type: :boolean, default: false, desc: 'Use globalpage instead of a job page'
12
12
  option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
13
+ option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
13
14
  def try_parse(scraper_name, parser_file, gid)
14
- begin
15
-
15
+ begin
16
+
16
17
  if options[:job]
17
18
  job_id = options[:job]
18
- elsif options[:global]
19
+ elsif options[:global]
19
20
  job_id = nil
20
21
  else
21
22
  job = Client::ScraperJob.new(options).find(scraper_name)
@@ -24,7 +25,7 @@ module Datahen
24
25
 
25
26
 
26
27
  vars = JSON.parse(options[:vars]) if options[:vars]
27
- puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars)
28
+ puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars, options[:"keep-outputs"])
28
29
 
29
30
  rescue JSON::ParserError
30
31
  if options[:vars]
@@ -40,6 +41,8 @@ module Datahen
40
41
  <GID>: Global ID of the page.\x5
41
42
  LONGDESC
42
43
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
44
+ option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
45
+ option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
43
46
  def exec_parse(scraper_name, parser_file, *gids)
44
47
  gids.each do |gid|
45
48
  begin
@@ -52,7 +55,8 @@ module Datahen
52
55
  job_id = job['id']
53
56
  end
54
57
 
55
- puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true)
58
+ vars = JSON.parse(options[:vars]) if options[:vars]
59
+ puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true, vars, options[:"keep-outputs"])
56
60
  rescue => e
57
61
  puts e
58
62
  end
@@ -60,7 +60,7 @@ module Datahen
60
60
  desc "show <scraper_name>", "Show a scraper"
61
61
  def show(scraper_name)
62
62
  client = Client::Scraper.new(options)
63
- puts "#{client.find(scraper_name)}"
63
+ puts "#{client.find(scraper_name, options)}"
64
64
  end
65
65
 
66
66
  desc "delete <scraper_name>", "Delete a scraper and related records"
@@ -102,6 +102,7 @@ module Datahen
102
102
  option :head, :aliases => :H, desc: 'Show the oldest log entries. If not set, newest entries is shown'
103
103
  option :parsing, :aliases => :p, type: :boolean, desc: 'Show only log entries related to parsing errors'
104
104
  option :seeding, :aliases => :s, type: :boolean, desc: 'Show only log entries related to seeding errors'
105
+ option :finisher, :aliases => :f, type: :boolean, desc: 'Show only log entries related to finisher errors'
105
106
  option :more, :aliases => :m, desc: 'Show next set of log entries. Enter the `More token`'
106
107
  option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 5000 per page.'
107
108
  def log(scraper_name)
@@ -111,6 +112,7 @@ module Datahen
111
112
  query["order"] = options.delete(:head) if options[:head]
112
113
  query["job_type"] = "parsing" if options[:parsing]
113
114
  query["job_type"] = "seeding" if options[:seeding]
115
+ query["job_type"] = "finisher executing" if options[:finisher]
114
116
  query["page_token"] = options.delete(:more) if options[:more]
115
117
  query["per_page"] = options.delete(:per_page) if options[:per_page]
116
118
 
@@ -138,17 +140,18 @@ module Datahen
138
140
  end
139
141
  end
140
142
 
141
- desc "stats <scraper_name>", "Get the current stat for a job"
143
+ desc "stats <scraper_name>", "Get the stat for a current job (Defaults to showing data from cached stats)"
142
144
  long_desc <<-LONGDESC
143
145
  Get stats for a scraper's current job\n
144
146
  LONGDESC
145
147
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
148
+ option :live, type: :boolean, desc: 'Get data from the live stats, not cached stats.'
146
149
  def stats(scraper_name)
147
150
  client = Client::JobStat.new(options)
148
151
  if options[:job]
149
- puts "#{client.job_current_stats(options[:job])}"
152
+ puts "#{client.job_current_stats(options[:job], options)}"
150
153
  else
151
- puts "#{client.scraper_job_current_stats(scraper_name)}"
154
+ puts "#{client.scraper_job_current_stats(scraper_name, options)}"
152
155
  end
153
156
  end
154
157
 
@@ -12,7 +12,6 @@ module Datahen
12
12
  puts "#{client.find(export_id)}"
13
13
  end
14
14
 
15
-
16
15
  desc "list", "Gets a list of exports"
17
16
  long_desc <<-LONGDESC
18
17
  List exports.
@@ -34,13 +33,13 @@ module Datahen
34
33
  def download(export_id)
35
34
  client = Client::ScraperExport.new(options)
36
35
  result = JSON.parse(client.download(export_id).to_s)
37
-
36
+
38
37
  if result['signed_url']
39
38
  puts "Download url: \"#{result['signed_url']}\""
40
39
  `open "#{result['signed_url']}"`
41
40
  else
42
41
  puts "Exported file does not exist"
43
- end
42
+ end
44
43
  end
45
44
 
46
45
 
@@ -11,9 +11,15 @@ module Datahen
11
11
  long_desc <<-LONGDESC
12
12
  Reset finisher on a scraper's current job.\x5
13
13
  LONGDESC
14
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
14
15
  def reset(scraper_name)
15
- client = Client::ScraperFinisher.new(options)
16
- puts "#{client.reset(scraper_name)}"
16
+ if options[:job]
17
+ client = Client::JobFinisher.new(options)
18
+ puts "#{client.reset(options[:job])}"
19
+ else
20
+ client = Client::ScraperFinisher.new(options)
21
+ puts "#{client.reset(scraper_name)}"
22
+ end
17
23
  end
18
24
  end
19
25
  end
@@ -6,10 +6,11 @@ module Datahen
6
6
  "#{basename} #{@package_name} #{command.usage}"
7
7
  end
8
8
 
9
- desc "show <scraper_name>", "Show a scraper's current job"
9
+ desc "show <scraper_name>", "Show a scraper's current job (Defaults to showing data from cached job)"
10
+ option :live, type: :boolean, desc: 'Get data from the live job, not cached job.'
10
11
  def show(scraper_name)
11
12
  client = Client::ScraperJob.new(options)
12
- puts "#{client.find(scraper_name)}"
13
+ puts "#{client.find(scraper_name, options)}"
13
14
  end
14
15
 
15
16
 
@@ -29,27 +30,45 @@ module Datahen
29
30
  long_desc <<-LONGDESC
30
31
  Cancels a scraper's current job
31
32
  LONGDESC
33
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
32
34
  def cancel(scraper_name)
33
- client = Client::ScraperJob.new(options)
34
- puts "#{client.cancel(scraper_name)}"
35
+ if options[:job]
36
+ client = Client::Job.new(options)
37
+ puts "#{client.cancel(options[:job])}"
38
+ else
39
+ client = Client::ScraperJob.new(options)
40
+ puts "#{client.cancel(scraper_name)}"
41
+ end
35
42
  end
36
43
 
37
44
  desc "resume <scraper_name>", "resumes a scraper's current job"
38
45
  long_desc <<-LONGDESC
39
46
  Resumes a scraper's current job
40
47
  LONGDESC
48
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
41
49
  def resume(scraper_name)
42
- client = Client::ScraperJob.new(options)
43
- puts "#{client.resume(scraper_name)}"
50
+ if options[:job]
51
+ client = Client::Job.new(options)
52
+ puts "#{client.resume(options[:job])}"
53
+ else
54
+ client = Client::ScraperJob.new(options)
55
+ puts "#{client.resume(scraper_name)}"
56
+ end
44
57
  end
45
58
 
46
59
  desc "pause <scraper_name>", "pauses a scraper's current job"
47
60
  long_desc <<-LONGDESC
48
61
  pauses a scraper's current job
49
62
  LONGDESC
63
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
50
64
  def pause(scraper_name)
51
- client = Client::ScraperJob.new(options)
52
- puts "#{client.pause(scraper_name)}"
65
+ if options[:job]
66
+ client = Client::Job.new(options)
67
+ puts "#{client.pause(options[:job])}"
68
+ else
69
+ client = Client::ScraperJob.new(options)
70
+ puts "#{client.pause(scraper_name)}"
71
+ end
53
72
  end
54
73
 
55
74
 
@@ -60,9 +79,15 @@ module Datahen
60
79
  option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 1. '
61
80
  option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 0. '
62
81
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
82
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
63
83
  def update(scraper_name)
64
- client = Client::ScraperJob.new(options)
65
- puts "#{client.update(scraper_name, options)}"
84
+ if options[:job]
85
+ client = Client::Job.new(options)
86
+ puts "#{client.update(options[:job], options)}"
87
+ else
88
+ client = Client::ScraperJob.new(options)
89
+ puts "#{client.update(scraper_name, options)}"
90
+ end
66
91
  end
67
92
 
68
93
  desc "var SUBCOMMAND ...ARGS", "for managing scraper's job variables"
@@ -13,9 +13,15 @@ module Datahen
13
13
  LONGDESC
14
14
  option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
15
15
  option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
16
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
16
17
  def list(scraper_name)
17
- client = Client::ScraperJobVar.new(options)
18
- puts "#{client.all(scraper_name)}"
18
+ if options[:job]
19
+ client = Client::JobVar.new(options)
20
+ puts "#{client.all(options[:job])}"
21
+ else
22
+ client = Client::ScraperJobVar.new(options)
23
+ puts "#{client.all(scraper_name)}"
24
+ end
19
25
  end
20
26
 
21
27
  desc "set <scraper_name> <var_name> <value>", "Set an environment var on the scrape job"
@@ -24,23 +30,40 @@ module Datahen
24
30
  <var_name>: Var name can only consist of alphabets, numbers, underscores. Name must be unique to your scrape job, otherwise it will be overwritten.\x5
25
31
  <value>: Value of variable.\x5
26
32
  LONGDESC
27
- option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
33
+ option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
34
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
28
35
  def set(scraper_name, var_name, value)
29
- # puts "options #{options}"
30
- client = Client::ScraperJobVar.new(options)
31
- puts "#{client.set(scraper_name, var_name, value, options)}"
36
+ if options[:job]
37
+ client = Client::JobVar.new(options)
38
+ puts "#{client.set(options[:job], var_name, value, options)}"
39
+ else
40
+ client = Client::ScraperJobVar.new(options)
41
+ puts "#{client.set(scraper_name, var_name, value, options)}"
42
+ end
32
43
  end
33
44
 
34
45
  desc "show <scraper_name> <var_name>", "Show an environment variable on the scrape job"
46
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
35
47
  def show(scraper_name, var_name)
36
- client = Client::ScraperJobVar.new(options)
37
- puts "#{client.find(scraper_name, var_name)}"
48
+ if options[:job]
49
+ client = Client::JobVar.new(options)
50
+ puts "#{client.find(options[:job], var_name)}"
51
+ else
52
+ client = Client::ScraperJobVar.new(options)
53
+ puts "#{client.find(scraper_name, var_name)}"
54
+ end
38
55
  end
39
56
 
40
57
  desc "unset <scraper_name> <var_name>", "Deletes an environment variable on the scrape job"
58
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
41
59
  def unset(scraper_name, var_name)
42
- client = Client::ScraperJobVar.new(options)
43
- puts "#{client.unset(scraper_name, var_name)}"
60
+ if options[:job]
61
+ client = Client::JobVar.new(options)
62
+ puts "#{client.unset(options[:job], var_name)}"
63
+ else
64
+ client = Client::ScraperJobVar.new(options)
65
+ puts "#{client.unset(scraper_name, var_name)}"
66
+ end
44
67
  end
45
68
  end
46
69
  end
@@ -17,6 +17,7 @@ module Datahen
17
17
  option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
18
18
  option :fetch_fail, type: :boolean, desc: 'Returns only pages that fails fetching.'
19
19
  option :parse_fail, type: :boolean, desc: 'Returns only pages that fails parsing.'
20
+ option :status, type: :string, desc: 'Returns only pages with specific status.'
20
21
  def list(scraper_name)
21
22
  if options[:job]
22
23
  client = Client::JobPage.new(options)
@@ -104,13 +105,19 @@ module Datahen
104
105
  option :fetch_fail, type: :boolean, desc: 'Refetches only pages that fails fetching.'
105
106
  option :parse_fail, type: :boolean, desc: 'Refetches only pages that fails parsing.'
106
107
  option :status, type: :string, desc: 'Refetches only pages with a specific status.'
108
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
107
109
  def refetch(scraper_name)
108
110
  if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status)
109
111
  puts "Must specify either a --gid, --fetch-fail, --parse-fail or --status"
110
112
  return
111
113
  end
112
- client = Client::ScraperJobPage.new(options)
113
- puts "#{client.refetch(scraper_name)}"
114
+ if options[:job]
115
+ client = Client::JobPage.new(options)
116
+ puts "#{client.refetch(options[:job])}"
117
+ else
118
+ client = Client::ScraperJobPage.new(options)
119
+ puts "#{client.refetch(scraper_name)}"
120
+ end
114
121
  end
115
122
 
116
123
  desc "reparse <scraper_name>", "Reparse Pages on a scraper's current job"
@@ -120,6 +127,7 @@ module Datahen
120
127
  option :gid, :aliases => :g, type: :string, desc: 'Reparse a specific GID'
121
128
  option :parse_fail, type: :boolean, desc: 'Reparse only pages that fails parsing.'
122
129
  option :status, type: :string, desc: 'Reparse only pages with a specific status.'
130
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
123
131
  def reparse(scraper_name)
124
132
  begin
125
133
  options[:vars] = JSON.parse(options[:vars]) if options[:vars]
@@ -129,8 +137,13 @@ module Datahen
129
137
  return
130
138
  end
131
139
 
132
- client = Client::ScraperJobPage.new(options)
133
- puts "#{client.reparse(scraper_name)}"
140
+ if options[:job]
141
+ client = Client::JobPage.new(options)
142
+ puts "#{client.reparse(options[:job])}"
143
+ else
144
+ client = Client::ScraperJobPage.new(options)
145
+ puts "#{client.reparse(scraper_name)}"
146
+ end
134
147
 
135
148
  rescue JSON::ParserError
136
149
  if options[:vars]
@@ -197,6 +210,46 @@ module Datahen
197
210
  end
198
211
  end
199
212
 
213
+ desc "content <scraper_name> <gid>", "Show a page's content in scraper's current job"
214
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
215
+ def content(scraper_name, gid)
216
+ result = nil
217
+ if options[:job]
218
+ client = Client::JobPage.new(options)
219
+ result = JSON.parse(client.find_content(options[:job], gid).to_s)
220
+ else
221
+ client = Client::ScraperJobPage.new(options)
222
+ result = JSON.parse(client.find_content(scraper_name, gid).to_s)
223
+ end
224
+
225
+ if result['available'] == true
226
+ puts "Preview content url: \"#{result['preview_url']}\""
227
+ `open "#{result['preview_url']}"`
228
+ else
229
+ puts "Content does not exist"
230
+ end
231
+ end
232
+
233
+ desc "failedcontent <gid>", "Show a page's failed content in scraper's current job"
234
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
235
+ def failedcontent(scraper_name, gid)
236
+ result = nil
237
+ if options[:job]
238
+ client = Client::JobPage.new(options)
239
+ result = JSON.parse(client.find_failed_content(options[:job], gid).to_s)
240
+ else
241
+ client = Client::ScraperJobPage.new(options)
242
+ result = JSON.parse(client.find_failed_content(scraper_name, gid).to_s)
243
+ end
244
+
245
+ if result['available'] == true
246
+ puts "Preview failed content url: \"#{result['preview_url']}\""
247
+ `open "#{result['preview_url']}"`
248
+ else
249
+ puts "Failed Content does not exist"
250
+ end
251
+ end
252
+
200
253
  end
201
254
  end
202
255
 
@@ -7,6 +7,7 @@ module Datahen
7
7
  <seeder_file>: Seeder script file will be executed.\x5
8
8
  LONGDESC
9
9
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
10
+ option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
10
11
  def try_seed(scraper_name, seeder_file)
11
12
  if options[:job]
12
13
  job_id = options[:job]
@@ -14,8 +15,8 @@ module Datahen
14
15
  job = Client::ScraperJob.new(options).find(scraper_name)
15
16
  job_id = job['id']
16
17
  end
17
-
18
- puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, false)
18
+
19
+ puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, false, options[:"keep-outputs"])
19
20
  end
20
21
 
21
22
  desc "exec <scraper_name> <seeder_file>", "Executes a seeder script onto a scraper's current job."
@@ -24,6 +25,7 @@ module Datahen
24
25
  <seeder_file>: Seeder script file that will be executed on the scraper's current job.\x5
25
26
  LONGDESC
26
27
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
28
+ option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
27
29
  def exec_parse(scraper_name, seeder_file)
28
30
  if options[:job]
29
31
  job_id = options[:job]
@@ -20,7 +20,9 @@ require "datahen/client/job_stat"
20
20
  require "datahen/client/backblaze_content"
21
21
  require "datahen/client/env_var"
22
22
  require "datahen/client/scraper_var"
23
+ require "datahen/client/job_var"
23
24
  require "datahen/client/scraper_job_var"
25
+ require "datahen/client/job_finisher"
24
26
 
25
27
 
26
28
  module Datahen
@@ -51,10 +51,10 @@ module Datahen
51
51
  query[:status] = opts[:status] if opts[:status]
52
52
  query[:page_type] = opts[:page_type] if opts[:page_type]
53
53
  query[:gid] = opts[:gid] if opts[:gid]
54
- query[:"min-timestamp"] = opts[:"min-timestamp"]
55
- query[:"max-timestamp"] = opts[:"max-timestamp"]
56
- query[:limit] = opts[:limit]
57
- query[:order] = opts[:order]
54
+ query[:"min-timestamp"] = opts[:"min-timestamp"] if opts[:"min-timestamp"]
55
+ query[:"max-timestamp"] = opts[:"max-timestamp"] if opts[:"max-timestamp"]
56
+ query[:limit] = opts[:limit] if opts[:limit]
57
+ query[:order] = opts[:order] if opts[:order]
58
58
 
59
59
  if opts[:query]
60
60
  if opts[:query].is_a?(Hash)
@@ -8,11 +8,6 @@ module Datahen
8
8
  def find_content(gid)
9
9
  self.class.get("/global_pages/#{gid}/content", @options)
10
10
  end
11
-
12
- def find_failed_content(gid)
13
- self.class.get("/global_pages/#{gid}/failed_content", @options)
14
- end
15
11
  end
16
12
  end
17
13
  end
18
-
@@ -6,8 +6,12 @@ module Datahen
6
6
  self.class.get("/jobs", params)
7
7
  end
8
8
 
9
- def find(job_id)
10
- self.class.get("/jobs/#{job_id}", @options)
9
+ def find(job_id, opts={})
10
+ if opts[:live]
11
+ self.class.get("/jobs/#{job_id}", @options)
12
+ else
13
+ self.class.get("/cached/jobs/#{job_id}", @options)
14
+ end
11
15
  end
12
16
 
13
17
  def update(job_id, opts={})
@@ -15,6 +19,7 @@ module Datahen
15
19
  body[:status] = opts[:status] if opts[:status]
16
20
  body[:standard_worker_count] = opts[:workers] if opts[:workers]
17
21
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
22
+ body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
18
23
  params = @options.merge({body: body.to_json})
19
24
 
20
25
  self.class.put("/jobs/#{job_id}", params)
@@ -41,6 +46,7 @@ module Datahen
41
46
  body[:pages] = opts.fetch(:pages) {[]}
42
47
  body[:seeding_status] = opts.fetch(:seeding_status){ nil }
43
48
  body[:log_error] = opts[:log_error] if opts[:log_error]
49
+ body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
44
50
 
45
51
  params = @options.merge({body: body.to_json})
46
52
 
@@ -0,0 +1,16 @@
1
+ module Datahen
2
+ module Client
3
+ class JobFinisher < Datahen::Client::Base
4
+ # Reset finisher on a scraper's current job.
5
+ #
6
+ # @param [Integer] job_id Job ID
7
+ # @param [Hash] opts ({}) API custom parameters.
8
+ #
9
+ # @return [HTTParty::Response]
10
+ def reset(job_id, opts={})
11
+ params = @options.merge(opts)
12
+ self.class.put("/jobs/#{job_id}/finisher/reset", params)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -48,11 +48,30 @@ module Datahen
48
48
  body[:pages] = opts.fetch(:pages) {[]}
49
49
  body[:parsing_status] = opts.fetch(:parsing_status){ nil }
50
50
  body[:log_error] = opts[:log_error] if opts[:log_error]
51
+ body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
51
52
 
52
53
  params = @options.merge({body: body.to_json})
53
54
 
54
55
  self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", params)
55
56
  end
57
+
58
+ def find_content(job_id, gid)
59
+ self.class.get("/jobs/#{job_id}/pages/#{gid}/content", @options)
60
+ end
61
+
62
+ def find_failed_content(job_id, gid)
63
+ self.class.get("/jobs/#{job_id}/pages/#{gid}/failed_content", @options)
64
+ end
65
+
66
+ def reparse(job_id, opts={})
67
+ params = @options.merge(opts)
68
+ self.class.put("/jobs/#{job_id}/pages/reparse", params)
69
+ end
70
+
71
+ def refetch(job_id, opts={})
72
+ params = @options.merge(opts)
73
+ self.class.put("/jobs/#{job_id}/pages/refetch", params)
74
+ end
56
75
  end
57
76
  end
58
77
  end
@@ -2,12 +2,20 @@ module Datahen
2
2
  module Client
3
3
  class JobStat < Datahen::Client::Base
4
4
 
5
- def job_current_stats(job_id)
6
- self.class.get("/jobs/#{job_id}/stats/current", @options)
5
+ def job_current_stats(job_id, opts={})
6
+ if opts[:live]
7
+ self.class.get("/jobs/#{job_id}/stats/current", @options)
8
+ else
9
+ self.class.get("/cached/jobs/#{job_id}/stats/current", @options)
10
+ end
7
11
  end
8
12
 
9
- def scraper_job_current_stats(scraper_name)
10
- self.class.get("/scrapers/#{scraper_name}/current_job/stats/current", @options)
13
+ def scraper_job_current_stats(scraper_name, opts={})
14
+ if opts[:live]
15
+ self.class.get("/scrapers/#{scraper_name}/current_job/stats/current", @options)
16
+ else
17
+ self.class.get("/cached/scrapers/#{scraper_name}/current_job/stats/current", @options)
18
+ end
11
19
  end
12
20
 
13
21
  def job_stats_history(job_id)
@@ -0,0 +1,28 @@
1
+ module Datahen
2
+ module Client
3
+ class JobVar < Datahen::Client::Base
4
+
5
+ def find(job_id, var_name)
6
+ self.class.get("/jobs/#{job_id}/vars/#{var_name}", @options)
7
+ end
8
+
9
+ def all(job_id, opts={})
10
+ params = @options.merge opts
11
+ self.class.get("/jobs/#{job_id}/vars", params)
12
+ end
13
+
14
+ def set(job_id, var_name, value, opts={})
15
+ body = {}
16
+ body[:value] = value
17
+ body[:secret] = opts[:secret] if opts[:secret]
18
+ params = @options.merge({body: body.to_json})
19
+ self.class.put("/jobs/#{job_id}/vars/#{var_name}", params)
20
+ end
21
+
22
+ def unset(job_id, var_name, opts={})
23
+ params = @options.merge(opts)
24
+ self.class.delete("/jobs/#{job_id}/vars/#{var_name}", params)
25
+ end
26
+ end
27
+ end
28
+ end
@@ -15,8 +15,12 @@ module Datahen
15
15
  self.class.post("/scrapers/#{scraper_name}/jobs", params)
16
16
  end
17
17
 
18
- def find(scraper_name)
19
- self.class.get("/scrapers/#{scraper_name}/current_job", @options)
18
+ def find(scraper_name, opts={})
19
+ if opts[:live]
20
+ self.class.get("/scrapers/#{scraper_name}/current_job", @options)
21
+ else
22
+ self.class.get("/cached/scrapers/#{scraper_name}/current_job", @options)
23
+ end
20
24
  end
21
25
 
22
26
  def update(scraper_name, opts={})
@@ -26,6 +26,9 @@ module Datahen
26
26
  self.class.put("/scrapers/#{scraper_name}/current_job/pages/refetch", params)
27
27
  end
28
28
 
29
+ # Deprecated, please use Datahen::Client::JobVar#refetch instead.
30
+ #
31
+ # @note This method will be removed at some point in the future.
29
32
  def refetch_by_job(job_id, opts={})
30
33
  params = @options.merge(opts)
31
34
  self.class.put("/jobs/#{job_id}/pages/refetch", params)
@@ -36,11 +39,6 @@ module Datahen
36
39
  self.class.put("/scrapers/#{scraper_name}/current_job/pages/reparse", params)
37
40
  end
38
41
 
39
- def reparse_by_job(job_id, opts={})
40
- params = @options.merge(opts)
41
- self.class.put("/jobs/#{job_id}/pages/reparse", params)
42
- end
43
-
44
42
  def enqueue(scraper_name, method, url, opts={})
45
43
  body = {}
46
44
  body[:method] = method != "" ? method : "GET"
@@ -62,6 +60,14 @@ module Datahen
62
60
  self.class.post("/scrapers/#{scraper_name}/current_job/pages", params)
63
61
  end
64
62
 
63
+ def find_content(scraper_name, gid)
64
+ self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}/content", @options)
65
+ end
66
+
67
+ def find_failed_content(scraper_name, gid)
68
+ self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}/failed_content", @options)
69
+ end
70
+
65
71
  end
66
72
  end
67
73
  end
@@ -63,9 +63,9 @@ module Datahen
63
63
  client.find(gid)
64
64
  end
65
65
 
66
- def get_content(gid)
67
- client = Client::GlobalPage.new()
68
- content_json = client.find_content(gid)
66
+ def get_content(job_id, gid)
67
+ client = Client::JobPage.new()
68
+ content_json = client.find_content(job_id, gid)
69
69
 
70
70
  if content_json['available']
71
71
  signed_url = content_json['signed_url']
@@ -75,9 +75,9 @@ module Datahen
75
75
  end
76
76
  end
77
77
 
78
- def get_failed_content(gid)
79
- client = Client::GlobalPage.new()
80
- content_json = client.find_failed_content(gid)
78
+ def get_failed_content(job_id, gid)
79
+ client = Client::JobPage.new()
80
+ content_json = client.find_failed_content(job_id, gid)
81
81
 
82
82
  if content_json['available']
83
83
  signed_url = content_json['signed_url']
@@ -1,18 +1,24 @@
1
1
  module Datahen
2
2
  module Scraper
3
3
  class Parser
4
- def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {})
4
+ def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {}, keep_outputs=false)
5
5
  extname = File.extname(filename)
6
6
  case extname
7
7
  when '.rb'
8
- executor = RubyParserExecutor.new(filename: filename, gid: gid, job_id: job_id, vars: vars)
8
+ executor = RubyParserExecutor.new(
9
+ filename: filename,
10
+ gid: gid,
11
+ job_id: job_id,
12
+ vars: vars,
13
+ keep_outputs: keep_outputs
14
+ )
9
15
  executor.exec_parser(save)
10
16
  else
11
17
  puts "Unable to find a parser executor for file type \"#{extname}\""
12
18
  end
13
19
  end
14
20
 
15
-
21
+
16
22
  end
17
23
  end
18
- end
24
+ end
@@ -15,6 +15,7 @@ module Datahen
15
15
  @gid = options.fetch(:gid) { raise "GID is required"}
16
16
  @job_id = options.fetch(:job_id)
17
17
  @page_vars = options.fetch(:vars) { {} }
18
+ @keep_outputs = !!(options.fetch(:keep_outputs) { false })
18
19
  end
19
20
 
20
21
  def self.exposed_methods
@@ -66,7 +67,9 @@ module Datahen
66
67
  response = parsing_update(
67
68
  job_id: job_id,
68
69
  gid: gid,
69
- parsing_status: :starting)
70
+ parsing_status: :starting,
71
+ keep_outputs: @keep_outputs
72
+ )
70
73
 
71
74
  if response.code == 200
72
75
  puts "Page Parsing Status Updated."
@@ -165,7 +168,7 @@ module Datahen
165
168
  handle_error(e) if save
166
169
  raise e
167
170
  end
168
-
171
+
169
172
  if refetch_self
170
173
  refetch_page gid
171
174
  elsif reparse_self
@@ -178,11 +181,11 @@ module Datahen
178
181
  end
179
182
 
180
183
  def content
181
- @content ||= get_content(gid)
184
+ @content ||= get_content(job_id, gid)
182
185
  end
183
186
 
184
187
  def failed_content
185
- @failed_content ||= get_failed_content(gid)
188
+ @failed_content ||= get_failed_content(job_id, gid)
186
189
  end
187
190
 
188
191
  def handle_error(e)
@@ -6,6 +6,7 @@ module Datahen
6
6
  def initialize(options={})
7
7
  @filename = options.fetch(:filename) { raise "Filename is required"}
8
8
  @job_id = options[:job_id]
9
+ @keep_outputs = !!(options.fetch(:keep_outputs) { false })
9
10
  end
10
11
 
11
12
  def self.exposed_methods
@@ -81,7 +82,9 @@ module Datahen
81
82
 
82
83
  response = seeding_update(
83
84
  job_id: job_id,
84
- seeding_status: :starting)
85
+ seeding_status: :starting,
86
+ keep_outputs: @keep_outputs
87
+ )
85
88
 
86
89
  if response.code == 200
87
90
  puts "Seeding Status Updated."
@@ -2,11 +2,15 @@ module Datahen
2
2
  module Scraper
3
3
  class Seeder
4
4
 
5
- def self.exec_seeder(filename, job_id=nil, save=false)
5
+ def self.exec_seeder(filename, job_id=nil, save=false, keep_outputs=false)
6
6
  extname = File.extname(filename)
7
7
  case extname
8
8
  when '.rb'
9
- executor = RubySeederExecutor.new(filename: filename, job_id: job_id)
9
+ executor = RubySeederExecutor.new(
10
+ filename: filename,
11
+ job_id: job_id,
12
+ keep_outputs: keep_outputs
13
+ )
10
14
  executor.exec_seeder(save)
11
15
  else
12
16
  puts "Unable to find a seeder executor for file type \"#{extname}\""
@@ -15,4 +19,4 @@ module Datahen
15
19
 
16
20
  end
17
21
  end
18
- end
22
+ end
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.11.1"
2
+ VERSION = "0.14.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.1
4
+ version: 0.14.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-01-17 00:00:00.000000000 Z
11
+ date: 2020-07-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -215,10 +215,12 @@ files:
215
215
  - lib/datahen/client/global_page.rb
216
216
  - lib/datahen/client/job.rb
217
217
  - lib/datahen/client/job_export.rb
218
+ - lib/datahen/client/job_finisher.rb
218
219
  - lib/datahen/client/job_log.rb
219
220
  - lib/datahen/client/job_output.rb
220
221
  - lib/datahen/client/job_page.rb
221
222
  - lib/datahen/client/job_stat.rb
223
+ - lib/datahen/client/job_var.rb
222
224
  - lib/datahen/client/scraper.rb
223
225
  - lib/datahen/client/scraper_deployment.rb
224
226
  - lib/datahen/client/scraper_export.rb
@@ -262,7 +264,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
262
264
  - !ruby/object:Gem::Version
263
265
  version: '0'
264
266
  requirements: []
265
- rubygems_version: 3.0.3
267
+ rubygems_version: 3.1.2
266
268
  signing_key:
267
269
  specification_version: 4
268
270
  summary: DataHen toolbelt for developers