datahen 0.11.1 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 80934b5ed446c59f749866cd5f497ef66f90e5af5509f3cd459d295352e537aa
4
- data.tar.gz: 7d7c4ebae89f5cbcaa1d56950d6b4c36089c10c20d54ec2e8e11fee6cc81433e
3
+ metadata.gz: 3ff2ed2cd4772450c01e3e88248ae89441de709198fdd177d3e572bbc5f0e474
4
+ data.tar.gz: 5701717fcba8a05b6f3e027d9bce33a3830fa20dabe3413255779899478cb4ab
5
5
  SHA512:
6
- metadata.gz: f1c68e2c7bb7f40e5c40c960d0dc412da03b7bc9f82e4d3b60f08e49fa1a796b34246b547bfb9ae67677aaf15944ff9d00f97dd97fa898309db0ee37864cd2b1
7
- data.tar.gz: 65f4a182ecc8e9638c1c22c43d03b4487588c112f33e75d85f2ffbee1f29e2d55bf67f861c3b6b34a97ca1014c39a0570c93b8f704c19fe28e7b5266d5a1795a
6
+ metadata.gz: 949ad06a090a4ac8c2ef5b4e053ed4b7668c051be15b6959a2948614e771c25e18774d9ee97fe1f5c03c130986b671a8b26ac253f592a993fa4ad393bcad7673
7
+ data.tar.gz: b73cfc6c070314f97cbc7917d571de67031247aac42f3474b2e71d04e8b3d650fc380a0ce3ca65c1d8339bf8743d94b666ecccca4431f7b89df4e7485a03a382
@@ -12,28 +12,15 @@ module Datahen
12
12
  def content(gid)
13
13
  client = Client::GlobalPage.new(options)
14
14
  result = JSON.parse(client.find_content(gid).to_s)
15
-
15
+
16
16
  if result['available'] == true
17
17
  puts "Preview content url: \"#{result['preview_url']}\""
18
18
  `open "#{result['preview_url']}"`
19
19
  else
20
20
  puts "Content does not exist"
21
- end
21
+ end
22
22
  end
23
23
 
24
- desc "failedcontent <gid>", "Show failed content of a globalpage"
25
- def failedcontent(gid)
26
- client = Client::GlobalPage.new(options)
27
- result = JSON.parse(client.find_failed_content(gid).to_s)
28
-
29
- if result['available'] == true
30
- puts "Preview failed content url: \"#{result['preview_url']}\""
31
- `open "#{result['preview_url']}"`
32
- else
33
- puts "Failed Content does not exist"
34
- end
35
- end
36
-
37
24
  end
38
25
  end
39
26
  end
@@ -18,12 +18,24 @@ module Datahen
18
18
  puts "#{client.all()}"
19
19
  end
20
20
 
21
- desc "show <job_id>", "Show a job"
21
+ desc "show <job_id>", "Show a job (Defaults to showing data from cached job)"
22
+ option :live, type: :boolean, desc: 'Get data from the live job, not cached job.'
22
23
  def show(job_id)
23
24
  client = Client::Job.new(options)
24
- puts "#{client.find(job_id)}"
25
+ puts "#{client.find(job_id, options)}"
25
26
  end
26
27
 
28
+ desc "stats <job_id>", "Get the stat for a job (Defaults to showing data from cached stats)"
29
+ long_desc <<-LONGDESC
30
+ Get stats for a scraper's current job\n
31
+ LONGDESC
32
+ option :live, type: :boolean, desc: 'Get data from the live stats, not cached stats.'
33
+ def stats(job_id)
34
+ client = Client::JobStat.new(options)
35
+ puts "#{client.job_current_stats(job_id, options)}"
36
+ end
37
+
38
+
27
39
  end
28
40
  end
29
41
 
@@ -10,12 +10,13 @@ module Datahen
10
10
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
11
11
  option :global, :aliases => :g, type: :boolean, default: false, desc: 'Use globalpage instead of a job page'
12
12
  option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
13
+ option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
13
14
  def try_parse(scraper_name, parser_file, gid)
14
- begin
15
-
15
+ begin
16
+
16
17
  if options[:job]
17
18
  job_id = options[:job]
18
- elsif options[:global]
19
+ elsif options[:global]
19
20
  job_id = nil
20
21
  else
21
22
  job = Client::ScraperJob.new(options).find(scraper_name)
@@ -24,7 +25,7 @@ module Datahen
24
25
 
25
26
 
26
27
  vars = JSON.parse(options[:vars]) if options[:vars]
27
- puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars)
28
+ puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars, options[:"keep-outputs"])
28
29
 
29
30
  rescue JSON::ParserError
30
31
  if options[:vars]
@@ -40,6 +41,8 @@ module Datahen
40
41
  <GID>: Global ID of the page.\x5
41
42
  LONGDESC
42
43
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
44
+ option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
45
+ option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
43
46
  def exec_parse(scraper_name, parser_file, *gids)
44
47
  gids.each do |gid|
45
48
  begin
@@ -52,7 +55,8 @@ module Datahen
52
55
  job_id = job['id']
53
56
  end
54
57
 
55
- puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true)
58
+ vars = JSON.parse(options[:vars]) if options[:vars]
59
+ puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true, vars, options[:"keep-outputs"])
56
60
  rescue => e
57
61
  puts e
58
62
  end
@@ -60,7 +60,7 @@ module Datahen
60
60
  desc "show <scraper_name>", "Show a scraper"
61
61
  def show(scraper_name)
62
62
  client = Client::Scraper.new(options)
63
- puts "#{client.find(scraper_name)}"
63
+ puts "#{client.find(scraper_name, options)}"
64
64
  end
65
65
 
66
66
  desc "delete <scraper_name>", "Delete a scraper and related records"
@@ -102,6 +102,7 @@ module Datahen
102
102
  option :head, :aliases => :H, desc: 'Show the oldest log entries. If not set, newest entries is shown'
103
103
  option :parsing, :aliases => :p, type: :boolean, desc: 'Show only log entries related to parsing errors'
104
104
  option :seeding, :aliases => :s, type: :boolean, desc: 'Show only log entries related to seeding errors'
105
+ option :finisher, :aliases => :f, type: :boolean, desc: 'Show only log entries related to finisher errors'
105
106
  option :more, :aliases => :m, desc: 'Show next set of log entries. Enter the `More token`'
106
107
  option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 5000 per page.'
107
108
  def log(scraper_name)
@@ -111,6 +112,7 @@ module Datahen
111
112
  query["order"] = options.delete(:head) if options[:head]
112
113
  query["job_type"] = "parsing" if options[:parsing]
113
114
  query["job_type"] = "seeding" if options[:seeding]
115
+ query["job_type"] = "finisher executing" if options[:finisher]
114
116
  query["page_token"] = options.delete(:more) if options[:more]
115
117
  query["per_page"] = options.delete(:per_page) if options[:per_page]
116
118
 
@@ -138,17 +140,18 @@ module Datahen
138
140
  end
139
141
  end
140
142
 
141
- desc "stats <scraper_name>", "Get the current stat for a job"
143
+ desc "stats <scraper_name>", "Get the stat for a current job (Defaults to showing data from cached stats)"
142
144
  long_desc <<-LONGDESC
143
145
  Get stats for a scraper's current job\n
144
146
  LONGDESC
145
147
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
148
+ option :live, type: :boolean, desc: 'Get data from the live stats, not cached stats.'
146
149
  def stats(scraper_name)
147
150
  client = Client::JobStat.new(options)
148
151
  if options[:job]
149
- puts "#{client.job_current_stats(options[:job])}"
152
+ puts "#{client.job_current_stats(options[:job], options)}"
150
153
  else
151
- puts "#{client.scraper_job_current_stats(scraper_name)}"
154
+ puts "#{client.scraper_job_current_stats(scraper_name, options)}"
152
155
  end
153
156
  end
154
157
 
@@ -12,7 +12,6 @@ module Datahen
12
12
  puts "#{client.find(export_id)}"
13
13
  end
14
14
 
15
-
16
15
  desc "list", "Gets a list of exports"
17
16
  long_desc <<-LONGDESC
18
17
  List exports.
@@ -34,13 +33,13 @@ module Datahen
34
33
  def download(export_id)
35
34
  client = Client::ScraperExport.new(options)
36
35
  result = JSON.parse(client.download(export_id).to_s)
37
-
36
+
38
37
  if result['signed_url']
39
38
  puts "Download url: \"#{result['signed_url']}\""
40
39
  `open "#{result['signed_url']}"`
41
40
  else
42
41
  puts "Exported file does not exist"
43
- end
42
+ end
44
43
  end
45
44
 
46
45
 
@@ -11,9 +11,15 @@ module Datahen
11
11
  long_desc <<-LONGDESC
12
12
  Reset finisher on a scraper's current job.\x5
13
13
  LONGDESC
14
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
14
15
  def reset(scraper_name)
15
- client = Client::ScraperFinisher.new(options)
16
- puts "#{client.reset(scraper_name)}"
16
+ if options[:job]
17
+ client = Client::JobFinisher.new(options)
18
+ puts "#{client.reset(options[:job])}"
19
+ else
20
+ client = Client::ScraperFinisher.new(options)
21
+ puts "#{client.reset(scraper_name)}"
22
+ end
17
23
  end
18
24
  end
19
25
  end
@@ -6,10 +6,11 @@ module Datahen
6
6
  "#{basename} #{@package_name} #{command.usage}"
7
7
  end
8
8
 
9
- desc "show <scraper_name>", "Show a scraper's current job"
9
+ desc "show <scraper_name>", "Show a scraper's current job (Defaults to showing data from cached job)"
10
+ option :live, type: :boolean, desc: 'Get data from the live job, not cached job.'
10
11
  def show(scraper_name)
11
12
  client = Client::ScraperJob.new(options)
12
- puts "#{client.find(scraper_name)}"
13
+ puts "#{client.find(scraper_name, options)}"
13
14
  end
14
15
 
15
16
 
@@ -29,27 +30,45 @@ module Datahen
29
30
  long_desc <<-LONGDESC
30
31
  Cancels a scraper's current job
31
32
  LONGDESC
33
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
32
34
  def cancel(scraper_name)
33
- client = Client::ScraperJob.new(options)
34
- puts "#{client.cancel(scraper_name)}"
35
+ if options[:job]
36
+ client = Client::Job.new(options)
37
+ puts "#{client.cancel(options[:job])}"
38
+ else
39
+ client = Client::ScraperJob.new(options)
40
+ puts "#{client.cancel(scraper_name)}"
41
+ end
35
42
  end
36
43
 
37
44
  desc "resume <scraper_name>", "resumes a scraper's current job"
38
45
  long_desc <<-LONGDESC
39
46
  Resumes a scraper's current job
40
47
  LONGDESC
48
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
41
49
  def resume(scraper_name)
42
- client = Client::ScraperJob.new(options)
43
- puts "#{client.resume(scraper_name)}"
50
+ if options[:job]
51
+ client = Client::Job.new(options)
52
+ puts "#{client.resume(options[:job])}"
53
+ else
54
+ client = Client::ScraperJob.new(options)
55
+ puts "#{client.resume(scraper_name)}"
56
+ end
44
57
  end
45
58
 
46
59
  desc "pause <scraper_name>", "pauses a scraper's current job"
47
60
  long_desc <<-LONGDESC
48
61
  pauses a scraper's current job
49
62
  LONGDESC
63
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
50
64
  def pause(scraper_name)
51
- client = Client::ScraperJob.new(options)
52
- puts "#{client.pause(scraper_name)}"
65
+ if options[:job]
66
+ client = Client::Job.new(options)
67
+ puts "#{client.pause(options[:job])}"
68
+ else
69
+ client = Client::ScraperJob.new(options)
70
+ puts "#{client.pause(scraper_name)}"
71
+ end
53
72
  end
54
73
 
55
74
 
@@ -60,9 +79,15 @@ module Datahen
60
79
  option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 1. '
61
80
  option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 0. '
62
81
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
82
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
63
83
  def update(scraper_name)
64
- client = Client::ScraperJob.new(options)
65
- puts "#{client.update(scraper_name, options)}"
84
+ if options[:job]
85
+ client = Client::Job.new(options)
86
+ puts "#{client.update(options[:job], options)}"
87
+ else
88
+ client = Client::ScraperJob.new(options)
89
+ puts "#{client.update(scraper_name, options)}"
90
+ end
66
91
  end
67
92
 
68
93
  desc "var SUBCOMMAND ...ARGS", "for managing scraper's job variables"
@@ -13,9 +13,15 @@ module Datahen
13
13
  LONGDESC
14
14
  option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
15
15
  option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
16
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
16
17
  def list(scraper_name)
17
- client = Client::ScraperJobVar.new(options)
18
- puts "#{client.all(scraper_name)}"
18
+ if options[:job]
19
+ client = Client::JobVar.new(options)
20
+ puts "#{client.all(options[:job])}"
21
+ else
22
+ client = Client::ScraperJobVar.new(options)
23
+ puts "#{client.all(scraper_name)}"
24
+ end
19
25
  end
20
26
 
21
27
  desc "set <scraper_name> <var_name> <value>", "Set an environment var on the scrape job"
@@ -24,23 +30,40 @@ module Datahen
24
30
  <var_name>: Var name can only consist of alphabets, numbers, underscores. Name must be unique to your scrape job, otherwise it will be overwritten.\x5
25
31
  <value>: Value of variable.\x5
26
32
  LONGDESC
27
- option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
33
+ option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
34
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
28
35
  def set(scraper_name, var_name, value)
29
- # puts "options #{options}"
30
- client = Client::ScraperJobVar.new(options)
31
- puts "#{client.set(scraper_name, var_name, value, options)}"
36
+ if options[:job]
37
+ client = Client::JobVar.new(options)
38
+ puts "#{client.set(options[:job], var_name, value, options)}"
39
+ else
40
+ client = Client::ScraperJobVar.new(options)
41
+ puts "#{client.set(scraper_name, var_name, value, options)}"
42
+ end
32
43
  end
33
44
 
34
45
  desc "show <scraper_name> <var_name>", "Show an environment variable on the scrape job"
46
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
35
47
  def show(scraper_name, var_name)
36
- client = Client::ScraperJobVar.new(options)
37
- puts "#{client.find(scraper_name, var_name)}"
48
+ if options[:job]
49
+ client = Client::JobVar.new(options)
50
+ puts "#{client.find(options[:job], var_name)}"
51
+ else
52
+ client = Client::ScraperJobVar.new(options)
53
+ puts "#{client.find(scraper_name, var_name)}"
54
+ end
38
55
  end
39
56
 
40
57
  desc "unset <scraper_name> <var_name>", "Deletes an environment variable on the scrape job"
58
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
41
59
  def unset(scraper_name, var_name)
42
- client = Client::ScraperJobVar.new(options)
43
- puts "#{client.unset(scraper_name, var_name)}"
60
+ if options[:job]
61
+ client = Client::JobVar.new(options)
62
+ puts "#{client.unset(options[:job], var_name)}"
63
+ else
64
+ client = Client::ScraperJobVar.new(options)
65
+ puts "#{client.unset(scraper_name, var_name)}"
66
+ end
44
67
  end
45
68
  end
46
69
  end
@@ -17,6 +17,7 @@ module Datahen
17
17
  option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
18
18
  option :fetch_fail, type: :boolean, desc: 'Returns only pages that fails fetching.'
19
19
  option :parse_fail, type: :boolean, desc: 'Returns only pages that fails parsing.'
20
+ option :status, type: :string, desc: 'Returns only pages with specific status.'
20
21
  def list(scraper_name)
21
22
  if options[:job]
22
23
  client = Client::JobPage.new(options)
@@ -104,13 +105,19 @@ module Datahen
104
105
  option :fetch_fail, type: :boolean, desc: 'Refetches only pages that fails fetching.'
105
106
  option :parse_fail, type: :boolean, desc: 'Refetches only pages that fails parsing.'
106
107
  option :status, type: :string, desc: 'Refetches only pages with a specific status.'
108
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
107
109
  def refetch(scraper_name)
108
110
  if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status)
109
111
  puts "Must specify either a --gid, --fetch-fail, --parse-fail or --status"
110
112
  return
111
113
  end
112
- client = Client::ScraperJobPage.new(options)
113
- puts "#{client.refetch(scraper_name)}"
114
+ if options[:job]
115
+ client = Client::JobPage.new(options)
116
+ puts "#{client.refetch(options[:job])}"
117
+ else
118
+ client = Client::ScraperJobPage.new(options)
119
+ puts "#{client.refetch(scraper_name)}"
120
+ end
114
121
  end
115
122
 
116
123
  desc "reparse <scraper_name>", "Reparse Pages on a scraper's current job"
@@ -120,6 +127,7 @@ module Datahen
120
127
  option :gid, :aliases => :g, type: :string, desc: 'Reparse a specific GID'
121
128
  option :parse_fail, type: :boolean, desc: 'Reparse only pages that fails parsing.'
122
129
  option :status, type: :string, desc: 'Reparse only pages with a specific status.'
130
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
123
131
  def reparse(scraper_name)
124
132
  begin
125
133
  options[:vars] = JSON.parse(options[:vars]) if options[:vars]
@@ -129,8 +137,13 @@ module Datahen
129
137
  return
130
138
  end
131
139
 
132
- client = Client::ScraperJobPage.new(options)
133
- puts "#{client.reparse(scraper_name)}"
140
+ if options[:job]
141
+ client = Client::JobPage.new(options)
142
+ puts "#{client.reparse(options[:job])}"
143
+ else
144
+ client = Client::ScraperJobPage.new(options)
145
+ puts "#{client.reparse(scraper_name)}"
146
+ end
134
147
 
135
148
  rescue JSON::ParserError
136
149
  if options[:vars]
@@ -197,6 +210,46 @@ module Datahen
197
210
  end
198
211
  end
199
212
 
213
+ desc "content <scraper_name> <gid>", "Show a page's content in scraper's current job"
214
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
215
+ def content(scraper_name, gid)
216
+ result = nil
217
+ if options[:job]
218
+ client = Client::JobPage.new(options)
219
+ result = JSON.parse(client.find_content(options[:job], gid).to_s)
220
+ else
221
+ client = Client::ScraperJobPage.new(options)
222
+ result = JSON.parse(client.find_content(scraper_name, gid).to_s)
223
+ end
224
+
225
+ if result['available'] == true
226
+ puts "Preview content url: \"#{result['preview_url']}\""
227
+ `open "#{result['preview_url']}"`
228
+ else
229
+ puts "Content does not exist"
230
+ end
231
+ end
232
+
233
+ desc "failedcontent <gid>", "Show a page's failed content in scraper's current job"
234
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
235
+ def failedcontent(scraper_name, gid)
236
+ result = nil
237
+ if options[:job]
238
+ client = Client::JobPage.new(options)
239
+ result = JSON.parse(client.find_failed_content(options[:job], gid).to_s)
240
+ else
241
+ client = Client::ScraperJobPage.new(options)
242
+ result = JSON.parse(client.find_failed_content(scraper_name, gid).to_s)
243
+ end
244
+
245
+ if result['available'] == true
246
+ puts "Preview failed content url: \"#{result['preview_url']}\""
247
+ `open "#{result['preview_url']}"`
248
+ else
249
+ puts "Failed Content does not exist"
250
+ end
251
+ end
252
+
200
253
  end
201
254
  end
202
255
 
@@ -7,6 +7,7 @@ module Datahen
7
7
  <seeder_file>: Seeder script file will be executed.\x5
8
8
  LONGDESC
9
9
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
10
+ option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
10
11
  def try_seed(scraper_name, seeder_file)
11
12
  if options[:job]
12
13
  job_id = options[:job]
@@ -14,8 +15,8 @@ module Datahen
14
15
  job = Client::ScraperJob.new(options).find(scraper_name)
15
16
  job_id = job['id']
16
17
  end
17
-
18
- puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, false)
18
+
19
+ puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, false, options[:"keep-outputs"])
19
20
  end
20
21
 
21
22
  desc "exec <scraper_name> <seeder_file>", "Executes a seeder script onto a scraper's current job."
@@ -24,6 +25,7 @@ module Datahen
24
25
  <seeder_file>: Seeder script file that will be executed on the scraper's current job.\x5
25
26
  LONGDESC
26
27
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
28
+ option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
27
29
  def exec_parse(scraper_name, seeder_file)
28
30
  if options[:job]
29
31
  job_id = options[:job]
@@ -20,7 +20,9 @@ require "datahen/client/job_stat"
20
20
  require "datahen/client/backblaze_content"
21
21
  require "datahen/client/env_var"
22
22
  require "datahen/client/scraper_var"
23
+ require "datahen/client/job_var"
23
24
  require "datahen/client/scraper_job_var"
25
+ require "datahen/client/job_finisher"
24
26
 
25
27
 
26
28
  module Datahen
@@ -51,10 +51,10 @@ module Datahen
51
51
  query[:status] = opts[:status] if opts[:status]
52
52
  query[:page_type] = opts[:page_type] if opts[:page_type]
53
53
  query[:gid] = opts[:gid] if opts[:gid]
54
- query[:"min-timestamp"] = opts[:"min-timestamp"]
55
- query[:"max-timestamp"] = opts[:"max-timestamp"]
56
- query[:limit] = opts[:limit]
57
- query[:order] = opts[:order]
54
+ query[:"min-timestamp"] = opts[:"min-timestamp"] if opts[:"min-timestamp"]
55
+ query[:"max-timestamp"] = opts[:"max-timestamp"] if opts[:"max-timestamp"]
56
+ query[:limit] = opts[:limit] if opts[:limit]
57
+ query[:order] = opts[:order] if opts[:order]
58
58
 
59
59
  if opts[:query]
60
60
  if opts[:query].is_a?(Hash)
@@ -8,11 +8,6 @@ module Datahen
8
8
  def find_content(gid)
9
9
  self.class.get("/global_pages/#{gid}/content", @options)
10
10
  end
11
-
12
- def find_failed_content(gid)
13
- self.class.get("/global_pages/#{gid}/failed_content", @options)
14
- end
15
11
  end
16
12
  end
17
13
  end
18
-
@@ -6,8 +6,12 @@ module Datahen
6
6
  self.class.get("/jobs", params)
7
7
  end
8
8
 
9
- def find(job_id)
10
- self.class.get("/jobs/#{job_id}", @options)
9
+ def find(job_id, opts={})
10
+ if opts[:live]
11
+ self.class.get("/jobs/#{job_id}", @options)
12
+ else
13
+ self.class.get("/cached/jobs/#{job_id}", @options)
14
+ end
11
15
  end
12
16
 
13
17
  def update(job_id, opts={})
@@ -15,6 +19,7 @@ module Datahen
15
19
  body[:status] = opts[:status] if opts[:status]
16
20
  body[:standard_worker_count] = opts[:workers] if opts[:workers]
17
21
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
22
+ body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
18
23
  params = @options.merge({body: body.to_json})
19
24
 
20
25
  self.class.put("/jobs/#{job_id}", params)
@@ -41,6 +46,7 @@ module Datahen
41
46
  body[:pages] = opts.fetch(:pages) {[]}
42
47
  body[:seeding_status] = opts.fetch(:seeding_status){ nil }
43
48
  body[:log_error] = opts[:log_error] if opts[:log_error]
49
+ body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
44
50
 
45
51
  params = @options.merge({body: body.to_json})
46
52
 
@@ -0,0 +1,16 @@
1
+ module Datahen
2
+ module Client
3
+ class JobFinisher < Datahen::Client::Base
4
+ # Reset finisher on a scraper's current job.
5
+ #
6
+ # @param [Integer] job_id Job ID
7
+ # @param [Hash] opts ({}) API custom parameters.
8
+ #
9
+ # @return [HTTParty::Response]
10
+ def reset(job_id, opts={})
11
+ params = @options.merge(opts)
12
+ self.class.put("/jobs/#{job_id}/finisher/reset", params)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -48,11 +48,30 @@ module Datahen
48
48
  body[:pages] = opts.fetch(:pages) {[]}
49
49
  body[:parsing_status] = opts.fetch(:parsing_status){ nil }
50
50
  body[:log_error] = opts[:log_error] if opts[:log_error]
51
+ body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
51
52
 
52
53
  params = @options.merge({body: body.to_json})
53
54
 
54
55
  self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", params)
55
56
  end
57
+
58
+ def find_content(job_id, gid)
59
+ self.class.get("/jobs/#{job_id}/pages/#{gid}/content", @options)
60
+ end
61
+
62
+ def find_failed_content(job_id, gid)
63
+ self.class.get("/jobs/#{job_id}/pages/#{gid}/failed_content", @options)
64
+ end
65
+
66
+ def reparse(job_id, opts={})
67
+ params = @options.merge(opts)
68
+ self.class.put("/jobs/#{job_id}/pages/reparse", params)
69
+ end
70
+
71
+ def refetch(job_id, opts={})
72
+ params = @options.merge(opts)
73
+ self.class.put("/jobs/#{job_id}/pages/refetch", params)
74
+ end
56
75
  end
57
76
  end
58
77
  end
@@ -2,12 +2,20 @@ module Datahen
2
2
  module Client
3
3
  class JobStat < Datahen::Client::Base
4
4
 
5
- def job_current_stats(job_id)
6
- self.class.get("/jobs/#{job_id}/stats/current", @options)
5
+ def job_current_stats(job_id, opts={})
6
+ if opts[:live]
7
+ self.class.get("/jobs/#{job_id}/stats/current", @options)
8
+ else
9
+ self.class.get("/cached/jobs/#{job_id}/stats/current", @options)
10
+ end
7
11
  end
8
12
 
9
- def scraper_job_current_stats(scraper_name)
10
- self.class.get("/scrapers/#{scraper_name}/current_job/stats/current", @options)
13
+ def scraper_job_current_stats(scraper_name, opts={})
14
+ if opts[:live]
15
+ self.class.get("/scrapers/#{scraper_name}/current_job/stats/current", @options)
16
+ else
17
+ self.class.get("/cached/scrapers/#{scraper_name}/current_job/stats/current", @options)
18
+ end
11
19
  end
12
20
 
13
21
  def job_stats_history(job_id)
@@ -0,0 +1,28 @@
1
+ module Datahen
2
+ module Client
3
+ class JobVar < Datahen::Client::Base
4
+
5
+ def find(job_id, var_name)
6
+ self.class.get("/jobs/#{job_id}/vars/#{var_name}", @options)
7
+ end
8
+
9
+ def all(job_id, opts={})
10
+ params = @options.merge opts
11
+ self.class.get("/jobs/#{job_id}/vars", params)
12
+ end
13
+
14
+ def set(job_id, var_name, value, opts={})
15
+ body = {}
16
+ body[:value] = value
17
+ body[:secret] = opts[:secret] if opts[:secret]
18
+ params = @options.merge({body: body.to_json})
19
+ self.class.put("/jobs/#{job_id}/vars/#{var_name}", params)
20
+ end
21
+
22
+ def unset(job_id, var_name, opts={})
23
+ params = @options.merge(opts)
24
+ self.class.delete("/jobs/#{job_id}/vars/#{var_name}", params)
25
+ end
26
+ end
27
+ end
28
+ end
@@ -15,8 +15,12 @@ module Datahen
15
15
  self.class.post("/scrapers/#{scraper_name}/jobs", params)
16
16
  end
17
17
 
18
- def find(scraper_name)
19
- self.class.get("/scrapers/#{scraper_name}/current_job", @options)
18
+ def find(scraper_name, opts={})
19
+ if opts[:live]
20
+ self.class.get("/scrapers/#{scraper_name}/current_job", @options)
21
+ else
22
+ self.class.get("/cached/scrapers/#{scraper_name}/current_job", @options)
23
+ end
20
24
  end
21
25
 
22
26
  def update(scraper_name, opts={})
@@ -26,6 +26,9 @@ module Datahen
26
26
  self.class.put("/scrapers/#{scraper_name}/current_job/pages/refetch", params)
27
27
  end
28
28
 
29
+ # Deprecated, please use Datahen::Client::JobVar#refetch instead.
30
+ #
31
+ # @note This method will be removed at some point in the future.
29
32
  def refetch_by_job(job_id, opts={})
30
33
  params = @options.merge(opts)
31
34
  self.class.put("/jobs/#{job_id}/pages/refetch", params)
@@ -36,11 +39,6 @@ module Datahen
36
39
  self.class.put("/scrapers/#{scraper_name}/current_job/pages/reparse", params)
37
40
  end
38
41
 
39
- def reparse_by_job(job_id, opts={})
40
- params = @options.merge(opts)
41
- self.class.put("/jobs/#{job_id}/pages/reparse", params)
42
- end
43
-
44
42
  def enqueue(scraper_name, method, url, opts={})
45
43
  body = {}
46
44
  body[:method] = method != "" ? method : "GET"
@@ -62,6 +60,14 @@ module Datahen
62
60
  self.class.post("/scrapers/#{scraper_name}/current_job/pages", params)
63
61
  end
64
62
 
63
+ def find_content(scraper_name, gid)
64
+ self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}/content", @options)
65
+ end
66
+
67
+ def find_failed_content(scraper_name, gid)
68
+ self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}/failed_content", @options)
69
+ end
70
+
65
71
  end
66
72
  end
67
73
  end
@@ -63,9 +63,9 @@ module Datahen
63
63
  client.find(gid)
64
64
  end
65
65
 
66
- def get_content(gid)
67
- client = Client::GlobalPage.new()
68
- content_json = client.find_content(gid)
66
+ def get_content(job_id, gid)
67
+ client = Client::JobPage.new()
68
+ content_json = client.find_content(job_id, gid)
69
69
 
70
70
  if content_json['available']
71
71
  signed_url = content_json['signed_url']
@@ -75,9 +75,9 @@ module Datahen
75
75
  end
76
76
  end
77
77
 
78
- def get_failed_content(gid)
79
- client = Client::GlobalPage.new()
80
- content_json = client.find_failed_content(gid)
78
+ def get_failed_content(job_id, gid)
79
+ client = Client::JobPage.new()
80
+ content_json = client.find_failed_content(job_id, gid)
81
81
 
82
82
  if content_json['available']
83
83
  signed_url = content_json['signed_url']
@@ -1,18 +1,24 @@
1
1
  module Datahen
2
2
  module Scraper
3
3
  class Parser
4
- def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {})
4
+ def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {}, keep_outputs=false)
5
5
  extname = File.extname(filename)
6
6
  case extname
7
7
  when '.rb'
8
- executor = RubyParserExecutor.new(filename: filename, gid: gid, job_id: job_id, vars: vars)
8
+ executor = RubyParserExecutor.new(
9
+ filename: filename,
10
+ gid: gid,
11
+ job_id: job_id,
12
+ vars: vars,
13
+ keep_outputs: keep_outputs
14
+ )
9
15
  executor.exec_parser(save)
10
16
  else
11
17
  puts "Unable to find a parser executor for file type \"#{extname}\""
12
18
  end
13
19
  end
14
20
 
15
-
21
+
16
22
  end
17
23
  end
18
- end
24
+ end
@@ -15,6 +15,7 @@ module Datahen
15
15
  @gid = options.fetch(:gid) { raise "GID is required"}
16
16
  @job_id = options.fetch(:job_id)
17
17
  @page_vars = options.fetch(:vars) { {} }
18
+ @keep_outputs = !!(options.fetch(:keep_outputs) { false })
18
19
  end
19
20
 
20
21
  def self.exposed_methods
@@ -66,7 +67,9 @@ module Datahen
66
67
  response = parsing_update(
67
68
  job_id: job_id,
68
69
  gid: gid,
69
- parsing_status: :starting)
70
+ parsing_status: :starting,
71
+ keep_outputs: @keep_outputs
72
+ )
70
73
 
71
74
  if response.code == 200
72
75
  puts "Page Parsing Status Updated."
@@ -165,7 +168,7 @@ module Datahen
165
168
  handle_error(e) if save
166
169
  raise e
167
170
  end
168
-
171
+
169
172
  if refetch_self
170
173
  refetch_page gid
171
174
  elsif reparse_self
@@ -178,11 +181,11 @@ module Datahen
178
181
  end
179
182
 
180
183
  def content
181
- @content ||= get_content(gid)
184
+ @content ||= get_content(job_id, gid)
182
185
  end
183
186
 
184
187
  def failed_content
185
- @failed_content ||= get_failed_content(gid)
188
+ @failed_content ||= get_failed_content(job_id, gid)
186
189
  end
187
190
 
188
191
  def handle_error(e)
@@ -6,6 +6,7 @@ module Datahen
6
6
  def initialize(options={})
7
7
  @filename = options.fetch(:filename) { raise "Filename is required"}
8
8
  @job_id = options[:job_id]
9
+ @keep_outputs = !!(options.fetch(:keep_outputs) { false })
9
10
  end
10
11
 
11
12
  def self.exposed_methods
@@ -81,7 +82,9 @@ module Datahen
81
82
 
82
83
  response = seeding_update(
83
84
  job_id: job_id,
84
- seeding_status: :starting)
85
+ seeding_status: :starting,
86
+ keep_outputs: @keep_outputs
87
+ )
85
88
 
86
89
  if response.code == 200
87
90
  puts "Seeding Status Updated."
@@ -2,11 +2,15 @@ module Datahen
2
2
  module Scraper
3
3
  class Seeder
4
4
 
5
- def self.exec_seeder(filename, job_id=nil, save=false)
5
+ def self.exec_seeder(filename, job_id=nil, save=false, keep_outputs=false)
6
6
  extname = File.extname(filename)
7
7
  case extname
8
8
  when '.rb'
9
- executor = RubySeederExecutor.new(filename: filename, job_id: job_id)
9
+ executor = RubySeederExecutor.new(
10
+ filename: filename,
11
+ job_id: job_id,
12
+ keep_outputs: keep_outputs
13
+ )
10
14
  executor.exec_seeder(save)
11
15
  else
12
16
  puts "Unable to find a seeder executor for file type \"#{extname}\""
@@ -15,4 +19,4 @@ module Datahen
15
19
 
16
20
  end
17
21
  end
18
- end
22
+ end
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.11.1"
2
+ VERSION = "0.14.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.1
4
+ version: 0.14.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-01-17 00:00:00.000000000 Z
11
+ date: 2020-07-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -215,10 +215,12 @@ files:
215
215
  - lib/datahen/client/global_page.rb
216
216
  - lib/datahen/client/job.rb
217
217
  - lib/datahen/client/job_export.rb
218
+ - lib/datahen/client/job_finisher.rb
218
219
  - lib/datahen/client/job_log.rb
219
220
  - lib/datahen/client/job_output.rb
220
221
  - lib/datahen/client/job_page.rb
221
222
  - lib/datahen/client/job_stat.rb
223
+ - lib/datahen/client/job_var.rb
222
224
  - lib/datahen/client/scraper.rb
223
225
  - lib/datahen/client/scraper_deployment.rb
224
226
  - lib/datahen/client/scraper_export.rb
@@ -262,7 +264,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
262
264
  - !ruby/object:Gem::Version
263
265
  version: '0'
264
266
  requirements: []
265
- rubygems_version: 3.0.3
267
+ rubygems_version: 3.1.2
266
268
  signing_key:
267
269
  specification_version: 4
268
270
  summary: DataHen toolbelt for developers