datahen 0.14.1 → 0.14.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/datahen/cli.rb +4 -0
- data/lib/datahen/cli/account.rb +17 -0
- data/lib/datahen/cli/env_var.rb +2 -6
- data/lib/datahen/cli/scraper.rb +15 -3
- data/lib/datahen/cli/scraper_job.rb +25 -3
- data/lib/datahen/cli/scraper_page.rb +1 -1
- data/lib/datahen/client.rb +1 -1
- data/lib/datahen/client/account.rb +13 -0
- data/lib/datahen/client/base.rb +3 -0
- data/lib/datahen/client/job.rb +7 -0
- data/lib/datahen/client/job_stat.rb +12 -4
- data/lib/datahen/client/scraper.rb +8 -0
- data/lib/datahen/client/scraper_job.rb +7 -0
- data/lib/datahen/scraper/executor.rb +9 -3
- data/lib/datahen/version.rb +1 -1
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 63ae3d8aba6bb70a89033d592db3242905d70d7e0ea6a90455c370cf3b8a2dde
|
4
|
+
data.tar.gz: 3f4644be8702eb789f8d4c3e099750dade2e9d9f83ab6a359f222972d11938bc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 20f50d8b4a52d360fe07f32aaa0350a190b80c30157d0d2c1e33ca745013ed8ab1a9a97d81acfc400fa6ff3250394ea44e69bfe894318d4798ec8d99567ab736
|
7
|
+
data.tar.gz: 2724495723be6e2d249e8697e28102674c3685ce4cb384bcefaeca03d606361bf2e0e24c098954ad5b872b7e067a0cc5d0c1ed84a10b7b703144ad3a2ba16b4f
|
data/lib/datahen/cli.rb
CHANGED
@@ -16,6 +16,7 @@ require 'datahen/cli/parser'
|
|
16
16
|
require 'datahen/cli/seeder'
|
17
17
|
require 'datahen/cli/finisher'
|
18
18
|
require 'datahen/cli/env_var'
|
19
|
+
require 'datahen/cli/account'
|
19
20
|
|
20
21
|
|
21
22
|
|
@@ -41,5 +42,8 @@ module Datahen
|
|
41
42
|
|
42
43
|
desc "var SUBCOMMAND ...ARGS", "for environment variable related activities"
|
43
44
|
subcommand "var", EnvVar
|
45
|
+
|
46
|
+
desc "account SUBCOMMAND ...ARGS", "for account related activities"
|
47
|
+
subcommand "account", Account
|
44
48
|
end
|
45
49
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Datahen
|
2
|
+
class CLI < Thor
|
3
|
+
class Account < Thor
|
4
|
+
|
5
|
+
desc "profile", "displays the account applied profile"
|
6
|
+
long_desc <<-LONGDESC
|
7
|
+
Displays the account applied profile
|
8
|
+
LONGDESC
|
9
|
+
def profile()
|
10
|
+
client = Client::Account.new(options)
|
11
|
+
puts "#{client.profile()}"
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
data/lib/datahen/cli/env_var.rb
CHANGED
@@ -2,7 +2,7 @@ module Datahen
|
|
2
2
|
class CLI < Thor
|
3
3
|
class EnvVar < Thor
|
4
4
|
desc "list", "List environment variables on the account"
|
5
|
-
|
5
|
+
|
6
6
|
long_desc <<-LONGDESC
|
7
7
|
List all environment variables on the account.
|
8
8
|
LONGDESC
|
@@ -19,7 +19,7 @@ module Datahen
|
|
19
19
|
<name>: Var name can only consist of alphabets, numbers, underscores. Name must be unique to your account, otherwise it will be overwritten.\x5
|
20
20
|
<value>: Value of variable.\x5
|
21
21
|
LONGDESC
|
22
|
-
option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
|
22
|
+
option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
|
23
23
|
def set(name, value)
|
24
24
|
# puts "options #{options}"
|
25
25
|
client = Client::EnvVar.new(options)
|
@@ -38,10 +38,6 @@ module Datahen
|
|
38
38
|
puts "#{client.unset(name)}"
|
39
39
|
end
|
40
40
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
41
|
end
|
46
42
|
end
|
47
43
|
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
module Datahen
|
2
2
|
class CLI < Thor
|
3
3
|
class Scraper < Thor
|
4
|
-
desc "list", "List scrapers"
|
5
4
|
|
5
|
+
desc "list", "List scrapers"
|
6
6
|
long_desc <<-LONGDESC
|
7
7
|
List all scrapers.
|
8
8
|
LONGDESC
|
@@ -29,6 +29,7 @@ module Datahen
|
|
29
29
|
option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
|
30
30
|
option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
|
31
31
|
option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
|
32
|
+
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
32
33
|
def create(scraper_name, git_repository)
|
33
34
|
# puts "options #{options}"
|
34
35
|
client = Client::Scraper.new(options)
|
@@ -51,6 +52,7 @@ module Datahen
|
|
51
52
|
option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
|
52
53
|
option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
|
53
54
|
option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
|
55
|
+
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
54
56
|
def update(scraper_name)
|
55
57
|
client = Client::Scraper.new(options)
|
56
58
|
puts "#{client.update(scraper_name, options)}"
|
@@ -164,15 +166,25 @@ module Datahen
|
|
164
166
|
option :"max-timestamp", type: :string, desc: 'Ending timestamp point in time to query historic stats (inclusive)'
|
165
167
|
option :"limit", type: :numeric, desc: 'Limit stats retrieved'
|
166
168
|
option :"order", type: :numeric, desc: 'Order stats by timestamp [DESC]'
|
169
|
+
option :live, type: :boolean, desc: 'Get data from the live stats history, not cached stats history.'
|
170
|
+
option :filter, type: :string, desc: 'Filter results on `day` or `hour`, if not specified will return all records.'
|
167
171
|
def history(scraper_name)
|
168
172
|
client = Client::JobStat.new(options)
|
169
173
|
if options[:job]
|
170
|
-
puts "#{client.job_stats_history(options[:job])}"
|
174
|
+
puts "#{client.job_stats_history(options[:job], options)}"
|
171
175
|
else
|
172
|
-
puts "#{client.scraper_job_stats_history(scraper_name)}"
|
176
|
+
puts "#{client.scraper_job_stats_history(scraper_name, options)}"
|
173
177
|
end
|
174
178
|
end
|
175
179
|
|
180
|
+
desc "profile <scraper_name>", "displays the scraper applied profile"
|
181
|
+
long_desc <<-LONGDESC
|
182
|
+
Displays the account applied profile
|
183
|
+
LONGDESC
|
184
|
+
def profile(scraper_name)
|
185
|
+
client = Client::Scraper.new(options)
|
186
|
+
puts "#{client.profile(scraper_name)}"
|
187
|
+
end
|
176
188
|
|
177
189
|
desc "job SUBCOMMAND ...ARGS", "manage scrapers jobs"
|
178
190
|
subcommand "job", ScraperJob
|
@@ -7,10 +7,16 @@ module Datahen
|
|
7
7
|
end
|
8
8
|
|
9
9
|
desc "show <scraper_name>", "Show a scraper's current job (Defaults to showing data from cached job)"
|
10
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
10
11
|
option :live, type: :boolean, desc: 'Get data from the live job, not cached job.'
|
11
12
|
def show(scraper_name)
|
12
|
-
|
13
|
-
|
13
|
+
if options[:job]
|
14
|
+
client = Client::Job.new(options)
|
15
|
+
puts "#{client.find(options[:job], options)}"
|
16
|
+
else
|
17
|
+
client = Client::ScraperJob.new(options)
|
18
|
+
puts "#{client.find(scraper_name, options)}"
|
19
|
+
end
|
14
20
|
end
|
15
21
|
|
16
22
|
|
@@ -58,7 +64,7 @@ module Datahen
|
|
58
64
|
|
59
65
|
desc "pause <scraper_name>", "pauses a scraper's current job"
|
60
66
|
long_desc <<-LONGDESC
|
61
|
-
|
67
|
+
Pauses a scraper's current job
|
62
68
|
LONGDESC
|
63
69
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
64
70
|
def pause(scraper_name)
|
@@ -79,6 +85,7 @@ module Datahen
|
|
79
85
|
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 1. '
|
80
86
|
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 0. '
|
81
87
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
88
|
+
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
82
89
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
83
90
|
def update(scraper_name)
|
84
91
|
if options[:job]
|
@@ -90,6 +97,21 @@ module Datahen
|
|
90
97
|
end
|
91
98
|
end
|
92
99
|
|
100
|
+
desc "profile <scraper_name>", "displays a scraper's current job applied profile"
|
101
|
+
long_desc <<-LONGDESC
|
102
|
+
Displays a scraper's current job applied profile
|
103
|
+
LONGDESC
|
104
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
105
|
+
def profile(scraper_name)
|
106
|
+
if options[:job]
|
107
|
+
client = Client::Job.new(options)
|
108
|
+
puts "#{client.profile(options[:job])}"
|
109
|
+
else
|
110
|
+
client = Client::ScraperJob.new(options)
|
111
|
+
puts "#{client.profile(scraper_name)}"
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
93
115
|
desc "var SUBCOMMAND ...ARGS", "for managing scraper's job variables"
|
94
116
|
subcommand "var", ScraperJobVar
|
95
117
|
|
@@ -230,7 +230,7 @@ module Datahen
|
|
230
230
|
end
|
231
231
|
end
|
232
232
|
|
233
|
-
desc "failedcontent <gid>", "Show a page's failed content in scraper's current job"
|
233
|
+
desc "failedcontent <scraper_name> <gid>", "Show a page's failed content in scraper's current job"
|
234
234
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
235
235
|
def failedcontent(scraper_name, gid)
|
236
236
|
result = nil
|
data/lib/datahen/client.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require "datahen/client/base"
|
2
|
+
require "datahen/client/account"
|
2
3
|
require "datahen/client/auth_token"
|
3
4
|
require "datahen/client/deploy_key"
|
4
5
|
require 'datahen/client/export'
|
@@ -24,7 +25,6 @@ require "datahen/client/job_var"
|
|
24
25
|
require "datahen/client/scraper_job_var"
|
25
26
|
require "datahen/client/job_finisher"
|
26
27
|
|
27
|
-
|
28
28
|
module Datahen
|
29
29
|
module Client
|
30
30
|
end
|
data/lib/datahen/client/base.rb
CHANGED
@@ -5,6 +5,8 @@ module Datahen
|
|
5
5
|
class Base
|
6
6
|
include HTTParty
|
7
7
|
|
8
|
+
default_timeout 60
|
9
|
+
|
8
10
|
def self.env_auth_token
|
9
11
|
ENV['DATAHEN_TOKEN']
|
10
12
|
end
|
@@ -55,6 +57,7 @@ module Datahen
|
|
55
57
|
query[:"max-timestamp"] = opts[:"max-timestamp"] if opts[:"max-timestamp"]
|
56
58
|
query[:limit] = opts[:limit] if opts[:limit]
|
57
59
|
query[:order] = opts[:order] if opts[:order]
|
60
|
+
query[:filter] = opts[:filter] if opts[:filter]
|
58
61
|
|
59
62
|
if opts[:query]
|
60
63
|
if opts[:query].is_a?(Hash)
|
data/lib/datahen/client/job.rb
CHANGED
@@ -20,6 +20,7 @@ module Datahen
|
|
20
20
|
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
21
21
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
22
22
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
23
|
+
body[:profile] = opts[:profile] if opts[:profile]
|
23
24
|
params = @options.merge({body: body.to_json})
|
24
25
|
|
25
26
|
self.class.put("/jobs/#{job_id}", params)
|
@@ -64,6 +65,12 @@ module Datahen
|
|
64
65
|
self.class.put("/jobs/#{job_id}/finisher_update", params)
|
65
66
|
end
|
66
67
|
|
68
|
+
def profile(job_id, opts={})
|
69
|
+
params = @options.merge(opts)
|
70
|
+
|
71
|
+
self.class.get("/jobs/#{job_id}/profile", params)
|
72
|
+
end
|
73
|
+
|
67
74
|
end
|
68
75
|
|
69
76
|
end
|
@@ -18,12 +18,20 @@ module Datahen
|
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
|
-
def job_stats_history(job_id)
|
22
|
-
|
21
|
+
def job_stats_history(job_id, opts={})
|
22
|
+
if opts[:live]
|
23
|
+
self.class.get("/jobs/#{job_id}/stats/history", @options)
|
24
|
+
else
|
25
|
+
self.class.get("/cached/jobs/#{job_id}/stats/history", @options)
|
26
|
+
end
|
23
27
|
end
|
24
28
|
|
25
|
-
def scraper_job_stats_history(scraper_name)
|
26
|
-
|
29
|
+
def scraper_job_stats_history(scraper_name, opts={})
|
30
|
+
if opts[:live]
|
31
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/stats/history", @options)
|
32
|
+
else
|
33
|
+
self.class.get("/cached/scrapers/#{scraper_name}/current_job/stats/history", @options)
|
34
|
+
end
|
27
35
|
end
|
28
36
|
|
29
37
|
end
|
@@ -25,6 +25,7 @@ module Datahen
|
|
25
25
|
body[:cancel_current_job] = opts[:cancel_current_job] if opts[:cancel_current_job]
|
26
26
|
body[:schedule] = opts[:schedule] if opts[:schedule]
|
27
27
|
body[:timezone] = opts[:timezone] if opts[:timezone]
|
28
|
+
body[:profile] = opts[:profile] if opts[:profile]
|
28
29
|
params = @options.merge({body: body.to_json})
|
29
30
|
self.class.post("/scrapers", params)
|
30
31
|
end
|
@@ -43,6 +44,7 @@ module Datahen
|
|
43
44
|
body[:cancel_current_job] = opts[:cancel_current_job] if opts.has_key?("cancel_current_job") || opts.has_key?(:cancel_current_job)
|
44
45
|
body[:schedule] = opts[:schedule] if opts[:schedule]
|
45
46
|
body[:timezone] = opts[:timezone] if opts[:timezone]
|
47
|
+
body[:profile] = opts[:profile] if opts[:profile]
|
46
48
|
params = @options.merge({body: body.to_json})
|
47
49
|
|
48
50
|
self.class.put("/scrapers/#{scraper_name}", params)
|
@@ -52,6 +54,12 @@ module Datahen
|
|
52
54
|
params = @options.merge(opts)
|
53
55
|
self.class.delete("/scrapers/#{scraper_name}", params)
|
54
56
|
end
|
57
|
+
|
58
|
+
def profile(scraper_name, opts={})
|
59
|
+
params = @options.merge(opts)
|
60
|
+
|
61
|
+
self.class.get("/scrapers/#{scraper_name}/profile", params)
|
62
|
+
end
|
55
63
|
end
|
56
64
|
end
|
57
65
|
end
|
@@ -29,6 +29,7 @@ module Datahen
|
|
29
29
|
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
30
30
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
31
31
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
32
|
+
body[:profile] = opts[:profile] if opts[:profile]
|
32
33
|
params = @options.merge({body: body.to_json})
|
33
34
|
|
34
35
|
self.class.put("/scrapers/#{scraper_name}/current_job", params)
|
@@ -48,6 +49,12 @@ module Datahen
|
|
48
49
|
opts[:status] = 'paused'
|
49
50
|
update(scraper_name, opts)
|
50
51
|
end
|
52
|
+
|
53
|
+
def profile(scraper_name, opts={})
|
54
|
+
params = @options.merge(opts)
|
55
|
+
|
56
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/profile", params)
|
57
|
+
end
|
51
58
|
end
|
52
59
|
end
|
53
60
|
end
|
@@ -60,7 +60,12 @@ module Datahen
|
|
60
60
|
|
61
61
|
def init_global_page()
|
62
62
|
client = Client::GlobalPage.new()
|
63
|
-
client.find(gid)
|
63
|
+
global_page = client.find(gid)
|
64
|
+
unless global_page.code == 200
|
65
|
+
raise "GID #{gid} not found. Aborting execution!"
|
66
|
+
else
|
67
|
+
global_page
|
68
|
+
end
|
64
69
|
end
|
65
70
|
|
66
71
|
def get_content(job_id, gid)
|
@@ -287,11 +292,12 @@ module Datahen
|
|
287
292
|
end
|
288
293
|
|
289
294
|
# behave differently if it is a real save
|
295
|
+
save_status = status
|
290
296
|
if save
|
291
297
|
log_msg = "Saving #{log_msgs.join(' and ')}."
|
292
298
|
puts "#{log_msg}"
|
293
299
|
else
|
294
|
-
|
300
|
+
save_status = "#{status}_try"
|
295
301
|
end
|
296
302
|
|
297
303
|
# saving to server
|
@@ -300,7 +306,7 @@ module Datahen
|
|
300
306
|
gid: gid,
|
301
307
|
pages: pages_slice,
|
302
308
|
outputs: outputs_slice,
|
303
|
-
status:
|
309
|
+
status: save_status)
|
304
310
|
|
305
311
|
if response.code == 200
|
306
312
|
if save
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.14.
|
4
|
+
version: 0.14.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-08-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -189,6 +189,7 @@ files:
|
|
189
189
|
- exe/hen
|
190
190
|
- lib/datahen.rb
|
191
191
|
- lib/datahen/cli.rb
|
192
|
+
- lib/datahen/cli/account.rb
|
192
193
|
- lib/datahen/cli/env_var.rb
|
193
194
|
- lib/datahen/cli/finisher.rb
|
194
195
|
- lib/datahen/cli/global_page.rb
|
@@ -206,6 +207,7 @@ files:
|
|
206
207
|
- lib/datahen/cli/scraper_var.rb
|
207
208
|
- lib/datahen/cli/seeder.rb
|
208
209
|
- lib/datahen/client.rb
|
210
|
+
- lib/datahen/client/account.rb
|
209
211
|
- lib/datahen/client/auth_token.rb
|
210
212
|
- lib/datahen/client/backblaze_content.rb
|
211
213
|
- lib/datahen/client/base.rb
|
@@ -264,7 +266,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
264
266
|
- !ruby/object:Gem::Version
|
265
267
|
version: '0'
|
266
268
|
requirements: []
|
267
|
-
rubygems_version: 3.
|
269
|
+
rubygems_version: 3.0.3
|
268
270
|
signing_key:
|
269
271
|
specification_version: 4
|
270
272
|
summary: DataHen toolbelt for developers
|