datahen 0.14.1 → 0.14.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/datahen/cli.rb +4 -0
- data/lib/datahen/cli/account.rb +17 -0
- data/lib/datahen/cli/env_var.rb +2 -6
- data/lib/datahen/cli/scraper.rb +15 -3
- data/lib/datahen/cli/scraper_job.rb +25 -3
- data/lib/datahen/cli/scraper_page.rb +1 -1
- data/lib/datahen/client.rb +1 -1
- data/lib/datahen/client/account.rb +13 -0
- data/lib/datahen/client/base.rb +3 -0
- data/lib/datahen/client/job.rb +7 -0
- data/lib/datahen/client/job_stat.rb +12 -4
- data/lib/datahen/client/scraper.rb +8 -0
- data/lib/datahen/client/scraper_job.rb +7 -0
- data/lib/datahen/scraper/executor.rb +9 -3
- data/lib/datahen/version.rb +1 -1
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 63ae3d8aba6bb70a89033d592db3242905d70d7e0ea6a90455c370cf3b8a2dde
|
4
|
+
data.tar.gz: 3f4644be8702eb789f8d4c3e099750dade2e9d9f83ab6a359f222972d11938bc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 20f50d8b4a52d360fe07f32aaa0350a190b80c30157d0d2c1e33ca745013ed8ab1a9a97d81acfc400fa6ff3250394ea44e69bfe894318d4798ec8d99567ab736
|
7
|
+
data.tar.gz: 2724495723be6e2d249e8697e28102674c3685ce4cb384bcefaeca03d606361bf2e0e24c098954ad5b872b7e067a0cc5d0c1ed84a10b7b703144ad3a2ba16b4f
|
data/lib/datahen/cli.rb
CHANGED
@@ -16,6 +16,7 @@ require 'datahen/cli/parser'
|
|
16
16
|
require 'datahen/cli/seeder'
|
17
17
|
require 'datahen/cli/finisher'
|
18
18
|
require 'datahen/cli/env_var'
|
19
|
+
require 'datahen/cli/account'
|
19
20
|
|
20
21
|
|
21
22
|
|
@@ -41,5 +42,8 @@ module Datahen
|
|
41
42
|
|
42
43
|
desc "var SUBCOMMAND ...ARGS", "for environment variable related activities"
|
43
44
|
subcommand "var", EnvVar
|
45
|
+
|
46
|
+
desc "account SUBCOMMAND ...ARGS", "for account related activities"
|
47
|
+
subcommand "account", Account
|
44
48
|
end
|
45
49
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Datahen
|
2
|
+
class CLI < Thor
|
3
|
+
class Account < Thor
|
4
|
+
|
5
|
+
desc "profile", "displays the account applied profile"
|
6
|
+
long_desc <<-LONGDESC
|
7
|
+
Displays the account applied profile
|
8
|
+
LONGDESC
|
9
|
+
def profile()
|
10
|
+
client = Client::Account.new(options)
|
11
|
+
puts "#{client.profile()}"
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
data/lib/datahen/cli/env_var.rb
CHANGED
@@ -2,7 +2,7 @@ module Datahen
|
|
2
2
|
class CLI < Thor
|
3
3
|
class EnvVar < Thor
|
4
4
|
desc "list", "List environment variables on the account"
|
5
|
-
|
5
|
+
|
6
6
|
long_desc <<-LONGDESC
|
7
7
|
List all environment variables on the account.
|
8
8
|
LONGDESC
|
@@ -19,7 +19,7 @@ module Datahen
|
|
19
19
|
<name>: Var name can only consist of alphabets, numbers, underscores. Name must be unique to your account, otherwise it will be overwritten.\x5
|
20
20
|
<value>: Value of variable.\x5
|
21
21
|
LONGDESC
|
22
|
-
option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
|
22
|
+
option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
|
23
23
|
def set(name, value)
|
24
24
|
# puts "options #{options}"
|
25
25
|
client = Client::EnvVar.new(options)
|
@@ -38,10 +38,6 @@ module Datahen
|
|
38
38
|
puts "#{client.unset(name)}"
|
39
39
|
end
|
40
40
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
41
|
end
|
46
42
|
end
|
47
43
|
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
module Datahen
|
2
2
|
class CLI < Thor
|
3
3
|
class Scraper < Thor
|
4
|
-
desc "list", "List scrapers"
|
5
4
|
|
5
|
+
desc "list", "List scrapers"
|
6
6
|
long_desc <<-LONGDESC
|
7
7
|
List all scrapers.
|
8
8
|
LONGDESC
|
@@ -29,6 +29,7 @@ module Datahen
|
|
29
29
|
option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
|
30
30
|
option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
|
31
31
|
option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
|
32
|
+
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
32
33
|
def create(scraper_name, git_repository)
|
33
34
|
# puts "options #{options}"
|
34
35
|
client = Client::Scraper.new(options)
|
@@ -51,6 +52,7 @@ module Datahen
|
|
51
52
|
option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
|
52
53
|
option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
|
53
54
|
option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
|
55
|
+
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
54
56
|
def update(scraper_name)
|
55
57
|
client = Client::Scraper.new(options)
|
56
58
|
puts "#{client.update(scraper_name, options)}"
|
@@ -164,15 +166,25 @@ module Datahen
|
|
164
166
|
option :"max-timestamp", type: :string, desc: 'Ending timestamp point in time to query historic stats (inclusive)'
|
165
167
|
option :"limit", type: :numeric, desc: 'Limit stats retrieved'
|
166
168
|
option :"order", type: :numeric, desc: 'Order stats by timestamp [DESC]'
|
169
|
+
option :live, type: :boolean, desc: 'Get data from the live stats history, not cached stats history.'
|
170
|
+
option :filter, type: :string, desc: 'Filter results on `day` or `hour`, if not specified will return all records.'
|
167
171
|
def history(scraper_name)
|
168
172
|
client = Client::JobStat.new(options)
|
169
173
|
if options[:job]
|
170
|
-
puts "#{client.job_stats_history(options[:job])}"
|
174
|
+
puts "#{client.job_stats_history(options[:job], options)}"
|
171
175
|
else
|
172
|
-
puts "#{client.scraper_job_stats_history(scraper_name)}"
|
176
|
+
puts "#{client.scraper_job_stats_history(scraper_name, options)}"
|
173
177
|
end
|
174
178
|
end
|
175
179
|
|
180
|
+
desc "profile <scraper_name>", "displays the scraper applied profile"
|
181
|
+
long_desc <<-LONGDESC
|
182
|
+
Displays the account applied profile
|
183
|
+
LONGDESC
|
184
|
+
def profile(scraper_name)
|
185
|
+
client = Client::Scraper.new(options)
|
186
|
+
puts "#{client.profile(scraper_name)}"
|
187
|
+
end
|
176
188
|
|
177
189
|
desc "job SUBCOMMAND ...ARGS", "manage scrapers jobs"
|
178
190
|
subcommand "job", ScraperJob
|
@@ -7,10 +7,16 @@ module Datahen
|
|
7
7
|
end
|
8
8
|
|
9
9
|
desc "show <scraper_name>", "Show a scraper's current job (Defaults to showing data from cached job)"
|
10
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
10
11
|
option :live, type: :boolean, desc: 'Get data from the live job, not cached job.'
|
11
12
|
def show(scraper_name)
|
12
|
-
|
13
|
-
|
13
|
+
if options[:job]
|
14
|
+
client = Client::Job.new(options)
|
15
|
+
puts "#{client.find(options[:job], options)}"
|
16
|
+
else
|
17
|
+
client = Client::ScraperJob.new(options)
|
18
|
+
puts "#{client.find(scraper_name, options)}"
|
19
|
+
end
|
14
20
|
end
|
15
21
|
|
16
22
|
|
@@ -58,7 +64,7 @@ module Datahen
|
|
58
64
|
|
59
65
|
desc "pause <scraper_name>", "pauses a scraper's current job"
|
60
66
|
long_desc <<-LONGDESC
|
61
|
-
|
67
|
+
Pauses a scraper's current job
|
62
68
|
LONGDESC
|
63
69
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
64
70
|
def pause(scraper_name)
|
@@ -79,6 +85,7 @@ module Datahen
|
|
79
85
|
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 1. '
|
80
86
|
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 0. '
|
81
87
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
88
|
+
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
82
89
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
83
90
|
def update(scraper_name)
|
84
91
|
if options[:job]
|
@@ -90,6 +97,21 @@ module Datahen
|
|
90
97
|
end
|
91
98
|
end
|
92
99
|
|
100
|
+
desc "profile <scraper_name>", "displays a scraper's current job applied profile"
|
101
|
+
long_desc <<-LONGDESC
|
102
|
+
Displays a scraper's current job applied profile
|
103
|
+
LONGDESC
|
104
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
105
|
+
def profile(scraper_name)
|
106
|
+
if options[:job]
|
107
|
+
client = Client::Job.new(options)
|
108
|
+
puts "#{client.profile(options[:job])}"
|
109
|
+
else
|
110
|
+
client = Client::ScraperJob.new(options)
|
111
|
+
puts "#{client.profile(scraper_name)}"
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
93
115
|
desc "var SUBCOMMAND ...ARGS", "for managing scraper's job variables"
|
94
116
|
subcommand "var", ScraperJobVar
|
95
117
|
|
@@ -230,7 +230,7 @@ module Datahen
|
|
230
230
|
end
|
231
231
|
end
|
232
232
|
|
233
|
-
desc "failedcontent <gid>", "Show a page's failed content in scraper's current job"
|
233
|
+
desc "failedcontent <scraper_name> <gid>", "Show a page's failed content in scraper's current job"
|
234
234
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
235
235
|
def failedcontent(scraper_name, gid)
|
236
236
|
result = nil
|
data/lib/datahen/client.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require "datahen/client/base"
|
2
|
+
require "datahen/client/account"
|
2
3
|
require "datahen/client/auth_token"
|
3
4
|
require "datahen/client/deploy_key"
|
4
5
|
require 'datahen/client/export'
|
@@ -24,7 +25,6 @@ require "datahen/client/job_var"
|
|
24
25
|
require "datahen/client/scraper_job_var"
|
25
26
|
require "datahen/client/job_finisher"
|
26
27
|
|
27
|
-
|
28
28
|
module Datahen
|
29
29
|
module Client
|
30
30
|
end
|
data/lib/datahen/client/base.rb
CHANGED
@@ -5,6 +5,8 @@ module Datahen
|
|
5
5
|
class Base
|
6
6
|
include HTTParty
|
7
7
|
|
8
|
+
default_timeout 60
|
9
|
+
|
8
10
|
def self.env_auth_token
|
9
11
|
ENV['DATAHEN_TOKEN']
|
10
12
|
end
|
@@ -55,6 +57,7 @@ module Datahen
|
|
55
57
|
query[:"max-timestamp"] = opts[:"max-timestamp"] if opts[:"max-timestamp"]
|
56
58
|
query[:limit] = opts[:limit] if opts[:limit]
|
57
59
|
query[:order] = opts[:order] if opts[:order]
|
60
|
+
query[:filter] = opts[:filter] if opts[:filter]
|
58
61
|
|
59
62
|
if opts[:query]
|
60
63
|
if opts[:query].is_a?(Hash)
|
data/lib/datahen/client/job.rb
CHANGED
@@ -20,6 +20,7 @@ module Datahen
|
|
20
20
|
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
21
21
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
22
22
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
23
|
+
body[:profile] = opts[:profile] if opts[:profile]
|
23
24
|
params = @options.merge({body: body.to_json})
|
24
25
|
|
25
26
|
self.class.put("/jobs/#{job_id}", params)
|
@@ -64,6 +65,12 @@ module Datahen
|
|
64
65
|
self.class.put("/jobs/#{job_id}/finisher_update", params)
|
65
66
|
end
|
66
67
|
|
68
|
+
def profile(job_id, opts={})
|
69
|
+
params = @options.merge(opts)
|
70
|
+
|
71
|
+
self.class.get("/jobs/#{job_id}/profile", params)
|
72
|
+
end
|
73
|
+
|
67
74
|
end
|
68
75
|
|
69
76
|
end
|
@@ -18,12 +18,20 @@ module Datahen
|
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
|
-
def job_stats_history(job_id)
|
22
|
-
|
21
|
+
def job_stats_history(job_id, opts={})
|
22
|
+
if opts[:live]
|
23
|
+
self.class.get("/jobs/#{job_id}/stats/history", @options)
|
24
|
+
else
|
25
|
+
self.class.get("/cached/jobs/#{job_id}/stats/history", @options)
|
26
|
+
end
|
23
27
|
end
|
24
28
|
|
25
|
-
def scraper_job_stats_history(scraper_name)
|
26
|
-
|
29
|
+
def scraper_job_stats_history(scraper_name, opts={})
|
30
|
+
if opts[:live]
|
31
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/stats/history", @options)
|
32
|
+
else
|
33
|
+
self.class.get("/cached/scrapers/#{scraper_name}/current_job/stats/history", @options)
|
34
|
+
end
|
27
35
|
end
|
28
36
|
|
29
37
|
end
|
@@ -25,6 +25,7 @@ module Datahen
|
|
25
25
|
body[:cancel_current_job] = opts[:cancel_current_job] if opts[:cancel_current_job]
|
26
26
|
body[:schedule] = opts[:schedule] if opts[:schedule]
|
27
27
|
body[:timezone] = opts[:timezone] if opts[:timezone]
|
28
|
+
body[:profile] = opts[:profile] if opts[:profile]
|
28
29
|
params = @options.merge({body: body.to_json})
|
29
30
|
self.class.post("/scrapers", params)
|
30
31
|
end
|
@@ -43,6 +44,7 @@ module Datahen
|
|
43
44
|
body[:cancel_current_job] = opts[:cancel_current_job] if opts.has_key?("cancel_current_job") || opts.has_key?(:cancel_current_job)
|
44
45
|
body[:schedule] = opts[:schedule] if opts[:schedule]
|
45
46
|
body[:timezone] = opts[:timezone] if opts[:timezone]
|
47
|
+
body[:profile] = opts[:profile] if opts[:profile]
|
46
48
|
params = @options.merge({body: body.to_json})
|
47
49
|
|
48
50
|
self.class.put("/scrapers/#{scraper_name}", params)
|
@@ -52,6 +54,12 @@ module Datahen
|
|
52
54
|
params = @options.merge(opts)
|
53
55
|
self.class.delete("/scrapers/#{scraper_name}", params)
|
54
56
|
end
|
57
|
+
|
58
|
+
def profile(scraper_name, opts={})
|
59
|
+
params = @options.merge(opts)
|
60
|
+
|
61
|
+
self.class.get("/scrapers/#{scraper_name}/profile", params)
|
62
|
+
end
|
55
63
|
end
|
56
64
|
end
|
57
65
|
end
|
@@ -29,6 +29,7 @@ module Datahen
|
|
29
29
|
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
30
30
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
31
31
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
32
|
+
body[:profile] = opts[:profile] if opts[:profile]
|
32
33
|
params = @options.merge({body: body.to_json})
|
33
34
|
|
34
35
|
self.class.put("/scrapers/#{scraper_name}/current_job", params)
|
@@ -48,6 +49,12 @@ module Datahen
|
|
48
49
|
opts[:status] = 'paused'
|
49
50
|
update(scraper_name, opts)
|
50
51
|
end
|
52
|
+
|
53
|
+
def profile(scraper_name, opts={})
|
54
|
+
params = @options.merge(opts)
|
55
|
+
|
56
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/profile", params)
|
57
|
+
end
|
51
58
|
end
|
52
59
|
end
|
53
60
|
end
|
@@ -60,7 +60,12 @@ module Datahen
|
|
60
60
|
|
61
61
|
def init_global_page()
|
62
62
|
client = Client::GlobalPage.new()
|
63
|
-
client.find(gid)
|
63
|
+
global_page = client.find(gid)
|
64
|
+
unless global_page.code == 200
|
65
|
+
raise "GID #{gid} not found. Aborting execution!"
|
66
|
+
else
|
67
|
+
global_page
|
68
|
+
end
|
64
69
|
end
|
65
70
|
|
66
71
|
def get_content(job_id, gid)
|
@@ -287,11 +292,12 @@ module Datahen
|
|
287
292
|
end
|
288
293
|
|
289
294
|
# behave differently if it is a real save
|
295
|
+
save_status = status
|
290
296
|
if save
|
291
297
|
log_msg = "Saving #{log_msgs.join(' and ')}."
|
292
298
|
puts "#{log_msg}"
|
293
299
|
else
|
294
|
-
|
300
|
+
save_status = "#{status}_try"
|
295
301
|
end
|
296
302
|
|
297
303
|
# saving to server
|
@@ -300,7 +306,7 @@ module Datahen
|
|
300
306
|
gid: gid,
|
301
307
|
pages: pages_slice,
|
302
308
|
outputs: outputs_slice,
|
303
|
-
status:
|
309
|
+
status: save_status)
|
304
310
|
|
305
311
|
if response.code == 200
|
306
312
|
if save
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.14.
|
4
|
+
version: 0.14.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-08-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -189,6 +189,7 @@ files:
|
|
189
189
|
- exe/hen
|
190
190
|
- lib/datahen.rb
|
191
191
|
- lib/datahen/cli.rb
|
192
|
+
- lib/datahen/cli/account.rb
|
192
193
|
- lib/datahen/cli/env_var.rb
|
193
194
|
- lib/datahen/cli/finisher.rb
|
194
195
|
- lib/datahen/cli/global_page.rb
|
@@ -206,6 +207,7 @@ files:
|
|
206
207
|
- lib/datahen/cli/scraper_var.rb
|
207
208
|
- lib/datahen/cli/seeder.rb
|
208
209
|
- lib/datahen/client.rb
|
210
|
+
- lib/datahen/client/account.rb
|
209
211
|
- lib/datahen/client/auth_token.rb
|
210
212
|
- lib/datahen/client/backblaze_content.rb
|
211
213
|
- lib/datahen/client/base.rb
|
@@ -264,7 +266,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
264
266
|
- !ruby/object:Gem::Version
|
265
267
|
version: '0'
|
266
268
|
requirements: []
|
267
|
-
rubygems_version: 3.
|
269
|
+
rubygems_version: 3.0.3
|
268
270
|
signing_key:
|
269
271
|
specification_version: 4
|
270
272
|
summary: DataHen toolbelt for developers
|