datahen 0.11.1 → 0.14.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/datahen/cli/global_page.rb +2 -15
- data/lib/datahen/cli/job.rb +14 -2
- data/lib/datahen/cli/parser.rb +9 -5
- data/lib/datahen/cli/scraper.rb +7 -4
- data/lib/datahen/cli/scraper_export.rb +2 -3
- data/lib/datahen/cli/scraper_finisher.rb +8 -2
- data/lib/datahen/cli/scraper_job.rb +35 -10
- data/lib/datahen/cli/scraper_job_var.rb +33 -10
- data/lib/datahen/cli/scraper_page.rb +57 -4
- data/lib/datahen/cli/seeder.rb +4 -2
- data/lib/datahen/client.rb +2 -0
- data/lib/datahen/client/base.rb +4 -4
- data/lib/datahen/client/global_page.rb +0 -5
- data/lib/datahen/client/job.rb +8 -2
- data/lib/datahen/client/job_finisher.rb +16 -0
- data/lib/datahen/client/job_page.rb +19 -0
- data/lib/datahen/client/job_stat.rb +12 -4
- data/lib/datahen/client/job_var.rb +28 -0
- data/lib/datahen/client/scraper_job.rb +6 -2
- data/lib/datahen/client/scraper_job_page.rb +11 -5
- data/lib/datahen/scraper/executor.rb +6 -6
- data/lib/datahen/scraper/parser.rb +10 -4
- data/lib/datahen/scraper/ruby_parser_executor.rb +7 -4
- data/lib/datahen/scraper/ruby_seeder_executor.rb +4 -1
- data/lib/datahen/scraper/seeder.rb +7 -3
- data/lib/datahen/version.rb +1 -1
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3ff2ed2cd4772450c01e3e88248ae89441de709198fdd177d3e572bbc5f0e474
|
4
|
+
data.tar.gz: 5701717fcba8a05b6f3e027d9bce33a3830fa20dabe3413255779899478cb4ab
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 949ad06a090a4ac8c2ef5b4e053ed4b7668c051be15b6959a2948614e771c25e18774d9ee97fe1f5c03c130986b671a8b26ac253f592a993fa4ad393bcad7673
|
7
|
+
data.tar.gz: b73cfc6c070314f97cbc7917d571de67031247aac42f3474b2e71d04e8b3d650fc380a0ce3ca65c1d8339bf8743d94b666ecccca4431f7b89df4e7485a03a382
|
@@ -12,28 +12,15 @@ module Datahen
|
|
12
12
|
def content(gid)
|
13
13
|
client = Client::GlobalPage.new(options)
|
14
14
|
result = JSON.parse(client.find_content(gid).to_s)
|
15
|
-
|
15
|
+
|
16
16
|
if result['available'] == true
|
17
17
|
puts "Preview content url: \"#{result['preview_url']}\""
|
18
18
|
`open "#{result['preview_url']}"`
|
19
19
|
else
|
20
20
|
puts "Content does not exist"
|
21
|
-
end
|
21
|
+
end
|
22
22
|
end
|
23
23
|
|
24
|
-
desc "failedcontent <gid>", "Show failed content of a globalpage"
|
25
|
-
def failedcontent(gid)
|
26
|
-
client = Client::GlobalPage.new(options)
|
27
|
-
result = JSON.parse(client.find_failed_content(gid).to_s)
|
28
|
-
|
29
|
-
if result['available'] == true
|
30
|
-
puts "Preview failed content url: \"#{result['preview_url']}\""
|
31
|
-
`open "#{result['preview_url']}"`
|
32
|
-
else
|
33
|
-
puts "Failed Content does not exist"
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
24
|
end
|
38
25
|
end
|
39
26
|
end
|
data/lib/datahen/cli/job.rb
CHANGED
@@ -18,12 +18,24 @@ module Datahen
|
|
18
18
|
puts "#{client.all()}"
|
19
19
|
end
|
20
20
|
|
21
|
-
desc "show <job_id>", "Show a job"
|
21
|
+
desc "show <job_id>", "Show a job (Defaults to showing data from cached job)"
|
22
|
+
option :live, type: :boolean, desc: 'Get data from the live job, not cached job.'
|
22
23
|
def show(job_id)
|
23
24
|
client = Client::Job.new(options)
|
24
|
-
puts "#{client.find(job_id)}"
|
25
|
+
puts "#{client.find(job_id, options)}"
|
25
26
|
end
|
26
27
|
|
28
|
+
desc "stats <job_id>", "Get the stat for a job (Defaults to showing data from cached stats)"
|
29
|
+
long_desc <<-LONGDESC
|
30
|
+
Get stats for a scraper's current job\n
|
31
|
+
LONGDESC
|
32
|
+
option :live, type: :boolean, desc: 'Get data from the live stats, not cached stats.'
|
33
|
+
def stats(job_id)
|
34
|
+
client = Client::JobStat.new(options)
|
35
|
+
puts "#{client.job_current_stats(job_id, options)}"
|
36
|
+
end
|
37
|
+
|
38
|
+
|
27
39
|
end
|
28
40
|
end
|
29
41
|
|
data/lib/datahen/cli/parser.rb
CHANGED
@@ -10,12 +10,13 @@ module Datahen
|
|
10
10
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
11
11
|
option :global, :aliases => :g, type: :boolean, default: false, desc: 'Use globalpage instead of a job page'
|
12
12
|
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
13
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
13
14
|
def try_parse(scraper_name, parser_file, gid)
|
14
|
-
begin
|
15
|
-
|
15
|
+
begin
|
16
|
+
|
16
17
|
if options[:job]
|
17
18
|
job_id = options[:job]
|
18
|
-
elsif options[:global]
|
19
|
+
elsif options[:global]
|
19
20
|
job_id = nil
|
20
21
|
else
|
21
22
|
job = Client::ScraperJob.new(options).find(scraper_name)
|
@@ -24,7 +25,7 @@ module Datahen
|
|
24
25
|
|
25
26
|
|
26
27
|
vars = JSON.parse(options[:vars]) if options[:vars]
|
27
|
-
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars)
|
28
|
+
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars, options[:"keep-outputs"])
|
28
29
|
|
29
30
|
rescue JSON::ParserError
|
30
31
|
if options[:vars]
|
@@ -40,6 +41,8 @@ module Datahen
|
|
40
41
|
<GID>: Global ID of the page.\x5
|
41
42
|
LONGDESC
|
42
43
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
44
|
+
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
45
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
43
46
|
def exec_parse(scraper_name, parser_file, *gids)
|
44
47
|
gids.each do |gid|
|
45
48
|
begin
|
@@ -52,7 +55,8 @@ module Datahen
|
|
52
55
|
job_id = job['id']
|
53
56
|
end
|
54
57
|
|
55
|
-
|
58
|
+
vars = JSON.parse(options[:vars]) if options[:vars]
|
59
|
+
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true, vars, options[:"keep-outputs"])
|
56
60
|
rescue => e
|
57
61
|
puts e
|
58
62
|
end
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -60,7 +60,7 @@ module Datahen
|
|
60
60
|
desc "show <scraper_name>", "Show a scraper"
|
61
61
|
def show(scraper_name)
|
62
62
|
client = Client::Scraper.new(options)
|
63
|
-
puts "#{client.find(scraper_name)}"
|
63
|
+
puts "#{client.find(scraper_name, options)}"
|
64
64
|
end
|
65
65
|
|
66
66
|
desc "delete <scraper_name>", "Delete a scraper and related records"
|
@@ -102,6 +102,7 @@ module Datahen
|
|
102
102
|
option :head, :aliases => :H, desc: 'Show the oldest log entries. If not set, newest entries is shown'
|
103
103
|
option :parsing, :aliases => :p, type: :boolean, desc: 'Show only log entries related to parsing errors'
|
104
104
|
option :seeding, :aliases => :s, type: :boolean, desc: 'Show only log entries related to seeding errors'
|
105
|
+
option :finisher, :aliases => :f, type: :boolean, desc: 'Show only log entries related to finisher errors'
|
105
106
|
option :more, :aliases => :m, desc: 'Show next set of log entries. Enter the `More token`'
|
106
107
|
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 5000 per page.'
|
107
108
|
def log(scraper_name)
|
@@ -111,6 +112,7 @@ module Datahen
|
|
111
112
|
query["order"] = options.delete(:head) if options[:head]
|
112
113
|
query["job_type"] = "parsing" if options[:parsing]
|
113
114
|
query["job_type"] = "seeding" if options[:seeding]
|
115
|
+
query["job_type"] = "finisher executing" if options[:finisher]
|
114
116
|
query["page_token"] = options.delete(:more) if options[:more]
|
115
117
|
query["per_page"] = options.delete(:per_page) if options[:per_page]
|
116
118
|
|
@@ -138,17 +140,18 @@ module Datahen
|
|
138
140
|
end
|
139
141
|
end
|
140
142
|
|
141
|
-
desc "stats <scraper_name>", "Get the
|
143
|
+
desc "stats <scraper_name>", "Get the stat for a current job (Defaults to showing data from cached stats)"
|
142
144
|
long_desc <<-LONGDESC
|
143
145
|
Get stats for a scraper's current job\n
|
144
146
|
LONGDESC
|
145
147
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
148
|
+
option :live, type: :boolean, desc: 'Get data from the live stats, not cached stats.'
|
146
149
|
def stats(scraper_name)
|
147
150
|
client = Client::JobStat.new(options)
|
148
151
|
if options[:job]
|
149
|
-
puts "#{client.job_current_stats(options[:job])}"
|
152
|
+
puts "#{client.job_current_stats(options[:job], options)}"
|
150
153
|
else
|
151
|
-
puts "#{client.scraper_job_current_stats(scraper_name)}"
|
154
|
+
puts "#{client.scraper_job_current_stats(scraper_name, options)}"
|
152
155
|
end
|
153
156
|
end
|
154
157
|
|
@@ -12,7 +12,6 @@ module Datahen
|
|
12
12
|
puts "#{client.find(export_id)}"
|
13
13
|
end
|
14
14
|
|
15
|
-
|
16
15
|
desc "list", "Gets a list of exports"
|
17
16
|
long_desc <<-LONGDESC
|
18
17
|
List exports.
|
@@ -34,13 +33,13 @@ module Datahen
|
|
34
33
|
def download(export_id)
|
35
34
|
client = Client::ScraperExport.new(options)
|
36
35
|
result = JSON.parse(client.download(export_id).to_s)
|
37
|
-
|
36
|
+
|
38
37
|
if result['signed_url']
|
39
38
|
puts "Download url: \"#{result['signed_url']}\""
|
40
39
|
`open "#{result['signed_url']}"`
|
41
40
|
else
|
42
41
|
puts "Exported file does not exist"
|
43
|
-
end
|
42
|
+
end
|
44
43
|
end
|
45
44
|
|
46
45
|
|
@@ -11,9 +11,15 @@ module Datahen
|
|
11
11
|
long_desc <<-LONGDESC
|
12
12
|
Reset finisher on a scraper's current job.\x5
|
13
13
|
LONGDESC
|
14
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
14
15
|
def reset(scraper_name)
|
15
|
-
|
16
|
-
|
16
|
+
if options[:job]
|
17
|
+
client = Client::JobFinisher.new(options)
|
18
|
+
puts "#{client.reset(options[:job])}"
|
19
|
+
else
|
20
|
+
client = Client::ScraperFinisher.new(options)
|
21
|
+
puts "#{client.reset(scraper_name)}"
|
22
|
+
end
|
17
23
|
end
|
18
24
|
end
|
19
25
|
end
|
@@ -6,10 +6,11 @@ module Datahen
|
|
6
6
|
"#{basename} #{@package_name} #{command.usage}"
|
7
7
|
end
|
8
8
|
|
9
|
-
desc "show <scraper_name>", "Show a scraper's current job"
|
9
|
+
desc "show <scraper_name>", "Show a scraper's current job (Defaults to showing data from cached job)"
|
10
|
+
option :live, type: :boolean, desc: 'Get data from the live job, not cached job.'
|
10
11
|
def show(scraper_name)
|
11
12
|
client = Client::ScraperJob.new(options)
|
12
|
-
puts "#{client.find(scraper_name)}"
|
13
|
+
puts "#{client.find(scraper_name, options)}"
|
13
14
|
end
|
14
15
|
|
15
16
|
|
@@ -29,27 +30,45 @@ module Datahen
|
|
29
30
|
long_desc <<-LONGDESC
|
30
31
|
Cancels a scraper's current job
|
31
32
|
LONGDESC
|
33
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
32
34
|
def cancel(scraper_name)
|
33
|
-
|
34
|
-
|
35
|
+
if options[:job]
|
36
|
+
client = Client::Job.new(options)
|
37
|
+
puts "#{client.cancel(options[:job])}"
|
38
|
+
else
|
39
|
+
client = Client::ScraperJob.new(options)
|
40
|
+
puts "#{client.cancel(scraper_name)}"
|
41
|
+
end
|
35
42
|
end
|
36
43
|
|
37
44
|
desc "resume <scraper_name>", "resumes a scraper's current job"
|
38
45
|
long_desc <<-LONGDESC
|
39
46
|
Resumes a scraper's current job
|
40
47
|
LONGDESC
|
48
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
41
49
|
def resume(scraper_name)
|
42
|
-
|
43
|
-
|
50
|
+
if options[:job]
|
51
|
+
client = Client::Job.new(options)
|
52
|
+
puts "#{client.resume(options[:job])}"
|
53
|
+
else
|
54
|
+
client = Client::ScraperJob.new(options)
|
55
|
+
puts "#{client.resume(scraper_name)}"
|
56
|
+
end
|
44
57
|
end
|
45
58
|
|
46
59
|
desc "pause <scraper_name>", "pauses a scraper's current job"
|
47
60
|
long_desc <<-LONGDESC
|
48
61
|
pauses a scraper's current job
|
49
62
|
LONGDESC
|
63
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
50
64
|
def pause(scraper_name)
|
51
|
-
|
52
|
-
|
65
|
+
if options[:job]
|
66
|
+
client = Client::Job.new(options)
|
67
|
+
puts "#{client.pause(options[:job])}"
|
68
|
+
else
|
69
|
+
client = Client::ScraperJob.new(options)
|
70
|
+
puts "#{client.pause(scraper_name)}"
|
71
|
+
end
|
53
72
|
end
|
54
73
|
|
55
74
|
|
@@ -60,9 +79,15 @@ module Datahen
|
|
60
79
|
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 1. '
|
61
80
|
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 0. '
|
62
81
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
82
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
63
83
|
def update(scraper_name)
|
64
|
-
|
65
|
-
|
84
|
+
if options[:job]
|
85
|
+
client = Client::Job.new(options)
|
86
|
+
puts "#{client.update(options[:job], options)}"
|
87
|
+
else
|
88
|
+
client = Client::ScraperJob.new(options)
|
89
|
+
puts "#{client.update(scraper_name, options)}"
|
90
|
+
end
|
66
91
|
end
|
67
92
|
|
68
93
|
desc "var SUBCOMMAND ...ARGS", "for managing scraper's job variables"
|
@@ -13,9 +13,15 @@ module Datahen
|
|
13
13
|
LONGDESC
|
14
14
|
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
15
15
|
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
16
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
16
17
|
def list(scraper_name)
|
17
|
-
|
18
|
-
|
18
|
+
if options[:job]
|
19
|
+
client = Client::JobVar.new(options)
|
20
|
+
puts "#{client.all(options[:job])}"
|
21
|
+
else
|
22
|
+
client = Client::ScraperJobVar.new(options)
|
23
|
+
puts "#{client.all(scraper_name)}"
|
24
|
+
end
|
19
25
|
end
|
20
26
|
|
21
27
|
desc "set <scraper_name> <var_name> <value>", "Set an environment var on the scrape job"
|
@@ -24,23 +30,40 @@ module Datahen
|
|
24
30
|
<var_name>: Var name can only consist of alphabets, numbers, underscores. Name must be unique to your scrape job, otherwise it will be overwritten.\x5
|
25
31
|
<value>: Value of variable.\x5
|
26
32
|
LONGDESC
|
27
|
-
option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
|
33
|
+
option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
|
34
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
28
35
|
def set(scraper_name, var_name, value)
|
29
|
-
|
30
|
-
|
31
|
-
|
36
|
+
if options[:job]
|
37
|
+
client = Client::JobVar.new(options)
|
38
|
+
puts "#{client.set(options[:job], var_name, value, options)}"
|
39
|
+
else
|
40
|
+
client = Client::ScraperJobVar.new(options)
|
41
|
+
puts "#{client.set(scraper_name, var_name, value, options)}"
|
42
|
+
end
|
32
43
|
end
|
33
44
|
|
34
45
|
desc "show <scraper_name> <var_name>", "Show an environment variable on the scrape job"
|
46
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
35
47
|
def show(scraper_name, var_name)
|
36
|
-
|
37
|
-
|
48
|
+
if options[:job]
|
49
|
+
client = Client::JobVar.new(options)
|
50
|
+
puts "#{client.find(options[:job], var_name)}"
|
51
|
+
else
|
52
|
+
client = Client::ScraperJobVar.new(options)
|
53
|
+
puts "#{client.find(scraper_name, var_name)}"
|
54
|
+
end
|
38
55
|
end
|
39
56
|
|
40
57
|
desc "unset <scraper_name> <var_name>", "Deletes an environment variable on the scrape job"
|
58
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
41
59
|
def unset(scraper_name, var_name)
|
42
|
-
|
43
|
-
|
60
|
+
if options[:job]
|
61
|
+
client = Client::JobVar.new(options)
|
62
|
+
puts "#{client.unset(options[:job], var_name)}"
|
63
|
+
else
|
64
|
+
client = Client::ScraperJobVar.new(options)
|
65
|
+
puts "#{client.unset(scraper_name, var_name)}"
|
66
|
+
end
|
44
67
|
end
|
45
68
|
end
|
46
69
|
end
|
@@ -17,6 +17,7 @@ module Datahen
|
|
17
17
|
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
18
18
|
option :fetch_fail, type: :boolean, desc: 'Returns only pages that fails fetching.'
|
19
19
|
option :parse_fail, type: :boolean, desc: 'Returns only pages that fails parsing.'
|
20
|
+
option :status, type: :string, desc: 'Returns only pages with specific status.'
|
20
21
|
def list(scraper_name)
|
21
22
|
if options[:job]
|
22
23
|
client = Client::JobPage.new(options)
|
@@ -104,13 +105,19 @@ module Datahen
|
|
104
105
|
option :fetch_fail, type: :boolean, desc: 'Refetches only pages that fails fetching.'
|
105
106
|
option :parse_fail, type: :boolean, desc: 'Refetches only pages that fails parsing.'
|
106
107
|
option :status, type: :string, desc: 'Refetches only pages with a specific status.'
|
108
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
107
109
|
def refetch(scraper_name)
|
108
110
|
if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status)
|
109
111
|
puts "Must specify either a --gid, --fetch-fail, --parse-fail or --status"
|
110
112
|
return
|
111
113
|
end
|
112
|
-
|
113
|
-
|
114
|
+
if options[:job]
|
115
|
+
client = Client::JobPage.new(options)
|
116
|
+
puts "#{client.refetch(options[:job])}"
|
117
|
+
else
|
118
|
+
client = Client::ScraperJobPage.new(options)
|
119
|
+
puts "#{client.refetch(scraper_name)}"
|
120
|
+
end
|
114
121
|
end
|
115
122
|
|
116
123
|
desc "reparse <scraper_name>", "Reparse Pages on a scraper's current job"
|
@@ -120,6 +127,7 @@ module Datahen
|
|
120
127
|
option :gid, :aliases => :g, type: :string, desc: 'Reparse a specific GID'
|
121
128
|
option :parse_fail, type: :boolean, desc: 'Reparse only pages that fails parsing.'
|
122
129
|
option :status, type: :string, desc: 'Reparse only pages with a specific status.'
|
130
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
123
131
|
def reparse(scraper_name)
|
124
132
|
begin
|
125
133
|
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
@@ -129,8 +137,13 @@ module Datahen
|
|
129
137
|
return
|
130
138
|
end
|
131
139
|
|
132
|
-
|
133
|
-
|
140
|
+
if options[:job]
|
141
|
+
client = Client::JobPage.new(options)
|
142
|
+
puts "#{client.reparse(options[:job])}"
|
143
|
+
else
|
144
|
+
client = Client::ScraperJobPage.new(options)
|
145
|
+
puts "#{client.reparse(scraper_name)}"
|
146
|
+
end
|
134
147
|
|
135
148
|
rescue JSON::ParserError
|
136
149
|
if options[:vars]
|
@@ -197,6 +210,46 @@ module Datahen
|
|
197
210
|
end
|
198
211
|
end
|
199
212
|
|
213
|
+
desc "content <scraper_name> <gid>", "Show a page's content in scraper's current job"
|
214
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
215
|
+
def content(scraper_name, gid)
|
216
|
+
result = nil
|
217
|
+
if options[:job]
|
218
|
+
client = Client::JobPage.new(options)
|
219
|
+
result = JSON.parse(client.find_content(options[:job], gid).to_s)
|
220
|
+
else
|
221
|
+
client = Client::ScraperJobPage.new(options)
|
222
|
+
result = JSON.parse(client.find_content(scraper_name, gid).to_s)
|
223
|
+
end
|
224
|
+
|
225
|
+
if result['available'] == true
|
226
|
+
puts "Preview content url: \"#{result['preview_url']}\""
|
227
|
+
`open "#{result['preview_url']}"`
|
228
|
+
else
|
229
|
+
puts "Content does not exist"
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
desc "failedcontent <gid>", "Show a page's failed content in scraper's current job"
|
234
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
235
|
+
def failedcontent(scraper_name, gid)
|
236
|
+
result = nil
|
237
|
+
if options[:job]
|
238
|
+
client = Client::JobPage.new(options)
|
239
|
+
result = JSON.parse(client.find_failed_content(options[:job], gid).to_s)
|
240
|
+
else
|
241
|
+
client = Client::ScraperJobPage.new(options)
|
242
|
+
result = JSON.parse(client.find_failed_content(scraper_name, gid).to_s)
|
243
|
+
end
|
244
|
+
|
245
|
+
if result['available'] == true
|
246
|
+
puts "Preview failed content url: \"#{result['preview_url']}\""
|
247
|
+
`open "#{result['preview_url']}"`
|
248
|
+
else
|
249
|
+
puts "Failed Content does not exist"
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
200
253
|
end
|
201
254
|
end
|
202
255
|
|
data/lib/datahen/cli/seeder.rb
CHANGED
@@ -7,6 +7,7 @@ module Datahen
|
|
7
7
|
<seeder_file>: Seeder script file will be executed.\x5
|
8
8
|
LONGDESC
|
9
9
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
10
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
10
11
|
def try_seed(scraper_name, seeder_file)
|
11
12
|
if options[:job]
|
12
13
|
job_id = options[:job]
|
@@ -14,8 +15,8 @@ module Datahen
|
|
14
15
|
job = Client::ScraperJob.new(options).find(scraper_name)
|
15
16
|
job_id = job['id']
|
16
17
|
end
|
17
|
-
|
18
|
-
puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, false)
|
18
|
+
|
19
|
+
puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, false, options[:"keep-outputs"])
|
19
20
|
end
|
20
21
|
|
21
22
|
desc "exec <scraper_name> <seeder_file>", "Executes a seeder script onto a scraper's current job."
|
@@ -24,6 +25,7 @@ module Datahen
|
|
24
25
|
<seeder_file>: Seeder script file that will be executed on the scraper's current job.\x5
|
25
26
|
LONGDESC
|
26
27
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
28
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
27
29
|
def exec_parse(scraper_name, seeder_file)
|
28
30
|
if options[:job]
|
29
31
|
job_id = options[:job]
|
data/lib/datahen/client.rb
CHANGED
@@ -20,7 +20,9 @@ require "datahen/client/job_stat"
|
|
20
20
|
require "datahen/client/backblaze_content"
|
21
21
|
require "datahen/client/env_var"
|
22
22
|
require "datahen/client/scraper_var"
|
23
|
+
require "datahen/client/job_var"
|
23
24
|
require "datahen/client/scraper_job_var"
|
25
|
+
require "datahen/client/job_finisher"
|
24
26
|
|
25
27
|
|
26
28
|
module Datahen
|
data/lib/datahen/client/base.rb
CHANGED
@@ -51,10 +51,10 @@ module Datahen
|
|
51
51
|
query[:status] = opts[:status] if opts[:status]
|
52
52
|
query[:page_type] = opts[:page_type] if opts[:page_type]
|
53
53
|
query[:gid] = opts[:gid] if opts[:gid]
|
54
|
-
query[:"min-timestamp"] = opts[:"min-timestamp"]
|
55
|
-
query[:"max-timestamp"] = opts[:"max-timestamp"]
|
56
|
-
query[:limit] = opts[:limit]
|
57
|
-
query[:order] = opts[:order]
|
54
|
+
query[:"min-timestamp"] = opts[:"min-timestamp"] if opts[:"min-timestamp"]
|
55
|
+
query[:"max-timestamp"] = opts[:"max-timestamp"] if opts[:"max-timestamp"]
|
56
|
+
query[:limit] = opts[:limit] if opts[:limit]
|
57
|
+
query[:order] = opts[:order] if opts[:order]
|
58
58
|
|
59
59
|
if opts[:query]
|
60
60
|
if opts[:query].is_a?(Hash)
|
data/lib/datahen/client/job.rb
CHANGED
@@ -6,8 +6,12 @@ module Datahen
|
|
6
6
|
self.class.get("/jobs", params)
|
7
7
|
end
|
8
8
|
|
9
|
-
def find(job_id)
|
10
|
-
|
9
|
+
def find(job_id, opts={})
|
10
|
+
if opts[:live]
|
11
|
+
self.class.get("/jobs/#{job_id}", @options)
|
12
|
+
else
|
13
|
+
self.class.get("/cached/jobs/#{job_id}", @options)
|
14
|
+
end
|
11
15
|
end
|
12
16
|
|
13
17
|
def update(job_id, opts={})
|
@@ -15,6 +19,7 @@ module Datahen
|
|
15
19
|
body[:status] = opts[:status] if opts[:status]
|
16
20
|
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
17
21
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
22
|
+
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
18
23
|
params = @options.merge({body: body.to_json})
|
19
24
|
|
20
25
|
self.class.put("/jobs/#{job_id}", params)
|
@@ -41,6 +46,7 @@ module Datahen
|
|
41
46
|
body[:pages] = opts.fetch(:pages) {[]}
|
42
47
|
body[:seeding_status] = opts.fetch(:seeding_status){ nil }
|
43
48
|
body[:log_error] = opts[:log_error] if opts[:log_error]
|
49
|
+
body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
|
44
50
|
|
45
51
|
params = @options.merge({body: body.to_json})
|
46
52
|
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class JobFinisher < Datahen::Client::Base
|
4
|
+
# Reset finisher on a scraper's current job.
|
5
|
+
#
|
6
|
+
# @param [Integer] job_id Job ID
|
7
|
+
# @param [Hash] opts ({}) API custom parameters.
|
8
|
+
#
|
9
|
+
# @return [HTTParty::Response]
|
10
|
+
def reset(job_id, opts={})
|
11
|
+
params = @options.merge(opts)
|
12
|
+
self.class.put("/jobs/#{job_id}/finisher/reset", params)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -48,11 +48,30 @@ module Datahen
|
|
48
48
|
body[:pages] = opts.fetch(:pages) {[]}
|
49
49
|
body[:parsing_status] = opts.fetch(:parsing_status){ nil }
|
50
50
|
body[:log_error] = opts[:log_error] if opts[:log_error]
|
51
|
+
body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
|
51
52
|
|
52
53
|
params = @options.merge({body: body.to_json})
|
53
54
|
|
54
55
|
self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", params)
|
55
56
|
end
|
57
|
+
|
58
|
+
def find_content(job_id, gid)
|
59
|
+
self.class.get("/jobs/#{job_id}/pages/#{gid}/content", @options)
|
60
|
+
end
|
61
|
+
|
62
|
+
def find_failed_content(job_id, gid)
|
63
|
+
self.class.get("/jobs/#{job_id}/pages/#{gid}/failed_content", @options)
|
64
|
+
end
|
65
|
+
|
66
|
+
def reparse(job_id, opts={})
|
67
|
+
params = @options.merge(opts)
|
68
|
+
self.class.put("/jobs/#{job_id}/pages/reparse", params)
|
69
|
+
end
|
70
|
+
|
71
|
+
def refetch(job_id, opts={})
|
72
|
+
params = @options.merge(opts)
|
73
|
+
self.class.put("/jobs/#{job_id}/pages/refetch", params)
|
74
|
+
end
|
56
75
|
end
|
57
76
|
end
|
58
77
|
end
|
@@ -2,12 +2,20 @@ module Datahen
|
|
2
2
|
module Client
|
3
3
|
class JobStat < Datahen::Client::Base
|
4
4
|
|
5
|
-
def job_current_stats(job_id)
|
6
|
-
|
5
|
+
def job_current_stats(job_id, opts={})
|
6
|
+
if opts[:live]
|
7
|
+
self.class.get("/jobs/#{job_id}/stats/current", @options)
|
8
|
+
else
|
9
|
+
self.class.get("/cached/jobs/#{job_id}/stats/current", @options)
|
10
|
+
end
|
7
11
|
end
|
8
12
|
|
9
|
-
def scraper_job_current_stats(scraper_name)
|
10
|
-
|
13
|
+
def scraper_job_current_stats(scraper_name, opts={})
|
14
|
+
if opts[:live]
|
15
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/stats/current", @options)
|
16
|
+
else
|
17
|
+
self.class.get("/cached/scrapers/#{scraper_name}/current_job/stats/current", @options)
|
18
|
+
end
|
11
19
|
end
|
12
20
|
|
13
21
|
def job_stats_history(job_id)
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class JobVar < Datahen::Client::Base
|
4
|
+
|
5
|
+
def find(job_id, var_name)
|
6
|
+
self.class.get("/jobs/#{job_id}/vars/#{var_name}", @options)
|
7
|
+
end
|
8
|
+
|
9
|
+
def all(job_id, opts={})
|
10
|
+
params = @options.merge opts
|
11
|
+
self.class.get("/jobs/#{job_id}/vars", params)
|
12
|
+
end
|
13
|
+
|
14
|
+
def set(job_id, var_name, value, opts={})
|
15
|
+
body = {}
|
16
|
+
body[:value] = value
|
17
|
+
body[:secret] = opts[:secret] if opts[:secret]
|
18
|
+
params = @options.merge({body: body.to_json})
|
19
|
+
self.class.put("/jobs/#{job_id}/vars/#{var_name}", params)
|
20
|
+
end
|
21
|
+
|
22
|
+
def unset(job_id, var_name, opts={})
|
23
|
+
params = @options.merge(opts)
|
24
|
+
self.class.delete("/jobs/#{job_id}/vars/#{var_name}", params)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -15,8 +15,12 @@ module Datahen
|
|
15
15
|
self.class.post("/scrapers/#{scraper_name}/jobs", params)
|
16
16
|
end
|
17
17
|
|
18
|
-
def find(scraper_name)
|
19
|
-
|
18
|
+
def find(scraper_name, opts={})
|
19
|
+
if opts[:live]
|
20
|
+
self.class.get("/scrapers/#{scraper_name}/current_job", @options)
|
21
|
+
else
|
22
|
+
self.class.get("/cached/scrapers/#{scraper_name}/current_job", @options)
|
23
|
+
end
|
20
24
|
end
|
21
25
|
|
22
26
|
def update(scraper_name, opts={})
|
@@ -26,6 +26,9 @@ module Datahen
|
|
26
26
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/refetch", params)
|
27
27
|
end
|
28
28
|
|
29
|
+
# Deprecated, please use Datahen::Client::JobVar#refetch instead.
|
30
|
+
#
|
31
|
+
# @note This method will be removed at some point in the future.
|
29
32
|
def refetch_by_job(job_id, opts={})
|
30
33
|
params = @options.merge(opts)
|
31
34
|
self.class.put("/jobs/#{job_id}/pages/refetch", params)
|
@@ -36,11 +39,6 @@ module Datahen
|
|
36
39
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/reparse", params)
|
37
40
|
end
|
38
41
|
|
39
|
-
def reparse_by_job(job_id, opts={})
|
40
|
-
params = @options.merge(opts)
|
41
|
-
self.class.put("/jobs/#{job_id}/pages/reparse", params)
|
42
|
-
end
|
43
|
-
|
44
42
|
def enqueue(scraper_name, method, url, opts={})
|
45
43
|
body = {}
|
46
44
|
body[:method] = method != "" ? method : "GET"
|
@@ -62,6 +60,14 @@ module Datahen
|
|
62
60
|
self.class.post("/scrapers/#{scraper_name}/current_job/pages", params)
|
63
61
|
end
|
64
62
|
|
63
|
+
def find_content(scraper_name, gid)
|
64
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}/content", @options)
|
65
|
+
end
|
66
|
+
|
67
|
+
def find_failed_content(scraper_name, gid)
|
68
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}/failed_content", @options)
|
69
|
+
end
|
70
|
+
|
65
71
|
end
|
66
72
|
end
|
67
73
|
end
|
@@ -63,9 +63,9 @@ module Datahen
|
|
63
63
|
client.find(gid)
|
64
64
|
end
|
65
65
|
|
66
|
-
def get_content(gid)
|
67
|
-
client = Client::
|
68
|
-
content_json = client.find_content(gid)
|
66
|
+
def get_content(job_id, gid)
|
67
|
+
client = Client::JobPage.new()
|
68
|
+
content_json = client.find_content(job_id, gid)
|
69
69
|
|
70
70
|
if content_json['available']
|
71
71
|
signed_url = content_json['signed_url']
|
@@ -75,9 +75,9 @@ module Datahen
|
|
75
75
|
end
|
76
76
|
end
|
77
77
|
|
78
|
-
def get_failed_content(gid)
|
79
|
-
client = Client::
|
80
|
-
content_json = client.find_failed_content(gid)
|
78
|
+
def get_failed_content(job_id, gid)
|
79
|
+
client = Client::JobPage.new()
|
80
|
+
content_json = client.find_failed_content(job_id, gid)
|
81
81
|
|
82
82
|
if content_json['available']
|
83
83
|
signed_url = content_json['signed_url']
|
@@ -1,18 +1,24 @@
|
|
1
1
|
module Datahen
|
2
2
|
module Scraper
|
3
3
|
class Parser
|
4
|
-
def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {})
|
4
|
+
def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {}, keep_outputs=false)
|
5
5
|
extname = File.extname(filename)
|
6
6
|
case extname
|
7
7
|
when '.rb'
|
8
|
-
executor = RubyParserExecutor.new(
|
8
|
+
executor = RubyParserExecutor.new(
|
9
|
+
filename: filename,
|
10
|
+
gid: gid,
|
11
|
+
job_id: job_id,
|
12
|
+
vars: vars,
|
13
|
+
keep_outputs: keep_outputs
|
14
|
+
)
|
9
15
|
executor.exec_parser(save)
|
10
16
|
else
|
11
17
|
puts "Unable to find a parser executor for file type \"#{extname}\""
|
12
18
|
end
|
13
19
|
end
|
14
20
|
|
15
|
-
|
21
|
+
|
16
22
|
end
|
17
23
|
end
|
18
|
-
end
|
24
|
+
end
|
@@ -15,6 +15,7 @@ module Datahen
|
|
15
15
|
@gid = options.fetch(:gid) { raise "GID is required"}
|
16
16
|
@job_id = options.fetch(:job_id)
|
17
17
|
@page_vars = options.fetch(:vars) { {} }
|
18
|
+
@keep_outputs = !!(options.fetch(:keep_outputs) { false })
|
18
19
|
end
|
19
20
|
|
20
21
|
def self.exposed_methods
|
@@ -66,7 +67,9 @@ module Datahen
|
|
66
67
|
response = parsing_update(
|
67
68
|
job_id: job_id,
|
68
69
|
gid: gid,
|
69
|
-
parsing_status: :starting
|
70
|
+
parsing_status: :starting,
|
71
|
+
keep_outputs: @keep_outputs
|
72
|
+
)
|
70
73
|
|
71
74
|
if response.code == 200
|
72
75
|
puts "Page Parsing Status Updated."
|
@@ -165,7 +168,7 @@ module Datahen
|
|
165
168
|
handle_error(e) if save
|
166
169
|
raise e
|
167
170
|
end
|
168
|
-
|
171
|
+
|
169
172
|
if refetch_self
|
170
173
|
refetch_page gid
|
171
174
|
elsif reparse_self
|
@@ -178,11 +181,11 @@ module Datahen
|
|
178
181
|
end
|
179
182
|
|
180
183
|
def content
|
181
|
-
@content ||= get_content(gid)
|
184
|
+
@content ||= get_content(job_id, gid)
|
182
185
|
end
|
183
186
|
|
184
187
|
def failed_content
|
185
|
-
@failed_content ||= get_failed_content(gid)
|
188
|
+
@failed_content ||= get_failed_content(job_id, gid)
|
186
189
|
end
|
187
190
|
|
188
191
|
def handle_error(e)
|
@@ -6,6 +6,7 @@ module Datahen
|
|
6
6
|
def initialize(options={})
|
7
7
|
@filename = options.fetch(:filename) { raise "Filename is required"}
|
8
8
|
@job_id = options[:job_id]
|
9
|
+
@keep_outputs = !!(options.fetch(:keep_outputs) { false })
|
9
10
|
end
|
10
11
|
|
11
12
|
def self.exposed_methods
|
@@ -81,7 +82,9 @@ module Datahen
|
|
81
82
|
|
82
83
|
response = seeding_update(
|
83
84
|
job_id: job_id,
|
84
|
-
seeding_status: :starting
|
85
|
+
seeding_status: :starting,
|
86
|
+
keep_outputs: @keep_outputs
|
87
|
+
)
|
85
88
|
|
86
89
|
if response.code == 200
|
87
90
|
puts "Seeding Status Updated."
|
@@ -2,11 +2,15 @@ module Datahen
|
|
2
2
|
module Scraper
|
3
3
|
class Seeder
|
4
4
|
|
5
|
-
def self.exec_seeder(filename, job_id=nil, save=false)
|
5
|
+
def self.exec_seeder(filename, job_id=nil, save=false, keep_outputs=false)
|
6
6
|
extname = File.extname(filename)
|
7
7
|
case extname
|
8
8
|
when '.rb'
|
9
|
-
executor = RubySeederExecutor.new(
|
9
|
+
executor = RubySeederExecutor.new(
|
10
|
+
filename: filename,
|
11
|
+
job_id: job_id,
|
12
|
+
keep_outputs: keep_outputs
|
13
|
+
)
|
10
14
|
executor.exec_seeder(save)
|
11
15
|
else
|
12
16
|
puts "Unable to find a seeder executor for file type \"#{extname}\""
|
@@ -15,4 +19,4 @@ module Datahen
|
|
15
19
|
|
16
20
|
end
|
17
21
|
end
|
18
|
-
end
|
22
|
+
end
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.14.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-07-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -215,10 +215,12 @@ files:
|
|
215
215
|
- lib/datahen/client/global_page.rb
|
216
216
|
- lib/datahen/client/job.rb
|
217
217
|
- lib/datahen/client/job_export.rb
|
218
|
+
- lib/datahen/client/job_finisher.rb
|
218
219
|
- lib/datahen/client/job_log.rb
|
219
220
|
- lib/datahen/client/job_output.rb
|
220
221
|
- lib/datahen/client/job_page.rb
|
221
222
|
- lib/datahen/client/job_stat.rb
|
223
|
+
- lib/datahen/client/job_var.rb
|
222
224
|
- lib/datahen/client/scraper.rb
|
223
225
|
- lib/datahen/client/scraper_deployment.rb
|
224
226
|
- lib/datahen/client/scraper_export.rb
|
@@ -262,7 +264,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
262
264
|
- !ruby/object:Gem::Version
|
263
265
|
version: '0'
|
264
266
|
requirements: []
|
265
|
-
rubygems_version: 3.
|
267
|
+
rubygems_version: 3.1.2
|
266
268
|
signing_key:
|
267
269
|
specification_version: 4
|
268
270
|
summary: DataHen toolbelt for developers
|