datahen 0.11.1 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/datahen/cli/global_page.rb +2 -15
- data/lib/datahen/cli/job.rb +14 -2
- data/lib/datahen/cli/parser.rb +9 -5
- data/lib/datahen/cli/scraper.rb +7 -4
- data/lib/datahen/cli/scraper_export.rb +2 -3
- data/lib/datahen/cli/scraper_finisher.rb +8 -2
- data/lib/datahen/cli/scraper_job.rb +35 -10
- data/lib/datahen/cli/scraper_job_var.rb +33 -10
- data/lib/datahen/cli/scraper_page.rb +57 -4
- data/lib/datahen/cli/seeder.rb +4 -2
- data/lib/datahen/client.rb +2 -0
- data/lib/datahen/client/base.rb +4 -4
- data/lib/datahen/client/global_page.rb +0 -5
- data/lib/datahen/client/job.rb +8 -2
- data/lib/datahen/client/job_finisher.rb +16 -0
- data/lib/datahen/client/job_page.rb +19 -0
- data/lib/datahen/client/job_stat.rb +12 -4
- data/lib/datahen/client/job_var.rb +28 -0
- data/lib/datahen/client/scraper_job.rb +6 -2
- data/lib/datahen/client/scraper_job_page.rb +11 -5
- data/lib/datahen/scraper/executor.rb +6 -6
- data/lib/datahen/scraper/parser.rb +10 -4
- data/lib/datahen/scraper/ruby_parser_executor.rb +7 -4
- data/lib/datahen/scraper/ruby_seeder_executor.rb +4 -1
- data/lib/datahen/scraper/seeder.rb +7 -3
- data/lib/datahen/version.rb +1 -1
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3ff2ed2cd4772450c01e3e88248ae89441de709198fdd177d3e572bbc5f0e474
|
4
|
+
data.tar.gz: 5701717fcba8a05b6f3e027d9bce33a3830fa20dabe3413255779899478cb4ab
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 949ad06a090a4ac8c2ef5b4e053ed4b7668c051be15b6959a2948614e771c25e18774d9ee97fe1f5c03c130986b671a8b26ac253f592a993fa4ad393bcad7673
|
7
|
+
data.tar.gz: b73cfc6c070314f97cbc7917d571de67031247aac42f3474b2e71d04e8b3d650fc380a0ce3ca65c1d8339bf8743d94b666ecccca4431f7b89df4e7485a03a382
|
@@ -12,28 +12,15 @@ module Datahen
|
|
12
12
|
def content(gid)
|
13
13
|
client = Client::GlobalPage.new(options)
|
14
14
|
result = JSON.parse(client.find_content(gid).to_s)
|
15
|
-
|
15
|
+
|
16
16
|
if result['available'] == true
|
17
17
|
puts "Preview content url: \"#{result['preview_url']}\""
|
18
18
|
`open "#{result['preview_url']}"`
|
19
19
|
else
|
20
20
|
puts "Content does not exist"
|
21
|
-
end
|
21
|
+
end
|
22
22
|
end
|
23
23
|
|
24
|
-
desc "failedcontent <gid>", "Show failed content of a globalpage"
|
25
|
-
def failedcontent(gid)
|
26
|
-
client = Client::GlobalPage.new(options)
|
27
|
-
result = JSON.parse(client.find_failed_content(gid).to_s)
|
28
|
-
|
29
|
-
if result['available'] == true
|
30
|
-
puts "Preview failed content url: \"#{result['preview_url']}\""
|
31
|
-
`open "#{result['preview_url']}"`
|
32
|
-
else
|
33
|
-
puts "Failed Content does not exist"
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
24
|
end
|
38
25
|
end
|
39
26
|
end
|
data/lib/datahen/cli/job.rb
CHANGED
@@ -18,12 +18,24 @@ module Datahen
|
|
18
18
|
puts "#{client.all()}"
|
19
19
|
end
|
20
20
|
|
21
|
-
desc "show <job_id>", "Show a job"
|
21
|
+
desc "show <job_id>", "Show a job (Defaults to showing data from cached job)"
|
22
|
+
option :live, type: :boolean, desc: 'Get data from the live job, not cached job.'
|
22
23
|
def show(job_id)
|
23
24
|
client = Client::Job.new(options)
|
24
|
-
puts "#{client.find(job_id)}"
|
25
|
+
puts "#{client.find(job_id, options)}"
|
25
26
|
end
|
26
27
|
|
28
|
+
desc "stats <job_id>", "Get the stat for a job (Defaults to showing data from cached stats)"
|
29
|
+
long_desc <<-LONGDESC
|
30
|
+
Get stats for a scraper's current job\n
|
31
|
+
LONGDESC
|
32
|
+
option :live, type: :boolean, desc: 'Get data from the live stats, not cached stats.'
|
33
|
+
def stats(job_id)
|
34
|
+
client = Client::JobStat.new(options)
|
35
|
+
puts "#{client.job_current_stats(job_id, options)}"
|
36
|
+
end
|
37
|
+
|
38
|
+
|
27
39
|
end
|
28
40
|
end
|
29
41
|
|
data/lib/datahen/cli/parser.rb
CHANGED
@@ -10,12 +10,13 @@ module Datahen
|
|
10
10
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
11
11
|
option :global, :aliases => :g, type: :boolean, default: false, desc: 'Use globalpage instead of a job page'
|
12
12
|
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
13
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
13
14
|
def try_parse(scraper_name, parser_file, gid)
|
14
|
-
begin
|
15
|
-
|
15
|
+
begin
|
16
|
+
|
16
17
|
if options[:job]
|
17
18
|
job_id = options[:job]
|
18
|
-
elsif options[:global]
|
19
|
+
elsif options[:global]
|
19
20
|
job_id = nil
|
20
21
|
else
|
21
22
|
job = Client::ScraperJob.new(options).find(scraper_name)
|
@@ -24,7 +25,7 @@ module Datahen
|
|
24
25
|
|
25
26
|
|
26
27
|
vars = JSON.parse(options[:vars]) if options[:vars]
|
27
|
-
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars)
|
28
|
+
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars, options[:"keep-outputs"])
|
28
29
|
|
29
30
|
rescue JSON::ParserError
|
30
31
|
if options[:vars]
|
@@ -40,6 +41,8 @@ module Datahen
|
|
40
41
|
<GID>: Global ID of the page.\x5
|
41
42
|
LONGDESC
|
42
43
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
44
|
+
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
45
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
43
46
|
def exec_parse(scraper_name, parser_file, *gids)
|
44
47
|
gids.each do |gid|
|
45
48
|
begin
|
@@ -52,7 +55,8 @@ module Datahen
|
|
52
55
|
job_id = job['id']
|
53
56
|
end
|
54
57
|
|
55
|
-
|
58
|
+
vars = JSON.parse(options[:vars]) if options[:vars]
|
59
|
+
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true, vars, options[:"keep-outputs"])
|
56
60
|
rescue => e
|
57
61
|
puts e
|
58
62
|
end
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -60,7 +60,7 @@ module Datahen
|
|
60
60
|
desc "show <scraper_name>", "Show a scraper"
|
61
61
|
def show(scraper_name)
|
62
62
|
client = Client::Scraper.new(options)
|
63
|
-
puts "#{client.find(scraper_name)}"
|
63
|
+
puts "#{client.find(scraper_name, options)}"
|
64
64
|
end
|
65
65
|
|
66
66
|
desc "delete <scraper_name>", "Delete a scraper and related records"
|
@@ -102,6 +102,7 @@ module Datahen
|
|
102
102
|
option :head, :aliases => :H, desc: 'Show the oldest log entries. If not set, newest entries is shown'
|
103
103
|
option :parsing, :aliases => :p, type: :boolean, desc: 'Show only log entries related to parsing errors'
|
104
104
|
option :seeding, :aliases => :s, type: :boolean, desc: 'Show only log entries related to seeding errors'
|
105
|
+
option :finisher, :aliases => :f, type: :boolean, desc: 'Show only log entries related to finisher errors'
|
105
106
|
option :more, :aliases => :m, desc: 'Show next set of log entries. Enter the `More token`'
|
106
107
|
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 5000 per page.'
|
107
108
|
def log(scraper_name)
|
@@ -111,6 +112,7 @@ module Datahen
|
|
111
112
|
query["order"] = options.delete(:head) if options[:head]
|
112
113
|
query["job_type"] = "parsing" if options[:parsing]
|
113
114
|
query["job_type"] = "seeding" if options[:seeding]
|
115
|
+
query["job_type"] = "finisher executing" if options[:finisher]
|
114
116
|
query["page_token"] = options.delete(:more) if options[:more]
|
115
117
|
query["per_page"] = options.delete(:per_page) if options[:per_page]
|
116
118
|
|
@@ -138,17 +140,18 @@ module Datahen
|
|
138
140
|
end
|
139
141
|
end
|
140
142
|
|
141
|
-
desc "stats <scraper_name>", "Get the
|
143
|
+
desc "stats <scraper_name>", "Get the stat for a current job (Defaults to showing data from cached stats)"
|
142
144
|
long_desc <<-LONGDESC
|
143
145
|
Get stats for a scraper's current job\n
|
144
146
|
LONGDESC
|
145
147
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
148
|
+
option :live, type: :boolean, desc: 'Get data from the live stats, not cached stats.'
|
146
149
|
def stats(scraper_name)
|
147
150
|
client = Client::JobStat.new(options)
|
148
151
|
if options[:job]
|
149
|
-
puts "#{client.job_current_stats(options[:job])}"
|
152
|
+
puts "#{client.job_current_stats(options[:job], options)}"
|
150
153
|
else
|
151
|
-
puts "#{client.scraper_job_current_stats(scraper_name)}"
|
154
|
+
puts "#{client.scraper_job_current_stats(scraper_name, options)}"
|
152
155
|
end
|
153
156
|
end
|
154
157
|
|
@@ -12,7 +12,6 @@ module Datahen
|
|
12
12
|
puts "#{client.find(export_id)}"
|
13
13
|
end
|
14
14
|
|
15
|
-
|
16
15
|
desc "list", "Gets a list of exports"
|
17
16
|
long_desc <<-LONGDESC
|
18
17
|
List exports.
|
@@ -34,13 +33,13 @@ module Datahen
|
|
34
33
|
def download(export_id)
|
35
34
|
client = Client::ScraperExport.new(options)
|
36
35
|
result = JSON.parse(client.download(export_id).to_s)
|
37
|
-
|
36
|
+
|
38
37
|
if result['signed_url']
|
39
38
|
puts "Download url: \"#{result['signed_url']}\""
|
40
39
|
`open "#{result['signed_url']}"`
|
41
40
|
else
|
42
41
|
puts "Exported file does not exist"
|
43
|
-
end
|
42
|
+
end
|
44
43
|
end
|
45
44
|
|
46
45
|
|
@@ -11,9 +11,15 @@ module Datahen
|
|
11
11
|
long_desc <<-LONGDESC
|
12
12
|
Reset finisher on a scraper's current job.\x5
|
13
13
|
LONGDESC
|
14
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
14
15
|
def reset(scraper_name)
|
15
|
-
|
16
|
-
|
16
|
+
if options[:job]
|
17
|
+
client = Client::JobFinisher.new(options)
|
18
|
+
puts "#{client.reset(options[:job])}"
|
19
|
+
else
|
20
|
+
client = Client::ScraperFinisher.new(options)
|
21
|
+
puts "#{client.reset(scraper_name)}"
|
22
|
+
end
|
17
23
|
end
|
18
24
|
end
|
19
25
|
end
|
@@ -6,10 +6,11 @@ module Datahen
|
|
6
6
|
"#{basename} #{@package_name} #{command.usage}"
|
7
7
|
end
|
8
8
|
|
9
|
-
desc "show <scraper_name>", "Show a scraper's current job"
|
9
|
+
desc "show <scraper_name>", "Show a scraper's current job (Defaults to showing data from cached job)"
|
10
|
+
option :live, type: :boolean, desc: 'Get data from the live job, not cached job.'
|
10
11
|
def show(scraper_name)
|
11
12
|
client = Client::ScraperJob.new(options)
|
12
|
-
puts "#{client.find(scraper_name)}"
|
13
|
+
puts "#{client.find(scraper_name, options)}"
|
13
14
|
end
|
14
15
|
|
15
16
|
|
@@ -29,27 +30,45 @@ module Datahen
|
|
29
30
|
long_desc <<-LONGDESC
|
30
31
|
Cancels a scraper's current job
|
31
32
|
LONGDESC
|
33
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
32
34
|
def cancel(scraper_name)
|
33
|
-
|
34
|
-
|
35
|
+
if options[:job]
|
36
|
+
client = Client::Job.new(options)
|
37
|
+
puts "#{client.cancel(options[:job])}"
|
38
|
+
else
|
39
|
+
client = Client::ScraperJob.new(options)
|
40
|
+
puts "#{client.cancel(scraper_name)}"
|
41
|
+
end
|
35
42
|
end
|
36
43
|
|
37
44
|
desc "resume <scraper_name>", "resumes a scraper's current job"
|
38
45
|
long_desc <<-LONGDESC
|
39
46
|
Resumes a scraper's current job
|
40
47
|
LONGDESC
|
48
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
41
49
|
def resume(scraper_name)
|
42
|
-
|
43
|
-
|
50
|
+
if options[:job]
|
51
|
+
client = Client::Job.new(options)
|
52
|
+
puts "#{client.resume(options[:job])}"
|
53
|
+
else
|
54
|
+
client = Client::ScraperJob.new(options)
|
55
|
+
puts "#{client.resume(scraper_name)}"
|
56
|
+
end
|
44
57
|
end
|
45
58
|
|
46
59
|
desc "pause <scraper_name>", "pauses a scraper's current job"
|
47
60
|
long_desc <<-LONGDESC
|
48
61
|
pauses a scraper's current job
|
49
62
|
LONGDESC
|
63
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
50
64
|
def pause(scraper_name)
|
51
|
-
|
52
|
-
|
65
|
+
if options[:job]
|
66
|
+
client = Client::Job.new(options)
|
67
|
+
puts "#{client.pause(options[:job])}"
|
68
|
+
else
|
69
|
+
client = Client::ScraperJob.new(options)
|
70
|
+
puts "#{client.pause(scraper_name)}"
|
71
|
+
end
|
53
72
|
end
|
54
73
|
|
55
74
|
|
@@ -60,9 +79,15 @@ module Datahen
|
|
60
79
|
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 1. '
|
61
80
|
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 0. '
|
62
81
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
82
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
63
83
|
def update(scraper_name)
|
64
|
-
|
65
|
-
|
84
|
+
if options[:job]
|
85
|
+
client = Client::Job.new(options)
|
86
|
+
puts "#{client.update(options[:job], options)}"
|
87
|
+
else
|
88
|
+
client = Client::ScraperJob.new(options)
|
89
|
+
puts "#{client.update(scraper_name, options)}"
|
90
|
+
end
|
66
91
|
end
|
67
92
|
|
68
93
|
desc "var SUBCOMMAND ...ARGS", "for managing scraper's job variables"
|
@@ -13,9 +13,15 @@ module Datahen
|
|
13
13
|
LONGDESC
|
14
14
|
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
15
15
|
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
16
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
16
17
|
def list(scraper_name)
|
17
|
-
|
18
|
-
|
18
|
+
if options[:job]
|
19
|
+
client = Client::JobVar.new(options)
|
20
|
+
puts "#{client.all(options[:job])}"
|
21
|
+
else
|
22
|
+
client = Client::ScraperJobVar.new(options)
|
23
|
+
puts "#{client.all(scraper_name)}"
|
24
|
+
end
|
19
25
|
end
|
20
26
|
|
21
27
|
desc "set <scraper_name> <var_name> <value>", "Set an environment var on the scrape job"
|
@@ -24,23 +30,40 @@ module Datahen
|
|
24
30
|
<var_name>: Var name can only consist of alphabets, numbers, underscores. Name must be unique to your scrape job, otherwise it will be overwritten.\x5
|
25
31
|
<value>: Value of variable.\x5
|
26
32
|
LONGDESC
|
27
|
-
option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
|
33
|
+
option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
|
34
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
28
35
|
def set(scraper_name, var_name, value)
|
29
|
-
|
30
|
-
|
31
|
-
|
36
|
+
if options[:job]
|
37
|
+
client = Client::JobVar.new(options)
|
38
|
+
puts "#{client.set(options[:job], var_name, value, options)}"
|
39
|
+
else
|
40
|
+
client = Client::ScraperJobVar.new(options)
|
41
|
+
puts "#{client.set(scraper_name, var_name, value, options)}"
|
42
|
+
end
|
32
43
|
end
|
33
44
|
|
34
45
|
desc "show <scraper_name> <var_name>", "Show an environment variable on the scrape job"
|
46
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
35
47
|
def show(scraper_name, var_name)
|
36
|
-
|
37
|
-
|
48
|
+
if options[:job]
|
49
|
+
client = Client::JobVar.new(options)
|
50
|
+
puts "#{client.find(options[:job], var_name)}"
|
51
|
+
else
|
52
|
+
client = Client::ScraperJobVar.new(options)
|
53
|
+
puts "#{client.find(scraper_name, var_name)}"
|
54
|
+
end
|
38
55
|
end
|
39
56
|
|
40
57
|
desc "unset <scraper_name> <var_name>", "Deletes an environment variable on the scrape job"
|
58
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
41
59
|
def unset(scraper_name, var_name)
|
42
|
-
|
43
|
-
|
60
|
+
if options[:job]
|
61
|
+
client = Client::JobVar.new(options)
|
62
|
+
puts "#{client.unset(options[:job], var_name)}"
|
63
|
+
else
|
64
|
+
client = Client::ScraperJobVar.new(options)
|
65
|
+
puts "#{client.unset(scraper_name, var_name)}"
|
66
|
+
end
|
44
67
|
end
|
45
68
|
end
|
46
69
|
end
|
@@ -17,6 +17,7 @@ module Datahen
|
|
17
17
|
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
18
18
|
option :fetch_fail, type: :boolean, desc: 'Returns only pages that fails fetching.'
|
19
19
|
option :parse_fail, type: :boolean, desc: 'Returns only pages that fails parsing.'
|
20
|
+
option :status, type: :string, desc: 'Returns only pages with specific status.'
|
20
21
|
def list(scraper_name)
|
21
22
|
if options[:job]
|
22
23
|
client = Client::JobPage.new(options)
|
@@ -104,13 +105,19 @@ module Datahen
|
|
104
105
|
option :fetch_fail, type: :boolean, desc: 'Refetches only pages that fails fetching.'
|
105
106
|
option :parse_fail, type: :boolean, desc: 'Refetches only pages that fails parsing.'
|
106
107
|
option :status, type: :string, desc: 'Refetches only pages with a specific status.'
|
108
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
107
109
|
def refetch(scraper_name)
|
108
110
|
if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status)
|
109
111
|
puts "Must specify either a --gid, --fetch-fail, --parse-fail or --status"
|
110
112
|
return
|
111
113
|
end
|
112
|
-
|
113
|
-
|
114
|
+
if options[:job]
|
115
|
+
client = Client::JobPage.new(options)
|
116
|
+
puts "#{client.refetch(options[:job])}"
|
117
|
+
else
|
118
|
+
client = Client::ScraperJobPage.new(options)
|
119
|
+
puts "#{client.refetch(scraper_name)}"
|
120
|
+
end
|
114
121
|
end
|
115
122
|
|
116
123
|
desc "reparse <scraper_name>", "Reparse Pages on a scraper's current job"
|
@@ -120,6 +127,7 @@ module Datahen
|
|
120
127
|
option :gid, :aliases => :g, type: :string, desc: 'Reparse a specific GID'
|
121
128
|
option :parse_fail, type: :boolean, desc: 'Reparse only pages that fails parsing.'
|
122
129
|
option :status, type: :string, desc: 'Reparse only pages with a specific status.'
|
130
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
123
131
|
def reparse(scraper_name)
|
124
132
|
begin
|
125
133
|
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
@@ -129,8 +137,13 @@ module Datahen
|
|
129
137
|
return
|
130
138
|
end
|
131
139
|
|
132
|
-
|
133
|
-
|
140
|
+
if options[:job]
|
141
|
+
client = Client::JobPage.new(options)
|
142
|
+
puts "#{client.reparse(options[:job])}"
|
143
|
+
else
|
144
|
+
client = Client::ScraperJobPage.new(options)
|
145
|
+
puts "#{client.reparse(scraper_name)}"
|
146
|
+
end
|
134
147
|
|
135
148
|
rescue JSON::ParserError
|
136
149
|
if options[:vars]
|
@@ -197,6 +210,46 @@ module Datahen
|
|
197
210
|
end
|
198
211
|
end
|
199
212
|
|
213
|
+
desc "content <scraper_name> <gid>", "Show a page's content in scraper's current job"
|
214
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
215
|
+
def content(scraper_name, gid)
|
216
|
+
result = nil
|
217
|
+
if options[:job]
|
218
|
+
client = Client::JobPage.new(options)
|
219
|
+
result = JSON.parse(client.find_content(options[:job], gid).to_s)
|
220
|
+
else
|
221
|
+
client = Client::ScraperJobPage.new(options)
|
222
|
+
result = JSON.parse(client.find_content(scraper_name, gid).to_s)
|
223
|
+
end
|
224
|
+
|
225
|
+
if result['available'] == true
|
226
|
+
puts "Preview content url: \"#{result['preview_url']}\""
|
227
|
+
`open "#{result['preview_url']}"`
|
228
|
+
else
|
229
|
+
puts "Content does not exist"
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
desc "failedcontent <gid>", "Show a page's failed content in scraper's current job"
|
234
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
235
|
+
def failedcontent(scraper_name, gid)
|
236
|
+
result = nil
|
237
|
+
if options[:job]
|
238
|
+
client = Client::JobPage.new(options)
|
239
|
+
result = JSON.parse(client.find_failed_content(options[:job], gid).to_s)
|
240
|
+
else
|
241
|
+
client = Client::ScraperJobPage.new(options)
|
242
|
+
result = JSON.parse(client.find_failed_content(scraper_name, gid).to_s)
|
243
|
+
end
|
244
|
+
|
245
|
+
if result['available'] == true
|
246
|
+
puts "Preview failed content url: \"#{result['preview_url']}\""
|
247
|
+
`open "#{result['preview_url']}"`
|
248
|
+
else
|
249
|
+
puts "Failed Content does not exist"
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
200
253
|
end
|
201
254
|
end
|
202
255
|
|
data/lib/datahen/cli/seeder.rb
CHANGED
@@ -7,6 +7,7 @@ module Datahen
|
|
7
7
|
<seeder_file>: Seeder script file will be executed.\x5
|
8
8
|
LONGDESC
|
9
9
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
10
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
10
11
|
def try_seed(scraper_name, seeder_file)
|
11
12
|
if options[:job]
|
12
13
|
job_id = options[:job]
|
@@ -14,8 +15,8 @@ module Datahen
|
|
14
15
|
job = Client::ScraperJob.new(options).find(scraper_name)
|
15
16
|
job_id = job['id']
|
16
17
|
end
|
17
|
-
|
18
|
-
puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, false)
|
18
|
+
|
19
|
+
puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, false, options[:"keep-outputs"])
|
19
20
|
end
|
20
21
|
|
21
22
|
desc "exec <scraper_name> <seeder_file>", "Executes a seeder script onto a scraper's current job."
|
@@ -24,6 +25,7 @@ module Datahen
|
|
24
25
|
<seeder_file>: Seeder script file that will be executed on the scraper's current job.\x5
|
25
26
|
LONGDESC
|
26
27
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
28
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
27
29
|
def exec_parse(scraper_name, seeder_file)
|
28
30
|
if options[:job]
|
29
31
|
job_id = options[:job]
|
data/lib/datahen/client.rb
CHANGED
@@ -20,7 +20,9 @@ require "datahen/client/job_stat"
|
|
20
20
|
require "datahen/client/backblaze_content"
|
21
21
|
require "datahen/client/env_var"
|
22
22
|
require "datahen/client/scraper_var"
|
23
|
+
require "datahen/client/job_var"
|
23
24
|
require "datahen/client/scraper_job_var"
|
25
|
+
require "datahen/client/job_finisher"
|
24
26
|
|
25
27
|
|
26
28
|
module Datahen
|
data/lib/datahen/client/base.rb
CHANGED
@@ -51,10 +51,10 @@ module Datahen
|
|
51
51
|
query[:status] = opts[:status] if opts[:status]
|
52
52
|
query[:page_type] = opts[:page_type] if opts[:page_type]
|
53
53
|
query[:gid] = opts[:gid] if opts[:gid]
|
54
|
-
query[:"min-timestamp"] = opts[:"min-timestamp"]
|
55
|
-
query[:"max-timestamp"] = opts[:"max-timestamp"]
|
56
|
-
query[:limit] = opts[:limit]
|
57
|
-
query[:order] = opts[:order]
|
54
|
+
query[:"min-timestamp"] = opts[:"min-timestamp"] if opts[:"min-timestamp"]
|
55
|
+
query[:"max-timestamp"] = opts[:"max-timestamp"] if opts[:"max-timestamp"]
|
56
|
+
query[:limit] = opts[:limit] if opts[:limit]
|
57
|
+
query[:order] = opts[:order] if opts[:order]
|
58
58
|
|
59
59
|
if opts[:query]
|
60
60
|
if opts[:query].is_a?(Hash)
|
data/lib/datahen/client/job.rb
CHANGED
@@ -6,8 +6,12 @@ module Datahen
|
|
6
6
|
self.class.get("/jobs", params)
|
7
7
|
end
|
8
8
|
|
9
|
-
def find(job_id)
|
10
|
-
|
9
|
+
def find(job_id, opts={})
|
10
|
+
if opts[:live]
|
11
|
+
self.class.get("/jobs/#{job_id}", @options)
|
12
|
+
else
|
13
|
+
self.class.get("/cached/jobs/#{job_id}", @options)
|
14
|
+
end
|
11
15
|
end
|
12
16
|
|
13
17
|
def update(job_id, opts={})
|
@@ -15,6 +19,7 @@ module Datahen
|
|
15
19
|
body[:status] = opts[:status] if opts[:status]
|
16
20
|
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
17
21
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
22
|
+
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
18
23
|
params = @options.merge({body: body.to_json})
|
19
24
|
|
20
25
|
self.class.put("/jobs/#{job_id}", params)
|
@@ -41,6 +46,7 @@ module Datahen
|
|
41
46
|
body[:pages] = opts.fetch(:pages) {[]}
|
42
47
|
body[:seeding_status] = opts.fetch(:seeding_status){ nil }
|
43
48
|
body[:log_error] = opts[:log_error] if opts[:log_error]
|
49
|
+
body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
|
44
50
|
|
45
51
|
params = @options.merge({body: body.to_json})
|
46
52
|
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class JobFinisher < Datahen::Client::Base
|
4
|
+
# Reset finisher on a scraper's current job.
|
5
|
+
#
|
6
|
+
# @param [Integer] job_id Job ID
|
7
|
+
# @param [Hash] opts ({}) API custom parameters.
|
8
|
+
#
|
9
|
+
# @return [HTTParty::Response]
|
10
|
+
def reset(job_id, opts={})
|
11
|
+
params = @options.merge(opts)
|
12
|
+
self.class.put("/jobs/#{job_id}/finisher/reset", params)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -48,11 +48,30 @@ module Datahen
|
|
48
48
|
body[:pages] = opts.fetch(:pages) {[]}
|
49
49
|
body[:parsing_status] = opts.fetch(:parsing_status){ nil }
|
50
50
|
body[:log_error] = opts[:log_error] if opts[:log_error]
|
51
|
+
body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
|
51
52
|
|
52
53
|
params = @options.merge({body: body.to_json})
|
53
54
|
|
54
55
|
self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", params)
|
55
56
|
end
|
57
|
+
|
58
|
+
def find_content(job_id, gid)
|
59
|
+
self.class.get("/jobs/#{job_id}/pages/#{gid}/content", @options)
|
60
|
+
end
|
61
|
+
|
62
|
+
def find_failed_content(job_id, gid)
|
63
|
+
self.class.get("/jobs/#{job_id}/pages/#{gid}/failed_content", @options)
|
64
|
+
end
|
65
|
+
|
66
|
+
def reparse(job_id, opts={})
|
67
|
+
params = @options.merge(opts)
|
68
|
+
self.class.put("/jobs/#{job_id}/pages/reparse", params)
|
69
|
+
end
|
70
|
+
|
71
|
+
def refetch(job_id, opts={})
|
72
|
+
params = @options.merge(opts)
|
73
|
+
self.class.put("/jobs/#{job_id}/pages/refetch", params)
|
74
|
+
end
|
56
75
|
end
|
57
76
|
end
|
58
77
|
end
|
@@ -2,12 +2,20 @@ module Datahen
|
|
2
2
|
module Client
|
3
3
|
class JobStat < Datahen::Client::Base
|
4
4
|
|
5
|
-
def job_current_stats(job_id)
|
6
|
-
|
5
|
+
def job_current_stats(job_id, opts={})
|
6
|
+
if opts[:live]
|
7
|
+
self.class.get("/jobs/#{job_id}/stats/current", @options)
|
8
|
+
else
|
9
|
+
self.class.get("/cached/jobs/#{job_id}/stats/current", @options)
|
10
|
+
end
|
7
11
|
end
|
8
12
|
|
9
|
-
def scraper_job_current_stats(scraper_name)
|
10
|
-
|
13
|
+
def scraper_job_current_stats(scraper_name, opts={})
|
14
|
+
if opts[:live]
|
15
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/stats/current", @options)
|
16
|
+
else
|
17
|
+
self.class.get("/cached/scrapers/#{scraper_name}/current_job/stats/current", @options)
|
18
|
+
end
|
11
19
|
end
|
12
20
|
|
13
21
|
def job_stats_history(job_id)
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class JobVar < Datahen::Client::Base
|
4
|
+
|
5
|
+
def find(job_id, var_name)
|
6
|
+
self.class.get("/jobs/#{job_id}/vars/#{var_name}", @options)
|
7
|
+
end
|
8
|
+
|
9
|
+
def all(job_id, opts={})
|
10
|
+
params = @options.merge opts
|
11
|
+
self.class.get("/jobs/#{job_id}/vars", params)
|
12
|
+
end
|
13
|
+
|
14
|
+
def set(job_id, var_name, value, opts={})
|
15
|
+
body = {}
|
16
|
+
body[:value] = value
|
17
|
+
body[:secret] = opts[:secret] if opts[:secret]
|
18
|
+
params = @options.merge({body: body.to_json})
|
19
|
+
self.class.put("/jobs/#{job_id}/vars/#{var_name}", params)
|
20
|
+
end
|
21
|
+
|
22
|
+
def unset(job_id, var_name, opts={})
|
23
|
+
params = @options.merge(opts)
|
24
|
+
self.class.delete("/jobs/#{job_id}/vars/#{var_name}", params)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -15,8 +15,12 @@ module Datahen
|
|
15
15
|
self.class.post("/scrapers/#{scraper_name}/jobs", params)
|
16
16
|
end
|
17
17
|
|
18
|
-
def find(scraper_name)
|
19
|
-
|
18
|
+
def find(scraper_name, opts={})
|
19
|
+
if opts[:live]
|
20
|
+
self.class.get("/scrapers/#{scraper_name}/current_job", @options)
|
21
|
+
else
|
22
|
+
self.class.get("/cached/scrapers/#{scraper_name}/current_job", @options)
|
23
|
+
end
|
20
24
|
end
|
21
25
|
|
22
26
|
def update(scraper_name, opts={})
|
@@ -26,6 +26,9 @@ module Datahen
|
|
26
26
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/refetch", params)
|
27
27
|
end
|
28
28
|
|
29
|
+
# Deprecated, please use Datahen::Client::JobVar#refetch instead.
|
30
|
+
#
|
31
|
+
# @note This method will be removed at some point in the future.
|
29
32
|
def refetch_by_job(job_id, opts={})
|
30
33
|
params = @options.merge(opts)
|
31
34
|
self.class.put("/jobs/#{job_id}/pages/refetch", params)
|
@@ -36,11 +39,6 @@ module Datahen
|
|
36
39
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/reparse", params)
|
37
40
|
end
|
38
41
|
|
39
|
-
def reparse_by_job(job_id, opts={})
|
40
|
-
params = @options.merge(opts)
|
41
|
-
self.class.put("/jobs/#{job_id}/pages/reparse", params)
|
42
|
-
end
|
43
|
-
|
44
42
|
def enqueue(scraper_name, method, url, opts={})
|
45
43
|
body = {}
|
46
44
|
body[:method] = method != "" ? method : "GET"
|
@@ -62,6 +60,14 @@ module Datahen
|
|
62
60
|
self.class.post("/scrapers/#{scraper_name}/current_job/pages", params)
|
63
61
|
end
|
64
62
|
|
63
|
+
def find_content(scraper_name, gid)
|
64
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}/content", @options)
|
65
|
+
end
|
66
|
+
|
67
|
+
def find_failed_content(scraper_name, gid)
|
68
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}/failed_content", @options)
|
69
|
+
end
|
70
|
+
|
65
71
|
end
|
66
72
|
end
|
67
73
|
end
|
@@ -63,9 +63,9 @@ module Datahen
|
|
63
63
|
client.find(gid)
|
64
64
|
end
|
65
65
|
|
66
|
-
def get_content(gid)
|
67
|
-
client = Client::
|
68
|
-
content_json = client.find_content(gid)
|
66
|
+
def get_content(job_id, gid)
|
67
|
+
client = Client::JobPage.new()
|
68
|
+
content_json = client.find_content(job_id, gid)
|
69
69
|
|
70
70
|
if content_json['available']
|
71
71
|
signed_url = content_json['signed_url']
|
@@ -75,9 +75,9 @@ module Datahen
|
|
75
75
|
end
|
76
76
|
end
|
77
77
|
|
78
|
-
def get_failed_content(gid)
|
79
|
-
client = Client::
|
80
|
-
content_json = client.find_failed_content(gid)
|
78
|
+
def get_failed_content(job_id, gid)
|
79
|
+
client = Client::JobPage.new()
|
80
|
+
content_json = client.find_failed_content(job_id, gid)
|
81
81
|
|
82
82
|
if content_json['available']
|
83
83
|
signed_url = content_json['signed_url']
|
@@ -1,18 +1,24 @@
|
|
1
1
|
module Datahen
|
2
2
|
module Scraper
|
3
3
|
class Parser
|
4
|
-
def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {})
|
4
|
+
def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {}, keep_outputs=false)
|
5
5
|
extname = File.extname(filename)
|
6
6
|
case extname
|
7
7
|
when '.rb'
|
8
|
-
executor = RubyParserExecutor.new(
|
8
|
+
executor = RubyParserExecutor.new(
|
9
|
+
filename: filename,
|
10
|
+
gid: gid,
|
11
|
+
job_id: job_id,
|
12
|
+
vars: vars,
|
13
|
+
keep_outputs: keep_outputs
|
14
|
+
)
|
9
15
|
executor.exec_parser(save)
|
10
16
|
else
|
11
17
|
puts "Unable to find a parser executor for file type \"#{extname}\""
|
12
18
|
end
|
13
19
|
end
|
14
20
|
|
15
|
-
|
21
|
+
|
16
22
|
end
|
17
23
|
end
|
18
|
-
end
|
24
|
+
end
|
@@ -15,6 +15,7 @@ module Datahen
|
|
15
15
|
@gid = options.fetch(:gid) { raise "GID is required"}
|
16
16
|
@job_id = options.fetch(:job_id)
|
17
17
|
@page_vars = options.fetch(:vars) { {} }
|
18
|
+
@keep_outputs = !!(options.fetch(:keep_outputs) { false })
|
18
19
|
end
|
19
20
|
|
20
21
|
def self.exposed_methods
|
@@ -66,7 +67,9 @@ module Datahen
|
|
66
67
|
response = parsing_update(
|
67
68
|
job_id: job_id,
|
68
69
|
gid: gid,
|
69
|
-
parsing_status: :starting
|
70
|
+
parsing_status: :starting,
|
71
|
+
keep_outputs: @keep_outputs
|
72
|
+
)
|
70
73
|
|
71
74
|
if response.code == 200
|
72
75
|
puts "Page Parsing Status Updated."
|
@@ -165,7 +168,7 @@ module Datahen
|
|
165
168
|
handle_error(e) if save
|
166
169
|
raise e
|
167
170
|
end
|
168
|
-
|
171
|
+
|
169
172
|
if refetch_self
|
170
173
|
refetch_page gid
|
171
174
|
elsif reparse_self
|
@@ -178,11 +181,11 @@ module Datahen
|
|
178
181
|
end
|
179
182
|
|
180
183
|
def content
|
181
|
-
@content ||= get_content(gid)
|
184
|
+
@content ||= get_content(job_id, gid)
|
182
185
|
end
|
183
186
|
|
184
187
|
def failed_content
|
185
|
-
@failed_content ||= get_failed_content(gid)
|
188
|
+
@failed_content ||= get_failed_content(job_id, gid)
|
186
189
|
end
|
187
190
|
|
188
191
|
def handle_error(e)
|
@@ -6,6 +6,7 @@ module Datahen
|
|
6
6
|
def initialize(options={})
|
7
7
|
@filename = options.fetch(:filename) { raise "Filename is required"}
|
8
8
|
@job_id = options[:job_id]
|
9
|
+
@keep_outputs = !!(options.fetch(:keep_outputs) { false })
|
9
10
|
end
|
10
11
|
|
11
12
|
def self.exposed_methods
|
@@ -81,7 +82,9 @@ module Datahen
|
|
81
82
|
|
82
83
|
response = seeding_update(
|
83
84
|
job_id: job_id,
|
84
|
-
seeding_status: :starting
|
85
|
+
seeding_status: :starting,
|
86
|
+
keep_outputs: @keep_outputs
|
87
|
+
)
|
85
88
|
|
86
89
|
if response.code == 200
|
87
90
|
puts "Seeding Status Updated."
|
@@ -2,11 +2,15 @@ module Datahen
|
|
2
2
|
module Scraper
|
3
3
|
class Seeder
|
4
4
|
|
5
|
-
def self.exec_seeder(filename, job_id=nil, save=false)
|
5
|
+
def self.exec_seeder(filename, job_id=nil, save=false, keep_outputs=false)
|
6
6
|
extname = File.extname(filename)
|
7
7
|
case extname
|
8
8
|
when '.rb'
|
9
|
-
executor = RubySeederExecutor.new(
|
9
|
+
executor = RubySeederExecutor.new(
|
10
|
+
filename: filename,
|
11
|
+
job_id: job_id,
|
12
|
+
keep_outputs: keep_outputs
|
13
|
+
)
|
10
14
|
executor.exec_seeder(save)
|
11
15
|
else
|
12
16
|
puts "Unable to find a seeder executor for file type \"#{extname}\""
|
@@ -15,4 +19,4 @@ module Datahen
|
|
15
19
|
|
16
20
|
end
|
17
21
|
end
|
18
|
-
end
|
22
|
+
end
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.14.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-07-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -215,10 +215,12 @@ files:
|
|
215
215
|
- lib/datahen/client/global_page.rb
|
216
216
|
- lib/datahen/client/job.rb
|
217
217
|
- lib/datahen/client/job_export.rb
|
218
|
+
- lib/datahen/client/job_finisher.rb
|
218
219
|
- lib/datahen/client/job_log.rb
|
219
220
|
- lib/datahen/client/job_output.rb
|
220
221
|
- lib/datahen/client/job_page.rb
|
221
222
|
- lib/datahen/client/job_stat.rb
|
223
|
+
- lib/datahen/client/job_var.rb
|
222
224
|
- lib/datahen/client/scraper.rb
|
223
225
|
- lib/datahen/client/scraper_deployment.rb
|
224
226
|
- lib/datahen/client/scraper_export.rb
|
@@ -262,7 +264,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
262
264
|
- !ruby/object:Gem::Version
|
263
265
|
version: '0'
|
264
266
|
requirements: []
|
265
|
-
rubygems_version: 3.
|
267
|
+
rubygems_version: 3.1.2
|
266
268
|
signing_key:
|
267
269
|
specification_version: 4
|
268
270
|
summary: DataHen toolbelt for developers
|