datahen 0.14.15 → 0.14.20
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/datahen/cli/global_page.rb +4 -1
- data/lib/datahen/cli/scraper.rb +1 -0
- data/lib/datahen/cli/scraper_export.rb +4 -1
- data/lib/datahen/cli/scraper_page.rb +45 -23
- data/lib/datahen/client/job_page.rb +5 -0
- data/lib/datahen/client/scraper_job.rb +7 -0
- data/lib/datahen/client/scraper_job_page.rb +6 -1
- data/lib/datahen/error.rb +6 -0
- data/lib/datahen/error/safe_terminate_error.rb +6 -0
- data/lib/datahen/scraper.rb +1 -0
- data/lib/datahen/scraper/ruby_parser_executor.rb +5 -3
- data/lib/datahen/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4a2f75a84728b7e8c228c578e4a4af6253dcd19db445f509ba44f9866847936c
|
4
|
+
data.tar.gz: c37744322cc3e3035a31dd69bca340397ac906d27a3497604e3d98321187d972
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fcf774827b35bf23048b47241da29b1a17e2b1d691a360d2afcb5b34c206c26b7ae316c457e26b946ae3144b4d9075dda5db90a2fd0fb7e56b9f53b0cd612d13
|
7
|
+
data.tar.gz: 56fe24f850c0b695b87c61629ab6939f856ecf8f0ea6b6557d59646214b6ffd87557a6ade6bf59e03b993626663cc1af3071ca10858802272d75d47cf34de56a
|
@@ -15,7 +15,10 @@ module Datahen
|
|
15
15
|
|
16
16
|
if result['available'] == true
|
17
17
|
puts "Preview content url: \"#{result['preview_url']}\""
|
18
|
-
|
18
|
+
begin
|
19
|
+
`open "#{result['preview_url']}"`
|
20
|
+
rescue
|
21
|
+
end
|
19
22
|
else
|
20
23
|
puts "Content does not exist"
|
21
24
|
end
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -93,6 +93,7 @@ module Datahen
|
|
93
93
|
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
|
94
94
|
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
|
95
95
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
96
|
+
option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: {"name":"foo", "value":"bar", "secret":false} '
|
96
97
|
def start(scraper_name)
|
97
98
|
client = Client::ScraperJob.new(options)
|
98
99
|
puts "Starting a scrape job..."
|
@@ -99,18 +99,20 @@ module Datahen
|
|
99
99
|
|
100
100
|
desc "refetch <scraper_name>", "Refetch Pages on a scraper's current job"
|
101
101
|
long_desc <<-LONGDESC
|
102
|
-
Refetch pages in a scraper's current job. You need to specify either a --gid or --fetch-fail or --parse-fail or --status.\x5
|
102
|
+
Refetch pages in a scraper's current job. You need to specify either a --gid or --fetch-fail or --parse-fail or --status or --page-type.\x5
|
103
103
|
LONGDESC
|
104
104
|
option :gid, :aliases => :g, type: :string, desc: 'Refetch a specific GID'
|
105
105
|
option :fetch_fail, type: :boolean, desc: 'Refetches only pages that fails fetching.'
|
106
106
|
option :parse_fail, type: :boolean, desc: 'Refetches only pages that fails parsing.'
|
107
107
|
option :status, type: :string, desc: 'Refetches only pages with a specific status.'
|
108
|
+
option :page_type, type: :string, desc: 'Refetches only pages with a specific page type.'
|
108
109
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
109
110
|
def refetch(scraper_name)
|
110
|
-
if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status)
|
111
|
-
puts "Must specify either a --gid, --fetch-fail, --parse-fail or --
|
111
|
+
if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status) && !options.key?(:page_type)
|
112
|
+
puts "Must specify either a --gid, --fetch-fail, --parse-fail, --status or --page-type"
|
112
113
|
return
|
113
114
|
end
|
115
|
+
|
114
116
|
if options[:job]
|
115
117
|
client = Client::JobPage.new(options)
|
116
118
|
puts "#{client.refetch(options[:job])}"
|
@@ -122,33 +124,47 @@ module Datahen
|
|
122
124
|
|
123
125
|
desc "reparse <scraper_name>", "Reparse Pages on a scraper's current job"
|
124
126
|
long_desc <<-LONGDESC
|
125
|
-
Reparse pages in a scraper's current job. You need to specify either a --gid or --parse-fail or --status.\x5
|
127
|
+
Reparse pages in a scraper's current job. You need to specify either a --gid or --parse-fail or --status or --page-type.\x5
|
126
128
|
LONGDESC
|
127
129
|
option :gid, :aliases => :g, type: :string, desc: 'Reparse a specific GID'
|
128
130
|
option :parse_fail, type: :boolean, desc: 'Reparse only pages that fails parsing.'
|
129
131
|
option :status, type: :string, desc: 'Reparse only pages with a specific status.'
|
132
|
+
option :page_type, type: :string, desc: 'Refetches only pages with a specific page type.'
|
130
133
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
131
134
|
def reparse(scraper_name)
|
132
|
-
|
133
|
-
|
135
|
+
if !options.key?(:gid) && !options.key?(:parse_fail) && !options.key?(:status) && !options.key?(:page_type)
|
136
|
+
puts "Must specify either a --gid, --parse-fail, --status or --page-type"
|
137
|
+
return
|
138
|
+
end
|
134
139
|
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
140
|
+
if options[:job]
|
141
|
+
client = Client::JobPage.new(options)
|
142
|
+
puts "#{client.reparse(options[:job])}"
|
143
|
+
else
|
144
|
+
client = Client::ScraperJobPage.new(options)
|
145
|
+
puts "#{client.reparse(scraper_name)}"
|
146
|
+
end
|
147
|
+
end
|
139
148
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
149
|
+
desc "limbo <scraper_name>", "Move pages on a scraper's current job to limbo"
|
150
|
+
long_desc <<-LONGDESC
|
151
|
+
Move pages in a scraper's current job to limbo. You need to specify either a --gid or --status.\x5
|
152
|
+
LONGDESC
|
153
|
+
option :gid, :aliases => :g, type: :string, desc: 'Move a specific GID to limbo'
|
154
|
+
option :status, type: :string, desc: 'Move pages with a specific status to limbo.'
|
155
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
156
|
+
def limbo(scraper_name)
|
157
|
+
if !options.key?(:gid) && !options.key?(:status)
|
158
|
+
puts "Must specify either a --gid or --status"
|
159
|
+
return
|
160
|
+
end
|
147
161
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
162
|
+
if options[:job]
|
163
|
+
client = Client::JobPage.new(options)
|
164
|
+
puts "#{client.limbo(options[:job])}"
|
165
|
+
else
|
166
|
+
client = Client::ScraperJobPage.new(options)
|
167
|
+
puts "#{client.limbo(scraper_name)}"
|
152
168
|
end
|
153
169
|
end
|
154
170
|
|
@@ -224,7 +240,10 @@ module Datahen
|
|
224
240
|
|
225
241
|
if result['available'] == true
|
226
242
|
puts "Preview content url: \"#{result['preview_url']}\""
|
227
|
-
|
243
|
+
begin
|
244
|
+
`open "#{result['preview_url']}"`
|
245
|
+
rescue
|
246
|
+
end
|
228
247
|
else
|
229
248
|
puts "Content does not exist"
|
230
249
|
end
|
@@ -244,7 +263,10 @@ module Datahen
|
|
244
263
|
|
245
264
|
if result['available'] == true
|
246
265
|
puts "Preview failed content url: \"#{result['preview_url']}\""
|
247
|
-
|
266
|
+
begin
|
267
|
+
`open "#{result['preview_url']}"`
|
268
|
+
rescue
|
269
|
+
end
|
248
270
|
else
|
249
271
|
puts "Failed Content does not exist"
|
250
272
|
end
|
@@ -72,6 +72,11 @@ module Datahen
|
|
72
72
|
params = @options.merge(opts)
|
73
73
|
self.class.put("/jobs/#{job_id}/pages/refetch", params)
|
74
74
|
end
|
75
|
+
|
76
|
+
def limbo(job_id, opts={})
|
77
|
+
params = @options.merge(opts)
|
78
|
+
self.class.put("/jobs/#{job_id}/pages/limbo", params)
|
79
|
+
end
|
75
80
|
end
|
76
81
|
end
|
77
82
|
end
|
@@ -11,6 +11,13 @@ module Datahen
|
|
11
11
|
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
12
12
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
13
13
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
14
|
+
if opts[:vars]
|
15
|
+
if opts[:vars].is_a?(Array)
|
16
|
+
body[:vars] = opts[:vars]
|
17
|
+
elsif opts[:vars].is_a?(String)
|
18
|
+
body[:vars] = JSON.parse(opts[:vars])
|
19
|
+
end
|
20
|
+
end
|
14
21
|
params = @options.merge({body: body.to_json})
|
15
22
|
self.class.post("/scrapers/#{scraper_name}/jobs", params)
|
16
23
|
end
|
@@ -26,7 +26,7 @@ module Datahen
|
|
26
26
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/refetch", params)
|
27
27
|
end
|
28
28
|
|
29
|
-
# Deprecated, please use Datahen::Client::
|
29
|
+
# Deprecated, please use Datahen::Client::JobPage#refetch instead.
|
30
30
|
#
|
31
31
|
# @note This method will be removed at some point in the future.
|
32
32
|
def refetch_by_job(job_id, opts={})
|
@@ -39,6 +39,11 @@ module Datahen
|
|
39
39
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/reparse", params)
|
40
40
|
end
|
41
41
|
|
42
|
+
def limbo(scraper_name, opts={})
|
43
|
+
params = @options.merge(opts)
|
44
|
+
self.class.put("/scrapers/#{scraper_name}/current_job/pages/limbo", params)
|
45
|
+
end
|
46
|
+
|
42
47
|
def enqueue(scraper_name, method, url, opts={})
|
43
48
|
body = {}
|
44
49
|
body[:method] = method != "" ? method : "GET"
|
data/lib/datahen/scraper.rb
CHANGED
@@ -112,7 +112,7 @@ module Datahen
|
|
112
112
|
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
|
113
113
|
if page_gid == gid
|
114
114
|
self.refetch_self = true
|
115
|
-
|
115
|
+
raise Error::SafeTerminateError
|
116
116
|
end
|
117
117
|
refetch_page page_gid
|
118
118
|
end
|
@@ -130,7 +130,7 @@ module Datahen
|
|
130
130
|
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
|
131
131
|
if page_gid == gid
|
132
132
|
self.reparse_self = true
|
133
|
-
|
133
|
+
raise Error::SafeTerminateError
|
134
134
|
end
|
135
135
|
reparse_page page_gid
|
136
136
|
end
|
@@ -153,6 +153,8 @@ module Datahen
|
|
153
153
|
page: page
|
154
154
|
})
|
155
155
|
eval_with_context filename, context
|
156
|
+
rescue Error::SafeTerminateError => e
|
157
|
+
# do nothing, this is fine
|
156
158
|
rescue SyntaxError => e
|
157
159
|
handle_error(e) if save
|
158
160
|
raise e
|
@@ -163,7 +165,7 @@ module Datahen
|
|
163
165
|
|
164
166
|
puts "=========== Parsing Executed ==========="
|
165
167
|
begin
|
166
|
-
save_pages_and_outputs(pages, outputs, :parsing)
|
168
|
+
save_pages_and_outputs(pages, outputs, :parsing) unless refetch_self
|
167
169
|
rescue => e
|
168
170
|
handle_error(e) if save
|
169
171
|
raise e
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.14.
|
4
|
+
version: 0.14.20
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-12-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -233,6 +233,8 @@ files:
|
|
233
233
|
- lib/datahen/client/scraper_job_page.rb
|
234
234
|
- lib/datahen/client/scraper_job_var.rb
|
235
235
|
- lib/datahen/client/scraper_var.rb
|
236
|
+
- lib/datahen/error.rb
|
237
|
+
- lib/datahen/error/safe_terminate_error.rb
|
236
238
|
- lib/datahen/plugin.rb
|
237
239
|
- lib/datahen/plugin/context_exposer.rb
|
238
240
|
- lib/datahen/scraper.rb
|