datahen 0.14.15 → 0.14.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/datahen/cli/global_page.rb +4 -1
- data/lib/datahen/cli/scraper.rb +1 -0
- data/lib/datahen/cli/scraper_export.rb +4 -1
- data/lib/datahen/cli/scraper_page.rb +45 -23
- data/lib/datahen/client/job_page.rb +5 -0
- data/lib/datahen/client/scraper_job.rb +7 -0
- data/lib/datahen/client/scraper_job_page.rb +6 -1
- data/lib/datahen/error.rb +6 -0
- data/lib/datahen/error/safe_terminate_error.rb +6 -0
- data/lib/datahen/scraper.rb +1 -0
- data/lib/datahen/scraper/ruby_parser_executor.rb +5 -3
- data/lib/datahen/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4a2f75a84728b7e8c228c578e4a4af6253dcd19db445f509ba44f9866847936c
|
4
|
+
data.tar.gz: c37744322cc3e3035a31dd69bca340397ac906d27a3497604e3d98321187d972
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fcf774827b35bf23048b47241da29b1a17e2b1d691a360d2afcb5b34c206c26b7ae316c457e26b946ae3144b4d9075dda5db90a2fd0fb7e56b9f53b0cd612d13
|
7
|
+
data.tar.gz: 56fe24f850c0b695b87c61629ab6939f856ecf8f0ea6b6557d59646214b6ffd87557a6ade6bf59e03b993626663cc1af3071ca10858802272d75d47cf34de56a
|
@@ -15,7 +15,10 @@ module Datahen
|
|
15
15
|
|
16
16
|
if result['available'] == true
|
17
17
|
puts "Preview content url: \"#{result['preview_url']}\""
|
18
|
-
|
18
|
+
begin
|
19
|
+
`open "#{result['preview_url']}"`
|
20
|
+
rescue
|
21
|
+
end
|
19
22
|
else
|
20
23
|
puts "Content does not exist"
|
21
24
|
end
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -93,6 +93,7 @@ module Datahen
|
|
93
93
|
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
|
94
94
|
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
|
95
95
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
96
|
+
option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: {"name":"foo", "value":"bar", "secret":false} '
|
96
97
|
def start(scraper_name)
|
97
98
|
client = Client::ScraperJob.new(options)
|
98
99
|
puts "Starting a scrape job..."
|
@@ -99,18 +99,20 @@ module Datahen
|
|
99
99
|
|
100
100
|
desc "refetch <scraper_name>", "Refetch Pages on a scraper's current job"
|
101
101
|
long_desc <<-LONGDESC
|
102
|
-
Refetch pages in a scraper's current job. You need to specify either a --gid or --fetch-fail or --parse-fail or --status.\x5
|
102
|
+
Refetch pages in a scraper's current job. You need to specify either a --gid or --fetch-fail or --parse-fail or --status or --page-type.\x5
|
103
103
|
LONGDESC
|
104
104
|
option :gid, :aliases => :g, type: :string, desc: 'Refetch a specific GID'
|
105
105
|
option :fetch_fail, type: :boolean, desc: 'Refetches only pages that fails fetching.'
|
106
106
|
option :parse_fail, type: :boolean, desc: 'Refetches only pages that fails parsing.'
|
107
107
|
option :status, type: :string, desc: 'Refetches only pages with a specific status.'
|
108
|
+
option :page_type, type: :string, desc: 'Refetches only pages with a specific page type.'
|
108
109
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
109
110
|
def refetch(scraper_name)
|
110
|
-
if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status)
|
111
|
-
puts "Must specify either a --gid, --fetch-fail, --parse-fail or --
|
111
|
+
if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status) && !options.key?(:page_type)
|
112
|
+
puts "Must specify either a --gid, --fetch-fail, --parse-fail, --status or --page-type"
|
112
113
|
return
|
113
114
|
end
|
115
|
+
|
114
116
|
if options[:job]
|
115
117
|
client = Client::JobPage.new(options)
|
116
118
|
puts "#{client.refetch(options[:job])}"
|
@@ -122,33 +124,47 @@ module Datahen
|
|
122
124
|
|
123
125
|
desc "reparse <scraper_name>", "Reparse Pages on a scraper's current job"
|
124
126
|
long_desc <<-LONGDESC
|
125
|
-
Reparse pages in a scraper's current job. You need to specify either a --gid or --parse-fail or --status.\x5
|
127
|
+
Reparse pages in a scraper's current job. You need to specify either a --gid or --parse-fail or --status or --page-type.\x5
|
126
128
|
LONGDESC
|
127
129
|
option :gid, :aliases => :g, type: :string, desc: 'Reparse a specific GID'
|
128
130
|
option :parse_fail, type: :boolean, desc: 'Reparse only pages that fails parsing.'
|
129
131
|
option :status, type: :string, desc: 'Reparse only pages with a specific status.'
|
132
|
+
option :page_type, type: :string, desc: 'Refetches only pages with a specific page type.'
|
130
133
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
131
134
|
def reparse(scraper_name)
|
132
|
-
|
133
|
-
|
135
|
+
if !options.key?(:gid) && !options.key?(:parse_fail) && !options.key?(:status) && !options.key?(:page_type)
|
136
|
+
puts "Must specify either a --gid, --parse-fail, --status or --page-type"
|
137
|
+
return
|
138
|
+
end
|
134
139
|
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
140
|
+
if options[:job]
|
141
|
+
client = Client::JobPage.new(options)
|
142
|
+
puts "#{client.reparse(options[:job])}"
|
143
|
+
else
|
144
|
+
client = Client::ScraperJobPage.new(options)
|
145
|
+
puts "#{client.reparse(scraper_name)}"
|
146
|
+
end
|
147
|
+
end
|
139
148
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
149
|
+
desc "limbo <scraper_name>", "Move pages on a scraper's current job to limbo"
|
150
|
+
long_desc <<-LONGDESC
|
151
|
+
Move pages in a scraper's current job to limbo. You need to specify either a --gid or --status.\x5
|
152
|
+
LONGDESC
|
153
|
+
option :gid, :aliases => :g, type: :string, desc: 'Move a specific GID to limbo'
|
154
|
+
option :status, type: :string, desc: 'Move pages with a specific status to limbo.'
|
155
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
156
|
+
def limbo(scraper_name)
|
157
|
+
if !options.key?(:gid) && !options.key?(:status)
|
158
|
+
puts "Must specify either a --gid or --status"
|
159
|
+
return
|
160
|
+
end
|
147
161
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
162
|
+
if options[:job]
|
163
|
+
client = Client::JobPage.new(options)
|
164
|
+
puts "#{client.limbo(options[:job])}"
|
165
|
+
else
|
166
|
+
client = Client::ScraperJobPage.new(options)
|
167
|
+
puts "#{client.limbo(scraper_name)}"
|
152
168
|
end
|
153
169
|
end
|
154
170
|
|
@@ -224,7 +240,10 @@ module Datahen
|
|
224
240
|
|
225
241
|
if result['available'] == true
|
226
242
|
puts "Preview content url: \"#{result['preview_url']}\""
|
227
|
-
|
243
|
+
begin
|
244
|
+
`open "#{result['preview_url']}"`
|
245
|
+
rescue
|
246
|
+
end
|
228
247
|
else
|
229
248
|
puts "Content does not exist"
|
230
249
|
end
|
@@ -244,7 +263,10 @@ module Datahen
|
|
244
263
|
|
245
264
|
if result['available'] == true
|
246
265
|
puts "Preview failed content url: \"#{result['preview_url']}\""
|
247
|
-
|
266
|
+
begin
|
267
|
+
`open "#{result['preview_url']}"`
|
268
|
+
rescue
|
269
|
+
end
|
248
270
|
else
|
249
271
|
puts "Failed Content does not exist"
|
250
272
|
end
|
@@ -72,6 +72,11 @@ module Datahen
|
|
72
72
|
params = @options.merge(opts)
|
73
73
|
self.class.put("/jobs/#{job_id}/pages/refetch", params)
|
74
74
|
end
|
75
|
+
|
76
|
+
def limbo(job_id, opts={})
|
77
|
+
params = @options.merge(opts)
|
78
|
+
self.class.put("/jobs/#{job_id}/pages/limbo", params)
|
79
|
+
end
|
75
80
|
end
|
76
81
|
end
|
77
82
|
end
|
@@ -11,6 +11,13 @@ module Datahen
|
|
11
11
|
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
12
12
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
13
13
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
14
|
+
if opts[:vars]
|
15
|
+
if opts[:vars].is_a?(Array)
|
16
|
+
body[:vars] = opts[:vars]
|
17
|
+
elsif opts[:vars].is_a?(String)
|
18
|
+
body[:vars] = JSON.parse(opts[:vars])
|
19
|
+
end
|
20
|
+
end
|
14
21
|
params = @options.merge({body: body.to_json})
|
15
22
|
self.class.post("/scrapers/#{scraper_name}/jobs", params)
|
16
23
|
end
|
@@ -26,7 +26,7 @@ module Datahen
|
|
26
26
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/refetch", params)
|
27
27
|
end
|
28
28
|
|
29
|
-
# Deprecated, please use Datahen::Client::
|
29
|
+
# Deprecated, please use Datahen::Client::JobPage#refetch instead.
|
30
30
|
#
|
31
31
|
# @note This method will be removed at some point in the future.
|
32
32
|
def refetch_by_job(job_id, opts={})
|
@@ -39,6 +39,11 @@ module Datahen
|
|
39
39
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/reparse", params)
|
40
40
|
end
|
41
41
|
|
42
|
+
def limbo(scraper_name, opts={})
|
43
|
+
params = @options.merge(opts)
|
44
|
+
self.class.put("/scrapers/#{scraper_name}/current_job/pages/limbo", params)
|
45
|
+
end
|
46
|
+
|
42
47
|
def enqueue(scraper_name, method, url, opts={})
|
43
48
|
body = {}
|
44
49
|
body[:method] = method != "" ? method : "GET"
|
data/lib/datahen/scraper.rb
CHANGED
@@ -112,7 +112,7 @@ module Datahen
|
|
112
112
|
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
|
113
113
|
if page_gid == gid
|
114
114
|
self.refetch_self = true
|
115
|
-
|
115
|
+
raise Error::SafeTerminateError
|
116
116
|
end
|
117
117
|
refetch_page page_gid
|
118
118
|
end
|
@@ -130,7 +130,7 @@ module Datahen
|
|
130
130
|
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
|
131
131
|
if page_gid == gid
|
132
132
|
self.reparse_self = true
|
133
|
-
|
133
|
+
raise Error::SafeTerminateError
|
134
134
|
end
|
135
135
|
reparse_page page_gid
|
136
136
|
end
|
@@ -153,6 +153,8 @@ module Datahen
|
|
153
153
|
page: page
|
154
154
|
})
|
155
155
|
eval_with_context filename, context
|
156
|
+
rescue Error::SafeTerminateError => e
|
157
|
+
# do nothing, this is fine
|
156
158
|
rescue SyntaxError => e
|
157
159
|
handle_error(e) if save
|
158
160
|
raise e
|
@@ -163,7 +165,7 @@ module Datahen
|
|
163
165
|
|
164
166
|
puts "=========== Parsing Executed ==========="
|
165
167
|
begin
|
166
|
-
save_pages_and_outputs(pages, outputs, :parsing)
|
168
|
+
save_pages_and_outputs(pages, outputs, :parsing) unless refetch_self
|
167
169
|
rescue => e
|
168
170
|
handle_error(e) if save
|
169
171
|
raise e
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.14.
|
4
|
+
version: 0.14.20
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-12-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -233,6 +233,8 @@ files:
|
|
233
233
|
- lib/datahen/client/scraper_job_page.rb
|
234
234
|
- lib/datahen/client/scraper_job_var.rb
|
235
235
|
- lib/datahen/client/scraper_var.rb
|
236
|
+
- lib/datahen/error.rb
|
237
|
+
- lib/datahen/error/safe_terminate_error.rb
|
236
238
|
- lib/datahen/plugin.rb
|
237
239
|
- lib/datahen/plugin/context_exposer.rb
|
238
240
|
- lib/datahen/scraper.rb
|