datahen 0.14.17 → 0.14.22
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/datahen/cli/scraper.rb +1 -0
- data/lib/datahen/cli/scraper_page.rb +8 -6
- data/lib/datahen/client/scraper_job.rb +7 -0
- data/lib/datahen/error.rb +6 -0
- data/lib/datahen/error/safe_terminate_error.rb +6 -0
- data/lib/datahen/scraper.rb +1 -0
- data/lib/datahen/scraper/ruby_parser_executor.rb +5 -3
- data/lib/datahen/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 25eb02598ac32462a53995a4b9e72b3bc466b54c2d74be02516f8d04f178a7b8
|
4
|
+
data.tar.gz: 5f6fcedfa7f4a477e18fc1a0ee80126b1a646a3ecefdd8258d1982bf7d7fe06f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 69a71d740f9078a5a4c2a77211587c0099a4064cabda690cd6fb4803c153975e3e91f1c05f98278f2852a0bacf8cb444bba8f29f56c3cfbd0fba12cece39b9cd
|
7
|
+
data.tar.gz: df131c11592d2b6192fa74d26fc0e8d823b99f8073b907c82c8e9f04622c7d28aa5e1145419ac0377a99f6efcd3f46ff8fcef88fc436e802d51afc014fd4383a
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -93,6 +93,7 @@ module Datahen
|
|
93
93
|
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
|
94
94
|
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
|
95
95
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
96
|
+
option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
|
96
97
|
def start(scraper_name)
|
97
98
|
client = Client::ScraperJob.new(options)
|
98
99
|
puts "Starting a scrape job..."
|
@@ -99,16 +99,17 @@ module Datahen
|
|
99
99
|
|
100
100
|
desc "refetch <scraper_name>", "Refetch Pages on a scraper's current job"
|
101
101
|
long_desc <<-LONGDESC
|
102
|
-
Refetch pages in a scraper's current job. You need to specify either a --gid or --fetch-fail or --parse-fail or --status.\x5
|
102
|
+
Refetch pages in a scraper's current job. You need to specify either a --gid or --fetch-fail or --parse-fail or --status or --page-type.\x5
|
103
103
|
LONGDESC
|
104
104
|
option :gid, :aliases => :g, type: :string, desc: 'Refetch a specific GID'
|
105
105
|
option :fetch_fail, type: :boolean, desc: 'Refetches only pages that fails fetching.'
|
106
106
|
option :parse_fail, type: :boolean, desc: 'Refetches only pages that fails parsing.'
|
107
107
|
option :status, type: :string, desc: 'Refetches only pages with a specific status.'
|
108
|
+
option :page_type, type: :string, desc: 'Refetches only pages with a specific page type.'
|
108
109
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
109
110
|
def refetch(scraper_name)
|
110
|
-
if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status)
|
111
|
-
puts "Must specify either a --gid, --fetch-fail, --parse-fail or --
|
111
|
+
if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status) && !options.key?(:page_type)
|
112
|
+
puts "Must specify either a --gid, --fetch-fail, --parse-fail, --status or --page-type"
|
112
113
|
return
|
113
114
|
end
|
114
115
|
|
@@ -123,15 +124,16 @@ module Datahen
|
|
123
124
|
|
124
125
|
desc "reparse <scraper_name>", "Reparse Pages on a scraper's current job"
|
125
126
|
long_desc <<-LONGDESC
|
126
|
-
Reparse pages in a scraper's current job. You need to specify either a --gid or --parse-fail or --status.\x5
|
127
|
+
Reparse pages in a scraper's current job. You need to specify either a --gid or --parse-fail or --status or --page-type.\x5
|
127
128
|
LONGDESC
|
128
129
|
option :gid, :aliases => :g, type: :string, desc: 'Reparse a specific GID'
|
129
130
|
option :parse_fail, type: :boolean, desc: 'Reparse only pages that fails parsing.'
|
130
131
|
option :status, type: :string, desc: 'Reparse only pages with a specific status.'
|
132
|
+
option :page_type, type: :string, desc: 'Refetches only pages with a specific page type.'
|
131
133
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
132
134
|
def reparse(scraper_name)
|
133
|
-
if !options.key?(:gid) && !options.key?(:parse_fail) && !options.key?(:status)
|
134
|
-
puts "Must specify either a --gid, --parse-fail or --
|
135
|
+
if !options.key?(:gid) && !options.key?(:parse_fail) && !options.key?(:status) && !options.key?(:page_type)
|
136
|
+
puts "Must specify either a --gid, --parse-fail, --status or --page-type"
|
135
137
|
return
|
136
138
|
end
|
137
139
|
|
@@ -11,6 +11,13 @@ module Datahen
|
|
11
11
|
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
12
12
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
13
13
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
14
|
+
if opts[:vars]
|
15
|
+
if opts[:vars].is_a?(Array)
|
16
|
+
body[:vars] = opts[:vars]
|
17
|
+
elsif opts[:vars].is_a?(String)
|
18
|
+
body[:vars] = JSON.parse(opts[:vars])
|
19
|
+
end
|
20
|
+
end
|
14
21
|
params = @options.merge({body: body.to_json})
|
15
22
|
self.class.post("/scrapers/#{scraper_name}/jobs", params)
|
16
23
|
end
|
data/lib/datahen/scraper.rb
CHANGED
@@ -112,7 +112,7 @@ module Datahen
|
|
112
112
|
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
|
113
113
|
if page_gid == gid
|
114
114
|
self.refetch_self = true
|
115
|
-
|
115
|
+
raise Error::SafeTerminateError
|
116
116
|
end
|
117
117
|
refetch_page page_gid
|
118
118
|
end
|
@@ -130,7 +130,7 @@ module Datahen
|
|
130
130
|
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
|
131
131
|
if page_gid == gid
|
132
132
|
self.reparse_self = true
|
133
|
-
|
133
|
+
raise Error::SafeTerminateError
|
134
134
|
end
|
135
135
|
reparse_page page_gid
|
136
136
|
end
|
@@ -153,6 +153,8 @@ module Datahen
|
|
153
153
|
page: page
|
154
154
|
})
|
155
155
|
eval_with_context filename, context
|
156
|
+
rescue Error::SafeTerminateError => e
|
157
|
+
# do nothing, this is fine
|
156
158
|
rescue SyntaxError => e
|
157
159
|
handle_error(e) if save
|
158
160
|
raise e
|
@@ -163,7 +165,7 @@ module Datahen
|
|
163
165
|
|
164
166
|
puts "=========== Parsing Executed ==========="
|
165
167
|
begin
|
166
|
-
save_pages_and_outputs(pages, outputs, :parsing)
|
168
|
+
save_pages_and_outputs(pages, outputs, :parsing) unless refetch_self
|
167
169
|
rescue => e
|
168
170
|
handle_error(e) if save
|
169
171
|
raise e
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.14.
|
4
|
+
version: 0.14.22
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-12-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -233,6 +233,8 @@ files:
|
|
233
233
|
- lib/datahen/client/scraper_job_page.rb
|
234
234
|
- lib/datahen/client/scraper_job_var.rb
|
235
235
|
- lib/datahen/client/scraper_var.rb
|
236
|
+
- lib/datahen/error.rb
|
237
|
+
- lib/datahen/error/safe_terminate_error.rb
|
236
238
|
- lib/datahen/plugin.rb
|
237
239
|
- lib/datahen/plugin/context_exposer.rb
|
238
240
|
- lib/datahen/scraper.rb
|