datahen 0.16.2 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/datahen/cli/scraper.rb +2 -0
- data/lib/datahen/cli/scraper_job.rb +1 -0
- data/lib/datahen/client/job.rb +1 -0
- data/lib/datahen/client/job_page.rb +2 -0
- data/lib/datahen/client/scraper.rb +2 -0
- data/lib/datahen/client/scraper_job.rb +2 -0
- data/lib/datahen/client/scraper_job_page.rb +2 -0
- data/lib/datahen/scraper/ruby_parser_executor.rb +29 -3
- data/lib/datahen/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 82f77763f1977307312821247750463f03c0a7dcb52d78d1ae8ca344265b6e56
|
4
|
+
data.tar.gz: 5f04de4cc0df0b7edcddc2514ef90bb1627e141d8a7b3fcc3ccbc2272b5a629a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cabad1dc36b89878f2a361ab4d4f0c62af932ab3d69b0ca6f929c24809be74d3e82b00fa577215f9019cd243fac286f684cacdcedf8f4cb221400b7c7014c5e3
|
7
|
+
data.tar.gz: b86c0f71a4c555b2e8dce19509d93b888c8dacd72ea0e57db623b3ad1ddb9f3a0ac5bafaac4fdcaf28673c3a77e2c2da4a4aaf0d00fa04465dda641ada86eb52
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -33,6 +33,7 @@ module Datahen
|
|
33
33
|
option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
|
34
34
|
option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
|
35
35
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
36
|
+
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
36
37
|
def create(scraper_name, git_repository)
|
37
38
|
# puts "options #{options}"
|
38
39
|
client = Client::Scraper.new(options)
|
@@ -59,6 +60,7 @@ module Datahen
|
|
59
60
|
option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
|
60
61
|
option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
|
61
62
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
63
|
+
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
62
64
|
def update(scraper_name)
|
63
65
|
client = Client::Scraper.new(options)
|
64
66
|
puts "#{client.update(scraper_name, options)}"
|
@@ -105,6 +105,7 @@ module Datahen
|
|
105
105
|
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
106
106
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
107
107
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
108
|
+
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
108
109
|
def update(scraper_name)
|
109
110
|
if options[:job]
|
110
111
|
client = Client::Job.new(options)
|
data/lib/datahen/client/job.rb
CHANGED
@@ -22,6 +22,7 @@ module Datahen
|
|
22
22
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
23
23
|
body[:profile] = opts[:profile] if opts[:profile]
|
24
24
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
25
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
25
26
|
params = @options.merge({body: body.to_json})
|
26
27
|
|
27
28
|
self.class.put("/jobs/#{job_id}", params)
|
@@ -16,6 +16,7 @@ module Datahen
|
|
16
16
|
body[:priority] = opts[:priority] if opts[:priority]
|
17
17
|
body[:vars] = opts[:vars] if opts[:vars]
|
18
18
|
body[:max_size] = opts[:max_size] if opts[:max_size]
|
19
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
19
20
|
|
20
21
|
params = @options.merge({body: body.to_json})
|
21
22
|
|
@@ -38,6 +39,7 @@ module Datahen
|
|
38
39
|
body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
|
39
40
|
body[:cookie] = opts[:cookie] if opts[:cookie]
|
40
41
|
body[:max_size] = opts[:max_size] if opts[:max_size]
|
42
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
41
43
|
|
42
44
|
params = @options.merge({body: body.to_json})
|
43
45
|
|
@@ -29,6 +29,7 @@ module Datahen
|
|
29
29
|
body[:multiple_jobs] = opts[:multiple_jobs] if opts[:multiple_jobs]
|
30
30
|
body[:max_job_count] = opts[:max_job_count] if opts[:max_job_count]
|
31
31
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
32
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
32
33
|
params = @options.merge({body: body.to_json})
|
33
34
|
self.class.post("/scrapers", params)
|
34
35
|
end
|
@@ -51,6 +52,7 @@ module Datahen
|
|
51
52
|
body[:multiple_jobs] = opts[:multiple_jobs] if opts.has_key?("multiple_jobs") || opts.has_key?(:multiple_jobs)
|
52
53
|
body[:max_job_count] = opts[:max_job_count] if opts.has_key?("max_job_count") || opts.has_key?(:max_job_count)
|
53
54
|
body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
|
55
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
54
56
|
params = @options.merge({body: body.to_json})
|
55
57
|
|
56
58
|
self.class.put("/scrapers/#{scraper_name}", params)
|
@@ -12,6 +12,7 @@ module Datahen
|
|
12
12
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
13
13
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
14
14
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
15
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
15
16
|
if opts[:vars]
|
16
17
|
if opts[:vars].is_a?(Array)
|
17
18
|
body[:vars] = opts[:vars]
|
@@ -39,6 +40,7 @@ module Datahen
|
|
39
40
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
40
41
|
body[:profile] = opts[:profile] if opts[:profile]
|
41
42
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
43
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
42
44
|
params = @options.merge({body: body.to_json})
|
43
45
|
|
44
46
|
self.class.put("/scrapers/#{scraper_name}/current_job", params)
|
@@ -16,6 +16,7 @@ module Datahen
|
|
16
16
|
body[:priority] = opts[:priority] if opts[:priority]
|
17
17
|
body[:vars] = opts[:vars] if opts[:vars]
|
18
18
|
body[:max_size] = opts[:max_size] if opts[:max_size]
|
19
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
19
20
|
|
20
21
|
params = @options.merge({body: body.to_json})
|
21
22
|
|
@@ -61,6 +62,7 @@ module Datahen
|
|
61
62
|
body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
|
62
63
|
body[:cookie] = opts[:cookie] if opts[:cookie]
|
63
64
|
body[:max_size] = opts[:max_size] if opts[:max_size]
|
65
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
64
66
|
|
65
67
|
params = @options.merge({body: body.to_json})
|
66
68
|
|
@@ -8,7 +8,11 @@ module Datahen
|
|
8
8
|
attr_accessor :refetch_self
|
9
9
|
# Reparse self page flag.
|
10
10
|
# @return [Boollean]
|
11
|
+
# @note It is stronger than #limbo_self flag.
|
11
12
|
attr_accessor :reparse_self
|
13
|
+
# Limbo self page flag.
|
14
|
+
# @return [Boollean]
|
15
|
+
attr_accessor :limbo_self
|
12
16
|
|
13
17
|
def initialize(options={})
|
14
18
|
@filename = options.fetch(:filename) { raise "Filename is required"}
|
@@ -31,7 +35,8 @@ module Datahen
|
|
31
35
|
:find_output,
|
32
36
|
:find_outputs,
|
33
37
|
:refetch,
|
34
|
-
:reparse
|
38
|
+
:reparse,
|
39
|
+
:limbo
|
35
40
|
].freeze
|
36
41
|
end
|
37
42
|
|
@@ -104,7 +109,7 @@ module Datahen
|
|
104
109
|
|
105
110
|
def refetch_page gid
|
106
111
|
if save
|
107
|
-
Client::
|
112
|
+
Client::JobPage.new({gid: gid}).refetch(self.job_id)
|
108
113
|
puts "Refetch page #{gid}"
|
109
114
|
else
|
110
115
|
puts "Would have refetch page #{gid}"
|
@@ -122,7 +127,7 @@ module Datahen
|
|
122
127
|
|
123
128
|
def reparse_page gid
|
124
129
|
if save
|
125
|
-
Client::
|
130
|
+
Client::JobPage.new({gid: gid}).reparse(self.job_id)
|
126
131
|
puts "Reparse page #{gid}"
|
127
132
|
else
|
128
133
|
puts "Would have reparse page #{gid}"
|
@@ -138,6 +143,24 @@ module Datahen
|
|
138
143
|
reparse_page page_gid
|
139
144
|
end
|
140
145
|
|
146
|
+
def limbo_page gid
|
147
|
+
if save
|
148
|
+
Client::JobPage.new({gid: gid}).limbo(self.job_id)
|
149
|
+
puts "Limbo page #{gid}"
|
150
|
+
else
|
151
|
+
puts "Would have limbo page #{gid}"
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
def limbo page_gid
|
156
|
+
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
|
157
|
+
if page_gid == gid
|
158
|
+
self.limbo_self = true
|
159
|
+
raise Error::SafeTerminateError
|
160
|
+
end
|
161
|
+
limbo_page page_gid
|
162
|
+
end
|
163
|
+
|
141
164
|
def eval_parser_script(save=false)
|
142
165
|
update_parsing_starting_status
|
143
166
|
|
@@ -148,6 +171,7 @@ module Datahen
|
|
148
171
|
page = init_page_vars(page)
|
149
172
|
self.refetch_self = false
|
150
173
|
self.reparse_self = false
|
174
|
+
self.limbo_self = false
|
151
175
|
|
152
176
|
begin
|
153
177
|
context = isolated_binding({
|
@@ -178,6 +202,8 @@ module Datahen
|
|
178
202
|
refetch_page gid
|
179
203
|
elsif reparse_self
|
180
204
|
reparse_page gid
|
205
|
+
elsif limbo_self
|
206
|
+
limbo_page gid
|
181
207
|
else
|
182
208
|
update_parsing_done_status
|
183
209
|
end
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.17.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-08-
|
11
|
+
date: 2021-08-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|