datahen 0.15.11 → 0.17.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f77e31da8e2a7ff08086c4aa9d174608a9c3f186679d456b22310b48384d3572
4
- data.tar.gz: 0bf53ae0886b16bf6fe08b0db07b1a631b69f31d8e3a6868a4d483549049e4ed
3
+ metadata.gz: 82f77763f1977307312821247750463f03c0a7dcb52d78d1ae8ca344265b6e56
4
+ data.tar.gz: 5f04de4cc0df0b7edcddc2514ef90bb1627e141d8a7b3fcc3ccbc2272b5a629a
5
5
  SHA512:
6
- metadata.gz: a491874347ed6ac97c0a0e4f0d2c5830140b9367c1e01b4c95e7a447b071df643d12793ed7d6e5a0224b8876905cf74bc13b987e6e3e03e937d1f821557b8ec3
7
- data.tar.gz: c553a372790654726f2921b6d9d582ca90b314b3fe8d78625cf2442e01cf2ce96ae556e78812d5bd6a03f350beb32aa3158981554179410c12926d480c911887
6
+ metadata.gz: cabad1dc36b89878f2a361ab4d4f0c62af932ab3d69b0ca6f929c24809be74d3e82b00fa577215f9019cd243fac286f684cacdcedf8f4cb221400b7c7014c5e3
7
+ data.tar.gz: b86c0f71a4c555b2e8dce19509d93b888c8dacd72ea0e57db623b3ad1ddb9f3a0ac5bafaac4fdcaf28673c3a77e2c2da4a4aaf0d00fa04465dda641ada86eb52
@@ -32,6 +32,8 @@ module Datahen
32
32
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
33
33
  option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
34
34
  option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
35
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
36
+ option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
35
37
  def create(scraper_name, git_repository)
36
38
  # puts "options #{options}"
37
39
  client = Client::Scraper.new(options)
@@ -57,6 +59,8 @@ module Datahen
57
59
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
58
60
  option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
59
61
  option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
62
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
63
+ option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
60
64
  def update(scraper_name)
61
65
  client = Client::Scraper.new(options)
62
66
  puts "#{client.update(scraper_name, options)}"
@@ -94,6 +98,7 @@ module Datahen
94
98
  option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
95
99
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
96
100
  option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
101
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
97
102
  def start(scraper_name)
98
103
  client = Client::ScraperJob.new(options)
99
104
  puts "Starting a scrape job..."
@@ -104,6 +104,8 @@ module Datahen
104
104
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
105
105
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
106
106
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
107
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
108
+ option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
107
109
  def update(scraper_name)
108
110
  if options[:job]
109
111
  client = Client::Job.new(options)
@@ -45,6 +45,7 @@ module Datahen
45
45
  option :freshness, :aliases => :s, desc: 'Set how fresh the page cache is. Accepts timestap format.'
46
46
  option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
47
47
  option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
48
+ option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
48
49
  def add(scraper_name, url)
49
50
  begin
50
51
  options[:headers] = JSON.parse(options[:headers]) if options[:headers]
@@ -78,6 +79,7 @@ module Datahen
78
79
  option :page_type, :aliases => :t, desc: 'Set page type'
79
80
  option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
80
81
  option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
82
+ option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
81
83
  def update(scraper_name, gid)
82
84
  begin
83
85
  options[:vars] = JSON.parse(options[:vars]) if options[:vars]
@@ -21,6 +21,8 @@ module Datahen
21
21
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
22
22
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
23
23
  body[:profile] = opts[:profile] if opts[:profile]
24
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
25
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
24
26
  params = @options.merge({body: body.to_json})
25
27
 
26
28
  self.class.put("/jobs/#{job_id}", params)
@@ -15,6 +15,8 @@ module Datahen
15
15
  body[:page_type] = opts[:page_type] if opts[:page_type]
16
16
  body[:priority] = opts[:priority] if opts[:priority]
17
17
  body[:vars] = opts[:vars] if opts[:vars]
18
+ body[:max_size] = opts[:max_size] if opts[:max_size]
19
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
18
20
 
19
21
  params = @options.merge({body: body.to_json})
20
22
 
@@ -36,6 +38,8 @@ module Datahen
36
38
  body[:ua_type] = opts[:ua_type] if opts[:ua_type]
37
39
  body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
38
40
  body[:cookie] = opts[:cookie] if opts[:cookie]
41
+ body[:max_size] = opts[:max_size] if opts[:max_size]
42
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
39
43
 
40
44
  params = @options.merge({body: body.to_json})
41
45
 
@@ -28,6 +28,8 @@ module Datahen
28
28
  body[:profile] = opts[:profile] if opts[:profile]
29
29
  body[:multiple_jobs] = opts[:multiple_jobs] if opts[:multiple_jobs]
30
30
  body[:max_job_count] = opts[:max_job_count] if opts[:max_job_count]
31
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
32
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
31
33
  params = @options.merge({body: body.to_json})
32
34
  self.class.post("/scrapers", params)
33
35
  end
@@ -49,6 +51,8 @@ module Datahen
49
51
  body[:profile] = opts[:profile] if opts[:profile]
50
52
  body[:multiple_jobs] = opts[:multiple_jobs] if opts.has_key?("multiple_jobs") || opts.has_key?(:multiple_jobs)
51
53
  body[:max_job_count] = opts[:max_job_count] if opts.has_key?("max_job_count") || opts.has_key?(:max_job_count)
54
+ body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
55
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
52
56
  params = @options.merge({body: body.to_json})
53
57
 
54
58
  self.class.put("/scrapers/#{scraper_name}", params)
@@ -11,6 +11,8 @@ module Datahen
11
11
  body[:standard_worker_count] = opts[:workers] if opts[:workers]
12
12
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
13
13
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
14
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
15
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
14
16
  if opts[:vars]
15
17
  if opts[:vars].is_a?(Array)
16
18
  body[:vars] = opts[:vars]
@@ -37,6 +39,8 @@ module Datahen
37
39
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
38
40
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
39
41
  body[:profile] = opts[:profile] if opts[:profile]
42
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
43
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
40
44
  params = @options.merge({body: body.to_json})
41
45
 
42
46
  self.class.put("/scrapers/#{scraper_name}/current_job", params)
@@ -15,6 +15,8 @@ module Datahen
15
15
  body[:page_type] = opts[:page_type] if opts[:page_type]
16
16
  body[:priority] = opts[:priority] if opts[:priority]
17
17
  body[:vars] = opts[:vars] if opts[:vars]
18
+ body[:max_size] = opts[:max_size] if opts[:max_size]
19
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
18
20
 
19
21
  params = @options.merge({body: body.to_json})
20
22
 
@@ -59,6 +61,8 @@ module Datahen
59
61
  body[:ua_type] = opts[:ua_type] if opts[:ua_type]
60
62
  body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
61
63
  body[:cookie] = opts[:cookie] if opts[:cookie]
64
+ body[:max_size] = opts[:max_size] if opts[:max_size]
65
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
62
66
 
63
67
  params = @options.merge({body: body.to_json})
64
68
 
@@ -334,6 +334,7 @@ module Datahen
334
334
  Parallel.each(dequeue, in_threads: (worker_count)) do |page|
335
335
  parser_file = self.parsers[page['page_type']]
336
336
  begin
337
+ self.repeat_puts("Parsing page with GID #{page['gid']}")
337
338
  puts Datahen::Scraper::Parser.exec_parser_by_page(
338
339
  parser_file,
339
340
  page,
@@ -342,6 +343,7 @@ module Datahen
342
343
  nil,
343
344
  keep_outputs
344
345
  )
346
+ self.repeat_puts("Finish parsing page with GID #{page['gid']}")
345
347
  rescue Parallel::Kill => e
346
348
  puts "[Worker #{Parallel.worker_number}]: Someone tried to kill Parallel!!!"
347
349
  rescue Parallel::Break => e
@@ -8,7 +8,11 @@ module Datahen
8
8
  attr_accessor :refetch_self
9
9
  # Reparse self page flag.
10
10
  # @return [Boollean]
11
+ # @note It is stronger than #limbo_self flag.
11
12
  attr_accessor :reparse_self
13
+ # Limbo self page flag.
14
+ # @return [Boollean]
15
+ attr_accessor :limbo_self
12
16
 
13
17
  def initialize(options={})
14
18
  @filename = options.fetch(:filename) { raise "Filename is required"}
@@ -31,7 +35,8 @@ module Datahen
31
35
  :find_output,
32
36
  :find_outputs,
33
37
  :refetch,
34
- :reparse
38
+ :reparse,
39
+ :limbo
35
40
  ].freeze
36
41
  end
37
42
 
@@ -104,7 +109,7 @@ module Datahen
104
109
 
105
110
  def refetch_page gid
106
111
  if save
107
- Client::ScraperJobPage.new({gid: gid}).refetch_by_job(self.job_id)
112
+ Client::JobPage.new({gid: gid}).refetch(self.job_id)
108
113
  puts "Refetch page #{gid}"
109
114
  else
110
115
  puts "Would have refetch page #{gid}"
@@ -122,7 +127,7 @@ module Datahen
122
127
 
123
128
  def reparse_page gid
124
129
  if save
125
- Client::ScraperJobPage.new({gid: gid}).reparse_by_job(self.job_id)
130
+ Client::JobPage.new({gid: gid}).reparse(self.job_id)
126
131
  puts "Reparse page #{gid}"
127
132
  else
128
133
  puts "Would have reparse page #{gid}"
@@ -138,6 +143,24 @@ module Datahen
138
143
  reparse_page page_gid
139
144
  end
140
145
 
146
+ def limbo_page gid
147
+ if save
148
+ Client::JobPage.new({gid: gid}).limbo(self.job_id)
149
+ puts "Limbo page #{gid}"
150
+ else
151
+ puts "Would have limbo page #{gid}"
152
+ end
153
+ end
154
+
155
+ def limbo page_gid
156
+ raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
157
+ if page_gid == gid
158
+ self.limbo_self = true
159
+ raise Error::SafeTerminateError
160
+ end
161
+ limbo_page page_gid
162
+ end
163
+
141
164
  def eval_parser_script(save=false)
142
165
  update_parsing_starting_status
143
166
 
@@ -148,6 +171,7 @@ module Datahen
148
171
  page = init_page_vars(page)
149
172
  self.refetch_self = false
150
173
  self.reparse_self = false
174
+ self.limbo_self = false
151
175
 
152
176
  begin
153
177
  context = isolated_binding({
@@ -178,6 +202,8 @@ module Datahen
178
202
  refetch_page gid
179
203
  elsif reparse_self
180
204
  reparse_page gid
205
+ elsif limbo_self
206
+ limbo_page gid
181
207
  else
182
208
  update_parsing_done_status
183
209
  end
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.15.11"
2
+ VERSION = "0.17.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.11
4
+ version: 0.17.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-05-29 00:00:00.000000000 Z
11
+ date: 2021-08-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor