datahen 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 39397d5cb4e60a6d24cdec5bd979f543a23019b7c9b9dffe6140a204d330465c
4
- data.tar.gz: 1db7c2b448179c2bc4b56e99428dfb4303cbb1451df032ec43cb5264f58935ec
3
+ metadata.gz: 0e1fcf7422236924fd818a1527337a6089cd444b1f35510b72fe140facbed7b0
4
+ data.tar.gz: 05be57d3e058ee9969d210ded0b1d043b388390d5f2ac834ece490691683f39d
5
5
  SHA512:
6
- metadata.gz: 7058506211d537c8ea3c9a521625fd339b255f41188a70341cc04683ca1abc1fa7f19ed796026b5e07679bb2fd7e57f096d319fbe8a75ae6fb7fd59a704a9824
7
- data.tar.gz: b8b60607cd27acbd654afe0816b0b7738871ca61c370fbafa747985d3723fec64f9b07c6078b34c1192db1d7b160682522a4901a7047c3d067860bc2b745b0b0
6
+ metadata.gz: 4e076509fa8a0fa7fa78406916530bfe2c1b6075ac1007baab43a447911d1c7d8e90bddd8a1438c339a4fadef4e05e629e4907f1f1bae3f4c1f283dba63c25c9
7
+ data.tar.gz: e7ceb1208c87cd75fa7202f55549c6b2f2ce24980f7642827aab6f721107ca8ddb59829b93742126e465d6930c6c5574de2d045d3a968a7f3a826bf099ee3c4b
@@ -33,6 +33,8 @@ module Datahen
33
33
  option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
34
34
  option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
35
35
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
36
+ option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
37
+ option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
36
38
  def create(scraper_name, git_repository)
37
39
  # puts "options #{options}"
38
40
  client = Client::Scraper.new(options)
@@ -59,6 +61,8 @@ module Datahen
59
61
  option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
60
62
  option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
61
63
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
64
+ option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
65
+ option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
62
66
  def update(scraper_name)
63
67
  client = Client::Scraper.new(options)
64
68
  puts "#{client.update(scraper_name, options)}"
@@ -97,6 +101,7 @@ module Datahen
97
101
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
98
102
  option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
99
103
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
104
+ option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
100
105
  def start(scraper_name)
101
106
  client = Client::ScraperJob.new(options)
102
107
  puts "Starting a scrape job..."
@@ -105,6 +105,8 @@ module Datahen
105
105
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
106
106
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
107
107
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
108
+ option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
109
+ option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
108
110
  def update(scraper_name)
109
111
  if options[:job]
110
112
  client = Client::Job.new(options)
@@ -46,6 +46,7 @@ module Datahen
46
46
  option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
47
47
  option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
48
48
  option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
49
+ option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
49
50
  def add(scraper_name, url)
50
51
  begin
51
52
  options[:headers] = JSON.parse(options[:headers]) if options[:headers]
@@ -80,6 +81,7 @@ module Datahen
80
81
  option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
81
82
  option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
82
83
  option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
84
+ option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
83
85
  def update(scraper_name, gid)
84
86
  begin
85
87
  options[:vars] = JSON.parse(options[:vars]) if options[:vars]
@@ -22,6 +22,8 @@ module Datahen
22
22
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
23
23
  body[:profile] = opts[:profile] if opts[:profile]
24
24
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
25
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
26
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
25
27
  params = @options.merge({body: body.to_json})
26
28
 
27
29
  self.class.put("/jobs/#{job_id}", params)
@@ -16,6 +16,8 @@ module Datahen
16
16
  body[:priority] = opts[:priority] if opts[:priority]
17
17
  body[:vars] = opts[:vars] if opts[:vars]
18
18
  body[:max_size] = opts[:max_size] if opts[:max_size]
19
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
20
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
19
21
 
20
22
  params = @options.merge({body: body.to_json})
21
23
 
@@ -38,6 +40,8 @@ module Datahen
38
40
  body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
39
41
  body[:cookie] = opts[:cookie] if opts[:cookie]
40
42
  body[:max_size] = opts[:max_size] if opts[:max_size]
43
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
44
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
41
45
 
42
46
  params = @options.merge({body: body.to_json})
43
47
 
@@ -29,6 +29,8 @@ module Datahen
29
29
  body[:multiple_jobs] = opts[:multiple_jobs] if opts[:multiple_jobs]
30
30
  body[:max_job_count] = opts[:max_job_count] if opts[:max_job_count]
31
31
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
32
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
33
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
32
34
  params = @options.merge({body: body.to_json})
33
35
  self.class.post("/scrapers", params)
34
36
  end
@@ -51,6 +53,8 @@ module Datahen
51
53
  body[:multiple_jobs] = opts[:multiple_jobs] if opts.has_key?("multiple_jobs") || opts.has_key?(:multiple_jobs)
52
54
  body[:max_job_count] = opts[:max_job_count] if opts.has_key?("max_job_count") || opts.has_key?(:max_job_count)
53
55
  body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
56
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
57
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
54
58
  params = @options.merge({body: body.to_json})
55
59
 
56
60
  self.class.put("/scrapers/#{scraper_name}", params)
@@ -12,6 +12,8 @@ module Datahen
12
12
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
13
13
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
14
14
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
15
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
16
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
15
17
  if opts[:vars]
16
18
  if opts[:vars].is_a?(Array)
17
19
  body[:vars] = opts[:vars]
@@ -39,6 +41,8 @@ module Datahen
39
41
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
40
42
  body[:profile] = opts[:profile] if opts[:profile]
41
43
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
44
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
45
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
42
46
  params = @options.merge({body: body.to_json})
43
47
 
44
48
  self.class.put("/scrapers/#{scraper_name}/current_job", params)
@@ -15,6 +15,9 @@ module Datahen
15
15
  body[:page_type] = opts[:page_type] if opts[:page_type]
16
16
  body[:priority] = opts[:priority] if opts[:priority]
17
17
  body[:vars] = opts[:vars] if opts[:vars]
18
+ body[:max_size] = opts[:max_size] if opts[:max_size]
19
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
20
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
18
21
 
19
22
  params = @options.merge({body: body.to_json})
20
23
 
@@ -59,6 +62,9 @@ module Datahen
59
62
  body[:ua_type] = opts[:ua_type] if opts[:ua_type]
60
63
  body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
61
64
  body[:cookie] = opts[:cookie] if opts[:cookie]
65
+ body[:max_size] = opts[:max_size] if opts[:max_size]
66
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
67
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
62
68
 
63
69
  params = @options.merge({body: body.to_json})
64
70
 
@@ -334,6 +334,7 @@ module Datahen
334
334
  Parallel.each(dequeue, in_threads: (worker_count)) do |page|
335
335
  parser_file = self.parsers[page['page_type']]
336
336
  begin
337
+ self.repeat_puts("Parsing page with GID #{page['gid']}")
337
338
  puts Datahen::Scraper::Parser.exec_parser_by_page(
338
339
  parser_file,
339
340
  page,
@@ -342,6 +343,7 @@ module Datahen
342
343
  nil,
343
344
  keep_outputs
344
345
  )
346
+ self.repeat_puts("Finish parsing page with GID #{page['gid']}")
345
347
  rescue Parallel::Kill => e
346
348
  puts "[Worker #{Parallel.worker_number}]: Someone tried to kill Parallel!!!"
347
349
  rescue Parallel::Break => e
@@ -8,7 +8,11 @@ module Datahen
8
8
  attr_accessor :refetch_self
9
9
  # Reparse self page flag.
10
10
  # @return [Boollean]
11
+ # @note It is stronger than #limbo_self flag.
11
12
  attr_accessor :reparse_self
13
+ # Limbo self page flag.
14
+ # @return [Boollean]
15
+ attr_accessor :limbo_self
12
16
 
13
17
  def initialize(options={})
14
18
  @filename = options.fetch(:filename) { raise "Filename is required"}
@@ -31,7 +35,8 @@ module Datahen
31
35
  :find_output,
32
36
  :find_outputs,
33
37
  :refetch,
34
- :reparse
38
+ :reparse,
39
+ :limbo
35
40
  ].freeze
36
41
  end
37
42
 
@@ -104,7 +109,7 @@ module Datahen
104
109
 
105
110
  def refetch_page gid
106
111
  if save
107
- Client::ScraperJobPage.new({gid: gid}).refetch_by_job(self.job_id)
112
+ Client::JobPage.new({gid: gid}).refetch(self.job_id)
108
113
  puts "Refetch page #{gid}"
109
114
  else
110
115
  puts "Would have refetch page #{gid}"
@@ -122,7 +127,7 @@ module Datahen
122
127
 
123
128
  def reparse_page gid
124
129
  if save
125
- Client::ScraperJobPage.new({gid: gid}).reparse_by_job(self.job_id)
130
+ Client::JobPage.new({gid: gid}).reparse(self.job_id)
126
131
  puts "Reparse page #{gid}"
127
132
  else
128
133
  puts "Would have reparse page #{gid}"
@@ -138,6 +143,24 @@ module Datahen
138
143
  reparse_page page_gid
139
144
  end
140
145
 
146
+ def limbo_page gid
147
+ if save
148
+ Client::JobPage.new({gid: gid}).limbo(self.job_id)
149
+ puts "Limbo page #{gid}"
150
+ else
151
+ puts "Would have limbo page #{gid}"
152
+ end
153
+ end
154
+
155
+ def limbo page_gid
156
+ raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
157
+ if page_gid == gid
158
+ self.limbo_self = true
159
+ raise Error::SafeTerminateError
160
+ end
161
+ limbo_page page_gid
162
+ end
163
+
141
164
  def eval_parser_script(save=false)
142
165
  update_parsing_starting_status
143
166
 
@@ -148,6 +171,7 @@ module Datahen
148
171
  page = init_page_vars(page)
149
172
  self.refetch_self = false
150
173
  self.reparse_self = false
174
+ self.limbo_self = false
151
175
 
152
176
  begin
153
177
  context = isolated_binding({
@@ -178,6 +202,8 @@ module Datahen
178
202
  refetch_page gid
179
203
  elsif reparse_self
180
204
  reparse_page gid
205
+ elsif limbo_self
206
+ limbo_page gid
181
207
  else
182
208
  update_parsing_done_status
183
209
  end
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.16.0"
2
+ VERSION = "0.18.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.16.0
4
+ version: 0.18.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-07-22 00:00:00.000000000 Z
11
+ date: 2021-09-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor