datahen 1.3.1 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: af6aefb0106af924ba636a91de29b27953fbf403ede51a779b9787a3f135d1b9
4
- data.tar.gz: ce6f5dedbdff08f2034cb631e103c905ee82642b86349fdbbecc878281764c6a
3
+ metadata.gz: ae63999d11bc052d81e3b1de67a0741702dd980e719dd544b5e689f0383e7a34
4
+ data.tar.gz: faf53b662afa26409bff83c3007127211863ce33ff45b8c60aab56491fdcafe7
5
5
  SHA512:
6
- metadata.gz: ccaa0107c92694865efbbf8c1b8e8d5d100d60145c3f33439abe5561ea6f220efa8f499764b152fdc90f0173433e380b697b4bd3eaee68d7b0681815c789bcc5
7
- data.tar.gz: c4859147460d3600e76d903755eef91cb114c65b7abb17c0ab093fa9f23e7e298dd02a42fc1da8a25c6d50edee06c17b8175dca4d61b62ec012c947f1904d2f4
6
+ metadata.gz: b0e7a0ddc975202df66785211cee796e1aef61de921f99ba0481645f59fb65963c03517e1d4f5b471d2ed108087011f20d6b693d6d199e2e96c860412b675415
7
+ data.tar.gz: 70a13268ba6f3df8f560a1b4d65b261ec4744bbd71eb22b04c25dc5809e70af3541cc30d9dad0e4a4cc22c3a353b90bc24c4a43cfc70e7f82fc2080001596d38
@@ -28,46 +28,48 @@ module Datahen
28
28
  end
29
29
  end
30
30
 
31
- desc "add <scraper_name> <url>", "Enqueues a page to a scraper's current job"
31
+ desc "add <scraper_name> <page_json>", "Enqueues a page to a scraper's current job"
32
32
  long_desc <<-LONGDESC
33
33
  Enqueues a page to a scraper's current job\x5
34
34
  LONGDESC
35
35
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
36
- option :method, :aliases => :m, type: :string, desc: 'Set request method. Default: GET'
37
- option :headers, :aliases => :H, type: :string, banner: :JSON, desc: 'Set request headers. Must be in json format. i.e: {"Foo":"bar"} '
38
- option :cookie, :aliases => :c, type: :string, desc: 'Set request cookie.'
39
- option :vars, :aliases => :v, type: :string, banner: :JSON, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
40
- option :page_type, :aliases => :t, desc: 'Set page type'
41
- option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
42
- option :fetch_type, :aliases => :F, desc: 'Set fetch type. Default: http'
43
- option :body, :aliases => :b, desc: 'Set request body'
44
- option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
45
- option :freshness, :aliases => :s, desc: 'Set how fresh the page cache is. Accepts timestap format.'
46
- option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
47
- option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
48
- option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
49
- option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
50
- def add(scraper_name, url)
36
+ def add(scraper_name, page_json)
51
37
  begin
52
- options[:headers] = JSON.parse(options[:headers]) if options[:headers]
53
- options[:vars] = JSON.parse(options[:vars]) if options[:vars]
54
- method = options[:method]
38
+ page = JSON.parse(page_json)
55
39
 
56
40
  if options[:job]
57
41
  client = Client::JobPage.new(options)
58
- puts "#{client.enqueue(options[:job], method, url, options)}"
42
+ puts "#{client.enqueue(options[:job], page, options)}"
59
43
  else
60
44
  client = Client::ScraperJobPage.new(options)
61
- puts "#{client.enqueue(scraper_name, method, url, options)}"
45
+ puts "#{client.enqueue(scraper_name, page, options)}"
62
46
  end
63
47
 
64
48
  rescue JSON::ParserError
65
- if options[:headers]
66
- puts "Error: #{options[:headers]} on headers is not a valid JSON"
67
- end
68
- if options[:vars]
69
- puts "Error: #{options[:vars]} on vars is not a valid JSON"
49
+ puts "Error: Invalid JSON"
50
+ end
51
+ end
52
+
53
+
54
+ desc "getgid <scraper_name> <page_json>", "Get the generated GID for a scraper's current job"
55
+ long_desc <<-LONGDESC
56
+ Get the generated GID for a scraper's current job.\x5
57
+ LONGDESC
58
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
59
+ def getgid(scraper_name, page_json)
60
+ begin
61
+ page = JSON.parse(page_json)
62
+
63
+ if options[:job]
64
+ client = Client::JobPage.new(options)
65
+ puts "#{client.get_gid(options[:job], page, options)}"
66
+ else
67
+ client = Client::ScraperJobPage.new(options)
68
+ puts "#{client.get_gid(scraper_name, page, options)}"
70
69
  end
70
+
71
+ rescue JSON::ParserError
72
+ puts "Error: Invalid JSON"
71
73
  end
72
74
  end
73
75
 
@@ -24,28 +24,18 @@ module Datahen
24
24
  self.class.put("/jobs/#{job_id}/pages/#{gid}", params)
25
25
  end
26
26
 
27
- def enqueue(job_id, method, url, opts={})
28
- body = {}
29
- body[:method] = method != "" ? method : "GET"
30
- body[:url] = url
31
- body[:page_type] = opts[:page_type] if opts[:page_type]
32
- body[:priority] = opts[:priority] if opts[:priority]
33
- body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
34
- body[:body] = opts[:body] if opts[:body]
35
- body[:headers] = opts[:headers] if opts[:headers]
36
- body[:vars] = opts[:vars] if opts[:vars]
37
- body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
38
- body[:freshness] = opts[:freshness] if opts[:freshness]
39
- body[:ua_type] = opts[:ua_type] if opts[:ua_type]
40
- body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
41
- body[:cookie] = opts[:cookie] if opts[:cookie]
42
- body[:max_size] = opts[:max_size] if opts[:max_size]
43
- body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
44
- body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
45
-
46
- params = @options.merge({body: body.to_json})
27
+ def enqueue(job_id, page, opts={})
28
+ params = @options.merge(opts).merge({body: page.to_json})
47
29
 
48
30
  self.class.post("/jobs/#{job_id}/pages", params)
31
+
32
+ end
33
+
34
+ def get_gid(job_id, page, opts={})
35
+
36
+ params = @options.merge(opts).merge({body: page.to_json})
37
+
38
+ self.class.post("/jobs/#{job_id}/generate_gid", params)
49
39
  end
50
40
 
51
41
  def dequeue(job_id, limit, page_types, parse_fetching_failed, opts = {})
@@ -47,30 +47,19 @@ module Datahen
47
47
  self.class.put("/scrapers/#{scraper_name}/current_job/pages/limbo", params)
48
48
  end
49
49
 
50
- def enqueue(scraper_name, method, url, opts={})
51
- body = {}
52
- body[:method] = method != "" ? method : "GET"
53
- body[:url] = url
54
- body[:page_type] = opts[:page_type] if opts[:page_type]
55
- body[:priority] = opts[:priority] if opts[:priority]
56
- body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
57
- body[:body] = opts[:body] if opts[:body]
58
- body[:headers] = opts[:headers] if opts[:headers]
59
- body[:vars] = opts[:vars] if opts[:vars]
60
- body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
61
- body[:freshness] = opts[:freshness] if opts[:freshness]
62
- body[:ua_type] = opts[:ua_type] if opts[:ua_type]
63
- body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
64
- body[:cookie] = opts[:cookie] if opts[:cookie]
65
- body[:max_size] = opts[:max_size] if opts[:max_size]
66
- body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
67
- body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
68
-
69
- params = @options.merge({body: body.to_json})
50
+ def enqueue(scraper_name, page, opts={})
51
+ params = @options.merge(opts).merge({body: page.to_json})
70
52
 
71
53
  self.class.post("/scrapers/#{scraper_name}/current_job/pages", params)
72
54
  end
73
55
 
56
+ def get_gid(scraper_name, page, opts={})
57
+
58
+ params = @options.merge(opts).merge({body: page.to_json})
59
+
60
+ self.class.post("/scrapers/#{scraper_name}/current_job/generate_gid", params)
61
+ end
62
+
74
63
  def find_content(scraper_name, gid)
75
64
  self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}/content", @options)
76
65
  end
@@ -219,7 +219,7 @@ module Datahen
219
219
  self.dequeuer_is_alive!
220
220
 
221
221
  # ensure a valid response or try again
222
- if response.nil? || response.response.code.to_i != 200
222
+ if response.body.nil? || response.body.empty? || response.response.code.to_i != 200
223
223
  self.repeat_puts(response.nil? ? 'null' : response.body)
224
224
  self.recollect_garbage
225
225
  return 0
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "1.3.1"
2
+ VERSION = "1.4.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.1
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-06-05 00:00:00.000000000 Z
11
+ date: 2023-11-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -278,7 +278,7 @@ metadata:
278
278
  allowed_push_host: https://rubygems.org
279
279
  homepage_uri: https://datahen.com
280
280
  source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
281
- post_install_message:
281
+ post_install_message:
282
282
  rdoc_options: []
283
283
  require_paths:
284
284
  - lib
@@ -293,8 +293,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
293
293
  - !ruby/object:Gem::Version
294
294
  version: '0'
295
295
  requirements: []
296
- rubygems_version: 3.0.3
297
- signing_key:
296
+ rubygems_version: 3.1.4
297
+ signing_key:
298
298
  specification_version: 4
299
299
  summary: DataHen toolbelt for developers
300
300
  test_files: []