datahen 1.3.1 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: af6aefb0106af924ba636a91de29b27953fbf403ede51a779b9787a3f135d1b9
4
- data.tar.gz: ce6f5dedbdff08f2034cb631e103c905ee82642b86349fdbbecc878281764c6a
3
+ metadata.gz: ae63999d11bc052d81e3b1de67a0741702dd980e719dd544b5e689f0383e7a34
4
+ data.tar.gz: faf53b662afa26409bff83c3007127211863ce33ff45b8c60aab56491fdcafe7
5
5
  SHA512:
6
- metadata.gz: ccaa0107c92694865efbbf8c1b8e8d5d100d60145c3f33439abe5561ea6f220efa8f499764b152fdc90f0173433e380b697b4bd3eaee68d7b0681815c789bcc5
7
- data.tar.gz: c4859147460d3600e76d903755eef91cb114c65b7abb17c0ab093fa9f23e7e298dd02a42fc1da8a25c6d50edee06c17b8175dca4d61b62ec012c947f1904d2f4
6
+ metadata.gz: b0e7a0ddc975202df66785211cee796e1aef61de921f99ba0481645f59fb65963c03517e1d4f5b471d2ed108087011f20d6b693d6d199e2e96c860412b675415
7
+ data.tar.gz: 70a13268ba6f3df8f560a1b4d65b261ec4744bbd71eb22b04c25dc5809e70af3541cc30d9dad0e4a4cc22c3a353b90bc24c4a43cfc70e7f82fc2080001596d38
@@ -28,46 +28,48 @@ module Datahen
28
28
  end
29
29
  end
30
30
 
31
- desc "add <scraper_name> <url>", "Enqueues a page to a scraper's current job"
31
+ desc "add <scraper_name> <page_json>", "Enqueues a page to a scraper's current job"
32
32
  long_desc <<-LONGDESC
33
33
  Enqueues a page to a scraper's current job\x5
34
34
  LONGDESC
35
35
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
36
- option :method, :aliases => :m, type: :string, desc: 'Set request method. Default: GET'
37
- option :headers, :aliases => :H, type: :string, banner: :JSON, desc: 'Set request headers. Must be in json format. i.e: {"Foo":"bar"} '
38
- option :cookie, :aliases => :c, type: :string, desc: 'Set request cookie.'
39
- option :vars, :aliases => :v, type: :string, banner: :JSON, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
40
- option :page_type, :aliases => :t, desc: 'Set page type'
41
- option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
42
- option :fetch_type, :aliases => :F, desc: 'Set fetch type. Default: http'
43
- option :body, :aliases => :b, desc: 'Set request body'
44
- option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
45
- option :freshness, :aliases => :s, desc: 'Set how fresh the page cache is. Accepts timestap format.'
46
- option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
47
- option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
48
- option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
49
- option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
50
- def add(scraper_name, url)
36
+ def add(scraper_name, page_json)
51
37
  begin
52
- options[:headers] = JSON.parse(options[:headers]) if options[:headers]
53
- options[:vars] = JSON.parse(options[:vars]) if options[:vars]
54
- method = options[:method]
38
+ page = JSON.parse(page_json)
55
39
 
56
40
  if options[:job]
57
41
  client = Client::JobPage.new(options)
58
- puts "#{client.enqueue(options[:job], method, url, options)}"
42
+ puts "#{client.enqueue(options[:job], page, options)}"
59
43
  else
60
44
  client = Client::ScraperJobPage.new(options)
61
- puts "#{client.enqueue(scraper_name, method, url, options)}"
45
+ puts "#{client.enqueue(scraper_name, page, options)}"
62
46
  end
63
47
 
64
48
  rescue JSON::ParserError
65
- if options[:headers]
66
- puts "Error: #{options[:headers]} on headers is not a valid JSON"
67
- end
68
- if options[:vars]
69
- puts "Error: #{options[:vars]} on vars is not a valid JSON"
49
+ puts "Error: Invalid JSON"
50
+ end
51
+ end
52
+
53
+
54
+ desc "getgid <scraper_name> <page_json>", "Get the generated GID for a scraper's current job"
55
+ long_desc <<-LONGDESC
56
+ Get the generated GID for a scraper's current job.\x5
57
+ LONGDESC
58
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
59
+ def getgid(scraper_name, page_json)
60
+ begin
61
+ page = JSON.parse(page_json)
62
+
63
+ if options[:job]
64
+ client = Client::JobPage.new(options)
65
+ puts "#{client.get_gid(options[:job], page, options)}"
66
+ else
67
+ client = Client::ScraperJobPage.new(options)
68
+ puts "#{client.get_gid(scraper_name, page, options)}"
70
69
  end
70
+
71
+ rescue JSON::ParserError
72
+ puts "Error: Invalid JSON"
71
73
  end
72
74
  end
73
75
 
@@ -24,28 +24,18 @@ module Datahen
24
24
  self.class.put("/jobs/#{job_id}/pages/#{gid}", params)
25
25
  end
26
26
 
27
- def enqueue(job_id, method, url, opts={})
28
- body = {}
29
- body[:method] = method != "" ? method : "GET"
30
- body[:url] = url
31
- body[:page_type] = opts[:page_type] if opts[:page_type]
32
- body[:priority] = opts[:priority] if opts[:priority]
33
- body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
34
- body[:body] = opts[:body] if opts[:body]
35
- body[:headers] = opts[:headers] if opts[:headers]
36
- body[:vars] = opts[:vars] if opts[:vars]
37
- body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
38
- body[:freshness] = opts[:freshness] if opts[:freshness]
39
- body[:ua_type] = opts[:ua_type] if opts[:ua_type]
40
- body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
41
- body[:cookie] = opts[:cookie] if opts[:cookie]
42
- body[:max_size] = opts[:max_size] if opts[:max_size]
43
- body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
44
- body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
45
-
46
- params = @options.merge({body: body.to_json})
27
+ def enqueue(job_id, page, opts={})
28
+ params = @options.merge(opts).merge({body: page.to_json})
47
29
 
48
30
  self.class.post("/jobs/#{job_id}/pages", params)
31
+
32
+ end
33
+
34
+ def get_gid(job_id, page, opts={})
35
+
36
+ params = @options.merge(opts).merge({body: page.to_json})
37
+
38
+ self.class.post("/jobs/#{job_id}/generate_gid", params)
49
39
  end
50
40
 
51
41
  def dequeue(job_id, limit, page_types, parse_fetching_failed, opts = {})
@@ -47,30 +47,19 @@ module Datahen
47
47
  self.class.put("/scrapers/#{scraper_name}/current_job/pages/limbo", params)
48
48
  end
49
49
 
50
- def enqueue(scraper_name, method, url, opts={})
51
- body = {}
52
- body[:method] = method != "" ? method : "GET"
53
- body[:url] = url
54
- body[:page_type] = opts[:page_type] if opts[:page_type]
55
- body[:priority] = opts[:priority] if opts[:priority]
56
- body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
57
- body[:body] = opts[:body] if opts[:body]
58
- body[:headers] = opts[:headers] if opts[:headers]
59
- body[:vars] = opts[:vars] if opts[:vars]
60
- body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
61
- body[:freshness] = opts[:freshness] if opts[:freshness]
62
- body[:ua_type] = opts[:ua_type] if opts[:ua_type]
63
- body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
64
- body[:cookie] = opts[:cookie] if opts[:cookie]
65
- body[:max_size] = opts[:max_size] if opts[:max_size]
66
- body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
67
- body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
68
-
69
- params = @options.merge({body: body.to_json})
50
+ def enqueue(scraper_name, page, opts={})
51
+ params = @options.merge(opts).merge({body: page.to_json})
70
52
 
71
53
  self.class.post("/scrapers/#{scraper_name}/current_job/pages", params)
72
54
  end
73
55
 
56
+ def get_gid(scraper_name, page, opts={})
57
+
58
+ params = @options.merge(opts).merge({body: page.to_json})
59
+
60
+ self.class.post("/scrapers/#{scraper_name}/current_job/generate_gid", params)
61
+ end
62
+
74
63
  def find_content(scraper_name, gid)
75
64
  self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}/content", @options)
76
65
  end
@@ -219,7 +219,7 @@ module Datahen
219
219
  self.dequeuer_is_alive!
220
220
 
221
221
  # ensure a valid response or try again
222
- if response.nil? || response.response.code.to_i != 200
222
+ if response.body.nil? || response.body.empty? || response.response.code.to_i != 200
223
223
  self.repeat_puts(response.nil? ? 'null' : response.body)
224
224
  self.recollect_garbage
225
225
  return 0
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "1.3.1"
2
+ VERSION = "1.4.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.1
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-06-05 00:00:00.000000000 Z
11
+ date: 2023-11-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -278,7 +278,7 @@ metadata:
278
278
  allowed_push_host: https://rubygems.org
279
279
  homepage_uri: https://datahen.com
280
280
  source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
281
- post_install_message:
281
+ post_install_message:
282
282
  rdoc_options: []
283
283
  require_paths:
284
284
  - lib
@@ -293,8 +293,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
293
293
  - !ruby/object:Gem::Version
294
294
  version: '0'
295
295
  requirements: []
296
- rubygems_version: 3.0.3
297
- signing_key:
296
+ rubygems_version: 3.1.4
297
+ signing_key:
298
298
  specification_version: 4
299
299
  summary: DataHen toolbelt for developers
300
300
  test_files: []