datahen 1.3.1 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/datahen/cli/scraper_page.rb +28 -26
- data/lib/datahen/client/job_page.rb +10 -20
- data/lib/datahen/client/scraper_job_page.rb +9 -20
- data/lib/datahen/scraper/batch_parser.rb +1 -1
- data/lib/datahen/version.rb +1 -1
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ae63999d11bc052d81e3b1de67a0741702dd980e719dd544b5e689f0383e7a34
|
4
|
+
data.tar.gz: faf53b662afa26409bff83c3007127211863ce33ff45b8c60aab56491fdcafe7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b0e7a0ddc975202df66785211cee796e1aef61de921f99ba0481645f59fb65963c03517e1d4f5b471d2ed108087011f20d6b693d6d199e2e96c860412b675415
|
7
|
+
data.tar.gz: 70a13268ba6f3df8f560a1b4d65b261ec4744bbd71eb22b04c25dc5809e70af3541cc30d9dad0e4a4cc22c3a353b90bc24c4a43cfc70e7f82fc2080001596d38
|
@@ -28,46 +28,48 @@ module Datahen
|
|
28
28
|
end
|
29
29
|
end
|
30
30
|
|
31
|
-
desc "add <scraper_name> <
|
31
|
+
desc "add <scraper_name> <page_json>", "Enqueues a page to a scraper's current job"
|
32
32
|
long_desc <<-LONGDESC
|
33
33
|
Enqueues a page to a scraper's current job\x5
|
34
34
|
LONGDESC
|
35
35
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
36
|
-
|
37
|
-
option :headers, :aliases => :H, type: :string, banner: :JSON, desc: 'Set request headers. Must be in json format. i.e: {"Foo":"bar"} '
|
38
|
-
option :cookie, :aliases => :c, type: :string, desc: 'Set request cookie.'
|
39
|
-
option :vars, :aliases => :v, type: :string, banner: :JSON, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
40
|
-
option :page_type, :aliases => :t, desc: 'Set page type'
|
41
|
-
option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
|
42
|
-
option :fetch_type, :aliases => :F, desc: 'Set fetch type. Default: http'
|
43
|
-
option :body, :aliases => :b, desc: 'Set request body'
|
44
|
-
option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
|
45
|
-
option :freshness, :aliases => :s, desc: 'Set how fresh the page cache is. Accepts timestap format.'
|
46
|
-
option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
|
47
|
-
option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
|
48
|
-
option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
49
|
-
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
50
|
-
def add(scraper_name, url)
|
36
|
+
def add(scraper_name, page_json)
|
51
37
|
begin
|
52
|
-
|
53
|
-
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
54
|
-
method = options[:method]
|
38
|
+
page = JSON.parse(page_json)
|
55
39
|
|
56
40
|
if options[:job]
|
57
41
|
client = Client::JobPage.new(options)
|
58
|
-
puts "#{client.enqueue(options[:job],
|
42
|
+
puts "#{client.enqueue(options[:job], page, options)}"
|
59
43
|
else
|
60
44
|
client = Client::ScraperJobPage.new(options)
|
61
|
-
puts "#{client.enqueue(scraper_name,
|
45
|
+
puts "#{client.enqueue(scraper_name, page, options)}"
|
62
46
|
end
|
63
47
|
|
64
48
|
rescue JSON::ParserError
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
49
|
+
puts "Error: Invalid JSON"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
desc "getgid <scraper_name> <page_json>", "Get the generated GID for a scraper's current job"
|
55
|
+
long_desc <<-LONGDESC
|
56
|
+
Get the generated GID for a scraper's current job.\x5
|
57
|
+
LONGDESC
|
58
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
59
|
+
def getgid(scraper_name, page_json)
|
60
|
+
begin
|
61
|
+
page = JSON.parse(page_json)
|
62
|
+
|
63
|
+
if options[:job]
|
64
|
+
client = Client::JobPage.new(options)
|
65
|
+
puts "#{client.get_gid(options[:job], page, options)}"
|
66
|
+
else
|
67
|
+
client = Client::ScraperJobPage.new(options)
|
68
|
+
puts "#{client.get_gid(scraper_name, page, options)}"
|
70
69
|
end
|
70
|
+
|
71
|
+
rescue JSON::ParserError
|
72
|
+
puts "Error: Invalid JSON"
|
71
73
|
end
|
72
74
|
end
|
73
75
|
|
@@ -24,28 +24,18 @@ module Datahen
|
|
24
24
|
self.class.put("/jobs/#{job_id}/pages/#{gid}", params)
|
25
25
|
end
|
26
26
|
|
27
|
-
def enqueue(job_id,
|
28
|
-
|
29
|
-
body[:method] = method != "" ? method : "GET"
|
30
|
-
body[:url] = url
|
31
|
-
body[:page_type] = opts[:page_type] if opts[:page_type]
|
32
|
-
body[:priority] = opts[:priority] if opts[:priority]
|
33
|
-
body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
|
34
|
-
body[:body] = opts[:body] if opts[:body]
|
35
|
-
body[:headers] = opts[:headers] if opts[:headers]
|
36
|
-
body[:vars] = opts[:vars] if opts[:vars]
|
37
|
-
body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
|
38
|
-
body[:freshness] = opts[:freshness] if opts[:freshness]
|
39
|
-
body[:ua_type] = opts[:ua_type] if opts[:ua_type]
|
40
|
-
body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
|
41
|
-
body[:cookie] = opts[:cookie] if opts[:cookie]
|
42
|
-
body[:max_size] = opts[:max_size] if opts[:max_size]
|
43
|
-
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
44
|
-
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
45
|
-
|
46
|
-
params = @options.merge({body: body.to_json})
|
27
|
+
def enqueue(job_id, page, opts={})
|
28
|
+
params = @options.merge(opts).merge({body: page.to_json})
|
47
29
|
|
48
30
|
self.class.post("/jobs/#{job_id}/pages", params)
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
def get_gid(job_id, page, opts={})
|
35
|
+
|
36
|
+
params = @options.merge(opts).merge({body: page.to_json})
|
37
|
+
|
38
|
+
self.class.post("/jobs/#{job_id}/generate_gid", params)
|
49
39
|
end
|
50
40
|
|
51
41
|
def dequeue(job_id, limit, page_types, parse_fetching_failed, opts = {})
|
@@ -47,30 +47,19 @@ module Datahen
|
|
47
47
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/limbo", params)
|
48
48
|
end
|
49
49
|
|
50
|
-
def enqueue(scraper_name,
|
51
|
-
|
52
|
-
body[:method] = method != "" ? method : "GET"
|
53
|
-
body[:url] = url
|
54
|
-
body[:page_type] = opts[:page_type] if opts[:page_type]
|
55
|
-
body[:priority] = opts[:priority] if opts[:priority]
|
56
|
-
body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
|
57
|
-
body[:body] = opts[:body] if opts[:body]
|
58
|
-
body[:headers] = opts[:headers] if opts[:headers]
|
59
|
-
body[:vars] = opts[:vars] if opts[:vars]
|
60
|
-
body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
|
61
|
-
body[:freshness] = opts[:freshness] if opts[:freshness]
|
62
|
-
body[:ua_type] = opts[:ua_type] if opts[:ua_type]
|
63
|
-
body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
|
64
|
-
body[:cookie] = opts[:cookie] if opts[:cookie]
|
65
|
-
body[:max_size] = opts[:max_size] if opts[:max_size]
|
66
|
-
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
67
|
-
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
68
|
-
|
69
|
-
params = @options.merge({body: body.to_json})
|
50
|
+
def enqueue(scraper_name, page, opts={})
|
51
|
+
params = @options.merge(opts).merge({body: page.to_json})
|
70
52
|
|
71
53
|
self.class.post("/scrapers/#{scraper_name}/current_job/pages", params)
|
72
54
|
end
|
73
55
|
|
56
|
+
def get_gid(scraper_name, page, opts={})
|
57
|
+
|
58
|
+
params = @options.merge(opts).merge({body: page.to_json})
|
59
|
+
|
60
|
+
self.class.post("/scrapers/#{scraper_name}/current_job/generate_gid", params)
|
61
|
+
end
|
62
|
+
|
74
63
|
def find_content(scraper_name, gid)
|
75
64
|
self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}/content", @options)
|
76
65
|
end
|
@@ -219,7 +219,7 @@ module Datahen
|
|
219
219
|
self.dequeuer_is_alive!
|
220
220
|
|
221
221
|
# ensure a valid response or try again
|
222
|
-
if response.nil? || response.response.code.to_i != 200
|
222
|
+
if response.body.nil? || response.body.empty? || response.response.code.to_i != 200
|
223
223
|
self.repeat_puts(response.nil? ? 'null' : response.body)
|
224
224
|
self.recollect_garbage
|
225
225
|
return 0
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-11-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -278,7 +278,7 @@ metadata:
|
|
278
278
|
allowed_push_host: https://rubygems.org
|
279
279
|
homepage_uri: https://datahen.com
|
280
280
|
source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
|
281
|
-
post_install_message:
|
281
|
+
post_install_message:
|
282
282
|
rdoc_options: []
|
283
283
|
require_paths:
|
284
284
|
- lib
|
@@ -293,8 +293,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
293
293
|
- !ruby/object:Gem::Version
|
294
294
|
version: '0'
|
295
295
|
requirements: []
|
296
|
-
rubygems_version: 3.
|
297
|
-
signing_key:
|
296
|
+
rubygems_version: 3.1.4
|
297
|
+
signing_key:
|
298
298
|
specification_version: 4
|
299
299
|
summary: DataHen toolbelt for developers
|
300
300
|
test_files: []
|