datahen 1.3.1 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/datahen/cli/scraper_page.rb +28 -26
- data/lib/datahen/client/job_page.rb +10 -20
- data/lib/datahen/client/scraper_job_page.rb +9 -20
- data/lib/datahen/scraper/batch_parser.rb +1 -1
- data/lib/datahen/version.rb +1 -1
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ae63999d11bc052d81e3b1de67a0741702dd980e719dd544b5e689f0383e7a34
|
4
|
+
data.tar.gz: faf53b662afa26409bff83c3007127211863ce33ff45b8c60aab56491fdcafe7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b0e7a0ddc975202df66785211cee796e1aef61de921f99ba0481645f59fb65963c03517e1d4f5b471d2ed108087011f20d6b693d6d199e2e96c860412b675415
|
7
|
+
data.tar.gz: 70a13268ba6f3df8f560a1b4d65b261ec4744bbd71eb22b04c25dc5809e70af3541cc30d9dad0e4a4cc22c3a353b90bc24c4a43cfc70e7f82fc2080001596d38
|
@@ -28,46 +28,48 @@ module Datahen
|
|
28
28
|
end
|
29
29
|
end
|
30
30
|
|
31
|
-
desc "add <scraper_name> <
|
31
|
+
desc "add <scraper_name> <page_json>", "Enqueues a page to a scraper's current job"
|
32
32
|
long_desc <<-LONGDESC
|
33
33
|
Enqueues a page to a scraper's current job\x5
|
34
34
|
LONGDESC
|
35
35
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
36
|
-
|
37
|
-
option :headers, :aliases => :H, type: :string, banner: :JSON, desc: 'Set request headers. Must be in json format. i.e: {"Foo":"bar"} '
|
38
|
-
option :cookie, :aliases => :c, type: :string, desc: 'Set request cookie.'
|
39
|
-
option :vars, :aliases => :v, type: :string, banner: :JSON, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
40
|
-
option :page_type, :aliases => :t, desc: 'Set page type'
|
41
|
-
option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
|
42
|
-
option :fetch_type, :aliases => :F, desc: 'Set fetch type. Default: http'
|
43
|
-
option :body, :aliases => :b, desc: 'Set request body'
|
44
|
-
option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
|
45
|
-
option :freshness, :aliases => :s, desc: 'Set how fresh the page cache is. Accepts timestap format.'
|
46
|
-
option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
|
47
|
-
option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
|
48
|
-
option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
49
|
-
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
50
|
-
def add(scraper_name, url)
|
36
|
+
def add(scraper_name, page_json)
|
51
37
|
begin
|
52
|
-
|
53
|
-
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
54
|
-
method = options[:method]
|
38
|
+
page = JSON.parse(page_json)
|
55
39
|
|
56
40
|
if options[:job]
|
57
41
|
client = Client::JobPage.new(options)
|
58
|
-
puts "#{client.enqueue(options[:job],
|
42
|
+
puts "#{client.enqueue(options[:job], page, options)}"
|
59
43
|
else
|
60
44
|
client = Client::ScraperJobPage.new(options)
|
61
|
-
puts "#{client.enqueue(scraper_name,
|
45
|
+
puts "#{client.enqueue(scraper_name, page, options)}"
|
62
46
|
end
|
63
47
|
|
64
48
|
rescue JSON::ParserError
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
49
|
+
puts "Error: Invalid JSON"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
desc "getgid <scraper_name> <page_json>", "Get the generated GID for a scraper's current job"
|
55
|
+
long_desc <<-LONGDESC
|
56
|
+
Get the generated GID for a scraper's current job.\x5
|
57
|
+
LONGDESC
|
58
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
59
|
+
def getgid(scraper_name, page_json)
|
60
|
+
begin
|
61
|
+
page = JSON.parse(page_json)
|
62
|
+
|
63
|
+
if options[:job]
|
64
|
+
client = Client::JobPage.new(options)
|
65
|
+
puts "#{client.get_gid(options[:job], page, options)}"
|
66
|
+
else
|
67
|
+
client = Client::ScraperJobPage.new(options)
|
68
|
+
puts "#{client.get_gid(scraper_name, page, options)}"
|
70
69
|
end
|
70
|
+
|
71
|
+
rescue JSON::ParserError
|
72
|
+
puts "Error: Invalid JSON"
|
71
73
|
end
|
72
74
|
end
|
73
75
|
|
@@ -24,28 +24,18 @@ module Datahen
|
|
24
24
|
self.class.put("/jobs/#{job_id}/pages/#{gid}", params)
|
25
25
|
end
|
26
26
|
|
27
|
-
def enqueue(job_id,
|
28
|
-
|
29
|
-
body[:method] = method != "" ? method : "GET"
|
30
|
-
body[:url] = url
|
31
|
-
body[:page_type] = opts[:page_type] if opts[:page_type]
|
32
|
-
body[:priority] = opts[:priority] if opts[:priority]
|
33
|
-
body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
|
34
|
-
body[:body] = opts[:body] if opts[:body]
|
35
|
-
body[:headers] = opts[:headers] if opts[:headers]
|
36
|
-
body[:vars] = opts[:vars] if opts[:vars]
|
37
|
-
body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
|
38
|
-
body[:freshness] = opts[:freshness] if opts[:freshness]
|
39
|
-
body[:ua_type] = opts[:ua_type] if opts[:ua_type]
|
40
|
-
body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
|
41
|
-
body[:cookie] = opts[:cookie] if opts[:cookie]
|
42
|
-
body[:max_size] = opts[:max_size] if opts[:max_size]
|
43
|
-
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
44
|
-
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
45
|
-
|
46
|
-
params = @options.merge({body: body.to_json})
|
27
|
+
def enqueue(job_id, page, opts={})
|
28
|
+
params = @options.merge(opts).merge({body: page.to_json})
|
47
29
|
|
48
30
|
self.class.post("/jobs/#{job_id}/pages", params)
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
def get_gid(job_id, page, opts={})
|
35
|
+
|
36
|
+
params = @options.merge(opts).merge({body: page.to_json})
|
37
|
+
|
38
|
+
self.class.post("/jobs/#{job_id}/generate_gid", params)
|
49
39
|
end
|
50
40
|
|
51
41
|
def dequeue(job_id, limit, page_types, parse_fetching_failed, opts = {})
|
@@ -47,30 +47,19 @@ module Datahen
|
|
47
47
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/limbo", params)
|
48
48
|
end
|
49
49
|
|
50
|
-
def enqueue(scraper_name,
|
51
|
-
|
52
|
-
body[:method] = method != "" ? method : "GET"
|
53
|
-
body[:url] = url
|
54
|
-
body[:page_type] = opts[:page_type] if opts[:page_type]
|
55
|
-
body[:priority] = opts[:priority] if opts[:priority]
|
56
|
-
body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
|
57
|
-
body[:body] = opts[:body] if opts[:body]
|
58
|
-
body[:headers] = opts[:headers] if opts[:headers]
|
59
|
-
body[:vars] = opts[:vars] if opts[:vars]
|
60
|
-
body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
|
61
|
-
body[:freshness] = opts[:freshness] if opts[:freshness]
|
62
|
-
body[:ua_type] = opts[:ua_type] if opts[:ua_type]
|
63
|
-
body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
|
64
|
-
body[:cookie] = opts[:cookie] if opts[:cookie]
|
65
|
-
body[:max_size] = opts[:max_size] if opts[:max_size]
|
66
|
-
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
67
|
-
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
68
|
-
|
69
|
-
params = @options.merge({body: body.to_json})
|
50
|
+
def enqueue(scraper_name, page, opts={})
|
51
|
+
params = @options.merge(opts).merge({body: page.to_json})
|
70
52
|
|
71
53
|
self.class.post("/scrapers/#{scraper_name}/current_job/pages", params)
|
72
54
|
end
|
73
55
|
|
56
|
+
def get_gid(scraper_name, page, opts={})
|
57
|
+
|
58
|
+
params = @options.merge(opts).merge({body: page.to_json})
|
59
|
+
|
60
|
+
self.class.post("/scrapers/#{scraper_name}/current_job/generate_gid", params)
|
61
|
+
end
|
62
|
+
|
74
63
|
def find_content(scraper_name, gid)
|
75
64
|
self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}/content", @options)
|
76
65
|
end
|
@@ -219,7 +219,7 @@ module Datahen
|
|
219
219
|
self.dequeuer_is_alive!
|
220
220
|
|
221
221
|
# ensure a valid response or try again
|
222
|
-
if response.nil? || response.response.code.to_i != 200
|
222
|
+
if response.body.nil? || response.body.empty? || response.response.code.to_i != 200
|
223
223
|
self.repeat_puts(response.nil? ? 'null' : response.body)
|
224
224
|
self.recollect_garbage
|
225
225
|
return 0
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-11-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -278,7 +278,7 @@ metadata:
|
|
278
278
|
allowed_push_host: https://rubygems.org
|
279
279
|
homepage_uri: https://datahen.com
|
280
280
|
source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
|
281
|
-
post_install_message:
|
281
|
+
post_install_message:
|
282
282
|
rdoc_options: []
|
283
283
|
require_paths:
|
284
284
|
- lib
|
@@ -293,8 +293,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
293
293
|
- !ruby/object:Gem::Version
|
294
294
|
version: '0'
|
295
295
|
requirements: []
|
296
|
-
rubygems_version: 3.
|
297
|
-
signing_key:
|
296
|
+
rubygems_version: 3.1.4
|
297
|
+
signing_key:
|
298
298
|
specification_version: 4
|
299
299
|
summary: DataHen toolbelt for developers
|
300
300
|
test_files: []
|