datahen 0.17.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/datahen/cli/scraper.rb +3 -0
- data/lib/datahen/cli/scraper_job.rb +1 -0
- data/lib/datahen/cli/scraper_page.rb +2 -0
- data/lib/datahen/client/job.rb +1 -0
- data/lib/datahen/client/job_page.rb +2 -0
- data/lib/datahen/client/scraper.rb +2 -0
- data/lib/datahen/client/scraper_job.rb +2 -0
- data/lib/datahen/client/scraper_job_page.rb +2 -0
- data/lib/datahen/version.rb +1 -1
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0e1fcf7422236924fd818a1527337a6089cd444b1f35510b72fe140facbed7b0
|
4
|
+
data.tar.gz: 05be57d3e058ee9969d210ded0b1d043b388390d5f2ac834ece490691683f39d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4e076509fa8a0fa7fa78406916530bfe2c1b6075ac1007baab43a447911d1c7d8e90bddd8a1438c339a4fadef4e05e629e4907f1f1bae3f4c1f283dba63c25c9
|
7
|
+
data.tar.gz: e7ceb1208c87cd75fa7202f55549c6b2f2ce24980f7642827aab6f721107ca8ddb59829b93742126e465d6930c6c5574de2d045d3a968a7f3a826bf099ee3c4b
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -34,6 +34,7 @@ module Datahen
|
|
34
34
|
option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
|
35
35
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
36
36
|
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
37
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
37
38
|
def create(scraper_name, git_repository)
|
38
39
|
# puts "options #{options}"
|
39
40
|
client = Client::Scraper.new(options)
|
@@ -61,6 +62,7 @@ module Datahen
|
|
61
62
|
option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
|
62
63
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
63
64
|
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
65
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
64
66
|
def update(scraper_name)
|
65
67
|
client = Client::Scraper.new(options)
|
66
68
|
puts "#{client.update(scraper_name, options)}"
|
@@ -99,6 +101,7 @@ module Datahen
|
|
99
101
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
100
102
|
option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
|
101
103
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
104
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
102
105
|
def start(scraper_name)
|
103
106
|
client = Client::ScraperJob.new(options)
|
104
107
|
puts "Starting a scrape job..."
|
@@ -106,6 +106,7 @@ module Datahen
|
|
106
106
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
107
107
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
108
108
|
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
109
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
109
110
|
def update(scraper_name)
|
110
111
|
if options[:job]
|
111
112
|
client = Client::Job.new(options)
|
@@ -46,6 +46,7 @@ module Datahen
|
|
46
46
|
option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
|
47
47
|
option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
|
48
48
|
option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
49
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
49
50
|
def add(scraper_name, url)
|
50
51
|
begin
|
51
52
|
options[:headers] = JSON.parse(options[:headers]) if options[:headers]
|
@@ -80,6 +81,7 @@ module Datahen
|
|
80
81
|
option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
|
81
82
|
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
82
83
|
option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
84
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
83
85
|
def update(scraper_name, gid)
|
84
86
|
begin
|
85
87
|
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
data/lib/datahen/client/job.rb
CHANGED
@@ -23,6 +23,7 @@ module Datahen
|
|
23
23
|
body[:profile] = opts[:profile] if opts[:profile]
|
24
24
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
25
25
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
26
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
26
27
|
params = @options.merge({body: body.to_json})
|
27
28
|
|
28
29
|
self.class.put("/jobs/#{job_id}", params)
|
@@ -17,6 +17,7 @@ module Datahen
|
|
17
17
|
body[:vars] = opts[:vars] if opts[:vars]
|
18
18
|
body[:max_size] = opts[:max_size] if opts[:max_size]
|
19
19
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
20
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
20
21
|
|
21
22
|
params = @options.merge({body: body.to_json})
|
22
23
|
|
@@ -40,6 +41,7 @@ module Datahen
|
|
40
41
|
body[:cookie] = opts[:cookie] if opts[:cookie]
|
41
42
|
body[:max_size] = opts[:max_size] if opts[:max_size]
|
42
43
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
44
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
43
45
|
|
44
46
|
params = @options.merge({body: body.to_json})
|
45
47
|
|
@@ -30,6 +30,7 @@ module Datahen
|
|
30
30
|
body[:max_job_count] = opts[:max_job_count] if opts[:max_job_count]
|
31
31
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
32
32
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
33
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
33
34
|
params = @options.merge({body: body.to_json})
|
34
35
|
self.class.post("/scrapers", params)
|
35
36
|
end
|
@@ -53,6 +54,7 @@ module Datahen
|
|
53
54
|
body[:max_job_count] = opts[:max_job_count] if opts.has_key?("max_job_count") || opts.has_key?(:max_job_count)
|
54
55
|
body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
|
55
56
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
57
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
56
58
|
params = @options.merge({body: body.to_json})
|
57
59
|
|
58
60
|
self.class.put("/scrapers/#{scraper_name}", params)
|
@@ -13,6 +13,7 @@ module Datahen
|
|
13
13
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
14
14
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
15
15
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
16
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
16
17
|
if opts[:vars]
|
17
18
|
if opts[:vars].is_a?(Array)
|
18
19
|
body[:vars] = opts[:vars]
|
@@ -41,6 +42,7 @@ module Datahen
|
|
41
42
|
body[:profile] = opts[:profile] if opts[:profile]
|
42
43
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
43
44
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
45
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
44
46
|
params = @options.merge({body: body.to_json})
|
45
47
|
|
46
48
|
self.class.put("/scrapers/#{scraper_name}/current_job", params)
|
@@ -17,6 +17,7 @@ module Datahen
|
|
17
17
|
body[:vars] = opts[:vars] if opts[:vars]
|
18
18
|
body[:max_size] = opts[:max_size] if opts[:max_size]
|
19
19
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
20
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
20
21
|
|
21
22
|
params = @options.merge({body: body.to_json})
|
22
23
|
|
@@ -63,6 +64,7 @@ module Datahen
|
|
63
64
|
body[:cookie] = opts[:cookie] if opts[:cookie]
|
64
65
|
body[:max_size] = opts[:max_size] if opts[:max_size]
|
65
66
|
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
67
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
66
68
|
|
67
69
|
params = @options.merge({body: body.to_json})
|
68
70
|
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.18.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-09-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -276,7 +276,7 @@ metadata:
|
|
276
276
|
allowed_push_host: https://rubygems.org
|
277
277
|
homepage_uri: https://datahen.com
|
278
278
|
source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
|
279
|
-
post_install_message:
|
279
|
+
post_install_message:
|
280
280
|
rdoc_options: []
|
281
281
|
require_paths:
|
282
282
|
- lib
|
@@ -292,7 +292,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
292
292
|
version: '0'
|
293
293
|
requirements: []
|
294
294
|
rubygems_version: 3.0.3
|
295
|
-
signing_key:
|
295
|
+
signing_key:
|
296
296
|
specification_version: 4
|
297
297
|
summary: DataHen toolbelt for developers
|
298
298
|
test_files: []
|