datahen 0.10.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.travis.yml +7 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +29 -0
- data/Rakefile +22 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/datahen.gemspec +47 -0
- data/examples/fetchtest/libraries/hello.rb +9 -0
- data/examples/fetchtest/libraries/hello_fail.rb +10 -0
- data/examples/fetchtest/parsers/failed.rb +2 -0
- data/examples/fetchtest/parsers/find_outputs.rb +18 -0
- data/examples/fetchtest/parsers/home.rb +50 -0
- data/examples/fetchtest/parsers/nested_fail.rb +3 -0
- data/examples/fetchtest/parsers/simple.rb +14 -0
- data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
- data/examples/fetchtest/seeders/failed.rb +1 -0
- data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
- data/examples/fetchtest/seeders/seed.rb +28 -0
- data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
- data/exe/hen +3 -0
- data/lib/datahen.rb +5 -0
- data/lib/datahen/cli.rb +45 -0
- data/lib/datahen/cli/env_var.rb +48 -0
- data/lib/datahen/cli/finisher.rb +40 -0
- data/lib/datahen/cli/global_page.rb +39 -0
- data/lib/datahen/cli/job.rb +30 -0
- data/lib/datahen/cli/job_output.rb +69 -0
- data/lib/datahen/cli/parser.rb +64 -0
- data/lib/datahen/cli/scraper.rb +185 -0
- data/lib/datahen/cli/scraper_deployment.rb +24 -0
- data/lib/datahen/cli/scraper_export.rb +51 -0
- data/lib/datahen/cli/scraper_exporter.rb +40 -0
- data/lib/datahen/cli/scraper_finisher.rb +20 -0
- data/lib/datahen/cli/scraper_job.rb +75 -0
- data/lib/datahen/cli/scraper_job_var.rb +48 -0
- data/lib/datahen/cli/scraper_page.rb +203 -0
- data/lib/datahen/cli/scraper_var.rb +48 -0
- data/lib/datahen/cli/seeder.rb +40 -0
- data/lib/datahen/client.rb +29 -0
- data/lib/datahen/client/auth_token.rb +50 -0
- data/lib/datahen/client/backblaze_content.rb +45 -0
- data/lib/datahen/client/base.rb +69 -0
- data/lib/datahen/client/deploy_key.rb +21 -0
- data/lib/datahen/client/env_var.rb +28 -0
- data/lib/datahen/client/export.rb +10 -0
- data/lib/datahen/client/global_page.rb +18 -0
- data/lib/datahen/client/job.rb +64 -0
- data/lib/datahen/client/job_export.rb +10 -0
- data/lib/datahen/client/job_log.rb +26 -0
- data/lib/datahen/client/job_output.rb +19 -0
- data/lib/datahen/client/job_page.rb +58 -0
- data/lib/datahen/client/job_stat.rb +16 -0
- data/lib/datahen/client/scraper.rb +57 -0
- data/lib/datahen/client/scraper_deployment.rb +18 -0
- data/lib/datahen/client/scraper_export.rb +22 -0
- data/lib/datahen/client/scraper_exporter.rb +14 -0
- data/lib/datahen/client/scraper_finisher.rb +16 -0
- data/lib/datahen/client/scraper_job.rb +49 -0
- data/lib/datahen/client/scraper_job_output.rb +19 -0
- data/lib/datahen/client/scraper_job_page.rb +67 -0
- data/lib/datahen/client/scraper_job_var.rb +28 -0
- data/lib/datahen/client/scraper_var.rb +28 -0
- data/lib/datahen/plugin.rb +6 -0
- data/lib/datahen/plugin/context_exposer.rb +55 -0
- data/lib/datahen/scraper.rb +18 -0
- data/lib/datahen/scraper/executor.rb +373 -0
- data/lib/datahen/scraper/finisher.rb +18 -0
- data/lib/datahen/scraper/parser.rb +18 -0
- data/lib/datahen/scraper/ruby_finisher_executor.rb +116 -0
- data/lib/datahen/scraper/ruby_parser_executor.rb +200 -0
- data/lib/datahen/scraper/ruby_seeder_executor.rb +120 -0
- data/lib/datahen/scraper/seeder.rb +18 -0
- data/lib/datahen/version.rb +3 -0
- metadata +270 -0
@@ -0,0 +1,48 @@
|
|
1
|
+
module Datahen
|
2
|
+
class CLI < Thor
|
3
|
+
class ScraperVar < Thor
|
4
|
+
|
5
|
+
package_name "scraper var"
|
6
|
+
def self.banner(command, namespace = nil, subcommand = false)
|
7
|
+
"#{basename} #{@package_name} #{command.usage}"
|
8
|
+
end
|
9
|
+
|
10
|
+
desc "list <scraper_name>", "List environment variables on the scraper"
|
11
|
+
long_desc <<-LONGDESC
|
12
|
+
List all environment variables on the scraper.
|
13
|
+
LONGDESC
|
14
|
+
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
15
|
+
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
16
|
+
def list(scraper_name)
|
17
|
+
client = Client::ScraperVar.new(options)
|
18
|
+
puts "#{client.all(scraper_name)}"
|
19
|
+
end
|
20
|
+
|
21
|
+
desc "set <scraper_name> <var_name> <value>", "Set an environment var on the scraper"
|
22
|
+
long_desc <<-LONGDESC
|
23
|
+
Creates an environment variable\x5
|
24
|
+
<var_name>: Var name can only consist of alphabets, numbers, underscores. Name must be unique to your scraper, otherwise it will be overwritten.\x5
|
25
|
+
<value>: Value of variable.\x5
|
26
|
+
LONGDESC
|
27
|
+
option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
|
28
|
+
def set(scraper_name, var_name, value)
|
29
|
+
# puts "options #{options}"
|
30
|
+
client = Client::ScraperVar.new(options)
|
31
|
+
puts "#{client.set(scraper_name, var_name, value, options)}"
|
32
|
+
end
|
33
|
+
|
34
|
+
desc "show <scraper_name> <var_name>", "Show an environment variable on the scraper"
|
35
|
+
def show(scraper_name, var_name)
|
36
|
+
client = Client::ScraperVar.new(options)
|
37
|
+
puts "#{client.find(scraper_name, var_name)}"
|
38
|
+
end
|
39
|
+
|
40
|
+
desc "unset <scraper_name> <var_name>", "Deletes an environment variable on the scraper"
|
41
|
+
def unset(scraper_name, var_name)
|
42
|
+
client = Client::ScraperVar.new(options)
|
43
|
+
puts "#{client.unset(scraper_name, var_name)}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Datahen
|
2
|
+
class CLI < Thor
|
3
|
+
class Seeder < Thor
|
4
|
+
desc "try <scraper_name> <seeder_file>", "Tries a seeder file"
|
5
|
+
long_desc <<-LONGDESC
|
6
|
+
Takes a seeder script and tries to execute it without saving anything.\x5
|
7
|
+
<seeder_file>: Seeder script file will be executed.\x5
|
8
|
+
LONGDESC
|
9
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
10
|
+
def try_seed(scraper_name, seeder_file)
|
11
|
+
if options[:job]
|
12
|
+
job_id = options[:job]
|
13
|
+
else
|
14
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
15
|
+
job_id = job['id']
|
16
|
+
end
|
17
|
+
|
18
|
+
puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, false)
|
19
|
+
end
|
20
|
+
|
21
|
+
desc "exec <scraper_name> <seeder_file>", "Executes a seeder script onto a scraper's current job."
|
22
|
+
long_desc <<-LONGDESC
|
23
|
+
Takes a seeder script and execute it against a job and enqueues the pages into the scraper's current job\x5
|
24
|
+
<seeder_file>: Seeder script file that will be executed on the scraper's current job.\x5
|
25
|
+
LONGDESC
|
26
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
27
|
+
def exec_parse(scraper_name, seeder_file)
|
28
|
+
if options[:job]
|
29
|
+
job_id = options[:job]
|
30
|
+
else
|
31
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
32
|
+
job_id = job['id']
|
33
|
+
end
|
34
|
+
|
35
|
+
puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, true)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require "datahen/client/base"
|
2
|
+
require "datahen/client/auth_token"
|
3
|
+
require "datahen/client/deploy_key"
|
4
|
+
require 'datahen/client/export'
|
5
|
+
require "datahen/client/scraper"
|
6
|
+
require "datahen/client/scraper_deployment"
|
7
|
+
require "datahen/client/scraper_job_output"
|
8
|
+
require "datahen/client/scraper_job_page"
|
9
|
+
require "datahen/client/scraper_exporter"
|
10
|
+
require "datahen/client/scraper_export"
|
11
|
+
require "datahen/client/scraper_job"
|
12
|
+
require "datahen/client/scraper_finisher"
|
13
|
+
require 'datahen/client/job_export'
|
14
|
+
require "datahen/client/job"
|
15
|
+
require "datahen/client/job_log"
|
16
|
+
require "datahen/client/global_page"
|
17
|
+
require "datahen/client/job_page"
|
18
|
+
require "datahen/client/job_output"
|
19
|
+
require "datahen/client/job_stat"
|
20
|
+
require "datahen/client/backblaze_content"
|
21
|
+
require "datahen/client/env_var"
|
22
|
+
require "datahen/client/scraper_var"
|
23
|
+
require "datahen/client/scraper_job_var"
|
24
|
+
|
25
|
+
|
26
|
+
module Datahen
|
27
|
+
module Client
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class AuthToken < Datahen::Client::Base
|
4
|
+
|
5
|
+
def find(token)
|
6
|
+
self.class.get("/auth_tokens/#{token}", @options)
|
7
|
+
end
|
8
|
+
|
9
|
+
def all(opts={})
|
10
|
+
params = @options.merge(opts)
|
11
|
+
self.class.get("/auth_tokens", params)
|
12
|
+
end
|
13
|
+
|
14
|
+
def create(role, description, opts={})
|
15
|
+
body = {
|
16
|
+
role: role,
|
17
|
+
description: description}
|
18
|
+
|
19
|
+
params = @options.merge({body: body.to_json})
|
20
|
+
self.class.post("/auth_tokens", params)
|
21
|
+
end
|
22
|
+
|
23
|
+
def create_on_account(account_id, role, description)
|
24
|
+
body = {
|
25
|
+
role: role,
|
26
|
+
description: description}
|
27
|
+
|
28
|
+
params = @options.merge({body: body.to_json})
|
29
|
+
self.class.post("/accounts/#{account_id}/auth_tokens", params)
|
30
|
+
end
|
31
|
+
|
32
|
+
def update(token, role, description="", opts={})
|
33
|
+
body = {}
|
34
|
+
|
35
|
+
body[:role] = role
|
36
|
+
body[:description] = description if description.present?
|
37
|
+
params = @options.merge({body: body.to_json})
|
38
|
+
|
39
|
+
self.class.put("/auth_tokens/#{token}", params)
|
40
|
+
end
|
41
|
+
|
42
|
+
def delete(token, opts={})
|
43
|
+
body = {}
|
44
|
+
params = @options.merge({body: body.to_json})
|
45
|
+
|
46
|
+
self.class.delete("/auth_tokens/#{token}", params)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'zlib'
|
2
|
+
require 'httparty'
|
3
|
+
|
4
|
+
module Datahen
|
5
|
+
module Client
|
6
|
+
class BackblazeContent
|
7
|
+
include HTTParty
|
8
|
+
|
9
|
+
def get_content(url)
|
10
|
+
self.class.get(url, format: :plain)
|
11
|
+
end
|
12
|
+
|
13
|
+
def get_gunzipped_content(url)
|
14
|
+
# Zlib.gunzip(get_content(url))
|
15
|
+
gunzip(get_content(url))
|
16
|
+
end
|
17
|
+
|
18
|
+
def gunzip(string)
|
19
|
+
sio = StringIO.new(string)
|
20
|
+
gz = Zlib::GzipReader.new(sio, encoding: Encoding::ASCII_8BIT)
|
21
|
+
_content = ""
|
22
|
+
begin
|
23
|
+
_content = gz.read
|
24
|
+
rescue => e
|
25
|
+
# if unexpected eof error, then readchar until error, and ignore it
|
26
|
+
if e.to_s == 'unexpected end of file'
|
27
|
+
begin
|
28
|
+
while !gz.eof?
|
29
|
+
_content += gz.readchar
|
30
|
+
end
|
31
|
+
rescue => e
|
32
|
+
puts "Ignored Zlib error: #{e.to_s}"
|
33
|
+
end
|
34
|
+
else
|
35
|
+
raise e
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
return _content
|
40
|
+
ensure
|
41
|
+
gz.close if gz.respond_to?(:close)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'httparty'
|
2
|
+
|
3
|
+
module Datahen
|
4
|
+
module Client
|
5
|
+
class Base
|
6
|
+
include HTTParty
|
7
|
+
|
8
|
+
def self.env_auth_token
|
9
|
+
ENV['DATAHEN_TOKEN']
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.env_ignore_ssl
|
13
|
+
ENV['DATAHEN_IGNORE_SSL'].to_s.strip == '1'
|
14
|
+
end
|
15
|
+
|
16
|
+
def env_api_url
|
17
|
+
ENV['DATAHEN_API_URL'].nil? ? 'https://app.datahen.com/api/v1' : ENV['DATAHEN_API_URL']
|
18
|
+
end
|
19
|
+
|
20
|
+
def ignore_ssl
|
21
|
+
return @ignore_ssl unless @ignore_ssl.nil?
|
22
|
+
@ignore_ssl = self.class.env_ignore_ssl
|
23
|
+
@ignore_ssl
|
24
|
+
end
|
25
|
+
|
26
|
+
def auth_token
|
27
|
+
@auth_token ||= self.class.env_auth_token
|
28
|
+
end
|
29
|
+
|
30
|
+
def auth_token= value
|
31
|
+
@auth_token = value
|
32
|
+
end
|
33
|
+
|
34
|
+
def initialize(opts={})
|
35
|
+
@ignore_ssl = opts[:ignore_ssl]
|
36
|
+
self.class.base_uri(env_api_url)
|
37
|
+
self.auth_token = opts[:auth_token] unless opts[:auth_token].nil?
|
38
|
+
@options = {
|
39
|
+
headers: {
|
40
|
+
"Authorization" => "Bearer #{auth_token}",
|
41
|
+
"Content-Type" => "application/json",
|
42
|
+
},
|
43
|
+
verify: !ignore_ssl
|
44
|
+
}
|
45
|
+
|
46
|
+
query = {}
|
47
|
+
query[:p] = opts[:page] if opts[:page]
|
48
|
+
query[:pp] = opts[:per_page] if opts[:per_page]
|
49
|
+
query[:fetchfail] = opts[:fetch_fail] if opts[:fetch_fail]
|
50
|
+
query[:parsefail] = opts[:parse_fail] if opts[:parse_fail]
|
51
|
+
query[:status] = opts[:status] if opts[:status]
|
52
|
+
query[:page_type] = opts[:page_type] if opts[:page_type]
|
53
|
+
query[:gid] = opts[:gid] if opts[:gid]
|
54
|
+
|
55
|
+
if opts[:query]
|
56
|
+
if opts[:query].is_a?(Hash)
|
57
|
+
query[:q] = opts[:query].to_json
|
58
|
+
elsif opts[:query].is_a?(String)
|
59
|
+
query[:q] = JSON.parse(opts[:query]).to_json
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
unless query.empty?
|
64
|
+
@options.merge!(query: query)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class DeployKey < Datahen::Client::Base
|
4
|
+
|
5
|
+
def find(opts={})
|
6
|
+
params = @options.merge(opts)
|
7
|
+
self.class.get("/deploy_key", params)
|
8
|
+
end
|
9
|
+
|
10
|
+
def create(opts={})
|
11
|
+
params = @options.merge(opts)
|
12
|
+
self.class.post("/deploy_key", params)
|
13
|
+
end
|
14
|
+
|
15
|
+
def delete(opts={})
|
16
|
+
params = @options.merge(opts)
|
17
|
+
self.class.delete("/deploy_key", params)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class EnvVar < Datahen::Client::Base
|
4
|
+
|
5
|
+
def find(name)
|
6
|
+
self.class.get("/env_vars/#{name}", @options)
|
7
|
+
end
|
8
|
+
|
9
|
+
def all(opts={})
|
10
|
+
params = @options.merge opts
|
11
|
+
self.class.get("/env_vars", params)
|
12
|
+
end
|
13
|
+
|
14
|
+
def set(name, value, opts={})
|
15
|
+
body = {}
|
16
|
+
body[:value] = value
|
17
|
+
body[:secret] = opts[:secret] if opts[:secret]
|
18
|
+
params = @options.merge({body: body.to_json})
|
19
|
+
self.class.put("/env_vars/#{name}", params)
|
20
|
+
end
|
21
|
+
|
22
|
+
def unset(name, opts={})
|
23
|
+
params = @options.merge(opts)
|
24
|
+
self.class.delete("/env_vars/#{name}", params)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class GlobalPage < Datahen::Client::Base
|
4
|
+
def find(gid)
|
5
|
+
self.class.get("/global_pages/#{gid}", @options)
|
6
|
+
end
|
7
|
+
|
8
|
+
def find_content(gid)
|
9
|
+
self.class.get("/global_pages/#{gid}/content", @options)
|
10
|
+
end
|
11
|
+
|
12
|
+
def find_failed_content(gid)
|
13
|
+
self.class.get("/global_pages/#{gid}/failed_content", @options)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
@@ -0,0 +1,64 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class Job < Datahen::Client::Base
|
4
|
+
def all(opts={})
|
5
|
+
params = @options.merge(opts)
|
6
|
+
self.class.get("/jobs", params)
|
7
|
+
end
|
8
|
+
|
9
|
+
def find(job_id)
|
10
|
+
self.class.get("/jobs/#{job_id}", @options)
|
11
|
+
end
|
12
|
+
|
13
|
+
def update(job_id, opts={})
|
14
|
+
body = {}
|
15
|
+
body[:status] = opts[:status] if opts[:status]
|
16
|
+
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
17
|
+
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
18
|
+
params = @options.merge({body: body.to_json})
|
19
|
+
|
20
|
+
self.class.put("/jobs/#{job_id}", params)
|
21
|
+
end
|
22
|
+
|
23
|
+
def cancel(job_id, opts={})
|
24
|
+
opts[:status] = 'cancelled'
|
25
|
+
update(job_id, opts)
|
26
|
+
end
|
27
|
+
|
28
|
+
def resume(job_id, opts={})
|
29
|
+
opts[:status] = 'active'
|
30
|
+
update(job_id, opts)
|
31
|
+
end
|
32
|
+
|
33
|
+
def pause(job_id, opts={})
|
34
|
+
opts[:status] = 'paused'
|
35
|
+
update(job_id, opts)
|
36
|
+
end
|
37
|
+
|
38
|
+
def seeding_update(job_id, opts={})
|
39
|
+
body = {}
|
40
|
+
body[:outputs] = opts.fetch(:outputs) {[]}
|
41
|
+
body[:pages] = opts.fetch(:pages) {[]}
|
42
|
+
body[:seeding_status] = opts.fetch(:seeding_status){ nil }
|
43
|
+
body[:log_error] = opts[:log_error] if opts[:log_error]
|
44
|
+
|
45
|
+
params = @options.merge({body: body.to_json})
|
46
|
+
|
47
|
+
self.class.put("/jobs/#{job_id}/seeding_update", params)
|
48
|
+
end
|
49
|
+
|
50
|
+
def finisher_update(job_id, opts={})
|
51
|
+
body = {}
|
52
|
+
body[:outputs] = opts.fetch(:outputs) {[]}
|
53
|
+
body[:finisher_status] = opts.fetch(:finisher_status){ nil }
|
54
|
+
body[:log_error] = opts[:log_error] if opts[:log_error]
|
55
|
+
|
56
|
+
params = @options.merge({body: body.to_json})
|
57
|
+
|
58
|
+
self.class.put("/jobs/#{job_id}/finisher_update", params)
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|