datahen 0.10.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.travis.yml +7 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +29 -0
- data/Rakefile +22 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/datahen.gemspec +47 -0
- data/examples/fetchtest/libraries/hello.rb +9 -0
- data/examples/fetchtest/libraries/hello_fail.rb +10 -0
- data/examples/fetchtest/parsers/failed.rb +2 -0
- data/examples/fetchtest/parsers/find_outputs.rb +18 -0
- data/examples/fetchtest/parsers/home.rb +50 -0
- data/examples/fetchtest/parsers/nested_fail.rb +3 -0
- data/examples/fetchtest/parsers/simple.rb +14 -0
- data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
- data/examples/fetchtest/seeders/failed.rb +1 -0
- data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
- data/examples/fetchtest/seeders/seed.rb +28 -0
- data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
- data/exe/hen +3 -0
- data/lib/datahen.rb +5 -0
- data/lib/datahen/cli.rb +45 -0
- data/lib/datahen/cli/env_var.rb +48 -0
- data/lib/datahen/cli/finisher.rb +40 -0
- data/lib/datahen/cli/global_page.rb +39 -0
- data/lib/datahen/cli/job.rb +30 -0
- data/lib/datahen/cli/job_output.rb +69 -0
- data/lib/datahen/cli/parser.rb +64 -0
- data/lib/datahen/cli/scraper.rb +185 -0
- data/lib/datahen/cli/scraper_deployment.rb +24 -0
- data/lib/datahen/cli/scraper_export.rb +51 -0
- data/lib/datahen/cli/scraper_exporter.rb +40 -0
- data/lib/datahen/cli/scraper_finisher.rb +20 -0
- data/lib/datahen/cli/scraper_job.rb +75 -0
- data/lib/datahen/cli/scraper_job_var.rb +48 -0
- data/lib/datahen/cli/scraper_page.rb +203 -0
- data/lib/datahen/cli/scraper_var.rb +48 -0
- data/lib/datahen/cli/seeder.rb +40 -0
- data/lib/datahen/client.rb +29 -0
- data/lib/datahen/client/auth_token.rb +50 -0
- data/lib/datahen/client/backblaze_content.rb +45 -0
- data/lib/datahen/client/base.rb +69 -0
- data/lib/datahen/client/deploy_key.rb +21 -0
- data/lib/datahen/client/env_var.rb +28 -0
- data/lib/datahen/client/export.rb +10 -0
- data/lib/datahen/client/global_page.rb +18 -0
- data/lib/datahen/client/job.rb +64 -0
- data/lib/datahen/client/job_export.rb +10 -0
- data/lib/datahen/client/job_log.rb +26 -0
- data/lib/datahen/client/job_output.rb +19 -0
- data/lib/datahen/client/job_page.rb +58 -0
- data/lib/datahen/client/job_stat.rb +16 -0
- data/lib/datahen/client/scraper.rb +57 -0
- data/lib/datahen/client/scraper_deployment.rb +18 -0
- data/lib/datahen/client/scraper_export.rb +22 -0
- data/lib/datahen/client/scraper_exporter.rb +14 -0
- data/lib/datahen/client/scraper_finisher.rb +16 -0
- data/lib/datahen/client/scraper_job.rb +49 -0
- data/lib/datahen/client/scraper_job_output.rb +19 -0
- data/lib/datahen/client/scraper_job_page.rb +67 -0
- data/lib/datahen/client/scraper_job_var.rb +28 -0
- data/lib/datahen/client/scraper_var.rb +28 -0
- data/lib/datahen/plugin.rb +6 -0
- data/lib/datahen/plugin/context_exposer.rb +55 -0
- data/lib/datahen/scraper.rb +18 -0
- data/lib/datahen/scraper/executor.rb +373 -0
- data/lib/datahen/scraper/finisher.rb +18 -0
- data/lib/datahen/scraper/parser.rb +18 -0
- data/lib/datahen/scraper/ruby_finisher_executor.rb +116 -0
- data/lib/datahen/scraper/ruby_parser_executor.rb +200 -0
- data/lib/datahen/scraper/ruby_seeder_executor.rb +120 -0
- data/lib/datahen/scraper/seeder.rb +18 -0
- data/lib/datahen/version.rb +3 -0
- metadata +270 -0
@@ -0,0 +1,48 @@
|
|
1
|
+
module Datahen
|
2
|
+
class CLI < Thor
|
3
|
+
class ScraperVar < Thor
|
4
|
+
|
5
|
+
package_name "scraper var"
|
6
|
+
def self.banner(command, namespace = nil, subcommand = false)
|
7
|
+
"#{basename} #{@package_name} #{command.usage}"
|
8
|
+
end
|
9
|
+
|
10
|
+
desc "list <scraper_name>", "List environment variables on the scraper"
|
11
|
+
long_desc <<-LONGDESC
|
12
|
+
List all environment variables on the scraper.
|
13
|
+
LONGDESC
|
14
|
+
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
15
|
+
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
16
|
+
def list(scraper_name)
|
17
|
+
client = Client::ScraperVar.new(options)
|
18
|
+
puts "#{client.all(scraper_name)}"
|
19
|
+
end
|
20
|
+
|
21
|
+
desc "set <scraper_name> <var_name> <value>", "Set an environment var on the scraper"
|
22
|
+
long_desc <<-LONGDESC
|
23
|
+
Creates an environment variable\x5
|
24
|
+
<var_name>: Var name can only consist of alphabets, numbers, underscores. Name must be unique to your scraper, otherwise it will be overwritten.\x5
|
25
|
+
<value>: Value of variable.\x5
|
26
|
+
LONGDESC
|
27
|
+
option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
|
28
|
+
def set(scraper_name, var_name, value)
|
29
|
+
# puts "options #{options}"
|
30
|
+
client = Client::ScraperVar.new(options)
|
31
|
+
puts "#{client.set(scraper_name, var_name, value, options)}"
|
32
|
+
end
|
33
|
+
|
34
|
+
desc "show <scraper_name> <var_name>", "Show an environment variable on the scraper"
|
35
|
+
def show(scraper_name, var_name)
|
36
|
+
client = Client::ScraperVar.new(options)
|
37
|
+
puts "#{client.find(scraper_name, var_name)}"
|
38
|
+
end
|
39
|
+
|
40
|
+
desc "unset <scraper_name> <var_name>", "Deletes an environment variable on the scraper"
|
41
|
+
def unset(scraper_name, var_name)
|
42
|
+
client = Client::ScraperVar.new(options)
|
43
|
+
puts "#{client.unset(scraper_name, var_name)}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Datahen
|
2
|
+
class CLI < Thor
|
3
|
+
class Seeder < Thor
|
4
|
+
desc "try <scraper_name> <seeder_file>", "Tries a seeder file"
|
5
|
+
long_desc <<-LONGDESC
|
6
|
+
Takes a seeder script and tries to execute it without saving anything.\x5
|
7
|
+
<seeder_file>: Seeder script file will be executed.\x5
|
8
|
+
LONGDESC
|
9
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
10
|
+
def try_seed(scraper_name, seeder_file)
|
11
|
+
if options[:job]
|
12
|
+
job_id = options[:job]
|
13
|
+
else
|
14
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
15
|
+
job_id = job['id']
|
16
|
+
end
|
17
|
+
|
18
|
+
puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, false)
|
19
|
+
end
|
20
|
+
|
21
|
+
desc "exec <scraper_name> <seeder_file>", "Executes a seeder script onto a scraper's current job."
|
22
|
+
long_desc <<-LONGDESC
|
23
|
+
Takes a seeder script and execute it against a job and enqueues the pages into the scraper's current job\x5
|
24
|
+
<seeder_file>: Seeder script file that will be executed on the scraper's current job.\x5
|
25
|
+
LONGDESC
|
26
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
27
|
+
def exec_parse(scraper_name, seeder_file)
|
28
|
+
if options[:job]
|
29
|
+
job_id = options[:job]
|
30
|
+
else
|
31
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
32
|
+
job_id = job['id']
|
33
|
+
end
|
34
|
+
|
35
|
+
puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, true)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require "datahen/client/base"
|
2
|
+
require "datahen/client/auth_token"
|
3
|
+
require "datahen/client/deploy_key"
|
4
|
+
require 'datahen/client/export'
|
5
|
+
require "datahen/client/scraper"
|
6
|
+
require "datahen/client/scraper_deployment"
|
7
|
+
require "datahen/client/scraper_job_output"
|
8
|
+
require "datahen/client/scraper_job_page"
|
9
|
+
require "datahen/client/scraper_exporter"
|
10
|
+
require "datahen/client/scraper_export"
|
11
|
+
require "datahen/client/scraper_job"
|
12
|
+
require "datahen/client/scraper_finisher"
|
13
|
+
require 'datahen/client/job_export'
|
14
|
+
require "datahen/client/job"
|
15
|
+
require "datahen/client/job_log"
|
16
|
+
require "datahen/client/global_page"
|
17
|
+
require "datahen/client/job_page"
|
18
|
+
require "datahen/client/job_output"
|
19
|
+
require "datahen/client/job_stat"
|
20
|
+
require "datahen/client/backblaze_content"
|
21
|
+
require "datahen/client/env_var"
|
22
|
+
require "datahen/client/scraper_var"
|
23
|
+
require "datahen/client/scraper_job_var"
|
24
|
+
|
25
|
+
|
26
|
+
module Datahen
|
27
|
+
module Client
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class AuthToken < Datahen::Client::Base
|
4
|
+
|
5
|
+
def find(token)
|
6
|
+
self.class.get("/auth_tokens/#{token}", @options)
|
7
|
+
end
|
8
|
+
|
9
|
+
def all(opts={})
|
10
|
+
params = @options.merge(opts)
|
11
|
+
self.class.get("/auth_tokens", params)
|
12
|
+
end
|
13
|
+
|
14
|
+
def create(role, description, opts={})
|
15
|
+
body = {
|
16
|
+
role: role,
|
17
|
+
description: description}
|
18
|
+
|
19
|
+
params = @options.merge({body: body.to_json})
|
20
|
+
self.class.post("/auth_tokens", params)
|
21
|
+
end
|
22
|
+
|
23
|
+
def create_on_account(account_id, role, description)
|
24
|
+
body = {
|
25
|
+
role: role,
|
26
|
+
description: description}
|
27
|
+
|
28
|
+
params = @options.merge({body: body.to_json})
|
29
|
+
self.class.post("/accounts/#{account_id}/auth_tokens", params)
|
30
|
+
end
|
31
|
+
|
32
|
+
def update(token, role, description="", opts={})
|
33
|
+
body = {}
|
34
|
+
|
35
|
+
body[:role] = role
|
36
|
+
body[:description] = description if description.present?
|
37
|
+
params = @options.merge({body: body.to_json})
|
38
|
+
|
39
|
+
self.class.put("/auth_tokens/#{token}", params)
|
40
|
+
end
|
41
|
+
|
42
|
+
def delete(token, opts={})
|
43
|
+
body = {}
|
44
|
+
params = @options.merge({body: body.to_json})
|
45
|
+
|
46
|
+
self.class.delete("/auth_tokens/#{token}", params)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'zlib'
|
2
|
+
require 'httparty'
|
3
|
+
|
4
|
+
module Datahen
|
5
|
+
module Client
|
6
|
+
class BackblazeContent
|
7
|
+
include HTTParty
|
8
|
+
|
9
|
+
def get_content(url)
|
10
|
+
self.class.get(url, format: :plain)
|
11
|
+
end
|
12
|
+
|
13
|
+
def get_gunzipped_content(url)
|
14
|
+
# Zlib.gunzip(get_content(url))
|
15
|
+
gunzip(get_content(url))
|
16
|
+
end
|
17
|
+
|
18
|
+
def gunzip(string)
|
19
|
+
sio = StringIO.new(string)
|
20
|
+
gz = Zlib::GzipReader.new(sio, encoding: Encoding::ASCII_8BIT)
|
21
|
+
_content = ""
|
22
|
+
begin
|
23
|
+
_content = gz.read
|
24
|
+
rescue => e
|
25
|
+
# if unexpected eof error, then readchar until error, and ignore it
|
26
|
+
if e.to_s == 'unexpected end of file'
|
27
|
+
begin
|
28
|
+
while !gz.eof?
|
29
|
+
_content += gz.readchar
|
30
|
+
end
|
31
|
+
rescue => e
|
32
|
+
puts "Ignored Zlib error: #{e.to_s}"
|
33
|
+
end
|
34
|
+
else
|
35
|
+
raise e
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
return _content
|
40
|
+
ensure
|
41
|
+
gz.close if gz.respond_to?(:close)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'httparty'
|
2
|
+
|
3
|
+
module Datahen
|
4
|
+
module Client
|
5
|
+
class Base
|
6
|
+
include HTTParty
|
7
|
+
|
8
|
+
def self.env_auth_token
|
9
|
+
ENV['DATAHEN_TOKEN']
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.env_ignore_ssl
|
13
|
+
ENV['DATAHEN_IGNORE_SSL'].to_s.strip == '1'
|
14
|
+
end
|
15
|
+
|
16
|
+
def env_api_url
|
17
|
+
ENV['DATAHEN_API_URL'].nil? ? 'https://app.datahen.com/api/v1' : ENV['DATAHEN_API_URL']
|
18
|
+
end
|
19
|
+
|
20
|
+
def ignore_ssl
|
21
|
+
return @ignore_ssl unless @ignore_ssl.nil?
|
22
|
+
@ignore_ssl = self.class.env_ignore_ssl
|
23
|
+
@ignore_ssl
|
24
|
+
end
|
25
|
+
|
26
|
+
def auth_token
|
27
|
+
@auth_token ||= self.class.env_auth_token
|
28
|
+
end
|
29
|
+
|
30
|
+
def auth_token= value
|
31
|
+
@auth_token = value
|
32
|
+
end
|
33
|
+
|
34
|
+
def initialize(opts={})
|
35
|
+
@ignore_ssl = opts[:ignore_ssl]
|
36
|
+
self.class.base_uri(env_api_url)
|
37
|
+
self.auth_token = opts[:auth_token] unless opts[:auth_token].nil?
|
38
|
+
@options = {
|
39
|
+
headers: {
|
40
|
+
"Authorization" => "Bearer #{auth_token}",
|
41
|
+
"Content-Type" => "application/json",
|
42
|
+
},
|
43
|
+
verify: !ignore_ssl
|
44
|
+
}
|
45
|
+
|
46
|
+
query = {}
|
47
|
+
query[:p] = opts[:page] if opts[:page]
|
48
|
+
query[:pp] = opts[:per_page] if opts[:per_page]
|
49
|
+
query[:fetchfail] = opts[:fetch_fail] if opts[:fetch_fail]
|
50
|
+
query[:parsefail] = opts[:parse_fail] if opts[:parse_fail]
|
51
|
+
query[:status] = opts[:status] if opts[:status]
|
52
|
+
query[:page_type] = opts[:page_type] if opts[:page_type]
|
53
|
+
query[:gid] = opts[:gid] if opts[:gid]
|
54
|
+
|
55
|
+
if opts[:query]
|
56
|
+
if opts[:query].is_a?(Hash)
|
57
|
+
query[:q] = opts[:query].to_json
|
58
|
+
elsif opts[:query].is_a?(String)
|
59
|
+
query[:q] = JSON.parse(opts[:query]).to_json
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
unless query.empty?
|
64
|
+
@options.merge!(query: query)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class DeployKey < Datahen::Client::Base
|
4
|
+
|
5
|
+
def find(opts={})
|
6
|
+
params = @options.merge(opts)
|
7
|
+
self.class.get("/deploy_key", params)
|
8
|
+
end
|
9
|
+
|
10
|
+
def create(opts={})
|
11
|
+
params = @options.merge(opts)
|
12
|
+
self.class.post("/deploy_key", params)
|
13
|
+
end
|
14
|
+
|
15
|
+
def delete(opts={})
|
16
|
+
params = @options.merge(opts)
|
17
|
+
self.class.delete("/deploy_key", params)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class EnvVar < Datahen::Client::Base
|
4
|
+
|
5
|
+
def find(name)
|
6
|
+
self.class.get("/env_vars/#{name}", @options)
|
7
|
+
end
|
8
|
+
|
9
|
+
def all(opts={})
|
10
|
+
params = @options.merge opts
|
11
|
+
self.class.get("/env_vars", params)
|
12
|
+
end
|
13
|
+
|
14
|
+
def set(name, value, opts={})
|
15
|
+
body = {}
|
16
|
+
body[:value] = value
|
17
|
+
body[:secret] = opts[:secret] if opts[:secret]
|
18
|
+
params = @options.merge({body: body.to_json})
|
19
|
+
self.class.put("/env_vars/#{name}", params)
|
20
|
+
end
|
21
|
+
|
22
|
+
def unset(name, opts={})
|
23
|
+
params = @options.merge(opts)
|
24
|
+
self.class.delete("/env_vars/#{name}", params)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class GlobalPage < Datahen::Client::Base
|
4
|
+
def find(gid)
|
5
|
+
self.class.get("/global_pages/#{gid}", @options)
|
6
|
+
end
|
7
|
+
|
8
|
+
def find_content(gid)
|
9
|
+
self.class.get("/global_pages/#{gid}/content", @options)
|
10
|
+
end
|
11
|
+
|
12
|
+
def find_failed_content(gid)
|
13
|
+
self.class.get("/global_pages/#{gid}/failed_content", @options)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
@@ -0,0 +1,64 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Client
|
3
|
+
class Job < Datahen::Client::Base
|
4
|
+
def all(opts={})
|
5
|
+
params = @options.merge(opts)
|
6
|
+
self.class.get("/jobs", params)
|
7
|
+
end
|
8
|
+
|
9
|
+
def find(job_id)
|
10
|
+
self.class.get("/jobs/#{job_id}", @options)
|
11
|
+
end
|
12
|
+
|
13
|
+
def update(job_id, opts={})
|
14
|
+
body = {}
|
15
|
+
body[:status] = opts[:status] if opts[:status]
|
16
|
+
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
17
|
+
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
18
|
+
params = @options.merge({body: body.to_json})
|
19
|
+
|
20
|
+
self.class.put("/jobs/#{job_id}", params)
|
21
|
+
end
|
22
|
+
|
23
|
+
def cancel(job_id, opts={})
|
24
|
+
opts[:status] = 'cancelled'
|
25
|
+
update(job_id, opts)
|
26
|
+
end
|
27
|
+
|
28
|
+
def resume(job_id, opts={})
|
29
|
+
opts[:status] = 'active'
|
30
|
+
update(job_id, opts)
|
31
|
+
end
|
32
|
+
|
33
|
+
def pause(job_id, opts={})
|
34
|
+
opts[:status] = 'paused'
|
35
|
+
update(job_id, opts)
|
36
|
+
end
|
37
|
+
|
38
|
+
def seeding_update(job_id, opts={})
|
39
|
+
body = {}
|
40
|
+
body[:outputs] = opts.fetch(:outputs) {[]}
|
41
|
+
body[:pages] = opts.fetch(:pages) {[]}
|
42
|
+
body[:seeding_status] = opts.fetch(:seeding_status){ nil }
|
43
|
+
body[:log_error] = opts[:log_error] if opts[:log_error]
|
44
|
+
|
45
|
+
params = @options.merge({body: body.to_json})
|
46
|
+
|
47
|
+
self.class.put("/jobs/#{job_id}/seeding_update", params)
|
48
|
+
end
|
49
|
+
|
50
|
+
def finisher_update(job_id, opts={})
|
51
|
+
body = {}
|
52
|
+
body[:outputs] = opts.fetch(:outputs) {[]}
|
53
|
+
body[:finisher_status] = opts.fetch(:finisher_status){ nil }
|
54
|
+
body[:log_error] = opts[:log_error] if opts[:log_error]
|
55
|
+
|
56
|
+
params = @options.merge({body: body.to_json})
|
57
|
+
|
58
|
+
self.class.put("/jobs/#{job_id}/finisher_update", params)
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|