answersengine 0.10.1 → 0.10.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CODE_OF_CONDUCT.md +1 -1
- data/LICENSE.txt +1 -1
- data/README.md +3 -4
- data/answersengine.gemspec +6 -12
- data/exe/answersengine +3 -2
- data/lib/answersengine.rb +20 -3
- metadata +14 -152
- data/examples/fetchtest/libraries/hello.rb +0 -9
- data/examples/fetchtest/libraries/hello_fail.rb +0 -10
- data/examples/fetchtest/parsers/failed.rb +0 -2
- data/examples/fetchtest/parsers/find_outputs.rb +0 -18
- data/examples/fetchtest/parsers/home.rb +0 -50
- data/examples/fetchtest/parsers/nested_fail.rb +0 -3
- data/examples/fetchtest/parsers/simple.rb +0 -14
- data/examples/fetchtest/seeders/csv_seeder.rb +0 -12
- data/examples/fetchtest/seeders/failed.rb +0 -1
- data/examples/fetchtest/seeders/list_of_urls.csv +0 -5
- data/examples/fetchtest/seeders/seed.rb +0 -28
- data/examples/fetchtest/seeders/test_reset_page.rb +0 -4
- data/lib/answersengine/cli.rb +0 -45
- data/lib/answersengine/cli/env_var.rb +0 -48
- data/lib/answersengine/cli/finisher.rb +0 -40
- data/lib/answersengine/cli/global_page.rb +0 -39
- data/lib/answersengine/cli/job.rb +0 -30
- data/lib/answersengine/cli/job_output.rb +0 -69
- data/lib/answersengine/cli/parser.rb +0 -64
- data/lib/answersengine/cli/scraper.rb +0 -185
- data/lib/answersengine/cli/scraper_deployment.rb +0 -24
- data/lib/answersengine/cli/scraper_export.rb +0 -51
- data/lib/answersengine/cli/scraper_exporter.rb +0 -40
- data/lib/answersengine/cli/scraper_finisher.rb +0 -20
- data/lib/answersengine/cli/scraper_job.rb +0 -75
- data/lib/answersengine/cli/scraper_job_var.rb +0 -48
- data/lib/answersengine/cli/scraper_page.rb +0 -203
- data/lib/answersengine/cli/scraper_var.rb +0 -48
- data/lib/answersengine/cli/seeder.rb +0 -40
- data/lib/answersengine/client.rb +0 -29
- data/lib/answersengine/client/auth_token.rb +0 -50
- data/lib/answersengine/client/backblaze_content.rb +0 -45
- data/lib/answersengine/client/base.rb +0 -55
- data/lib/answersengine/client/deploy_key.rb +0 -21
- data/lib/answersengine/client/env_var.rb +0 -28
- data/lib/answersengine/client/export.rb +0 -10
- data/lib/answersengine/client/global_page.rb +0 -18
- data/lib/answersengine/client/job.rb +0 -64
- data/lib/answersengine/client/job_export.rb +0 -10
- data/lib/answersengine/client/job_log.rb +0 -26
- data/lib/answersengine/client/job_output.rb +0 -19
- data/lib/answersengine/client/job_page.rb +0 -58
- data/lib/answersengine/client/job_stat.rb +0 -16
- data/lib/answersengine/client/scraper.rb +0 -57
- data/lib/answersengine/client/scraper_deployment.rb +0 -18
- data/lib/answersengine/client/scraper_export.rb +0 -22
- data/lib/answersengine/client/scraper_exporter.rb +0 -14
- data/lib/answersengine/client/scraper_finisher.rb +0 -16
- data/lib/answersengine/client/scraper_job.rb +0 -49
- data/lib/answersengine/client/scraper_job_output.rb +0 -19
- data/lib/answersengine/client/scraper_job_page.rb +0 -67
- data/lib/answersengine/client/scraper_job_var.rb +0 -28
- data/lib/answersengine/client/scraper_var.rb +0 -28
- data/lib/answersengine/plugin.rb +0 -6
- data/lib/answersengine/plugin/context_exposer.rb +0 -55
- data/lib/answersengine/scraper.rb +0 -18
- data/lib/answersengine/scraper/executor.rb +0 -373
- data/lib/answersengine/scraper/finisher.rb +0 -18
- data/lib/answersengine/scraper/parser.rb +0 -18
- data/lib/answersengine/scraper/ruby_finisher_executor.rb +0 -116
- data/lib/answersengine/scraper/ruby_parser_executor.rb +0 -200
- data/lib/answersengine/scraper/ruby_seeder_executor.rb +0 -120
- data/lib/answersengine/scraper/seeder.rb +0 -18
- data/lib/answersengine/version.rb +0 -3
@@ -1,48 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
class CLI < Thor
|
3
|
-
class ScraperVar < Thor
|
4
|
-
|
5
|
-
package_name "scraper var"
|
6
|
-
def self.banner(command, namespace = nil, subcommand = false)
|
7
|
-
"#{basename} #{@package_name} #{command.usage}"
|
8
|
-
end
|
9
|
-
|
10
|
-
desc "list <scraper_name>", "List environment variables on the scraper"
|
11
|
-
long_desc <<-LONGDESC
|
12
|
-
List all environment variables on the scraper.
|
13
|
-
LONGDESC
|
14
|
-
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
15
|
-
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
16
|
-
def list(scraper_name)
|
17
|
-
client = Client::ScraperVar.new(options)
|
18
|
-
puts "#{client.all(scraper_name)}"
|
19
|
-
end
|
20
|
-
|
21
|
-
desc "set <scraper_name> <var_name> <value>", "Set an environment var on the scraper"
|
22
|
-
long_desc <<-LONGDESC
|
23
|
-
Creates an environment variable\x5
|
24
|
-
<var_name>: Var name can only consist of alphabets, numbers, underscores. Name must be unique to your scraper, otherwise it will be overwritten.\x5
|
25
|
-
<value>: Value of variable.\x5
|
26
|
-
LONGDESC
|
27
|
-
option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
|
28
|
-
def set(scraper_name, var_name, value)
|
29
|
-
# puts "options #{options}"
|
30
|
-
client = Client::ScraperVar.new(options)
|
31
|
-
puts "#{client.set(scraper_name, var_name, value, options)}"
|
32
|
-
end
|
33
|
-
|
34
|
-
desc "show <scraper_name> <var_name>", "Show an environment variable on the scraper"
|
35
|
-
def show(scraper_name, var_name)
|
36
|
-
client = Client::ScraperVar.new(options)
|
37
|
-
puts "#{client.find(scraper_name, var_name)}"
|
38
|
-
end
|
39
|
-
|
40
|
-
desc "unset <scraper_name> <var_name>", "Deletes an environment variable on the scraper"
|
41
|
-
def unset(scraper_name, var_name)
|
42
|
-
client = Client::ScraperVar.new(options)
|
43
|
-
puts "#{client.unset(scraper_name, var_name)}"
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
end
|
@@ -1,40 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
class CLI < Thor
|
3
|
-
class Seeder < Thor
|
4
|
-
desc "try <scraper_name> <seeder_file>", "Tries a seeder file"
|
5
|
-
long_desc <<-LONGDESC
|
6
|
-
Takes a seeder script and tries to execute it without saving anything.\x5
|
7
|
-
<seeder_file>: Seeder script file will be executed.\x5
|
8
|
-
LONGDESC
|
9
|
-
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
10
|
-
def try_seed(scraper_name, seeder_file)
|
11
|
-
if options[:job]
|
12
|
-
job_id = options[:job]
|
13
|
-
else
|
14
|
-
job = Client::ScraperJob.new(options).find(scraper_name)
|
15
|
-
job_id = job['id']
|
16
|
-
end
|
17
|
-
|
18
|
-
puts AnswersEngine::Scraper::Seeder.exec_seeder(seeder_file, job_id, false)
|
19
|
-
end
|
20
|
-
|
21
|
-
desc "exec <scraper_name> <seeder_file>", "Executes a seeder script onto a scraper's current job."
|
22
|
-
long_desc <<-LONGDESC
|
23
|
-
Takes a seeder script and execute it against a job and enqueues the pages into the scraper's current job\x5
|
24
|
-
<seeder_file>: Seeder script file that will be executed on the scraper's current job.\x5
|
25
|
-
LONGDESC
|
26
|
-
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
27
|
-
def exec_parse(scraper_name, seeder_file)
|
28
|
-
if options[:job]
|
29
|
-
job_id = options[:job]
|
30
|
-
else
|
31
|
-
job = Client::ScraperJob.new(options).find(scraper_name)
|
32
|
-
job_id = job['id']
|
33
|
-
end
|
34
|
-
|
35
|
-
puts AnswersEngine::Scraper::Seeder.exec_seeder(seeder_file, job_id, true)
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
end
|
data/lib/answersengine/client.rb
DELETED
@@ -1,29 +0,0 @@
|
|
1
|
-
require "answersengine/client/base"
|
2
|
-
require "answersengine/client/auth_token"
|
3
|
-
require "answersengine/client/deploy_key"
|
4
|
-
require 'answersengine/client/export'
|
5
|
-
require "answersengine/client/scraper"
|
6
|
-
require "answersengine/client/scraper_deployment"
|
7
|
-
require "answersengine/client/scraper_job_output"
|
8
|
-
require "answersengine/client/scraper_job_page"
|
9
|
-
require "answersengine/client/scraper_exporter"
|
10
|
-
require "answersengine/client/scraper_export"
|
11
|
-
require "answersengine/client/scraper_job"
|
12
|
-
require "answersengine/client/scraper_finisher"
|
13
|
-
require 'answersengine/client/job_export'
|
14
|
-
require "answersengine/client/job"
|
15
|
-
require "answersengine/client/job_log"
|
16
|
-
require "answersengine/client/global_page"
|
17
|
-
require "answersengine/client/job_page"
|
18
|
-
require "answersengine/client/job_output"
|
19
|
-
require "answersengine/client/job_stat"
|
20
|
-
require "answersengine/client/backblaze_content"
|
21
|
-
require "answersengine/client/env_var"
|
22
|
-
require "answersengine/client/scraper_var"
|
23
|
-
require "answersengine/client/scraper_job_var"
|
24
|
-
|
25
|
-
|
26
|
-
module AnswersEngine
|
27
|
-
module Client
|
28
|
-
end
|
29
|
-
end
|
@@ -1,50 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
module Client
|
3
|
-
class AuthToken < AnswersEngine::Client::Base
|
4
|
-
|
5
|
-
def find(token)
|
6
|
-
self.class.get("/auth_tokens/#{token}", @options)
|
7
|
-
end
|
8
|
-
|
9
|
-
def all(opts={})
|
10
|
-
params = @options.merge(opts)
|
11
|
-
self.class.get("/auth_tokens", params)
|
12
|
-
end
|
13
|
-
|
14
|
-
def create(role, description, opts={})
|
15
|
-
body = {
|
16
|
-
role: role,
|
17
|
-
description: description}
|
18
|
-
|
19
|
-
params = @options.merge({body: body.to_json})
|
20
|
-
self.class.post("/auth_tokens", params)
|
21
|
-
end
|
22
|
-
|
23
|
-
def create_on_account(account_id, role, description)
|
24
|
-
body = {
|
25
|
-
role: role,
|
26
|
-
description: description}
|
27
|
-
|
28
|
-
params = @options.merge({body: body.to_json})
|
29
|
-
self.class.post("/accounts/#{account_id}/auth_tokens", params)
|
30
|
-
end
|
31
|
-
|
32
|
-
def update(token, role, description="", opts={})
|
33
|
-
body = {}
|
34
|
-
|
35
|
-
body[:role] = role
|
36
|
-
body[:description] = description if description.present?
|
37
|
-
params = @options.merge({body: body.to_json})
|
38
|
-
|
39
|
-
self.class.put("/auth_tokens/#{token}", params)
|
40
|
-
end
|
41
|
-
|
42
|
-
def delete(token, opts={})
|
43
|
-
body = {}
|
44
|
-
params = @options.merge({body: body.to_json})
|
45
|
-
|
46
|
-
self.class.delete("/auth_tokens/#{token}", params)
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
@@ -1,45 +0,0 @@
|
|
1
|
-
require 'zlib'
|
2
|
-
require 'httparty'
|
3
|
-
|
4
|
-
module AnswersEngine
|
5
|
-
module Client
|
6
|
-
class BackblazeContent
|
7
|
-
include HTTParty
|
8
|
-
|
9
|
-
def get_content(url)
|
10
|
-
self.class.get(url, format: :plain)
|
11
|
-
end
|
12
|
-
|
13
|
-
def get_gunzipped_content(url)
|
14
|
-
# Zlib.gunzip(get_content(url))
|
15
|
-
gunzip(get_content(url))
|
16
|
-
end
|
17
|
-
|
18
|
-
def gunzip(string)
|
19
|
-
sio = StringIO.new(string)
|
20
|
-
gz = Zlib::GzipReader.new(sio, encoding: Encoding::ASCII_8BIT)
|
21
|
-
_content = ""
|
22
|
-
begin
|
23
|
-
_content = gz.read
|
24
|
-
rescue => e
|
25
|
-
# if unexpected eof error, then readchar until error, and ignore it
|
26
|
-
if e.to_s == 'unexpected end of file'
|
27
|
-
begin
|
28
|
-
while !gz.eof?
|
29
|
-
_content += gz.readchar
|
30
|
-
end
|
31
|
-
rescue => e
|
32
|
-
puts "Ignored Zlib error: #{e.to_s}"
|
33
|
-
end
|
34
|
-
else
|
35
|
-
raise e
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
return _content
|
40
|
-
ensure
|
41
|
-
gz.close if gz.respond_to?(:close)
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
@@ -1,55 +0,0 @@
|
|
1
|
-
require 'httparty'
|
2
|
-
|
3
|
-
module AnswersEngine
|
4
|
-
module Client
|
5
|
-
class Base
|
6
|
-
include HTTParty
|
7
|
-
|
8
|
-
def self.env_auth_token
|
9
|
-
ENV['ANSWERSENGINE_TOKEN']
|
10
|
-
end
|
11
|
-
|
12
|
-
def env_api_url
|
13
|
-
ENV['ANSWERSENGINE_API_URL'].nil? ? 'https://fetch.answersengine.com/api/v1' : ENV['ANSWERSENGINE_API_URL']
|
14
|
-
end
|
15
|
-
|
16
|
-
def auth_token
|
17
|
-
@auth_token ||= self.class.env_auth_token
|
18
|
-
end
|
19
|
-
|
20
|
-
def auth_token= value
|
21
|
-
@auth_token = value
|
22
|
-
end
|
23
|
-
|
24
|
-
def initialize(opts={})
|
25
|
-
self.class.base_uri(env_api_url)
|
26
|
-
self.auth_token = opts[:auth_token] unless opts[:auth_token].nil?
|
27
|
-
@options = { headers: {
|
28
|
-
"Authorization" => "Bearer #{auth_token}",
|
29
|
-
"Content-Type" => "application/json",
|
30
|
-
}}
|
31
|
-
|
32
|
-
query = {}
|
33
|
-
query[:p] = opts[:page] if opts[:page]
|
34
|
-
query[:pp] = opts[:per_page] if opts[:per_page]
|
35
|
-
query[:fetchfail] = opts[:fetch_fail] if opts[:fetch_fail]
|
36
|
-
query[:parsefail] = opts[:parse_fail] if opts[:parse_fail]
|
37
|
-
query[:status] = opts[:status] if opts[:status]
|
38
|
-
query[:page_type] = opts[:page_type] if opts[:page_type]
|
39
|
-
query[:gid] = opts[:gid] if opts[:gid]
|
40
|
-
|
41
|
-
if opts[:query]
|
42
|
-
if opts[:query].is_a?(Hash)
|
43
|
-
query[:q] = opts[:query].to_json
|
44
|
-
elsif opts[:query].is_a?(String)
|
45
|
-
query[:q] = JSON.parse(opts[:query]).to_json
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
unless query.empty?
|
50
|
-
@options.merge!(query: query)
|
51
|
-
end
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|
@@ -1,21 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
module Client
|
3
|
-
class DeployKey < AnswersEngine::Client::Base
|
4
|
-
|
5
|
-
def find(opts={})
|
6
|
-
params = @options.merge(opts)
|
7
|
-
self.class.get("/deploy_key", params)
|
8
|
-
end
|
9
|
-
|
10
|
-
def create(opts={})
|
11
|
-
params = @options.merge(opts)
|
12
|
-
self.class.post("/deploy_key", params)
|
13
|
-
end
|
14
|
-
|
15
|
-
def delete(opts={})
|
16
|
-
params = @options.merge(opts)
|
17
|
-
self.class.delete("/deploy_key", params)
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|
@@ -1,28 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
module Client
|
3
|
-
class EnvVar < AnswersEngine::Client::Base
|
4
|
-
|
5
|
-
def find(name)
|
6
|
-
self.class.get("/env_vars/#{name}", @options)
|
7
|
-
end
|
8
|
-
|
9
|
-
def all(opts={})
|
10
|
-
params = @options.merge opts
|
11
|
-
self.class.get("/env_vars", params)
|
12
|
-
end
|
13
|
-
|
14
|
-
def set(name, value, opts={})
|
15
|
-
body = {}
|
16
|
-
body[:value] = value
|
17
|
-
body[:secret] = opts[:secret] if opts[:secret]
|
18
|
-
params = @options.merge({body: body.to_json})
|
19
|
-
self.class.put("/env_vars/#{name}", params)
|
20
|
-
end
|
21
|
-
|
22
|
-
def unset(name, opts={})
|
23
|
-
params = @options.merge(opts)
|
24
|
-
self.class.delete("/env_vars/#{name}", params)
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
@@ -1,18 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
module Client
|
3
|
-
class GlobalPage < AnswersEngine::Client::Base
|
4
|
-
def find(gid)
|
5
|
-
self.class.get("/global_pages/#{gid}", @options)
|
6
|
-
end
|
7
|
-
|
8
|
-
def find_content(gid)
|
9
|
-
self.class.get("/global_pages/#{gid}/content", @options)
|
10
|
-
end
|
11
|
-
|
12
|
-
def find_failed_content(gid)
|
13
|
-
self.class.get("/global_pages/#{gid}/failed_content", @options)
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
@@ -1,64 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
module Client
|
3
|
-
class Job < AnswersEngine::Client::Base
|
4
|
-
def all(opts={})
|
5
|
-
params = @options.merge(opts)
|
6
|
-
self.class.get("/jobs", params)
|
7
|
-
end
|
8
|
-
|
9
|
-
def find(job_id)
|
10
|
-
self.class.get("/jobs/#{job_id}", @options)
|
11
|
-
end
|
12
|
-
|
13
|
-
def update(job_id, opts={})
|
14
|
-
body = {}
|
15
|
-
body[:status] = opts[:status] if opts[:status]
|
16
|
-
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
17
|
-
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
18
|
-
params = @options.merge({body: body.to_json})
|
19
|
-
|
20
|
-
self.class.put("/jobs/#{job_id}", params)
|
21
|
-
end
|
22
|
-
|
23
|
-
def cancel(job_id, opts={})
|
24
|
-
opts[:status] = 'cancelled'
|
25
|
-
update(job_id, opts)
|
26
|
-
end
|
27
|
-
|
28
|
-
def resume(job_id, opts={})
|
29
|
-
opts[:status] = 'active'
|
30
|
-
update(job_id, opts)
|
31
|
-
end
|
32
|
-
|
33
|
-
def pause(job_id, opts={})
|
34
|
-
opts[:status] = 'paused'
|
35
|
-
update(job_id, opts)
|
36
|
-
end
|
37
|
-
|
38
|
-
def seeding_update(job_id, opts={})
|
39
|
-
body = {}
|
40
|
-
body[:outputs] = opts.fetch(:outputs) {[]}
|
41
|
-
body[:pages] = opts.fetch(:pages) {[]}
|
42
|
-
body[:seeding_status] = opts.fetch(:seeding_status){ nil }
|
43
|
-
body[:log_error] = opts[:log_error] if opts[:log_error]
|
44
|
-
|
45
|
-
params = @options.merge({body: body.to_json})
|
46
|
-
|
47
|
-
self.class.put("/jobs/#{job_id}/seeding_update", params)
|
48
|
-
end
|
49
|
-
|
50
|
-
def finisher_update(job_id, opts={})
|
51
|
-
body = {}
|
52
|
-
body[:outputs] = opts.fetch(:outputs) {[]}
|
53
|
-
body[:finisher_status] = opts.fetch(:finisher_status){ nil }
|
54
|
-
body[:log_error] = opts[:log_error] if opts[:log_error]
|
55
|
-
|
56
|
-
params = @options.merge({body: body.to_json})
|
57
|
-
|
58
|
-
self.class.put("/jobs/#{job_id}/finisher_update", params)
|
59
|
-
end
|
60
|
-
|
61
|
-
end
|
62
|
-
|
63
|
-
end
|
64
|
-
end
|
@@ -1,26 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
module Client
|
3
|
-
class JobLog < AnswersEngine::Client::Base
|
4
|
-
def all_job_page_log(job_id, gid, opts={})
|
5
|
-
params = @options.merge(opts)
|
6
|
-
self.class.get("/jobs/#{job_id}/pages/#{gid}/log", params)
|
7
|
-
end
|
8
|
-
|
9
|
-
def scraper_all_job_page_log(scraper_name, gid, opts={})
|
10
|
-
params = @options.merge(opts)
|
11
|
-
self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}/log", params)
|
12
|
-
end
|
13
|
-
|
14
|
-
def all_job_log(job_id, opts={})
|
15
|
-
params = @options.merge(opts)
|
16
|
-
self.class.get("/jobs/#{job_id}/log", params)
|
17
|
-
end
|
18
|
-
|
19
|
-
def scraper_all_job_log(scraper_name, opts={})
|
20
|
-
params = @options.merge(opts)
|
21
|
-
self.class.get("/scrapers/#{scraper_name}/current_job/log", params)
|
22
|
-
end
|
23
|
-
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|