answersengine 0.10.1 → 0.10.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CODE_OF_CONDUCT.md +1 -1
- data/LICENSE.txt +1 -1
- data/README.md +3 -4
- data/answersengine.gemspec +6 -12
- data/exe/answersengine +3 -2
- data/lib/answersengine.rb +20 -3
- metadata +14 -152
- data/examples/fetchtest/libraries/hello.rb +0 -9
- data/examples/fetchtest/libraries/hello_fail.rb +0 -10
- data/examples/fetchtest/parsers/failed.rb +0 -2
- data/examples/fetchtest/parsers/find_outputs.rb +0 -18
- data/examples/fetchtest/parsers/home.rb +0 -50
- data/examples/fetchtest/parsers/nested_fail.rb +0 -3
- data/examples/fetchtest/parsers/simple.rb +0 -14
- data/examples/fetchtest/seeders/csv_seeder.rb +0 -12
- data/examples/fetchtest/seeders/failed.rb +0 -1
- data/examples/fetchtest/seeders/list_of_urls.csv +0 -5
- data/examples/fetchtest/seeders/seed.rb +0 -28
- data/examples/fetchtest/seeders/test_reset_page.rb +0 -4
- data/lib/answersengine/cli.rb +0 -45
- data/lib/answersengine/cli/env_var.rb +0 -48
- data/lib/answersengine/cli/finisher.rb +0 -40
- data/lib/answersengine/cli/global_page.rb +0 -39
- data/lib/answersengine/cli/job.rb +0 -30
- data/lib/answersengine/cli/job_output.rb +0 -69
- data/lib/answersengine/cli/parser.rb +0 -64
- data/lib/answersengine/cli/scraper.rb +0 -185
- data/lib/answersengine/cli/scraper_deployment.rb +0 -24
- data/lib/answersengine/cli/scraper_export.rb +0 -51
- data/lib/answersengine/cli/scraper_exporter.rb +0 -40
- data/lib/answersengine/cli/scraper_finisher.rb +0 -20
- data/lib/answersengine/cli/scraper_job.rb +0 -75
- data/lib/answersengine/cli/scraper_job_var.rb +0 -48
- data/lib/answersengine/cli/scraper_page.rb +0 -203
- data/lib/answersengine/cli/scraper_var.rb +0 -48
- data/lib/answersengine/cli/seeder.rb +0 -40
- data/lib/answersengine/client.rb +0 -29
- data/lib/answersengine/client/auth_token.rb +0 -50
- data/lib/answersengine/client/backblaze_content.rb +0 -45
- data/lib/answersengine/client/base.rb +0 -55
- data/lib/answersengine/client/deploy_key.rb +0 -21
- data/lib/answersengine/client/env_var.rb +0 -28
- data/lib/answersengine/client/export.rb +0 -10
- data/lib/answersengine/client/global_page.rb +0 -18
- data/lib/answersengine/client/job.rb +0 -64
- data/lib/answersengine/client/job_export.rb +0 -10
- data/lib/answersengine/client/job_log.rb +0 -26
- data/lib/answersengine/client/job_output.rb +0 -19
- data/lib/answersengine/client/job_page.rb +0 -58
- data/lib/answersengine/client/job_stat.rb +0 -16
- data/lib/answersengine/client/scraper.rb +0 -57
- data/lib/answersengine/client/scraper_deployment.rb +0 -18
- data/lib/answersengine/client/scraper_export.rb +0 -22
- data/lib/answersengine/client/scraper_exporter.rb +0 -14
- data/lib/answersengine/client/scraper_finisher.rb +0 -16
- data/lib/answersengine/client/scraper_job.rb +0 -49
- data/lib/answersengine/client/scraper_job_output.rb +0 -19
- data/lib/answersengine/client/scraper_job_page.rb +0 -67
- data/lib/answersengine/client/scraper_job_var.rb +0 -28
- data/lib/answersengine/client/scraper_var.rb +0 -28
- data/lib/answersengine/plugin.rb +0 -6
- data/lib/answersengine/plugin/context_exposer.rb +0 -55
- data/lib/answersengine/scraper.rb +0 -18
- data/lib/answersengine/scraper/executor.rb +0 -373
- data/lib/answersengine/scraper/finisher.rb +0 -18
- data/lib/answersengine/scraper/parser.rb +0 -18
- data/lib/answersengine/scraper/ruby_finisher_executor.rb +0 -116
- data/lib/answersengine/scraper/ruby_parser_executor.rb +0 -200
- data/lib/answersengine/scraper/ruby_seeder_executor.rb +0 -120
- data/lib/answersengine/scraper/seeder.rb +0 -18
- data/lib/answersengine/version.rb +0 -3
@@ -1,12 +0,0 @@
|
|
1
|
-
CSV.foreach("./seeders/list_of_urls.csv",:headers => true) do |row|
|
2
|
-
pages << {
|
3
|
-
url: row['url'],
|
4
|
-
page_type: row['page_type'],
|
5
|
-
vars: {"abc":[1,2,3], "def": "defcontent"}
|
6
|
-
}
|
7
|
-
|
8
|
-
# Save pages to the job partially if record counts will be too large
|
9
|
-
max_records = 100
|
10
|
-
save_pages(pages) if $. % max_records == 0
|
11
|
-
end
|
12
|
-
|
@@ -1 +0,0 @@
|
|
1
|
-
raise "fail from seeder"
|
@@ -1,28 +0,0 @@
|
|
1
|
-
puts "hello from seeder"
|
2
|
-
|
3
|
-
pages << {
|
4
|
-
url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser",
|
5
|
-
vars: {"abc":[1], "def": "defcontent"}
|
6
|
-
}
|
7
|
-
|
8
|
-
pages << {
|
9
|
-
url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser2",
|
10
|
-
vars: {"abc":[2], "def": "defcontent"}
|
11
|
-
}
|
12
|
-
|
13
|
-
save_pages(pages)
|
14
|
-
|
15
|
-
pages << {
|
16
|
-
url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser3",
|
17
|
-
vars: {"abc":[3], "def": "defcontent"}
|
18
|
-
}
|
19
|
-
|
20
|
-
pages << {
|
21
|
-
url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser4",
|
22
|
-
vars: {"abc":[3], "def": "defcontent"}
|
23
|
-
}
|
24
|
-
|
25
|
-
pages << {
|
26
|
-
url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser5",
|
27
|
-
vars: {"abc":[3], "def": "defcontent"}
|
28
|
-
}
|
data/lib/answersengine/cli.rb
DELETED
@@ -1,45 +0,0 @@
|
|
1
|
-
require 'thor'
|
2
|
-
require 'answersengine/scraper'
|
3
|
-
require 'answersengine/cli/scraper_var'
|
4
|
-
require 'answersengine/cli/scraper_exporter'
|
5
|
-
require 'answersengine/cli/scraper_export'
|
6
|
-
require 'answersengine/cli/scraper_job_var'
|
7
|
-
require 'answersengine/cli/scraper_job'
|
8
|
-
require 'answersengine/cli/scraper_finisher'
|
9
|
-
require 'answersengine/cli/global_page'
|
10
|
-
require 'answersengine/cli/scraper_page'
|
11
|
-
require 'answersengine/cli/job_output'
|
12
|
-
require 'answersengine/cli/job'
|
13
|
-
require 'answersengine/cli/scraper_deployment'
|
14
|
-
require 'answersengine/cli/scraper'
|
15
|
-
require 'answersengine/cli/parser'
|
16
|
-
require 'answersengine/cli/seeder'
|
17
|
-
require 'answersengine/cli/finisher'
|
18
|
-
require 'answersengine/cli/env_var'
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
module AnswersEngine
|
23
|
-
class CLI < Thor
|
24
|
-
desc "scraper SUBCOMMAND ...ARGS", "manage scrapers"
|
25
|
-
subcommand "scraper", Scraper
|
26
|
-
|
27
|
-
desc "job SUBCOMMAND ...ARGS", "manage scrapers jobs"
|
28
|
-
subcommand "job", Job
|
29
|
-
|
30
|
-
desc "globalpage SUBCOMMAND ...ARGS", "interacts with global page"
|
31
|
-
subcommand "globalpage", GlobalPage
|
32
|
-
|
33
|
-
desc "parser SUBCOMMAND ...ARGS", "for parsing related activities"
|
34
|
-
subcommand "parser", Parser
|
35
|
-
|
36
|
-
desc "seeder SUBCOMMAND ...ARGS", "for seeding related activities"
|
37
|
-
subcommand "seeder", Seeder
|
38
|
-
|
39
|
-
desc "seeder SUBCOMMAND ...ARGS", "for seeding related activities"
|
40
|
-
subcommand "finisher", Finisher
|
41
|
-
|
42
|
-
desc "var SUBCOMMAND ...ARGS", "for environment variable related activities"
|
43
|
-
subcommand "var", EnvVar
|
44
|
-
end
|
45
|
-
end
|
@@ -1,48 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
class CLI < Thor
|
3
|
-
class EnvVar < Thor
|
4
|
-
desc "list", "List environment variables on the account"
|
5
|
-
|
6
|
-
long_desc <<-LONGDESC
|
7
|
-
List all environment variables on the account.
|
8
|
-
LONGDESC
|
9
|
-
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
10
|
-
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
11
|
-
def list
|
12
|
-
client = Client::EnvVar.new(options)
|
13
|
-
puts "#{client.all}"
|
14
|
-
end
|
15
|
-
|
16
|
-
desc "set <name> <value>", "Set an environment var on the account"
|
17
|
-
long_desc <<-LONGDESC
|
18
|
-
Creates an environment variable\x5
|
19
|
-
<name>: Var name can only consist of alphabets, numbers, underscores. Name must be unique to your account, otherwise it will be overwritten.\x5
|
20
|
-
<value>: Value of variable.\x5
|
21
|
-
LONGDESC
|
22
|
-
option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
|
23
|
-
def set(name, value)
|
24
|
-
# puts "options #{options}"
|
25
|
-
client = Client::EnvVar.new(options)
|
26
|
-
puts "#{client.set(name, value, options)}"
|
27
|
-
end
|
28
|
-
|
29
|
-
desc "show <name>", "Show an environment variable on the account"
|
30
|
-
def show(name)
|
31
|
-
client = Client::EnvVar.new(options)
|
32
|
-
puts "#{client.find(name)}"
|
33
|
-
end
|
34
|
-
|
35
|
-
desc "unset <name>", "Deletes an environment variable on the account"
|
36
|
-
def unset(name)
|
37
|
-
client = Client::EnvVar.new(options)
|
38
|
-
puts "#{client.unset(name)}"
|
39
|
-
end
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
end
|
@@ -1,40 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
class CLI < Thor
|
3
|
-
class Finisher < Thor
|
4
|
-
desc "try <scraper_name> <finisher_file>", "Tries a finisher file"
|
5
|
-
long_desc <<-LONGDESC
|
6
|
-
Takes a finisher script and tries to execute it without saving anything.\x5
|
7
|
-
<seeder_file>: Finisher script file will be executed.\x5
|
8
|
-
LONGDESC
|
9
|
-
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
10
|
-
def try_finisher(scraper_name, finisher_file)
|
11
|
-
if options[:job]
|
12
|
-
job_id = options[:job]
|
13
|
-
else
|
14
|
-
job = Client::ScraperJob.new(options).find(scraper_name)
|
15
|
-
job_id = job['id']
|
16
|
-
end
|
17
|
-
|
18
|
-
puts AnswersEngine::Scraper::Finisher.exec_finisher(finisher_file, job_id, false)
|
19
|
-
end
|
20
|
-
|
21
|
-
desc "exec <scraper_name> <finisher_file>", "Executes a finisher script onto a scraper's current job."
|
22
|
-
long_desc <<-LONGDESC
|
23
|
-
Takes a finisher script and execute it against a job and save outputs into the scraper's current job\x5
|
24
|
-
<finisher_file>: Finisher script file that will be executed on the scraper's current job.\x5
|
25
|
-
LONGDESC
|
26
|
-
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
27
|
-
def exec_parse(scraper_name, finisher_file)
|
28
|
-
if options[:job]
|
29
|
-
job_id = options[:job]
|
30
|
-
else
|
31
|
-
job = Client::ScraperJob.new(options).find(scraper_name)
|
32
|
-
job_id = job['id']
|
33
|
-
end
|
34
|
-
|
35
|
-
puts AnswersEngine::Scraper::Finisher.exec_finisher(finisher_file, job_id, true)
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
end
|
@@ -1,39 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
class CLI < Thor
|
3
|
-
class GlobalPage < Thor
|
4
|
-
|
5
|
-
desc "show <gid>", "Show a global page"
|
6
|
-
def show(gid)
|
7
|
-
client = Client::GlobalPage.new(options)
|
8
|
-
puts "#{client.find(gid)}"
|
9
|
-
end
|
10
|
-
|
11
|
-
desc "content <gid>", "Show content of a globalpage"
|
12
|
-
def content(gid)
|
13
|
-
client = Client::GlobalPage.new(options)
|
14
|
-
result = JSON.parse(client.find_content(gid).to_s)
|
15
|
-
|
16
|
-
if result['available'] == true
|
17
|
-
puts "Preview content url: \"#{result['preview_url']}\""
|
18
|
-
`open "#{result['preview_url']}"`
|
19
|
-
else
|
20
|
-
puts "Content does not exist"
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
desc "failedcontent <gid>", "Show failed content of a globalpage"
|
25
|
-
def failedcontent(gid)
|
26
|
-
client = Client::GlobalPage.new(options)
|
27
|
-
result = JSON.parse(client.find_failed_content(gid).to_s)
|
28
|
-
|
29
|
-
if result['available'] == true
|
30
|
-
puts "Preview failed content url: \"#{result['preview_url']}\""
|
31
|
-
`open "#{result['preview_url']}"`
|
32
|
-
else
|
33
|
-
puts "Failed Content does not exist"
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
@@ -1,30 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
class CLI < Thor
|
3
|
-
class Job < Thor
|
4
|
-
package_name "job"
|
5
|
-
def self.banner(command, namespace = nil, subcommand = false)
|
6
|
-
"#{basename} #{@package_name} #{command.usage}"
|
7
|
-
end
|
8
|
-
|
9
|
-
|
10
|
-
desc "list", "gets a list of jobs"
|
11
|
-
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
12
|
-
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
13
|
-
long_desc <<-LONGDESC
|
14
|
-
List scrape jobs.
|
15
|
-
LONGDESC
|
16
|
-
def list()
|
17
|
-
client = Client::Job.new(options)
|
18
|
-
puts "#{client.all()}"
|
19
|
-
end
|
20
|
-
|
21
|
-
desc "show <job_id>", "Show a job"
|
22
|
-
def show(job_id)
|
23
|
-
client = Client::Job.new(options)
|
24
|
-
puts "#{client.find(job_id)}"
|
25
|
-
end
|
26
|
-
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
end
|
@@ -1,69 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
class CLI < Thor
|
3
|
-
class JobOutput < Thor
|
4
|
-
|
5
|
-
package_name "scraper output"
|
6
|
-
def self.banner(command, namespace = nil, subcommand = false)
|
7
|
-
"#{basename} #{@package_name} #{command.usage}"
|
8
|
-
end
|
9
|
-
|
10
|
-
desc "list <scraper_name>", "List output records in a collection that is in the current job"
|
11
|
-
long_desc <<-LONGDESC
|
12
|
-
List all output records in a collection that is in the current job of a scraper\n
|
13
|
-
LONGDESC
|
14
|
-
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
15
|
-
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
16
|
-
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
17
|
-
option :collection, :aliases => :c, desc: "Shows outputs from a specific collection.(defaults to 'default' collection)"
|
18
|
-
option :query, :aliases => :q, type: :string, banner: :JSON, desc: 'Set search query. Must be in json format. i.e: {"Foo":"bar"} '
|
19
|
-
def list(scraper_name)
|
20
|
-
collection = options.fetch(:collection) { 'default' }
|
21
|
-
if options[:job]
|
22
|
-
client = Client::JobOutput.new(options)
|
23
|
-
puts "#{client.all(options[:job], collection)}"
|
24
|
-
else
|
25
|
-
client = Client::ScraperJobOutput.new(options)
|
26
|
-
puts "#{client.all(scraper_name, collection)}"
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
desc "show <scraper_name> <record_id>", "Show one output record in a collection that is in the current job of a scraper"
|
31
|
-
long_desc <<-LONGDESC
|
32
|
-
Shows an output record in a collection that is in the current job of a scraper\n
|
33
|
-
<record_id>: ID of the output record.\x5
|
34
|
-
LONGDESC
|
35
|
-
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
36
|
-
option :collection, :aliases => :c, desc: "Shows output from a specific collection.(defaults to 'default' collection)"
|
37
|
-
def show(scraper_name, id)
|
38
|
-
collection = options.fetch(:collection) { 'default' }
|
39
|
-
if options[:job]
|
40
|
-
client = Client::JobOutput.new(options)
|
41
|
-
puts "#{client.find(options[:job], collection, id)}"
|
42
|
-
else
|
43
|
-
client = Client::ScraperJobOutput.new(options)
|
44
|
-
puts "#{client.find(scraper_name, collection, id)}"
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
desc "collections <scraper_name>", "list job output collections that are inside a current job of a scraper."
|
49
|
-
long_desc <<-LONGDESC
|
50
|
-
List job output collections that are inside a current job of a scraper.\x5
|
51
|
-
LONGDESC
|
52
|
-
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
53
|
-
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
54
|
-
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
55
|
-
def collections(scraper_name)
|
56
|
-
|
57
|
-
if options[:job]
|
58
|
-
client = Client::JobOutput.new(options)
|
59
|
-
puts "#{client.collections(options[:job])}"
|
60
|
-
else
|
61
|
-
client = Client::ScraperJobOutput.new(options)
|
62
|
-
puts "#{client.collections(scraper_name)}"
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
end
|
67
|
-
end
|
68
|
-
|
69
|
-
end
|
@@ -1,64 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
class CLI < Thor
|
3
|
-
class Parser < Thor
|
4
|
-
desc "try <scraper_name> <parser_file> <GID>", "Tries a parser on a Job Page"
|
5
|
-
long_desc <<-LONGDESC
|
6
|
-
Takes a parser script and runs it against a job page\x5
|
7
|
-
<parser_file>: Parser script file that will be executed on the page.\x5
|
8
|
-
<GID>: Global ID of the page.\x5
|
9
|
-
LONGDESC
|
10
|
-
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
11
|
-
option :global, :aliases => :g, type: :boolean, default: false, desc: 'Use globalpage instead of a job page'
|
12
|
-
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
13
|
-
def try_parse(scraper_name, parser_file, gid)
|
14
|
-
begin
|
15
|
-
|
16
|
-
if options[:job]
|
17
|
-
job_id = options[:job]
|
18
|
-
elsif options[:global]
|
19
|
-
job_id = nil
|
20
|
-
else
|
21
|
-
job = Client::ScraperJob.new(options).find(scraper_name)
|
22
|
-
job_id = job['id']
|
23
|
-
end
|
24
|
-
|
25
|
-
|
26
|
-
vars = JSON.parse(options[:vars]) if options[:vars]
|
27
|
-
puts AnswersEngine::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars)
|
28
|
-
|
29
|
-
rescue JSON::ParserError
|
30
|
-
if options[:vars]
|
31
|
-
puts "Error: #{options[:vars]} on vars is not a valid JSON"
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
|
-
desc "exec <scraper_name> <parser_file> <GID>...<GID>", "Executes a parser script on one or more Job Pages within a scraper's current job"
|
37
|
-
long_desc <<-LONGDESC
|
38
|
-
Takes a parser script executes it against a job page(s) and save the output to the scraper's current job\x5
|
39
|
-
<parser_file>: Parser script file will be executed on the page.\x5
|
40
|
-
<GID>: Global ID of the page.\x5
|
41
|
-
LONGDESC
|
42
|
-
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
43
|
-
def exec_parse(scraper_name, parser_file, *gids)
|
44
|
-
gids.each do |gid|
|
45
|
-
begin
|
46
|
-
puts "Parsing #{gid}"
|
47
|
-
|
48
|
-
if options[:job]
|
49
|
-
job_id = options[:job]
|
50
|
-
else
|
51
|
-
job = Client::ScraperJob.new(options).find(scraper_name)
|
52
|
-
job_id = job['id']
|
53
|
-
end
|
54
|
-
|
55
|
-
puts AnswersEngine::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true)
|
56
|
-
rescue => e
|
57
|
-
puts e
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
|
-
end
|
@@ -1,185 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
class CLI < Thor
|
3
|
-
class Scraper < Thor
|
4
|
-
desc "list", "List scrapers"
|
5
|
-
|
6
|
-
long_desc <<-LONGDESC
|
7
|
-
List all scrapers.
|
8
|
-
LONGDESC
|
9
|
-
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
10
|
-
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
11
|
-
def list
|
12
|
-
client = Client::Scraper.new(options)
|
13
|
-
puts "#{client.all}"
|
14
|
-
end
|
15
|
-
|
16
|
-
desc "create <scraper_name> <git_repository>", "Create a scraper"
|
17
|
-
long_desc <<-LONGDESC
|
18
|
-
Creates a scraper\x5
|
19
|
-
<scraper_name>: Scraper name can only consist of alphabets, numbers, underscores and dashes. Name must be unique to your account.\x5
|
20
|
-
<git_repository>: URL to a valid Git repository.\x5
|
21
|
-
LONGDESC
|
22
|
-
option :branch, :aliases => :b, desc: 'Set the Git branch to use. Default: master'
|
23
|
-
option :freshness_type, :aliases => :t, desc: 'Set how fresh the page cache is. Possible values: day, week, month, year. Default: any'
|
24
|
-
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
25
|
-
option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
|
26
|
-
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
|
27
|
-
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
|
28
|
-
option :disable_scheduler, type: :boolean, desc: 'Set true to disable scheduler. Default: false'
|
29
|
-
option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
|
30
|
-
option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
|
31
|
-
option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
|
32
|
-
def create(scraper_name, git_repository)
|
33
|
-
# puts "options #{options}"
|
34
|
-
client = Client::Scraper.new(options)
|
35
|
-
puts "#{client.create(scraper_name, git_repository, options)}"
|
36
|
-
end
|
37
|
-
|
38
|
-
desc "update <scraper_name>", "Update a scraper"
|
39
|
-
long_desc <<-LONGDESC
|
40
|
-
Updates a scraper\x5
|
41
|
-
LONGDESC
|
42
|
-
option :branch, :aliases => :b, desc: 'Set the Git branch to use. Default: master'
|
43
|
-
option :name, :aliases => :n, desc: 'Set the scraper name. Name can only consist of alphabets, numbers, underscores and dashes. Name must be unique to your account'
|
44
|
-
option :repo, :aliases => :r, desc: 'Set the URL to a valid Git repository'
|
45
|
-
option :freshness_type, :aliases => :t, desc: 'Set how fresh the page cache is. Possible values: day, week, month, year. Default: any'
|
46
|
-
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
47
|
-
option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
|
48
|
-
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
|
49
|
-
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
|
50
|
-
option :disable_scheduler, type: :boolean, desc: 'Set true to disable scheduler. Default: false'
|
51
|
-
option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
|
52
|
-
option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
|
53
|
-
option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
|
54
|
-
def update(scraper_name)
|
55
|
-
client = Client::Scraper.new(options)
|
56
|
-
puts "#{client.update(scraper_name, options)}"
|
57
|
-
end
|
58
|
-
|
59
|
-
|
60
|
-
desc "show <scraper_name>", "Show a scraper"
|
61
|
-
def show(scraper_name)
|
62
|
-
client = Client::Scraper.new(options)
|
63
|
-
puts "#{client.find(scraper_name)}"
|
64
|
-
end
|
65
|
-
|
66
|
-
desc "delete <scraper_name>", "Delete a scraper and related records"
|
67
|
-
def delete(scraper_name)
|
68
|
-
client = Client::Scraper.new(options)
|
69
|
-
puts "#{client.delete(scraper_name)}"
|
70
|
-
end
|
71
|
-
|
72
|
-
|
73
|
-
desc "deploy <scraper_name>", "Deploy a scraper"
|
74
|
-
long_desc <<-LONGDESC
|
75
|
-
Deploys a scraper
|
76
|
-
LONGDESC
|
77
|
-
def deploy(scraper_name)
|
78
|
-
client = Client::ScraperDeployment.new()
|
79
|
-
puts "Deploying scraper. This may take a while..."
|
80
|
-
puts "#{client.deploy(scraper_name)}"
|
81
|
-
end
|
82
|
-
|
83
|
-
desc "start <scraper_name>", "Creates a scraping job and runs it"
|
84
|
-
long_desc <<-LONGDESC
|
85
|
-
Starts a scraper by creating an active scrape job\x5
|
86
|
-
LONGDESC
|
87
|
-
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
|
88
|
-
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
|
89
|
-
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
90
|
-
def start(scraper_name)
|
91
|
-
client = Client::ScraperJob.new(options)
|
92
|
-
puts "Starting a scrape job..."
|
93
|
-
puts "#{client.create(scraper_name, options)}"
|
94
|
-
end
|
95
|
-
|
96
|
-
|
97
|
-
desc "log <scraper_name>", "List log entries related to a scraper's current job"
|
98
|
-
long_desc <<-LONGDESC
|
99
|
-
Shows log related to a scraper's current job. Defaults to showing the most recent entries\x5
|
100
|
-
LONGDESC
|
101
|
-
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
102
|
-
option :head, :aliases => :H, desc: 'Show the oldest log entries. If not set, newest entries is shown'
|
103
|
-
option :parsing, :aliases => :p, type: :boolean, desc: 'Show only log entries related to parsing errors'
|
104
|
-
option :seeding, :aliases => :s, type: :boolean, desc: 'Show only log entries related to seeding errors'
|
105
|
-
option :more, :aliases => :m, desc: 'Show next set of log entries. Enter the `More token`'
|
106
|
-
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 5000 per page.'
|
107
|
-
def log(scraper_name)
|
108
|
-
client = Client::JobLog.new(options)
|
109
|
-
|
110
|
-
query = {}
|
111
|
-
query["order"] = options.delete(:head) if options[:head]
|
112
|
-
query["job_type"] = "parsing" if options[:parsing]
|
113
|
-
query["job_type"] = "seeding" if options[:seeding]
|
114
|
-
query["page_token"] = options.delete(:more) if options[:more]
|
115
|
-
query["per_page"] = options.delete(:per_page) if options[:per_page]
|
116
|
-
|
117
|
-
puts "Fetching logs..."
|
118
|
-
|
119
|
-
if options[:job]
|
120
|
-
result = client.all_job_log(options[:job], {query: query})
|
121
|
-
else
|
122
|
-
result = client.scraper_all_job_log(scraper_name, {query: query})
|
123
|
-
end
|
124
|
-
|
125
|
-
if result['entries'].nil? || result["entries"].length == 0
|
126
|
-
puts "No logs yet, please try again later."
|
127
|
-
else
|
128
|
-
more_token = result["more_token"]
|
129
|
-
|
130
|
-
result["entries"].each do |entry|
|
131
|
-
puts "#{entry["timestamp"]} #{entry["severity"]}: #{entry["payload"]}" if entry.is_a?(Hash)
|
132
|
-
end
|
133
|
-
|
134
|
-
unless more_token.nil?
|
135
|
-
puts "-----------"
|
136
|
-
puts "To see more entries, add: \"--more #{more_token}\""
|
137
|
-
end
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
desc "stats <scraper_name>", "Get the current stat for a job"
|
142
|
-
long_desc <<-LONGDESC
|
143
|
-
Get stats for a scraper's current job\n
|
144
|
-
LONGDESC
|
145
|
-
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
146
|
-
def stats(scraper_name)
|
147
|
-
client = Client::JobStat.new(options)
|
148
|
-
if options[:job]
|
149
|
-
puts "#{client.job_current_stats(options[:job])}"
|
150
|
-
else
|
151
|
-
puts "#{client.scraper_job_current_stats(scraper_name)}"
|
152
|
-
end
|
153
|
-
|
154
|
-
end
|
155
|
-
|
156
|
-
|
157
|
-
desc "job SUBCOMMAND ...ARGS", "manage scrapers jobs"
|
158
|
-
subcommand "job", ScraperJob
|
159
|
-
|
160
|
-
desc "deployment SUBCOMMAND ...ARGS", "manage scrapers deployments"
|
161
|
-
subcommand "deployment", ScraperDeployment
|
162
|
-
|
163
|
-
desc "finisher SUBCOMMAND ...ARGS", "manage scrapers finishers"
|
164
|
-
subcommand "finisher", ScraperFinisher
|
165
|
-
|
166
|
-
desc "output SUBCOMMAND ...ARGS", "view scraper outputs"
|
167
|
-
subcommand "output", JobOutput
|
168
|
-
|
169
|
-
desc "page SUBCOMMAND ...ARGS", "manage pages on a job"
|
170
|
-
subcommand "page", ScraperPage
|
171
|
-
|
172
|
-
desc "export SUBCOMMAND ...ARGS", "manage scraper's exports"
|
173
|
-
subcommand "export", ScraperExport
|
174
|
-
|
175
|
-
desc "exporter SUBCOMMAND ...ARGS", "manage scraper's exporters"
|
176
|
-
subcommand "exporter", ScraperExporter
|
177
|
-
|
178
|
-
desc "var SUBCOMMAND ...ARGS", "for managing scraper's variables"
|
179
|
-
subcommand "var", ScraperVar
|
180
|
-
|
181
|
-
|
182
|
-
end
|
183
|
-
end
|
184
|
-
|
185
|
-
end
|