answersengine 0.10.1 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CODE_OF_CONDUCT.md +1 -1
- data/LICENSE.txt +1 -1
- data/README.md +3 -4
- data/answersengine.gemspec +6 -12
- data/exe/answersengine +3 -2
- data/lib/answersengine.rb +20 -3
- metadata +14 -152
- data/examples/fetchtest/libraries/hello.rb +0 -9
- data/examples/fetchtest/libraries/hello_fail.rb +0 -10
- data/examples/fetchtest/parsers/failed.rb +0 -2
- data/examples/fetchtest/parsers/find_outputs.rb +0 -18
- data/examples/fetchtest/parsers/home.rb +0 -50
- data/examples/fetchtest/parsers/nested_fail.rb +0 -3
- data/examples/fetchtest/parsers/simple.rb +0 -14
- data/examples/fetchtest/seeders/csv_seeder.rb +0 -12
- data/examples/fetchtest/seeders/failed.rb +0 -1
- data/examples/fetchtest/seeders/list_of_urls.csv +0 -5
- data/examples/fetchtest/seeders/seed.rb +0 -28
- data/examples/fetchtest/seeders/test_reset_page.rb +0 -4
- data/lib/answersengine/cli.rb +0 -45
- data/lib/answersengine/cli/env_var.rb +0 -48
- data/lib/answersengine/cli/finisher.rb +0 -40
- data/lib/answersengine/cli/global_page.rb +0 -39
- data/lib/answersengine/cli/job.rb +0 -30
- data/lib/answersengine/cli/job_output.rb +0 -69
- data/lib/answersengine/cli/parser.rb +0 -64
- data/lib/answersengine/cli/scraper.rb +0 -185
- data/lib/answersengine/cli/scraper_deployment.rb +0 -24
- data/lib/answersengine/cli/scraper_export.rb +0 -51
- data/lib/answersengine/cli/scraper_exporter.rb +0 -40
- data/lib/answersengine/cli/scraper_finisher.rb +0 -20
- data/lib/answersengine/cli/scraper_job.rb +0 -75
- data/lib/answersengine/cli/scraper_job_var.rb +0 -48
- data/lib/answersengine/cli/scraper_page.rb +0 -203
- data/lib/answersengine/cli/scraper_var.rb +0 -48
- data/lib/answersengine/cli/seeder.rb +0 -40
- data/lib/answersengine/client.rb +0 -29
- data/lib/answersengine/client/auth_token.rb +0 -50
- data/lib/answersengine/client/backblaze_content.rb +0 -45
- data/lib/answersengine/client/base.rb +0 -55
- data/lib/answersengine/client/deploy_key.rb +0 -21
- data/lib/answersengine/client/env_var.rb +0 -28
- data/lib/answersengine/client/export.rb +0 -10
- data/lib/answersengine/client/global_page.rb +0 -18
- data/lib/answersengine/client/job.rb +0 -64
- data/lib/answersengine/client/job_export.rb +0 -10
- data/lib/answersengine/client/job_log.rb +0 -26
- data/lib/answersengine/client/job_output.rb +0 -19
- data/lib/answersengine/client/job_page.rb +0 -58
- data/lib/answersengine/client/job_stat.rb +0 -16
- data/lib/answersengine/client/scraper.rb +0 -57
- data/lib/answersengine/client/scraper_deployment.rb +0 -18
- data/lib/answersengine/client/scraper_export.rb +0 -22
- data/lib/answersengine/client/scraper_exporter.rb +0 -14
- data/lib/answersengine/client/scraper_finisher.rb +0 -16
- data/lib/answersengine/client/scraper_job.rb +0 -49
- data/lib/answersengine/client/scraper_job_output.rb +0 -19
- data/lib/answersengine/client/scraper_job_page.rb +0 -67
- data/lib/answersengine/client/scraper_job_var.rb +0 -28
- data/lib/answersengine/client/scraper_var.rb +0 -28
- data/lib/answersengine/plugin.rb +0 -6
- data/lib/answersengine/plugin/context_exposer.rb +0 -55
- data/lib/answersengine/scraper.rb +0 -18
- data/lib/answersengine/scraper/executor.rb +0 -373
- data/lib/answersengine/scraper/finisher.rb +0 -18
- data/lib/answersengine/scraper/parser.rb +0 -18
- data/lib/answersengine/scraper/ruby_finisher_executor.rb +0 -116
- data/lib/answersengine/scraper/ruby_parser_executor.rb +0 -200
- data/lib/answersengine/scraper/ruby_seeder_executor.rb +0 -120
- data/lib/answersengine/scraper/seeder.rb +0 -18
- data/lib/answersengine/version.rb +0 -3
@@ -1,12 +0,0 @@
|
|
1
|
-
CSV.foreach("./seeders/list_of_urls.csv",:headers => true) do |row|
|
2
|
-
pages << {
|
3
|
-
url: row['url'],
|
4
|
-
page_type: row['page_type'],
|
5
|
-
vars: {"abc":[1,2,3], "def": "defcontent"}
|
6
|
-
}
|
7
|
-
|
8
|
-
# Save pages to the job partially if record counts will be too large
|
9
|
-
max_records = 100
|
10
|
-
save_pages(pages) if $. % max_records == 0
|
11
|
-
end
|
12
|
-
|
@@ -1 +0,0 @@
|
|
1
|
-
raise "fail from seeder"
|
@@ -1,28 +0,0 @@
|
|
1
|
-
puts "hello from seeder"
|
2
|
-
|
3
|
-
pages << {
|
4
|
-
url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser",
|
5
|
-
vars: {"abc":[1], "def": "defcontent"}
|
6
|
-
}
|
7
|
-
|
8
|
-
pages << {
|
9
|
-
url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser2",
|
10
|
-
vars: {"abc":[2], "def": "defcontent"}
|
11
|
-
}
|
12
|
-
|
13
|
-
save_pages(pages)
|
14
|
-
|
15
|
-
pages << {
|
16
|
-
url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser3",
|
17
|
-
vars: {"abc":[3], "def": "defcontent"}
|
18
|
-
}
|
19
|
-
|
20
|
-
pages << {
|
21
|
-
url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser4",
|
22
|
-
vars: {"abc":[3], "def": "defcontent"}
|
23
|
-
}
|
24
|
-
|
25
|
-
pages << {
|
26
|
-
url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser5",
|
27
|
-
vars: {"abc":[3], "def": "defcontent"}
|
28
|
-
}
|
data/lib/answersengine/cli.rb
DELETED
@@ -1,45 +0,0 @@
|
|
1
|
-
require 'thor'
|
2
|
-
require 'answersengine/scraper'
|
3
|
-
require 'answersengine/cli/scraper_var'
|
4
|
-
require 'answersengine/cli/scraper_exporter'
|
5
|
-
require 'answersengine/cli/scraper_export'
|
6
|
-
require 'answersengine/cli/scraper_job_var'
|
7
|
-
require 'answersengine/cli/scraper_job'
|
8
|
-
require 'answersengine/cli/scraper_finisher'
|
9
|
-
require 'answersengine/cli/global_page'
|
10
|
-
require 'answersengine/cli/scraper_page'
|
11
|
-
require 'answersengine/cli/job_output'
|
12
|
-
require 'answersengine/cli/job'
|
13
|
-
require 'answersengine/cli/scraper_deployment'
|
14
|
-
require 'answersengine/cli/scraper'
|
15
|
-
require 'answersengine/cli/parser'
|
16
|
-
require 'answersengine/cli/seeder'
|
17
|
-
require 'answersengine/cli/finisher'
|
18
|
-
require 'answersengine/cli/env_var'
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
module AnswersEngine
|
23
|
-
class CLI < Thor
|
24
|
-
desc "scraper SUBCOMMAND ...ARGS", "manage scrapers"
|
25
|
-
subcommand "scraper", Scraper
|
26
|
-
|
27
|
-
desc "job SUBCOMMAND ...ARGS", "manage scrapers jobs"
|
28
|
-
subcommand "job", Job
|
29
|
-
|
30
|
-
desc "globalpage SUBCOMMAND ...ARGS", "interacts with global page"
|
31
|
-
subcommand "globalpage", GlobalPage
|
32
|
-
|
33
|
-
desc "parser SUBCOMMAND ...ARGS", "for parsing related activities"
|
34
|
-
subcommand "parser", Parser
|
35
|
-
|
36
|
-
desc "seeder SUBCOMMAND ...ARGS", "for seeding related activities"
|
37
|
-
subcommand "seeder", Seeder
|
38
|
-
|
39
|
-
desc "seeder SUBCOMMAND ...ARGS", "for seeding related activities"
|
40
|
-
subcommand "finisher", Finisher
|
41
|
-
|
42
|
-
desc "var SUBCOMMAND ...ARGS", "for environment variable related activities"
|
43
|
-
subcommand "var", EnvVar
|
44
|
-
end
|
45
|
-
end
|
@@ -1,48 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
class CLI < Thor
|
3
|
-
class EnvVar < Thor
|
4
|
-
desc "list", "List environment variables on the account"
|
5
|
-
|
6
|
-
long_desc <<-LONGDESC
|
7
|
-
List all environment variables on the account.
|
8
|
-
LONGDESC
|
9
|
-
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
10
|
-
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
11
|
-
def list
|
12
|
-
client = Client::EnvVar.new(options)
|
13
|
-
puts "#{client.all}"
|
14
|
-
end
|
15
|
-
|
16
|
-
desc "set <name> <value>", "Set an environment var on the account"
|
17
|
-
long_desc <<-LONGDESC
|
18
|
-
Creates an environment variable\x5
|
19
|
-
<name>: Var name can only consist of alphabets, numbers, underscores. Name must be unique to your account, otherwise it will be overwritten.\x5
|
20
|
-
<value>: Value of variable.\x5
|
21
|
-
LONGDESC
|
22
|
-
option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
|
23
|
-
def set(name, value)
|
24
|
-
# puts "options #{options}"
|
25
|
-
client = Client::EnvVar.new(options)
|
26
|
-
puts "#{client.set(name, value, options)}"
|
27
|
-
end
|
28
|
-
|
29
|
-
desc "show <name>", "Show an environment variable on the account"
|
30
|
-
def show(name)
|
31
|
-
client = Client::EnvVar.new(options)
|
32
|
-
puts "#{client.find(name)}"
|
33
|
-
end
|
34
|
-
|
35
|
-
desc "unset <name>", "Deletes an environment variable on the account"
|
36
|
-
def unset(name)
|
37
|
-
client = Client::EnvVar.new(options)
|
38
|
-
puts "#{client.unset(name)}"
|
39
|
-
end
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
end
|
@@ -1,40 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
class CLI < Thor
|
3
|
-
class Finisher < Thor
|
4
|
-
desc "try <scraper_name> <finisher_file>", "Tries a finisher file"
|
5
|
-
long_desc <<-LONGDESC
|
6
|
-
Takes a finisher script and tries to execute it without saving anything.\x5
|
7
|
-
<seeder_file>: Finisher script file will be executed.\x5
|
8
|
-
LONGDESC
|
9
|
-
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
10
|
-
def try_finisher(scraper_name, finisher_file)
|
11
|
-
if options[:job]
|
12
|
-
job_id = options[:job]
|
13
|
-
else
|
14
|
-
job = Client::ScraperJob.new(options).find(scraper_name)
|
15
|
-
job_id = job['id']
|
16
|
-
end
|
17
|
-
|
18
|
-
puts AnswersEngine::Scraper::Finisher.exec_finisher(finisher_file, job_id, false)
|
19
|
-
end
|
20
|
-
|
21
|
-
desc "exec <scraper_name> <finisher_file>", "Executes a finisher script onto a scraper's current job."
|
22
|
-
long_desc <<-LONGDESC
|
23
|
-
Takes a finisher script and execute it against a job and save outputs into the scraper's current job\x5
|
24
|
-
<finisher_file>: Finisher script file that will be executed on the scraper's current job.\x5
|
25
|
-
LONGDESC
|
26
|
-
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
27
|
-
def exec_parse(scraper_name, finisher_file)
|
28
|
-
if options[:job]
|
29
|
-
job_id = options[:job]
|
30
|
-
else
|
31
|
-
job = Client::ScraperJob.new(options).find(scraper_name)
|
32
|
-
job_id = job['id']
|
33
|
-
end
|
34
|
-
|
35
|
-
puts AnswersEngine::Scraper::Finisher.exec_finisher(finisher_file, job_id, true)
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
end
|
@@ -1,39 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
class CLI < Thor
|
3
|
-
class GlobalPage < Thor
|
4
|
-
|
5
|
-
desc "show <gid>", "Show a global page"
|
6
|
-
def show(gid)
|
7
|
-
client = Client::GlobalPage.new(options)
|
8
|
-
puts "#{client.find(gid)}"
|
9
|
-
end
|
10
|
-
|
11
|
-
desc "content <gid>", "Show content of a globalpage"
|
12
|
-
def content(gid)
|
13
|
-
client = Client::GlobalPage.new(options)
|
14
|
-
result = JSON.parse(client.find_content(gid).to_s)
|
15
|
-
|
16
|
-
if result['available'] == true
|
17
|
-
puts "Preview content url: \"#{result['preview_url']}\""
|
18
|
-
`open "#{result['preview_url']}"`
|
19
|
-
else
|
20
|
-
puts "Content does not exist"
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
desc "failedcontent <gid>", "Show failed content of a globalpage"
|
25
|
-
def failedcontent(gid)
|
26
|
-
client = Client::GlobalPage.new(options)
|
27
|
-
result = JSON.parse(client.find_failed_content(gid).to_s)
|
28
|
-
|
29
|
-
if result['available'] == true
|
30
|
-
puts "Preview failed content url: \"#{result['preview_url']}\""
|
31
|
-
`open "#{result['preview_url']}"`
|
32
|
-
else
|
33
|
-
puts "Failed Content does not exist"
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
@@ -1,30 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
class CLI < Thor
|
3
|
-
class Job < Thor
|
4
|
-
package_name "job"
|
5
|
-
def self.banner(command, namespace = nil, subcommand = false)
|
6
|
-
"#{basename} #{@package_name} #{command.usage}"
|
7
|
-
end
|
8
|
-
|
9
|
-
|
10
|
-
desc "list", "gets a list of jobs"
|
11
|
-
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
12
|
-
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
13
|
-
long_desc <<-LONGDESC
|
14
|
-
List scrape jobs.
|
15
|
-
LONGDESC
|
16
|
-
def list()
|
17
|
-
client = Client::Job.new(options)
|
18
|
-
puts "#{client.all()}"
|
19
|
-
end
|
20
|
-
|
21
|
-
desc "show <job_id>", "Show a job"
|
22
|
-
def show(job_id)
|
23
|
-
client = Client::Job.new(options)
|
24
|
-
puts "#{client.find(job_id)}"
|
25
|
-
end
|
26
|
-
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
end
|
@@ -1,69 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
class CLI < Thor
|
3
|
-
class JobOutput < Thor
|
4
|
-
|
5
|
-
package_name "scraper output"
|
6
|
-
def self.banner(command, namespace = nil, subcommand = false)
|
7
|
-
"#{basename} #{@package_name} #{command.usage}"
|
8
|
-
end
|
9
|
-
|
10
|
-
desc "list <scraper_name>", "List output records in a collection that is in the current job"
|
11
|
-
long_desc <<-LONGDESC
|
12
|
-
List all output records in a collection that is in the current job of a scraper\n
|
13
|
-
LONGDESC
|
14
|
-
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
15
|
-
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
16
|
-
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
17
|
-
option :collection, :aliases => :c, desc: "Shows outputs from a specific collection.(defaults to 'default' collection)"
|
18
|
-
option :query, :aliases => :q, type: :string, banner: :JSON, desc: 'Set search query. Must be in json format. i.e: {"Foo":"bar"} '
|
19
|
-
def list(scraper_name)
|
20
|
-
collection = options.fetch(:collection) { 'default' }
|
21
|
-
if options[:job]
|
22
|
-
client = Client::JobOutput.new(options)
|
23
|
-
puts "#{client.all(options[:job], collection)}"
|
24
|
-
else
|
25
|
-
client = Client::ScraperJobOutput.new(options)
|
26
|
-
puts "#{client.all(scraper_name, collection)}"
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
desc "show <scraper_name> <record_id>", "Show one output record in a collection that is in the current job of a scraper"
|
31
|
-
long_desc <<-LONGDESC
|
32
|
-
Shows an output record in a collection that is in the current job of a scraper\n
|
33
|
-
<record_id>: ID of the output record.\x5
|
34
|
-
LONGDESC
|
35
|
-
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
36
|
-
option :collection, :aliases => :c, desc: "Shows output from a specific collection.(defaults to 'default' collection)"
|
37
|
-
def show(scraper_name, id)
|
38
|
-
collection = options.fetch(:collection) { 'default' }
|
39
|
-
if options[:job]
|
40
|
-
client = Client::JobOutput.new(options)
|
41
|
-
puts "#{client.find(options[:job], collection, id)}"
|
42
|
-
else
|
43
|
-
client = Client::ScraperJobOutput.new(options)
|
44
|
-
puts "#{client.find(scraper_name, collection, id)}"
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
desc "collections <scraper_name>", "list job output collections that are inside a current job of a scraper."
|
49
|
-
long_desc <<-LONGDESC
|
50
|
-
List job output collections that are inside a current job of a scraper.\x5
|
51
|
-
LONGDESC
|
52
|
-
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
53
|
-
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
54
|
-
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
55
|
-
def collections(scraper_name)
|
56
|
-
|
57
|
-
if options[:job]
|
58
|
-
client = Client::JobOutput.new(options)
|
59
|
-
puts "#{client.collections(options[:job])}"
|
60
|
-
else
|
61
|
-
client = Client::ScraperJobOutput.new(options)
|
62
|
-
puts "#{client.collections(scraper_name)}"
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
end
|
67
|
-
end
|
68
|
-
|
69
|
-
end
|
@@ -1,64 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
class CLI < Thor
|
3
|
-
class Parser < Thor
|
4
|
-
desc "try <scraper_name> <parser_file> <GID>", "Tries a parser on a Job Page"
|
5
|
-
long_desc <<-LONGDESC
|
6
|
-
Takes a parser script and runs it against a job page\x5
|
7
|
-
<parser_file>: Parser script file that will be executed on the page.\x5
|
8
|
-
<GID>: Global ID of the page.\x5
|
9
|
-
LONGDESC
|
10
|
-
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
11
|
-
option :global, :aliases => :g, type: :boolean, default: false, desc: 'Use globalpage instead of a job page'
|
12
|
-
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
13
|
-
def try_parse(scraper_name, parser_file, gid)
|
14
|
-
begin
|
15
|
-
|
16
|
-
if options[:job]
|
17
|
-
job_id = options[:job]
|
18
|
-
elsif options[:global]
|
19
|
-
job_id = nil
|
20
|
-
else
|
21
|
-
job = Client::ScraperJob.new(options).find(scraper_name)
|
22
|
-
job_id = job['id']
|
23
|
-
end
|
24
|
-
|
25
|
-
|
26
|
-
vars = JSON.parse(options[:vars]) if options[:vars]
|
27
|
-
puts AnswersEngine::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars)
|
28
|
-
|
29
|
-
rescue JSON::ParserError
|
30
|
-
if options[:vars]
|
31
|
-
puts "Error: #{options[:vars]} on vars is not a valid JSON"
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
|
-
desc "exec <scraper_name> <parser_file> <GID>...<GID>", "Executes a parser script on one or more Job Pages within a scraper's current job"
|
37
|
-
long_desc <<-LONGDESC
|
38
|
-
Takes a parser script executes it against a job page(s) and save the output to the scraper's current job\x5
|
39
|
-
<parser_file>: Parser script file will be executed on the page.\x5
|
40
|
-
<GID>: Global ID of the page.\x5
|
41
|
-
LONGDESC
|
42
|
-
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
43
|
-
def exec_parse(scraper_name, parser_file, *gids)
|
44
|
-
gids.each do |gid|
|
45
|
-
begin
|
46
|
-
puts "Parsing #{gid}"
|
47
|
-
|
48
|
-
if options[:job]
|
49
|
-
job_id = options[:job]
|
50
|
-
else
|
51
|
-
job = Client::ScraperJob.new(options).find(scraper_name)
|
52
|
-
job_id = job['id']
|
53
|
-
end
|
54
|
-
|
55
|
-
puts AnswersEngine::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true)
|
56
|
-
rescue => e
|
57
|
-
puts e
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
|
-
end
|
@@ -1,185 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
class CLI < Thor
|
3
|
-
class Scraper < Thor
|
4
|
-
desc "list", "List scrapers"
|
5
|
-
|
6
|
-
long_desc <<-LONGDESC
|
7
|
-
List all scrapers.
|
8
|
-
LONGDESC
|
9
|
-
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
10
|
-
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
11
|
-
def list
|
12
|
-
client = Client::Scraper.new(options)
|
13
|
-
puts "#{client.all}"
|
14
|
-
end
|
15
|
-
|
16
|
-
desc "create <scraper_name> <git_repository>", "Create a scraper"
|
17
|
-
long_desc <<-LONGDESC
|
18
|
-
Creates a scraper\x5
|
19
|
-
<scraper_name>: Scraper name can only consist of alphabets, numbers, underscores and dashes. Name must be unique to your account.\x5
|
20
|
-
<git_repository>: URL to a valid Git repository.\x5
|
21
|
-
LONGDESC
|
22
|
-
option :branch, :aliases => :b, desc: 'Set the Git branch to use. Default: master'
|
23
|
-
option :freshness_type, :aliases => :t, desc: 'Set how fresh the page cache is. Possible values: day, week, month, year. Default: any'
|
24
|
-
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
25
|
-
option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
|
26
|
-
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
|
27
|
-
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
|
28
|
-
option :disable_scheduler, type: :boolean, desc: 'Set true to disable scheduler. Default: false'
|
29
|
-
option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
|
30
|
-
option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
|
31
|
-
option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
|
32
|
-
def create(scraper_name, git_repository)
|
33
|
-
# puts "options #{options}"
|
34
|
-
client = Client::Scraper.new(options)
|
35
|
-
puts "#{client.create(scraper_name, git_repository, options)}"
|
36
|
-
end
|
37
|
-
|
38
|
-
desc "update <scraper_name>", "Update a scraper"
|
39
|
-
long_desc <<-LONGDESC
|
40
|
-
Updates a scraper\x5
|
41
|
-
LONGDESC
|
42
|
-
option :branch, :aliases => :b, desc: 'Set the Git branch to use. Default: master'
|
43
|
-
option :name, :aliases => :n, desc: 'Set the scraper name. Name can only consist of alphabets, numbers, underscores and dashes. Name must be unique to your account'
|
44
|
-
option :repo, :aliases => :r, desc: 'Set the URL to a valid Git repository'
|
45
|
-
option :freshness_type, :aliases => :t, desc: 'Set how fresh the page cache is. Possible values: day, week, month, year. Default: any'
|
46
|
-
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
47
|
-
option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
|
48
|
-
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
|
49
|
-
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
|
50
|
-
option :disable_scheduler, type: :boolean, desc: 'Set true to disable scheduler. Default: false'
|
51
|
-
option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
|
52
|
-
option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
|
53
|
-
option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
|
54
|
-
def update(scraper_name)
|
55
|
-
client = Client::Scraper.new(options)
|
56
|
-
puts "#{client.update(scraper_name, options)}"
|
57
|
-
end
|
58
|
-
|
59
|
-
|
60
|
-
desc "show <scraper_name>", "Show a scraper"
|
61
|
-
def show(scraper_name)
|
62
|
-
client = Client::Scraper.new(options)
|
63
|
-
puts "#{client.find(scraper_name)}"
|
64
|
-
end
|
65
|
-
|
66
|
-
desc "delete <scraper_name>", "Delete a scraper and related records"
|
67
|
-
def delete(scraper_name)
|
68
|
-
client = Client::Scraper.new(options)
|
69
|
-
puts "#{client.delete(scraper_name)}"
|
70
|
-
end
|
71
|
-
|
72
|
-
|
73
|
-
desc "deploy <scraper_name>", "Deploy a scraper"
|
74
|
-
long_desc <<-LONGDESC
|
75
|
-
Deploys a scraper
|
76
|
-
LONGDESC
|
77
|
-
def deploy(scraper_name)
|
78
|
-
client = Client::ScraperDeployment.new()
|
79
|
-
puts "Deploying scraper. This may take a while..."
|
80
|
-
puts "#{client.deploy(scraper_name)}"
|
81
|
-
end
|
82
|
-
|
83
|
-
desc "start <scraper_name>", "Creates a scraping job and runs it"
|
84
|
-
long_desc <<-LONGDESC
|
85
|
-
Starts a scraper by creating an active scrape job\x5
|
86
|
-
LONGDESC
|
87
|
-
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
|
88
|
-
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
|
89
|
-
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
90
|
-
def start(scraper_name)
|
91
|
-
client = Client::ScraperJob.new(options)
|
92
|
-
puts "Starting a scrape job..."
|
93
|
-
puts "#{client.create(scraper_name, options)}"
|
94
|
-
end
|
95
|
-
|
96
|
-
|
97
|
-
desc "log <scraper_name>", "List log entries related to a scraper's current job"
|
98
|
-
long_desc <<-LONGDESC
|
99
|
-
Shows log related to a scraper's current job. Defaults to showing the most recent entries\x5
|
100
|
-
LONGDESC
|
101
|
-
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
102
|
-
option :head, :aliases => :H, desc: 'Show the oldest log entries. If not set, newest entries is shown'
|
103
|
-
option :parsing, :aliases => :p, type: :boolean, desc: 'Show only log entries related to parsing errors'
|
104
|
-
option :seeding, :aliases => :s, type: :boolean, desc: 'Show only log entries related to seeding errors'
|
105
|
-
option :more, :aliases => :m, desc: 'Show next set of log entries. Enter the `More token`'
|
106
|
-
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 5000 per page.'
|
107
|
-
def log(scraper_name)
|
108
|
-
client = Client::JobLog.new(options)
|
109
|
-
|
110
|
-
query = {}
|
111
|
-
query["order"] = options.delete(:head) if options[:head]
|
112
|
-
query["job_type"] = "parsing" if options[:parsing]
|
113
|
-
query["job_type"] = "seeding" if options[:seeding]
|
114
|
-
query["page_token"] = options.delete(:more) if options[:more]
|
115
|
-
query["per_page"] = options.delete(:per_page) if options[:per_page]
|
116
|
-
|
117
|
-
puts "Fetching logs..."
|
118
|
-
|
119
|
-
if options[:job]
|
120
|
-
result = client.all_job_log(options[:job], {query: query})
|
121
|
-
else
|
122
|
-
result = client.scraper_all_job_log(scraper_name, {query: query})
|
123
|
-
end
|
124
|
-
|
125
|
-
if result['entries'].nil? || result["entries"].length == 0
|
126
|
-
puts "No logs yet, please try again later."
|
127
|
-
else
|
128
|
-
more_token = result["more_token"]
|
129
|
-
|
130
|
-
result["entries"].each do |entry|
|
131
|
-
puts "#{entry["timestamp"]} #{entry["severity"]}: #{entry["payload"]}" if entry.is_a?(Hash)
|
132
|
-
end
|
133
|
-
|
134
|
-
unless more_token.nil?
|
135
|
-
puts "-----------"
|
136
|
-
puts "To see more entries, add: \"--more #{more_token}\""
|
137
|
-
end
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
desc "stats <scraper_name>", "Get the current stat for a job"
|
142
|
-
long_desc <<-LONGDESC
|
143
|
-
Get stats for a scraper's current job\n
|
144
|
-
LONGDESC
|
145
|
-
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
146
|
-
def stats(scraper_name)
|
147
|
-
client = Client::JobStat.new(options)
|
148
|
-
if options[:job]
|
149
|
-
puts "#{client.job_current_stats(options[:job])}"
|
150
|
-
else
|
151
|
-
puts "#{client.scraper_job_current_stats(scraper_name)}"
|
152
|
-
end
|
153
|
-
|
154
|
-
end
|
155
|
-
|
156
|
-
|
157
|
-
desc "job SUBCOMMAND ...ARGS", "manage scrapers jobs"
|
158
|
-
subcommand "job", ScraperJob
|
159
|
-
|
160
|
-
desc "deployment SUBCOMMAND ...ARGS", "manage scrapers deployments"
|
161
|
-
subcommand "deployment", ScraperDeployment
|
162
|
-
|
163
|
-
desc "finisher SUBCOMMAND ...ARGS", "manage scrapers finishers"
|
164
|
-
subcommand "finisher", ScraperFinisher
|
165
|
-
|
166
|
-
desc "output SUBCOMMAND ...ARGS", "view scraper outputs"
|
167
|
-
subcommand "output", JobOutput
|
168
|
-
|
169
|
-
desc "page SUBCOMMAND ...ARGS", "manage pages on a job"
|
170
|
-
subcommand "page", ScraperPage
|
171
|
-
|
172
|
-
desc "export SUBCOMMAND ...ARGS", "manage scraper's exports"
|
173
|
-
subcommand "export", ScraperExport
|
174
|
-
|
175
|
-
desc "exporter SUBCOMMAND ...ARGS", "manage scraper's exporters"
|
176
|
-
subcommand "exporter", ScraperExporter
|
177
|
-
|
178
|
-
desc "var SUBCOMMAND ...ARGS", "for managing scraper's variables"
|
179
|
-
subcommand "var", ScraperVar
|
180
|
-
|
181
|
-
|
182
|
-
end
|
183
|
-
end
|
184
|
-
|
185
|
-
end
|