answersengine 0.10.1 → 0.10.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/CODE_OF_CONDUCT.md +1 -1
  3. data/LICENSE.txt +1 -1
  4. data/README.md +3 -4
  5. data/answersengine.gemspec +6 -12
  6. data/exe/answersengine +3 -2
  7. data/lib/answersengine.rb +20 -3
  8. metadata +14 -152
  9. data/examples/fetchtest/libraries/hello.rb +0 -9
  10. data/examples/fetchtest/libraries/hello_fail.rb +0 -10
  11. data/examples/fetchtest/parsers/failed.rb +0 -2
  12. data/examples/fetchtest/parsers/find_outputs.rb +0 -18
  13. data/examples/fetchtest/parsers/home.rb +0 -50
  14. data/examples/fetchtest/parsers/nested_fail.rb +0 -3
  15. data/examples/fetchtest/parsers/simple.rb +0 -14
  16. data/examples/fetchtest/seeders/csv_seeder.rb +0 -12
  17. data/examples/fetchtest/seeders/failed.rb +0 -1
  18. data/examples/fetchtest/seeders/list_of_urls.csv +0 -5
  19. data/examples/fetchtest/seeders/seed.rb +0 -28
  20. data/examples/fetchtest/seeders/test_reset_page.rb +0 -4
  21. data/lib/answersengine/cli.rb +0 -45
  22. data/lib/answersengine/cli/env_var.rb +0 -48
  23. data/lib/answersengine/cli/finisher.rb +0 -40
  24. data/lib/answersengine/cli/global_page.rb +0 -39
  25. data/lib/answersengine/cli/job.rb +0 -30
  26. data/lib/answersengine/cli/job_output.rb +0 -69
  27. data/lib/answersengine/cli/parser.rb +0 -64
  28. data/lib/answersengine/cli/scraper.rb +0 -185
  29. data/lib/answersengine/cli/scraper_deployment.rb +0 -24
  30. data/lib/answersengine/cli/scraper_export.rb +0 -51
  31. data/lib/answersengine/cli/scraper_exporter.rb +0 -40
  32. data/lib/answersengine/cli/scraper_finisher.rb +0 -20
  33. data/lib/answersengine/cli/scraper_job.rb +0 -75
  34. data/lib/answersengine/cli/scraper_job_var.rb +0 -48
  35. data/lib/answersengine/cli/scraper_page.rb +0 -203
  36. data/lib/answersengine/cli/scraper_var.rb +0 -48
  37. data/lib/answersengine/cli/seeder.rb +0 -40
  38. data/lib/answersengine/client.rb +0 -29
  39. data/lib/answersengine/client/auth_token.rb +0 -50
  40. data/lib/answersengine/client/backblaze_content.rb +0 -45
  41. data/lib/answersengine/client/base.rb +0 -55
  42. data/lib/answersengine/client/deploy_key.rb +0 -21
  43. data/lib/answersengine/client/env_var.rb +0 -28
  44. data/lib/answersengine/client/export.rb +0 -10
  45. data/lib/answersengine/client/global_page.rb +0 -18
  46. data/lib/answersengine/client/job.rb +0 -64
  47. data/lib/answersengine/client/job_export.rb +0 -10
  48. data/lib/answersengine/client/job_log.rb +0 -26
  49. data/lib/answersengine/client/job_output.rb +0 -19
  50. data/lib/answersengine/client/job_page.rb +0 -58
  51. data/lib/answersengine/client/job_stat.rb +0 -16
  52. data/lib/answersengine/client/scraper.rb +0 -57
  53. data/lib/answersengine/client/scraper_deployment.rb +0 -18
  54. data/lib/answersengine/client/scraper_export.rb +0 -22
  55. data/lib/answersengine/client/scraper_exporter.rb +0 -14
  56. data/lib/answersengine/client/scraper_finisher.rb +0 -16
  57. data/lib/answersengine/client/scraper_job.rb +0 -49
  58. data/lib/answersengine/client/scraper_job_output.rb +0 -19
  59. data/lib/answersengine/client/scraper_job_page.rb +0 -67
  60. data/lib/answersengine/client/scraper_job_var.rb +0 -28
  61. data/lib/answersengine/client/scraper_var.rb +0 -28
  62. data/lib/answersengine/plugin.rb +0 -6
  63. data/lib/answersengine/plugin/context_exposer.rb +0 -55
  64. data/lib/answersengine/scraper.rb +0 -18
  65. data/lib/answersengine/scraper/executor.rb +0 -373
  66. data/lib/answersengine/scraper/finisher.rb +0 -18
  67. data/lib/answersengine/scraper/parser.rb +0 -18
  68. data/lib/answersengine/scraper/ruby_finisher_executor.rb +0 -116
  69. data/lib/answersengine/scraper/ruby_parser_executor.rb +0 -200
  70. data/lib/answersengine/scraper/ruby_seeder_executor.rb +0 -120
  71. data/lib/answersengine/scraper/seeder.rb +0 -18
  72. data/lib/answersengine/version.rb +0 -3
@@ -1,12 +0,0 @@
1
- CSV.foreach("./seeders/list_of_urls.csv",:headers => true) do |row|
2
- pages << {
3
- url: row['url'],
4
- page_type: row['page_type'],
5
- vars: {"abc":[1,2,3], "def": "defcontent"}
6
- }
7
-
8
- # Save pages to the job partially if record counts will be too large
9
- max_records = 100
10
- save_pages(pages) if $. % max_records == 0
11
- end
12
-
@@ -1 +0,0 @@
1
- raise "fail from seeder"
@@ -1,5 +0,0 @@
1
- url,page_type
2
- http://fetchtest.datahen.com,home
3
- http://fetchtest.datahen.com/statuses/200,statuses
4
- http://fetchtest.datahen.com/statuses/200?q=1,statuses
5
- http://fetchtest.datahen.com/statuses/200?q=2,statuses
@@ -1,28 +0,0 @@
1
- puts "hello from seeder"
2
-
3
- pages << {
4
- url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser",
5
- vars: {"abc":[1], "def": "defcontent"}
6
- }
7
-
8
- pages << {
9
- url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser2",
10
- vars: {"abc":[2], "def": "defcontent"}
11
- }
12
-
13
- save_pages(pages)
14
-
15
- pages << {
16
- url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser3",
17
- vars: {"abc":[3], "def": "defcontent"}
18
- }
19
-
20
- pages << {
21
- url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser4",
22
- vars: {"abc":[3], "def": "defcontent"}
23
- }
24
-
25
- pages << {
26
- url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser5",
27
- vars: {"abc":[3], "def": "defcontent"}
28
- }
@@ -1,4 +0,0 @@
1
- pages << {
2
- url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser",
3
- reset: true
4
- }
@@ -1,45 +0,0 @@
1
- require 'thor'
2
- require 'answersengine/scraper'
3
- require 'answersengine/cli/scraper_var'
4
- require 'answersengine/cli/scraper_exporter'
5
- require 'answersengine/cli/scraper_export'
6
- require 'answersengine/cli/scraper_job_var'
7
- require 'answersengine/cli/scraper_job'
8
- require 'answersengine/cli/scraper_finisher'
9
- require 'answersengine/cli/global_page'
10
- require 'answersengine/cli/scraper_page'
11
- require 'answersengine/cli/job_output'
12
- require 'answersengine/cli/job'
13
- require 'answersengine/cli/scraper_deployment'
14
- require 'answersengine/cli/scraper'
15
- require 'answersengine/cli/parser'
16
- require 'answersengine/cli/seeder'
17
- require 'answersengine/cli/finisher'
18
- require 'answersengine/cli/env_var'
19
-
20
-
21
-
22
- module AnswersEngine
23
- class CLI < Thor
24
- desc "scraper SUBCOMMAND ...ARGS", "manage scrapers"
25
- subcommand "scraper", Scraper
26
-
27
- desc "job SUBCOMMAND ...ARGS", "manage scrapers jobs"
28
- subcommand "job", Job
29
-
30
- desc "globalpage SUBCOMMAND ...ARGS", "interacts with global page"
31
- subcommand "globalpage", GlobalPage
32
-
33
- desc "parser SUBCOMMAND ...ARGS", "for parsing related activities"
34
- subcommand "parser", Parser
35
-
36
- desc "seeder SUBCOMMAND ...ARGS", "for seeding related activities"
37
- subcommand "seeder", Seeder
38
-
39
- desc "seeder SUBCOMMAND ...ARGS", "for seeding related activities"
40
- subcommand "finisher", Finisher
41
-
42
- desc "var SUBCOMMAND ...ARGS", "for environment variable related activities"
43
- subcommand "var", EnvVar
44
- end
45
- end
@@ -1,48 +0,0 @@
1
- module AnswersEngine
2
- class CLI < Thor
3
- class EnvVar < Thor
4
- desc "list", "List environment variables on the account"
5
-
6
- long_desc <<-LONGDESC
7
- List all environment variables on the account.
8
- LONGDESC
9
- option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
10
- option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
11
- def list
12
- client = Client::EnvVar.new(options)
13
- puts "#{client.all}"
14
- end
15
-
16
- desc "set <name> <value>", "Set an environment var on the account"
17
- long_desc <<-LONGDESC
18
- Creates an environment variable\x5
19
- <name>: Var name can only consist of alphabets, numbers, underscores. Name must be unique to your account, otherwise it will be overwritten.\x5
20
- <value>: Value of variable.\x5
21
- LONGDESC
22
- option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
23
- def set(name, value)
24
- # puts "options #{options}"
25
- client = Client::EnvVar.new(options)
26
- puts "#{client.set(name, value, options)}"
27
- end
28
-
29
- desc "show <name>", "Show an environment variable on the account"
30
- def show(name)
31
- client = Client::EnvVar.new(options)
32
- puts "#{client.find(name)}"
33
- end
34
-
35
- desc "unset <name>", "Deletes an environment variable on the account"
36
- def unset(name)
37
- client = Client::EnvVar.new(options)
38
- puts "#{client.unset(name)}"
39
- end
40
-
41
-
42
-
43
-
44
-
45
- end
46
- end
47
-
48
- end
@@ -1,40 +0,0 @@
1
- module AnswersEngine
2
- class CLI < Thor
3
- class Finisher < Thor
4
- desc "try <scraper_name> <finisher_file>", "Tries a finisher file"
5
- long_desc <<-LONGDESC
6
- Takes a finisher script and tries to execute it without saving anything.\x5
7
- <seeder_file>: Finisher script file will be executed.\x5
8
- LONGDESC
9
- option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
10
- def try_finisher(scraper_name, finisher_file)
11
- if options[:job]
12
- job_id = options[:job]
13
- else
14
- job = Client::ScraperJob.new(options).find(scraper_name)
15
- job_id = job['id']
16
- end
17
-
18
- puts AnswersEngine::Scraper::Finisher.exec_finisher(finisher_file, job_id, false)
19
- end
20
-
21
- desc "exec <scraper_name> <finisher_file>", "Executes a finisher script onto a scraper's current job."
22
- long_desc <<-LONGDESC
23
- Takes a finisher script and execute it against a job and save outputs into the scraper's current job\x5
24
- <finisher_file>: Finisher script file that will be executed on the scraper's current job.\x5
25
- LONGDESC
26
- option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
27
- def exec_parse(scraper_name, finisher_file)
28
- if options[:job]
29
- job_id = options[:job]
30
- else
31
- job = Client::ScraperJob.new(options).find(scraper_name)
32
- job_id = job['id']
33
- end
34
-
35
- puts AnswersEngine::Scraper::Finisher.exec_finisher(finisher_file, job_id, true)
36
- end
37
- end
38
- end
39
-
40
- end
@@ -1,39 +0,0 @@
1
- module AnswersEngine
2
- class CLI < Thor
3
- class GlobalPage < Thor
4
-
5
- desc "show <gid>", "Show a global page"
6
- def show(gid)
7
- client = Client::GlobalPage.new(options)
8
- puts "#{client.find(gid)}"
9
- end
10
-
11
- desc "content <gid>", "Show content of a globalpage"
12
- def content(gid)
13
- client = Client::GlobalPage.new(options)
14
- result = JSON.parse(client.find_content(gid).to_s)
15
-
16
- if result['available'] == true
17
- puts "Preview content url: \"#{result['preview_url']}\""
18
- `open "#{result['preview_url']}"`
19
- else
20
- puts "Content does not exist"
21
- end
22
- end
23
-
24
- desc "failedcontent <gid>", "Show failed content of a globalpage"
25
- def failedcontent(gid)
26
- client = Client::GlobalPage.new(options)
27
- result = JSON.parse(client.find_failed_content(gid).to_s)
28
-
29
- if result['available'] == true
30
- puts "Preview failed content url: \"#{result['preview_url']}\""
31
- `open "#{result['preview_url']}"`
32
- else
33
- puts "Failed Content does not exist"
34
- end
35
- end
36
-
37
- end
38
- end
39
- end
@@ -1,30 +0,0 @@
1
- module AnswersEngine
2
- class CLI < Thor
3
- class Job < Thor
4
- package_name "job"
5
- def self.banner(command, namespace = nil, subcommand = false)
6
- "#{basename} #{@package_name} #{command.usage}"
7
- end
8
-
9
-
10
- desc "list", "gets a list of jobs"
11
- option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
12
- option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
13
- long_desc <<-LONGDESC
14
- List scrape jobs.
15
- LONGDESC
16
- def list()
17
- client = Client::Job.new(options)
18
- puts "#{client.all()}"
19
- end
20
-
21
- desc "show <job_id>", "Show a job"
22
- def show(job_id)
23
- client = Client::Job.new(options)
24
- puts "#{client.find(job_id)}"
25
- end
26
-
27
- end
28
- end
29
-
30
- end
@@ -1,69 +0,0 @@
1
- module AnswersEngine
2
- class CLI < Thor
3
- class JobOutput < Thor
4
-
5
- package_name "scraper output"
6
- def self.banner(command, namespace = nil, subcommand = false)
7
- "#{basename} #{@package_name} #{command.usage}"
8
- end
9
-
10
- desc "list <scraper_name>", "List output records in a collection that is in the current job"
11
- long_desc <<-LONGDESC
12
- List all output records in a collection that is in the current job of a scraper\n
13
- LONGDESC
14
- option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
15
- option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
16
- option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
17
- option :collection, :aliases => :c, desc: "Shows outputs from a specific collection.(defaults to 'default' collection)"
18
- option :query, :aliases => :q, type: :string, banner: :JSON, desc: 'Set search query. Must be in json format. i.e: {"Foo":"bar"} '
19
- def list(scraper_name)
20
- collection = options.fetch(:collection) { 'default' }
21
- if options[:job]
22
- client = Client::JobOutput.new(options)
23
- puts "#{client.all(options[:job], collection)}"
24
- else
25
- client = Client::ScraperJobOutput.new(options)
26
- puts "#{client.all(scraper_name, collection)}"
27
- end
28
- end
29
-
30
- desc "show <scraper_name> <record_id>", "Show one output record in a collection that is in the current job of a scraper"
31
- long_desc <<-LONGDESC
32
- Shows an output record in a collection that is in the current job of a scraper\n
33
- <record_id>: ID of the output record.\x5
34
- LONGDESC
35
- option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
36
- option :collection, :aliases => :c, desc: "Shows output from a specific collection.(defaults to 'default' collection)"
37
- def show(scraper_name, id)
38
- collection = options.fetch(:collection) { 'default' }
39
- if options[:job]
40
- client = Client::JobOutput.new(options)
41
- puts "#{client.find(options[:job], collection, id)}"
42
- else
43
- client = Client::ScraperJobOutput.new(options)
44
- puts "#{client.find(scraper_name, collection, id)}"
45
- end
46
- end
47
-
48
- desc "collections <scraper_name>", "list job output collections that are inside a current job of a scraper."
49
- long_desc <<-LONGDESC
50
- List job output collections that are inside a current job of a scraper.\x5
51
- LONGDESC
52
- option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
53
- option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
54
- option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
55
- def collections(scraper_name)
56
-
57
- if options[:job]
58
- client = Client::JobOutput.new(options)
59
- puts "#{client.collections(options[:job])}"
60
- else
61
- client = Client::ScraperJobOutput.new(options)
62
- puts "#{client.collections(scraper_name)}"
63
- end
64
- end
65
-
66
- end
67
- end
68
-
69
- end
@@ -1,64 +0,0 @@
1
- module AnswersEngine
2
- class CLI < Thor
3
- class Parser < Thor
4
- desc "try <scraper_name> <parser_file> <GID>", "Tries a parser on a Job Page"
5
- long_desc <<-LONGDESC
6
- Takes a parser script and runs it against a job page\x5
7
- <parser_file>: Parser script file that will be executed on the page.\x5
8
- <GID>: Global ID of the page.\x5
9
- LONGDESC
10
- option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
11
- option :global, :aliases => :g, type: :boolean, default: false, desc: 'Use globalpage instead of a job page'
12
- option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
13
- def try_parse(scraper_name, parser_file, gid)
14
- begin
15
-
16
- if options[:job]
17
- job_id = options[:job]
18
- elsif options[:global]
19
- job_id = nil
20
- else
21
- job = Client::ScraperJob.new(options).find(scraper_name)
22
- job_id = job['id']
23
- end
24
-
25
-
26
- vars = JSON.parse(options[:vars]) if options[:vars]
27
- puts AnswersEngine::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars)
28
-
29
- rescue JSON::ParserError
30
- if options[:vars]
31
- puts "Error: #{options[:vars]} on vars is not a valid JSON"
32
- end
33
- end
34
- end
35
-
36
- desc "exec <scraper_name> <parser_file> <GID>...<GID>", "Executes a parser script on one or more Job Pages within a scraper's current job"
37
- long_desc <<-LONGDESC
38
- Takes a parser script executes it against a job page(s) and save the output to the scraper's current job\x5
39
- <parser_file>: Parser script file will be executed on the page.\x5
40
- <GID>: Global ID of the page.\x5
41
- LONGDESC
42
- option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
43
- def exec_parse(scraper_name, parser_file, *gids)
44
- gids.each do |gid|
45
- begin
46
- puts "Parsing #{gid}"
47
-
48
- if options[:job]
49
- job_id = options[:job]
50
- else
51
- job = Client::ScraperJob.new(options).find(scraper_name)
52
- job_id = job['id']
53
- end
54
-
55
- puts AnswersEngine::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true)
56
- rescue => e
57
- puts e
58
- end
59
- end
60
- end
61
- end
62
- end
63
-
64
- end
@@ -1,185 +0,0 @@
1
- module AnswersEngine
2
- class CLI < Thor
3
- class Scraper < Thor
4
- desc "list", "List scrapers"
5
-
6
- long_desc <<-LONGDESC
7
- List all scrapers.
8
- LONGDESC
9
- option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
10
- option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
11
- def list
12
- client = Client::Scraper.new(options)
13
- puts "#{client.all}"
14
- end
15
-
16
- desc "create <scraper_name> <git_repository>", "Create a scraper"
17
- long_desc <<-LONGDESC
18
- Creates a scraper\x5
19
- <scraper_name>: Scraper name can only consist of alphabets, numbers, underscores and dashes. Name must be unique to your account.\x5
20
- <git_repository>: URL to a valid Git repository.\x5
21
- LONGDESC
22
- option :branch, :aliases => :b, desc: 'Set the Git branch to use. Default: master'
23
- option :freshness_type, :aliases => :t, desc: 'Set how fresh the page cache is. Possible values: day, week, month, year. Default: any'
24
- option :proxy_type, desc: 'Set the Proxy type. Default: standard'
25
- option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
26
- option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
27
- option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
28
- option :disable_scheduler, type: :boolean, desc: 'Set true to disable scheduler. Default: false'
29
- option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
30
- option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
31
- option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
32
- def create(scraper_name, git_repository)
33
- # puts "options #{options}"
34
- client = Client::Scraper.new(options)
35
- puts "#{client.create(scraper_name, git_repository, options)}"
36
- end
37
-
38
- desc "update <scraper_name>", "Update a scraper"
39
- long_desc <<-LONGDESC
40
- Updates a scraper\x5
41
- LONGDESC
42
- option :branch, :aliases => :b, desc: 'Set the Git branch to use. Default: master'
43
- option :name, :aliases => :n, desc: 'Set the scraper name. Name can only consist of alphabets, numbers, underscores and dashes. Name must be unique to your account'
44
- option :repo, :aliases => :r, desc: 'Set the URL to a valid Git repository'
45
- option :freshness_type, :aliases => :t, desc: 'Set how fresh the page cache is. Possible values: day, week, month, year. Default: any'
46
- option :proxy_type, desc: 'Set the Proxy type. Default: standard'
47
- option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
48
- option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
49
- option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
50
- option :disable_scheduler, type: :boolean, desc: 'Set true to disable scheduler. Default: false'
51
- option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
52
- option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
53
- option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
54
- def update(scraper_name)
55
- client = Client::Scraper.new(options)
56
- puts "#{client.update(scraper_name, options)}"
57
- end
58
-
59
-
60
- desc "show <scraper_name>", "Show a scraper"
61
- def show(scraper_name)
62
- client = Client::Scraper.new(options)
63
- puts "#{client.find(scraper_name)}"
64
- end
65
-
66
- desc "delete <scraper_name>", "Delete a scraper and related records"
67
- def delete(scraper_name)
68
- client = Client::Scraper.new(options)
69
- puts "#{client.delete(scraper_name)}"
70
- end
71
-
72
-
73
- desc "deploy <scraper_name>", "Deploy a scraper"
74
- long_desc <<-LONGDESC
75
- Deploys a scraper
76
- LONGDESC
77
- def deploy(scraper_name)
78
- client = Client::ScraperDeployment.new()
79
- puts "Deploying scraper. This may take a while..."
80
- puts "#{client.deploy(scraper_name)}"
81
- end
82
-
83
- desc "start <scraper_name>", "Creates a scraping job and runs it"
84
- long_desc <<-LONGDESC
85
- Starts a scraper by creating an active scrape job\x5
86
- LONGDESC
87
- option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
88
- option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
89
- option :proxy_type, desc: 'Set the Proxy type. Default: standard'
90
- def start(scraper_name)
91
- client = Client::ScraperJob.new(options)
92
- puts "Starting a scrape job..."
93
- puts "#{client.create(scraper_name, options)}"
94
- end
95
-
96
-
97
- desc "log <scraper_name>", "List log entries related to a scraper's current job"
98
- long_desc <<-LONGDESC
99
- Shows log related to a scraper's current job. Defaults to showing the most recent entries\x5
100
- LONGDESC
101
- option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
102
- option :head, :aliases => :H, desc: 'Show the oldest log entries. If not set, newest entries is shown'
103
- option :parsing, :aliases => :p, type: :boolean, desc: 'Show only log entries related to parsing errors'
104
- option :seeding, :aliases => :s, type: :boolean, desc: 'Show only log entries related to seeding errors'
105
- option :more, :aliases => :m, desc: 'Show next set of log entries. Enter the `More token`'
106
- option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 5000 per page.'
107
- def log(scraper_name)
108
- client = Client::JobLog.new(options)
109
-
110
- query = {}
111
- query["order"] = options.delete(:head) if options[:head]
112
- query["job_type"] = "parsing" if options[:parsing]
113
- query["job_type"] = "seeding" if options[:seeding]
114
- query["page_token"] = options.delete(:more) if options[:more]
115
- query["per_page"] = options.delete(:per_page) if options[:per_page]
116
-
117
- puts "Fetching logs..."
118
-
119
- if options[:job]
120
- result = client.all_job_log(options[:job], {query: query})
121
- else
122
- result = client.scraper_all_job_log(scraper_name, {query: query})
123
- end
124
-
125
- if result['entries'].nil? || result["entries"].length == 0
126
- puts "No logs yet, please try again later."
127
- else
128
- more_token = result["more_token"]
129
-
130
- result["entries"].each do |entry|
131
- puts "#{entry["timestamp"]} #{entry["severity"]}: #{entry["payload"]}" if entry.is_a?(Hash)
132
- end
133
-
134
- unless more_token.nil?
135
- puts "-----------"
136
- puts "To see more entries, add: \"--more #{more_token}\""
137
- end
138
- end
139
- end
140
-
141
- desc "stats <scraper_name>", "Get the current stat for a job"
142
- long_desc <<-LONGDESC
143
- Get stats for a scraper's current job\n
144
- LONGDESC
145
- option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
146
- def stats(scraper_name)
147
- client = Client::JobStat.new(options)
148
- if options[:job]
149
- puts "#{client.job_current_stats(options[:job])}"
150
- else
151
- puts "#{client.scraper_job_current_stats(scraper_name)}"
152
- end
153
-
154
- end
155
-
156
-
157
- desc "job SUBCOMMAND ...ARGS", "manage scrapers jobs"
158
- subcommand "job", ScraperJob
159
-
160
- desc "deployment SUBCOMMAND ...ARGS", "manage scrapers deployments"
161
- subcommand "deployment", ScraperDeployment
162
-
163
- desc "finisher SUBCOMMAND ...ARGS", "manage scrapers finishers"
164
- subcommand "finisher", ScraperFinisher
165
-
166
- desc "output SUBCOMMAND ...ARGS", "view scraper outputs"
167
- subcommand "output", JobOutput
168
-
169
- desc "page SUBCOMMAND ...ARGS", "manage pages on a job"
170
- subcommand "page", ScraperPage
171
-
172
- desc "export SUBCOMMAND ...ARGS", "manage scraper's exports"
173
- subcommand "export", ScraperExport
174
-
175
- desc "exporter SUBCOMMAND ...ARGS", "manage scraper's exporters"
176
- subcommand "exporter", ScraperExporter
177
-
178
- desc "var SUBCOMMAND ...ARGS", "for managing scraper's variables"
179
- subcommand "var", ScraperVar
180
-
181
-
182
- end
183
- end
184
-
185
- end