answersengine 0.10.1 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/CODE_OF_CONDUCT.md +1 -1
  3. data/LICENSE.txt +1 -1
  4. data/README.md +3 -4
  5. data/answersengine.gemspec +6 -12
  6. data/exe/answersengine +3 -2
  7. data/lib/answersengine.rb +20 -3
  8. metadata +14 -152
  9. data/examples/fetchtest/libraries/hello.rb +0 -9
  10. data/examples/fetchtest/libraries/hello_fail.rb +0 -10
  11. data/examples/fetchtest/parsers/failed.rb +0 -2
  12. data/examples/fetchtest/parsers/find_outputs.rb +0 -18
  13. data/examples/fetchtest/parsers/home.rb +0 -50
  14. data/examples/fetchtest/parsers/nested_fail.rb +0 -3
  15. data/examples/fetchtest/parsers/simple.rb +0 -14
  16. data/examples/fetchtest/seeders/csv_seeder.rb +0 -12
  17. data/examples/fetchtest/seeders/failed.rb +0 -1
  18. data/examples/fetchtest/seeders/list_of_urls.csv +0 -5
  19. data/examples/fetchtest/seeders/seed.rb +0 -28
  20. data/examples/fetchtest/seeders/test_reset_page.rb +0 -4
  21. data/lib/answersengine/cli.rb +0 -45
  22. data/lib/answersengine/cli/env_var.rb +0 -48
  23. data/lib/answersengine/cli/finisher.rb +0 -40
  24. data/lib/answersengine/cli/global_page.rb +0 -39
  25. data/lib/answersengine/cli/job.rb +0 -30
  26. data/lib/answersengine/cli/job_output.rb +0 -69
  27. data/lib/answersengine/cli/parser.rb +0 -64
  28. data/lib/answersengine/cli/scraper.rb +0 -185
  29. data/lib/answersengine/cli/scraper_deployment.rb +0 -24
  30. data/lib/answersengine/cli/scraper_export.rb +0 -51
  31. data/lib/answersengine/cli/scraper_exporter.rb +0 -40
  32. data/lib/answersengine/cli/scraper_finisher.rb +0 -20
  33. data/lib/answersengine/cli/scraper_job.rb +0 -75
  34. data/lib/answersengine/cli/scraper_job_var.rb +0 -48
  35. data/lib/answersengine/cli/scraper_page.rb +0 -203
  36. data/lib/answersengine/cli/scraper_var.rb +0 -48
  37. data/lib/answersengine/cli/seeder.rb +0 -40
  38. data/lib/answersengine/client.rb +0 -29
  39. data/lib/answersengine/client/auth_token.rb +0 -50
  40. data/lib/answersengine/client/backblaze_content.rb +0 -45
  41. data/lib/answersengine/client/base.rb +0 -55
  42. data/lib/answersengine/client/deploy_key.rb +0 -21
  43. data/lib/answersengine/client/env_var.rb +0 -28
  44. data/lib/answersengine/client/export.rb +0 -10
  45. data/lib/answersengine/client/global_page.rb +0 -18
  46. data/lib/answersengine/client/job.rb +0 -64
  47. data/lib/answersengine/client/job_export.rb +0 -10
  48. data/lib/answersengine/client/job_log.rb +0 -26
  49. data/lib/answersengine/client/job_output.rb +0 -19
  50. data/lib/answersengine/client/job_page.rb +0 -58
  51. data/lib/answersengine/client/job_stat.rb +0 -16
  52. data/lib/answersengine/client/scraper.rb +0 -57
  53. data/lib/answersengine/client/scraper_deployment.rb +0 -18
  54. data/lib/answersengine/client/scraper_export.rb +0 -22
  55. data/lib/answersengine/client/scraper_exporter.rb +0 -14
  56. data/lib/answersengine/client/scraper_finisher.rb +0 -16
  57. data/lib/answersengine/client/scraper_job.rb +0 -49
  58. data/lib/answersengine/client/scraper_job_output.rb +0 -19
  59. data/lib/answersengine/client/scraper_job_page.rb +0 -67
  60. data/lib/answersengine/client/scraper_job_var.rb +0 -28
  61. data/lib/answersengine/client/scraper_var.rb +0 -28
  62. data/lib/answersengine/plugin.rb +0 -6
  63. data/lib/answersengine/plugin/context_exposer.rb +0 -55
  64. data/lib/answersengine/scraper.rb +0 -18
  65. data/lib/answersengine/scraper/executor.rb +0 -373
  66. data/lib/answersengine/scraper/finisher.rb +0 -18
  67. data/lib/answersengine/scraper/parser.rb +0 -18
  68. data/lib/answersengine/scraper/ruby_finisher_executor.rb +0 -116
  69. data/lib/answersengine/scraper/ruby_parser_executor.rb +0 -200
  70. data/lib/answersengine/scraper/ruby_seeder_executor.rb +0 -120
  71. data/lib/answersengine/scraper/seeder.rb +0 -18
  72. data/lib/answersengine/version.rb +0 -3
@@ -1,12 +0,0 @@
1
- CSV.foreach("./seeders/list_of_urls.csv",:headers => true) do |row|
2
- pages << {
3
- url: row['url'],
4
- page_type: row['page_type'],
5
- vars: {"abc":[1,2,3], "def": "defcontent"}
6
- }
7
-
8
- # Save pages to the job partially if record counts will be too large
9
- max_records = 100
10
- save_pages(pages) if $. % max_records == 0
11
- end
12
-
@@ -1 +0,0 @@
1
- raise "fail from seeder"
@@ -1,5 +0,0 @@
1
- url,page_type
2
- http://fetchtest.datahen.com,home
3
- http://fetchtest.datahen.com/statuses/200,statuses
4
- http://fetchtest.datahen.com/statuses/200?q=1,statuses
5
- http://fetchtest.datahen.com/statuses/200?q=2,statuses
@@ -1,28 +0,0 @@
1
- puts "hello from seeder"
2
-
3
- pages << {
4
- url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser",
5
- vars: {"abc":[1], "def": "defcontent"}
6
- }
7
-
8
- pages << {
9
- url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser2",
10
- vars: {"abc":[2], "def": "defcontent"}
11
- }
12
-
13
- save_pages(pages)
14
-
15
- pages << {
16
- url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser3",
17
- vars: {"abc":[3], "def": "defcontent"}
18
- }
19
-
20
- pages << {
21
- url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser4",
22
- vars: {"abc":[3], "def": "defcontent"}
23
- }
24
-
25
- pages << {
26
- url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser5",
27
- vars: {"abc":[3], "def": "defcontent"}
28
- }
@@ -1,4 +0,0 @@
1
- pages << {
2
- url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser",
3
- reset: true
4
- }
@@ -1,45 +0,0 @@
1
- require 'thor'
2
- require 'answersengine/scraper'
3
- require 'answersengine/cli/scraper_var'
4
- require 'answersengine/cli/scraper_exporter'
5
- require 'answersengine/cli/scraper_export'
6
- require 'answersengine/cli/scraper_job_var'
7
- require 'answersengine/cli/scraper_job'
8
- require 'answersengine/cli/scraper_finisher'
9
- require 'answersengine/cli/global_page'
10
- require 'answersengine/cli/scraper_page'
11
- require 'answersengine/cli/job_output'
12
- require 'answersengine/cli/job'
13
- require 'answersengine/cli/scraper_deployment'
14
- require 'answersengine/cli/scraper'
15
- require 'answersengine/cli/parser'
16
- require 'answersengine/cli/seeder'
17
- require 'answersengine/cli/finisher'
18
- require 'answersengine/cli/env_var'
19
-
20
-
21
-
22
- module AnswersEngine
23
- class CLI < Thor
24
- desc "scraper SUBCOMMAND ...ARGS", "manage scrapers"
25
- subcommand "scraper", Scraper
26
-
27
- desc "job SUBCOMMAND ...ARGS", "manage scrapers jobs"
28
- subcommand "job", Job
29
-
30
- desc "globalpage SUBCOMMAND ...ARGS", "interacts with global page"
31
- subcommand "globalpage", GlobalPage
32
-
33
- desc "parser SUBCOMMAND ...ARGS", "for parsing related activities"
34
- subcommand "parser", Parser
35
-
36
- desc "seeder SUBCOMMAND ...ARGS", "for seeding related activities"
37
- subcommand "seeder", Seeder
38
-
39
- desc "seeder SUBCOMMAND ...ARGS", "for seeding related activities"
40
- subcommand "finisher", Finisher
41
-
42
- desc "var SUBCOMMAND ...ARGS", "for environment variable related activities"
43
- subcommand "var", EnvVar
44
- end
45
- end
@@ -1,48 +0,0 @@
1
- module AnswersEngine
2
- class CLI < Thor
3
- class EnvVar < Thor
4
- desc "list", "List environment variables on the account"
5
-
6
- long_desc <<-LONGDESC
7
- List all environment variables on the account.
8
- LONGDESC
9
- option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
10
- option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
11
- def list
12
- client = Client::EnvVar.new(options)
13
- puts "#{client.all}"
14
- end
15
-
16
- desc "set <name> <value>", "Set an environment var on the account"
17
- long_desc <<-LONGDESC
18
- Creates an environment variable\x5
19
- <name>: Var name can only consist of alphabets, numbers, underscores. Name must be unique to your account, otherwise it will be overwritten.\x5
20
- <value>: Value of variable.\x5
21
- LONGDESC
22
- option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
23
- def set(name, value)
24
- # puts "options #{options}"
25
- client = Client::EnvVar.new(options)
26
- puts "#{client.set(name, value, options)}"
27
- end
28
-
29
- desc "show <name>", "Show an environment variable on the account"
30
- def show(name)
31
- client = Client::EnvVar.new(options)
32
- puts "#{client.find(name)}"
33
- end
34
-
35
- desc "unset <name>", "Deletes an environment variable on the account"
36
- def unset(name)
37
- client = Client::EnvVar.new(options)
38
- puts "#{client.unset(name)}"
39
- end
40
-
41
-
42
-
43
-
44
-
45
- end
46
- end
47
-
48
- end
@@ -1,40 +0,0 @@
1
- module AnswersEngine
2
- class CLI < Thor
3
- class Finisher < Thor
4
- desc "try <scraper_name> <finisher_file>", "Tries a finisher file"
5
- long_desc <<-LONGDESC
6
- Takes a finisher script and tries to execute it without saving anything.\x5
7
- <seeder_file>: Finisher script file will be executed.\x5
8
- LONGDESC
9
- option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
10
- def try_finisher(scraper_name, finisher_file)
11
- if options[:job]
12
- job_id = options[:job]
13
- else
14
- job = Client::ScraperJob.new(options).find(scraper_name)
15
- job_id = job['id']
16
- end
17
-
18
- puts AnswersEngine::Scraper::Finisher.exec_finisher(finisher_file, job_id, false)
19
- end
20
-
21
- desc "exec <scraper_name> <finisher_file>", "Executes a finisher script onto a scraper's current job."
22
- long_desc <<-LONGDESC
23
- Takes a finisher script and execute it against a job and save outputs into the scraper's current job\x5
24
- <finisher_file>: Finisher script file that will be executed on the scraper's current job.\x5
25
- LONGDESC
26
- option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
27
- def exec_parse(scraper_name, finisher_file)
28
- if options[:job]
29
- job_id = options[:job]
30
- else
31
- job = Client::ScraperJob.new(options).find(scraper_name)
32
- job_id = job['id']
33
- end
34
-
35
- puts AnswersEngine::Scraper::Finisher.exec_finisher(finisher_file, job_id, true)
36
- end
37
- end
38
- end
39
-
40
- end
@@ -1,39 +0,0 @@
1
- module AnswersEngine
2
- class CLI < Thor
3
- class GlobalPage < Thor
4
-
5
- desc "show <gid>", "Show a global page"
6
- def show(gid)
7
- client = Client::GlobalPage.new(options)
8
- puts "#{client.find(gid)}"
9
- end
10
-
11
- desc "content <gid>", "Show content of a globalpage"
12
- def content(gid)
13
- client = Client::GlobalPage.new(options)
14
- result = JSON.parse(client.find_content(gid).to_s)
15
-
16
- if result['available'] == true
17
- puts "Preview content url: \"#{result['preview_url']}\""
18
- `open "#{result['preview_url']}"`
19
- else
20
- puts "Content does not exist"
21
- end
22
- end
23
-
24
- desc "failedcontent <gid>", "Show failed content of a globalpage"
25
- def failedcontent(gid)
26
- client = Client::GlobalPage.new(options)
27
- result = JSON.parse(client.find_failed_content(gid).to_s)
28
-
29
- if result['available'] == true
30
- puts "Preview failed content url: \"#{result['preview_url']}\""
31
- `open "#{result['preview_url']}"`
32
- else
33
- puts "Failed Content does not exist"
34
- end
35
- end
36
-
37
- end
38
- end
39
- end
@@ -1,30 +0,0 @@
1
- module AnswersEngine
2
- class CLI < Thor
3
- class Job < Thor
4
- package_name "job"
5
- def self.banner(command, namespace = nil, subcommand = false)
6
- "#{basename} #{@package_name} #{command.usage}"
7
- end
8
-
9
-
10
- desc "list", "gets a list of jobs"
11
- option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
12
- option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
13
- long_desc <<-LONGDESC
14
- List scrape jobs.
15
- LONGDESC
16
- def list()
17
- client = Client::Job.new(options)
18
- puts "#{client.all()}"
19
- end
20
-
21
- desc "show <job_id>", "Show a job"
22
- def show(job_id)
23
- client = Client::Job.new(options)
24
- puts "#{client.find(job_id)}"
25
- end
26
-
27
- end
28
- end
29
-
30
- end
@@ -1,69 +0,0 @@
1
- module AnswersEngine
2
- class CLI < Thor
3
- class JobOutput < Thor
4
-
5
- package_name "scraper output"
6
- def self.banner(command, namespace = nil, subcommand = false)
7
- "#{basename} #{@package_name} #{command.usage}"
8
- end
9
-
10
- desc "list <scraper_name>", "List output records in a collection that is in the current job"
11
- long_desc <<-LONGDESC
12
- List all output records in a collection that is in the current job of a scraper\n
13
- LONGDESC
14
- option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
15
- option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
16
- option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
17
- option :collection, :aliases => :c, desc: "Shows outputs from a specific collection.(defaults to 'default' collection)"
18
- option :query, :aliases => :q, type: :string, banner: :JSON, desc: 'Set search query. Must be in json format. i.e: {"Foo":"bar"} '
19
- def list(scraper_name)
20
- collection = options.fetch(:collection) { 'default' }
21
- if options[:job]
22
- client = Client::JobOutput.new(options)
23
- puts "#{client.all(options[:job], collection)}"
24
- else
25
- client = Client::ScraperJobOutput.new(options)
26
- puts "#{client.all(scraper_name, collection)}"
27
- end
28
- end
29
-
30
- desc "show <scraper_name> <record_id>", "Show one output record in a collection that is in the current job of a scraper"
31
- long_desc <<-LONGDESC
32
- Shows an output record in a collection that is in the current job of a scraper\n
33
- <record_id>: ID of the output record.\x5
34
- LONGDESC
35
- option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
36
- option :collection, :aliases => :c, desc: "Shows output from a specific collection.(defaults to 'default' collection)"
37
- def show(scraper_name, id)
38
- collection = options.fetch(:collection) { 'default' }
39
- if options[:job]
40
- client = Client::JobOutput.new(options)
41
- puts "#{client.find(options[:job], collection, id)}"
42
- else
43
- client = Client::ScraperJobOutput.new(options)
44
- puts "#{client.find(scraper_name, collection, id)}"
45
- end
46
- end
47
-
48
- desc "collections <scraper_name>", "list job output collections that are inside a current job of a scraper."
49
- long_desc <<-LONGDESC
50
- List job output collections that are inside a current job of a scraper.\x5
51
- LONGDESC
52
- option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
53
- option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
54
- option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
55
- def collections(scraper_name)
56
-
57
- if options[:job]
58
- client = Client::JobOutput.new(options)
59
- puts "#{client.collections(options[:job])}"
60
- else
61
- client = Client::ScraperJobOutput.new(options)
62
- puts "#{client.collections(scraper_name)}"
63
- end
64
- end
65
-
66
- end
67
- end
68
-
69
- end
@@ -1,64 +0,0 @@
1
- module AnswersEngine
2
- class CLI < Thor
3
- class Parser < Thor
4
- desc "try <scraper_name> <parser_file> <GID>", "Tries a parser on a Job Page"
5
- long_desc <<-LONGDESC
6
- Takes a parser script and runs it against a job page\x5
7
- <parser_file>: Parser script file that will be executed on the page.\x5
8
- <GID>: Global ID of the page.\x5
9
- LONGDESC
10
- option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
11
- option :global, :aliases => :g, type: :boolean, default: false, desc: 'Use globalpage instead of a job page'
12
- option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
13
- def try_parse(scraper_name, parser_file, gid)
14
- begin
15
-
16
- if options[:job]
17
- job_id = options[:job]
18
- elsif options[:global]
19
- job_id = nil
20
- else
21
- job = Client::ScraperJob.new(options).find(scraper_name)
22
- job_id = job['id']
23
- end
24
-
25
-
26
- vars = JSON.parse(options[:vars]) if options[:vars]
27
- puts AnswersEngine::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars)
28
-
29
- rescue JSON::ParserError
30
- if options[:vars]
31
- puts "Error: #{options[:vars]} on vars is not a valid JSON"
32
- end
33
- end
34
- end
35
-
36
- desc "exec <scraper_name> <parser_file> <GID>...<GID>", "Executes a parser script on one or more Job Pages within a scraper's current job"
37
- long_desc <<-LONGDESC
38
- Takes a parser script executes it against a job page(s) and save the output to the scraper's current job\x5
39
- <parser_file>: Parser script file will be executed on the page.\x5
40
- <GID>: Global ID of the page.\x5
41
- LONGDESC
42
- option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
43
- def exec_parse(scraper_name, parser_file, *gids)
44
- gids.each do |gid|
45
- begin
46
- puts "Parsing #{gid}"
47
-
48
- if options[:job]
49
- job_id = options[:job]
50
- else
51
- job = Client::ScraperJob.new(options).find(scraper_name)
52
- job_id = job['id']
53
- end
54
-
55
- puts AnswersEngine::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true)
56
- rescue => e
57
- puts e
58
- end
59
- end
60
- end
61
- end
62
- end
63
-
64
- end
@@ -1,185 +0,0 @@
1
- module AnswersEngine
2
- class CLI < Thor
3
- class Scraper < Thor
4
- desc "list", "List scrapers"
5
-
6
- long_desc <<-LONGDESC
7
- List all scrapers.
8
- LONGDESC
9
- option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
10
- option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
11
- def list
12
- client = Client::Scraper.new(options)
13
- puts "#{client.all}"
14
- end
15
-
16
- desc "create <scraper_name> <git_repository>", "Create a scraper"
17
- long_desc <<-LONGDESC
18
- Creates a scraper\x5
19
- <scraper_name>: Scraper name can only consist of alphabets, numbers, underscores and dashes. Name must be unique to your account.\x5
20
- <git_repository>: URL to a valid Git repository.\x5
21
- LONGDESC
22
- option :branch, :aliases => :b, desc: 'Set the Git branch to use. Default: master'
23
- option :freshness_type, :aliases => :t, desc: 'Set how fresh the page cache is. Possible values: day, week, month, year. Default: any'
24
- option :proxy_type, desc: 'Set the Proxy type. Default: standard'
25
- option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
26
- option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
27
- option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
28
- option :disable_scheduler, type: :boolean, desc: 'Set true to disable scheduler. Default: false'
29
- option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
30
- option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
31
- option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
32
- def create(scraper_name, git_repository)
33
- # puts "options #{options}"
34
- client = Client::Scraper.new(options)
35
- puts "#{client.create(scraper_name, git_repository, options)}"
36
- end
37
-
38
- desc "update <scraper_name>", "Update a scraper"
39
- long_desc <<-LONGDESC
40
- Updates a scraper\x5
41
- LONGDESC
42
- option :branch, :aliases => :b, desc: 'Set the Git branch to use. Default: master'
43
- option :name, :aliases => :n, desc: 'Set the scraper name. Name can only consist of alphabets, numbers, underscores and dashes. Name must be unique to your account'
44
- option :repo, :aliases => :r, desc: 'Set the URL to a valid Git repository'
45
- option :freshness_type, :aliases => :t, desc: 'Set how fresh the page cache is. Possible values: day, week, month, year. Default: any'
46
- option :proxy_type, desc: 'Set the Proxy type. Default: standard'
47
- option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
48
- option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
49
- option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
50
- option :disable_scheduler, type: :boolean, desc: 'Set true to disable scheduler. Default: false'
51
- option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
52
- option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
53
- option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
54
- def update(scraper_name)
55
- client = Client::Scraper.new(options)
56
- puts "#{client.update(scraper_name, options)}"
57
- end
58
-
59
-
60
- desc "show <scraper_name>", "Show a scraper"
61
- def show(scraper_name)
62
- client = Client::Scraper.new(options)
63
- puts "#{client.find(scraper_name)}"
64
- end
65
-
66
- desc "delete <scraper_name>", "Delete a scraper and related records"
67
- def delete(scraper_name)
68
- client = Client::Scraper.new(options)
69
- puts "#{client.delete(scraper_name)}"
70
- end
71
-
72
-
73
- desc "deploy <scraper_name>", "Deploy a scraper"
74
- long_desc <<-LONGDESC
75
- Deploys a scraper
76
- LONGDESC
77
- def deploy(scraper_name)
78
- client = Client::ScraperDeployment.new()
79
- puts "Deploying scraper. This may take a while..."
80
- puts "#{client.deploy(scraper_name)}"
81
- end
82
-
83
- desc "start <scraper_name>", "Creates a scraping job and runs it"
84
- long_desc <<-LONGDESC
85
- Starts a scraper by creating an active scrape job\x5
86
- LONGDESC
87
- option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
88
- option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
89
- option :proxy_type, desc: 'Set the Proxy type. Default: standard'
90
- def start(scraper_name)
91
- client = Client::ScraperJob.new(options)
92
- puts "Starting a scrape job..."
93
- puts "#{client.create(scraper_name, options)}"
94
- end
95
-
96
-
97
- desc "log <scraper_name>", "List log entries related to a scraper's current job"
98
- long_desc <<-LONGDESC
99
- Shows log related to a scraper's current job. Defaults to showing the most recent entries\x5
100
- LONGDESC
101
- option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
102
- option :head, :aliases => :H, desc: 'Show the oldest log entries. If not set, newest entries is shown'
103
- option :parsing, :aliases => :p, type: :boolean, desc: 'Show only log entries related to parsing errors'
104
- option :seeding, :aliases => :s, type: :boolean, desc: 'Show only log entries related to seeding errors'
105
- option :more, :aliases => :m, desc: 'Show next set of log entries. Enter the `More token`'
106
- option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 5000 per page.'
107
- def log(scraper_name)
108
- client = Client::JobLog.new(options)
109
-
110
- query = {}
111
- query["order"] = options.delete(:head) if options[:head]
112
- query["job_type"] = "parsing" if options[:parsing]
113
- query["job_type"] = "seeding" if options[:seeding]
114
- query["page_token"] = options.delete(:more) if options[:more]
115
- query["per_page"] = options.delete(:per_page) if options[:per_page]
116
-
117
- puts "Fetching logs..."
118
-
119
- if options[:job]
120
- result = client.all_job_log(options[:job], {query: query})
121
- else
122
- result = client.scraper_all_job_log(scraper_name, {query: query})
123
- end
124
-
125
- if result['entries'].nil? || result["entries"].length == 0
126
- puts "No logs yet, please try again later."
127
- else
128
- more_token = result["more_token"]
129
-
130
- result["entries"].each do |entry|
131
- puts "#{entry["timestamp"]} #{entry["severity"]}: #{entry["payload"]}" if entry.is_a?(Hash)
132
- end
133
-
134
- unless more_token.nil?
135
- puts "-----------"
136
- puts "To see more entries, add: \"--more #{more_token}\""
137
- end
138
- end
139
- end
140
-
141
- desc "stats <scraper_name>", "Get the current stat for a job"
142
- long_desc <<-LONGDESC
143
- Get stats for a scraper's current job\n
144
- LONGDESC
145
- option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
146
- def stats(scraper_name)
147
- client = Client::JobStat.new(options)
148
- if options[:job]
149
- puts "#{client.job_current_stats(options[:job])}"
150
- else
151
- puts "#{client.scraper_job_current_stats(scraper_name)}"
152
- end
153
-
154
- end
155
-
156
-
157
- desc "job SUBCOMMAND ...ARGS", "manage scrapers jobs"
158
- subcommand "job", ScraperJob
159
-
160
- desc "deployment SUBCOMMAND ...ARGS", "manage scrapers deployments"
161
- subcommand "deployment", ScraperDeployment
162
-
163
- desc "finisher SUBCOMMAND ...ARGS", "manage scrapers finishers"
164
- subcommand "finisher", ScraperFinisher
165
-
166
- desc "output SUBCOMMAND ...ARGS", "view scraper outputs"
167
- subcommand "output", JobOutput
168
-
169
- desc "page SUBCOMMAND ...ARGS", "manage pages on a job"
170
- subcommand "page", ScraperPage
171
-
172
- desc "export SUBCOMMAND ...ARGS", "manage scraper's exports"
173
- subcommand "export", ScraperExport
174
-
175
- desc "exporter SUBCOMMAND ...ARGS", "manage scraper's exporters"
176
- subcommand "exporter", ScraperExporter
177
-
178
- desc "var SUBCOMMAND ...ARGS", "for managing scraper's variables"
179
- subcommand "var", ScraperVar
180
-
181
-
182
- end
183
- end
184
-
185
- end