datahen 0.10.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (78) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +29 -0
  8. data/Rakefile +22 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/datahen.gemspec +47 -0
  12. data/examples/fetchtest/libraries/hello.rb +9 -0
  13. data/examples/fetchtest/libraries/hello_fail.rb +10 -0
  14. data/examples/fetchtest/parsers/failed.rb +2 -0
  15. data/examples/fetchtest/parsers/find_outputs.rb +18 -0
  16. data/examples/fetchtest/parsers/home.rb +50 -0
  17. data/examples/fetchtest/parsers/nested_fail.rb +3 -0
  18. data/examples/fetchtest/parsers/simple.rb +14 -0
  19. data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
  20. data/examples/fetchtest/seeders/failed.rb +1 -0
  21. data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
  22. data/examples/fetchtest/seeders/seed.rb +28 -0
  23. data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
  24. data/exe/hen +3 -0
  25. data/lib/datahen.rb +5 -0
  26. data/lib/datahen/cli.rb +45 -0
  27. data/lib/datahen/cli/env_var.rb +48 -0
  28. data/lib/datahen/cli/finisher.rb +40 -0
  29. data/lib/datahen/cli/global_page.rb +39 -0
  30. data/lib/datahen/cli/job.rb +30 -0
  31. data/lib/datahen/cli/job_output.rb +69 -0
  32. data/lib/datahen/cli/parser.rb +64 -0
  33. data/lib/datahen/cli/scraper.rb +185 -0
  34. data/lib/datahen/cli/scraper_deployment.rb +24 -0
  35. data/lib/datahen/cli/scraper_export.rb +51 -0
  36. data/lib/datahen/cli/scraper_exporter.rb +40 -0
  37. data/lib/datahen/cli/scraper_finisher.rb +20 -0
  38. data/lib/datahen/cli/scraper_job.rb +75 -0
  39. data/lib/datahen/cli/scraper_job_var.rb +48 -0
  40. data/lib/datahen/cli/scraper_page.rb +203 -0
  41. data/lib/datahen/cli/scraper_var.rb +48 -0
  42. data/lib/datahen/cli/seeder.rb +40 -0
  43. data/lib/datahen/client.rb +29 -0
  44. data/lib/datahen/client/auth_token.rb +50 -0
  45. data/lib/datahen/client/backblaze_content.rb +45 -0
  46. data/lib/datahen/client/base.rb +69 -0
  47. data/lib/datahen/client/deploy_key.rb +21 -0
  48. data/lib/datahen/client/env_var.rb +28 -0
  49. data/lib/datahen/client/export.rb +10 -0
  50. data/lib/datahen/client/global_page.rb +18 -0
  51. data/lib/datahen/client/job.rb +64 -0
  52. data/lib/datahen/client/job_export.rb +10 -0
  53. data/lib/datahen/client/job_log.rb +26 -0
  54. data/lib/datahen/client/job_output.rb +19 -0
  55. data/lib/datahen/client/job_page.rb +58 -0
  56. data/lib/datahen/client/job_stat.rb +16 -0
  57. data/lib/datahen/client/scraper.rb +57 -0
  58. data/lib/datahen/client/scraper_deployment.rb +18 -0
  59. data/lib/datahen/client/scraper_export.rb +22 -0
  60. data/lib/datahen/client/scraper_exporter.rb +14 -0
  61. data/lib/datahen/client/scraper_finisher.rb +16 -0
  62. data/lib/datahen/client/scraper_job.rb +49 -0
  63. data/lib/datahen/client/scraper_job_output.rb +19 -0
  64. data/lib/datahen/client/scraper_job_page.rb +67 -0
  65. data/lib/datahen/client/scraper_job_var.rb +28 -0
  66. data/lib/datahen/client/scraper_var.rb +28 -0
  67. data/lib/datahen/plugin.rb +6 -0
  68. data/lib/datahen/plugin/context_exposer.rb +55 -0
  69. data/lib/datahen/scraper.rb +18 -0
  70. data/lib/datahen/scraper/executor.rb +373 -0
  71. data/lib/datahen/scraper/finisher.rb +18 -0
  72. data/lib/datahen/scraper/parser.rb +18 -0
  73. data/lib/datahen/scraper/ruby_finisher_executor.rb +116 -0
  74. data/lib/datahen/scraper/ruby_parser_executor.rb +200 -0
  75. data/lib/datahen/scraper/ruby_seeder_executor.rb +120 -0
  76. data/lib/datahen/scraper/seeder.rb +18 -0
  77. data/lib/datahen/version.rb +3 -0
  78. metadata +270 -0
@@ -0,0 +1,5 @@
1
+ require "datahen/version"
2
+ require "datahen/scraper"
3
+
4
+ module Datahen
5
+ end
@@ -0,0 +1,45 @@
1
+ require 'thor'
2
+ require 'datahen/scraper'
3
+ require 'datahen/cli/scraper_var'
4
+ require 'datahen/cli/scraper_exporter'
5
+ require 'datahen/cli/scraper_export'
6
+ require 'datahen/cli/scraper_job_var'
7
+ require 'datahen/cli/scraper_job'
8
+ require 'datahen/cli/scraper_finisher'
9
+ require 'datahen/cli/global_page'
10
+ require 'datahen/cli/scraper_page'
11
+ require 'datahen/cli/job_output'
12
+ require 'datahen/cli/job'
13
+ require 'datahen/cli/scraper_deployment'
14
+ require 'datahen/cli/scraper'
15
+ require 'datahen/cli/parser'
16
+ require 'datahen/cli/seeder'
17
+ require 'datahen/cli/finisher'
18
+ require 'datahen/cli/env_var'
19
+
20
+
21
+
22
+ module Datahen
23
+ class CLI < Thor
24
+ desc "scraper SUBCOMMAND ...ARGS", "manage scrapers"
25
+ subcommand "scraper", Scraper
26
+
27
+ desc "job SUBCOMMAND ...ARGS", "manage scrapers jobs"
28
+ subcommand "job", Job
29
+
30
+ desc "globalpage SUBCOMMAND ...ARGS", "interacts with global page"
31
+ subcommand "globalpage", GlobalPage
32
+
33
+ desc "parser SUBCOMMAND ...ARGS", "for parsing related activities"
34
+ subcommand "parser", Parser
35
+
36
+ desc "seeder SUBCOMMAND ...ARGS", "for seeding related activities"
37
+ subcommand "seeder", Seeder
38
+
39
+ desc "seeder SUBCOMMAND ...ARGS", "for seeding related activities"
40
+ subcommand "finisher", Finisher
41
+
42
+ desc "var SUBCOMMAND ...ARGS", "for environment variable related activities"
43
+ subcommand "var", EnvVar
44
+ end
45
+ end
@@ -0,0 +1,48 @@
1
+ module Datahen
2
+ class CLI < Thor
3
+ class EnvVar < Thor
4
+ desc "list", "List environment variables on the account"
5
+
6
+ long_desc <<-LONGDESC
7
+ List all environment variables on the account.
8
+ LONGDESC
9
+ option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
10
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
11
+ def list
12
+ client = Client::EnvVar.new(options)
13
+ puts "#{client.all}"
14
+ end
15
+
16
+ desc "set <name> <value>", "Set an environment var on the account"
17
+ long_desc <<-LONGDESC
18
+ Creates an environment variable\x5
19
+ <name>: Var name can only consist of alphabets, numbers, underscores. Name must be unique to your account, otherwise it will be overwritten.\x5
20
+ <value>: Value of variable.\x5
21
+ LONGDESC
22
+ option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
23
+ def set(name, value)
24
+ # puts "options #{options}"
25
+ client = Client::EnvVar.new(options)
26
+ puts "#{client.set(name, value, options)}"
27
+ end
28
+
29
+ desc "show <name>", "Show an environment variable on the account"
30
+ def show(name)
31
+ client = Client::EnvVar.new(options)
32
+ puts "#{client.find(name)}"
33
+ end
34
+
35
+ desc "unset <name>", "Deletes an environment variable on the account"
36
+ def unset(name)
37
+ client = Client::EnvVar.new(options)
38
+ puts "#{client.unset(name)}"
39
+ end
40
+
41
+
42
+
43
+
44
+
45
+ end
46
+ end
47
+
48
+ end
@@ -0,0 +1,40 @@
1
+ module Datahen
2
+ class CLI < Thor
3
+ class Finisher < Thor
4
+ desc "try <scraper_name> <finisher_file>", "Tries a finisher file"
5
+ long_desc <<-LONGDESC
6
+ Takes a finisher script and tries to execute it without saving anything.\x5
7
+ <seeder_file>: Finisher script file will be executed.\x5
8
+ LONGDESC
9
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
10
+ def try_finisher(scraper_name, finisher_file)
11
+ if options[:job]
12
+ job_id = options[:job]
13
+ else
14
+ job = Client::ScraperJob.new(options).find(scraper_name)
15
+ job_id = job['id']
16
+ end
17
+
18
+ puts Datahen::Scraper::Finisher.exec_finisher(finisher_file, job_id, false)
19
+ end
20
+
21
+ desc "exec <scraper_name> <finisher_file>", "Executes a finisher script onto a scraper's current job."
22
+ long_desc <<-LONGDESC
23
+ Takes a finisher script and execute it against a job and save outputs into the scraper's current job\x5
24
+ <finisher_file>: Finisher script file that will be executed on the scraper's current job.\x5
25
+ LONGDESC
26
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
27
+ def exec_parse(scraper_name, finisher_file)
28
+ if options[:job]
29
+ job_id = options[:job]
30
+ else
31
+ job = Client::ScraperJob.new(options).find(scraper_name)
32
+ job_id = job['id']
33
+ end
34
+
35
+ puts Datahen::Scraper::Finisher.exec_finisher(finisher_file, job_id, true)
36
+ end
37
+ end
38
+ end
39
+
40
+ end
@@ -0,0 +1,39 @@
1
+ module Datahen
2
+ class CLI < Thor
3
+ class GlobalPage < Thor
4
+
5
+ desc "show <gid>", "Show a global page"
6
+ def show(gid)
7
+ client = Client::GlobalPage.new(options)
8
+ puts "#{client.find(gid)}"
9
+ end
10
+
11
+ desc "content <gid>", "Show content of a globalpage"
12
+ def content(gid)
13
+ client = Client::GlobalPage.new(options)
14
+ result = JSON.parse(client.find_content(gid).to_s)
15
+
16
+ if result['available'] == true
17
+ puts "Preview content url: \"#{result['preview_url']}\""
18
+ `open "#{result['preview_url']}"`
19
+ else
20
+ puts "Content does not exist"
21
+ end
22
+ end
23
+
24
+ desc "failedcontent <gid>", "Show failed content of a globalpage"
25
+ def failedcontent(gid)
26
+ client = Client::GlobalPage.new(options)
27
+ result = JSON.parse(client.find_failed_content(gid).to_s)
28
+
29
+ if result['available'] == true
30
+ puts "Preview failed content url: \"#{result['preview_url']}\""
31
+ `open "#{result['preview_url']}"`
32
+ else
33
+ puts "Failed Content does not exist"
34
+ end
35
+ end
36
+
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,30 @@
1
+ module Datahen
2
+ class CLI < Thor
3
+ class Job < Thor
4
+ package_name "job"
5
+ def self.banner(command, namespace = nil, subcommand = false)
6
+ "#{basename} #{@package_name} #{command.usage}"
7
+ end
8
+
9
+
10
+ desc "list", "gets a list of jobs"
11
+ option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
12
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
13
+ long_desc <<-LONGDESC
14
+ List scrape jobs.
15
+ LONGDESC
16
+ def list()
17
+ client = Client::Job.new(options)
18
+ puts "#{client.all()}"
19
+ end
20
+
21
+ desc "show <job_id>", "Show a job"
22
+ def show(job_id)
23
+ client = Client::Job.new(options)
24
+ puts "#{client.find(job_id)}"
25
+ end
26
+
27
+ end
28
+ end
29
+
30
+ end
@@ -0,0 +1,69 @@
1
+ module Datahen
2
+ class CLI < Thor
3
+ class JobOutput < Thor
4
+
5
+ package_name "scraper output"
6
+ def self.banner(command, namespace = nil, subcommand = false)
7
+ "#{basename} #{@package_name} #{command.usage}"
8
+ end
9
+
10
+ desc "list <scraper_name>", "List output records in a collection that is in the current job"
11
+ long_desc <<-LONGDESC
12
+ List all output records in a collection that is in the current job of a scraper\n
13
+ LONGDESC
14
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
15
+ option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
16
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
17
+ option :collection, :aliases => :c, desc: "Shows outputs from a specific collection.(defaults to 'default' collection)"
18
+ option :query, :aliases => :q, type: :string, banner: :JSON, desc: 'Set search query. Must be in json format. i.e: {"Foo":"bar"} '
19
+ def list(scraper_name)
20
+ collection = options.fetch(:collection) { 'default' }
21
+ if options[:job]
22
+ client = Client::JobOutput.new(options)
23
+ puts "#{client.all(options[:job], collection)}"
24
+ else
25
+ client = Client::ScraperJobOutput.new(options)
26
+ puts "#{client.all(scraper_name, collection)}"
27
+ end
28
+ end
29
+
30
+ desc "show <scraper_name> <record_id>", "Show one output record in a collection that is in the current job of a scraper"
31
+ long_desc <<-LONGDESC
32
+ Shows an output record in a collection that is in the current job of a scraper\n
33
+ <record_id>: ID of the output record.\x5
34
+ LONGDESC
35
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
36
+ option :collection, :aliases => :c, desc: "Shows output from a specific collection.(defaults to 'default' collection)"
37
+ def show(scraper_name, id)
38
+ collection = options.fetch(:collection) { 'default' }
39
+ if options[:job]
40
+ client = Client::JobOutput.new(options)
41
+ puts "#{client.find(options[:job], collection, id)}"
42
+ else
43
+ client = Client::ScraperJobOutput.new(options)
44
+ puts "#{client.find(scraper_name, collection, id)}"
45
+ end
46
+ end
47
+
48
+ desc "collections <scraper_name>", "list job output collections that are inside a current job of a scraper."
49
+ long_desc <<-LONGDESC
50
+ List job output collections that are inside a current job of a scraper.\x5
51
+ LONGDESC
52
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
53
+ option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
54
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
55
+ def collections(scraper_name)
56
+
57
+ if options[:job]
58
+ client = Client::JobOutput.new(options)
59
+ puts "#{client.collections(options[:job])}"
60
+ else
61
+ client = Client::ScraperJobOutput.new(options)
62
+ puts "#{client.collections(scraper_name)}"
63
+ end
64
+ end
65
+
66
+ end
67
+ end
68
+
69
+ end
@@ -0,0 +1,64 @@
1
+ module Datahen
2
+ class CLI < Thor
3
+ class Parser < Thor
4
+ desc "try <scraper_name> <parser_file> <GID>", "Tries a parser on a Job Page"
5
+ long_desc <<-LONGDESC
6
+ Takes a parser script and runs it against a job page\x5
7
+ <parser_file>: Parser script file that will be executed on the page.\x5
8
+ <GID>: Global ID of the page.\x5
9
+ LONGDESC
10
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
11
+ option :global, :aliases => :g, type: :boolean, default: false, desc: 'Use globalpage instead of a job page'
12
+ option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
13
+ def try_parse(scraper_name, parser_file, gid)
14
+ begin
15
+
16
+ if options[:job]
17
+ job_id = options[:job]
18
+ elsif options[:global]
19
+ job_id = nil
20
+ else
21
+ job = Client::ScraperJob.new(options).find(scraper_name)
22
+ job_id = job['id']
23
+ end
24
+
25
+
26
+ vars = JSON.parse(options[:vars]) if options[:vars]
27
+ puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars)
28
+
29
+ rescue JSON::ParserError
30
+ if options[:vars]
31
+ puts "Error: #{options[:vars]} on vars is not a valid JSON"
32
+ end
33
+ end
34
+ end
35
+
36
+ desc "exec <scraper_name> <parser_file> <GID>...<GID>", "Executes a parser script on one or more Job Pages within a scraper's current job"
37
+ long_desc <<-LONGDESC
38
+ Takes a parser script executes it against a job page(s) and save the output to the scraper's current job\x5
39
+ <parser_file>: Parser script file will be executed on the page.\x5
40
+ <GID>: Global ID of the page.\x5
41
+ LONGDESC
42
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
43
+ def exec_parse(scraper_name, parser_file, *gids)
44
+ gids.each do |gid|
45
+ begin
46
+ puts "Parsing #{gid}"
47
+
48
+ if options[:job]
49
+ job_id = options[:job]
50
+ else
51
+ job = Client::ScraperJob.new(options).find(scraper_name)
52
+ job_id = job['id']
53
+ end
54
+
55
+ puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true)
56
+ rescue => e
57
+ puts e
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
63
+
64
+ end
@@ -0,0 +1,185 @@
1
+ module Datahen
2
+ class CLI < Thor
3
+ class Scraper < Thor
4
+ desc "list", "List scrapers"
5
+
6
+ long_desc <<-LONGDESC
7
+ List all scrapers.
8
+ LONGDESC
9
+ option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
10
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
11
+ def list
12
+ client = Client::Scraper.new(options)
13
+ puts "#{client.all}"
14
+ end
15
+
16
+ desc "create <scraper_name> <git_repository>", "Create a scraper"
17
+ long_desc <<-LONGDESC
18
+ Creates a scraper\x5
19
+ <scraper_name>: Scraper name can only consist of alphabets, numbers, underscores and dashes. Name must be unique to your account.\x5
20
+ <git_repository>: URL to a valid Git repository.\x5
21
+ LONGDESC
22
+ option :branch, :aliases => :b, desc: 'Set the Git branch to use. Default: master'
23
+ option :freshness_type, :aliases => :t, desc: 'Set how fresh the page cache is. Possible values: day, week, month, year. Default: any'
24
+ option :proxy_type, desc: 'Set the Proxy type. Default: standard'
25
+ option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
26
+ option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
27
+ option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
28
+ option :disable_scheduler, type: :boolean, desc: 'Set true to disable scheduler. Default: false'
29
+ option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
30
+ option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
31
+ option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
32
+ def create(scraper_name, git_repository)
33
+ # puts "options #{options}"
34
+ client = Client::Scraper.new(options)
35
+ puts "#{client.create(scraper_name, git_repository, options)}"
36
+ end
37
+
38
+ desc "update <scraper_name>", "Update a scraper"
39
+ long_desc <<-LONGDESC
40
+ Updates a scraper\x5
41
+ LONGDESC
42
+ option :branch, :aliases => :b, desc: 'Set the Git branch to use. Default: master'
43
+ option :name, :aliases => :n, desc: 'Set the scraper name. Name can only consist of alphabets, numbers, underscores and dashes. Name must be unique to your account'
44
+ option :repo, :aliases => :r, desc: 'Set the URL to a valid Git repository'
45
+ option :freshness_type, :aliases => :t, desc: 'Set how fresh the page cache is. Possible values: day, week, month, year. Default: any'
46
+ option :proxy_type, desc: 'Set the Proxy type. Default: standard'
47
+ option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
48
+ option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
49
+ option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
50
+ option :disable_scheduler, type: :boolean, desc: 'Set true to disable scheduler. Default: false'
51
+ option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
52
+ option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
53
+ option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
54
+ def update(scraper_name)
55
+ client = Client::Scraper.new(options)
56
+ puts "#{client.update(scraper_name, options)}"
57
+ end
58
+
59
+
60
+ desc "show <scraper_name>", "Show a scraper"
61
+ def show(scraper_name)
62
+ client = Client::Scraper.new(options)
63
+ puts "#{client.find(scraper_name)}"
64
+ end
65
+
66
+ desc "delete <scraper_name>", "Delete a scraper and related records"
67
+ def delete(scraper_name)
68
+ client = Client::Scraper.new(options)
69
+ puts "#{client.delete(scraper_name)}"
70
+ end
71
+
72
+
73
+ desc "deploy <scraper_name>", "Deploy a scraper"
74
+ long_desc <<-LONGDESC
75
+ Deploys a scraper
76
+ LONGDESC
77
+ def deploy(scraper_name)
78
+ client = Client::ScraperDeployment.new()
79
+ puts "Deploying scraper. This may take a while..."
80
+ puts "#{client.deploy(scraper_name)}"
81
+ end
82
+
83
+ desc "start <scraper_name>", "Creates a scraping job and runs it"
84
+ long_desc <<-LONGDESC
85
+ Starts a scraper by creating an active scrape job\x5
86
+ LONGDESC
87
+ option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
88
+ option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
89
+ option :proxy_type, desc: 'Set the Proxy type. Default: standard'
90
+ def start(scraper_name)
91
+ client = Client::ScraperJob.new(options)
92
+ puts "Starting a scrape job..."
93
+ puts "#{client.create(scraper_name, options)}"
94
+ end
95
+
96
+
97
+ desc "log <scraper_name>", "List log entries related to a scraper's current job"
98
+ long_desc <<-LONGDESC
99
+ Shows log related to a scraper's current job. Defaults to showing the most recent entries\x5
100
+ LONGDESC
101
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
102
+ option :head, :aliases => :H, desc: 'Show the oldest log entries. If not set, newest entries is shown'
103
+ option :parsing, :aliases => :p, type: :boolean, desc: 'Show only log entries related to parsing errors'
104
+ option :seeding, :aliases => :s, type: :boolean, desc: 'Show only log entries related to seeding errors'
105
+ option :more, :aliases => :m, desc: 'Show next set of log entries. Enter the `More token`'
106
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 5000 per page.'
107
+ def log(scraper_name)
108
+ client = Client::JobLog.new(options)
109
+
110
+ query = {}
111
+ query["order"] = options.delete(:head) if options[:head]
112
+ query["job_type"] = "parsing" if options[:parsing]
113
+ query["job_type"] = "seeding" if options[:seeding]
114
+ query["page_token"] = options.delete(:more) if options[:more]
115
+ query["per_page"] = options.delete(:per_page) if options[:per_page]
116
+
117
+ puts "Fetching logs..."
118
+
119
+ if options[:job]
120
+ result = client.all_job_log(options[:job], {query: query})
121
+ else
122
+ result = client.scraper_all_job_log(scraper_name, {query: query})
123
+ end
124
+
125
+ if result['entries'].nil? || result["entries"].length == 0
126
+ puts "No logs yet, please try again later."
127
+ else
128
+ more_token = result["more_token"]
129
+
130
+ result["entries"].each do |entry|
131
+ puts "#{entry["timestamp"]} #{entry["severity"]}: #{entry["payload"]}" if entry.is_a?(Hash)
132
+ end
133
+
134
+ unless more_token.nil?
135
+ puts "-----------"
136
+ puts "To see more entries, add: \"--more #{more_token}\""
137
+ end
138
+ end
139
+ end
140
+
141
+ desc "stats <scraper_name>", "Get the current stat for a job"
142
+ long_desc <<-LONGDESC
143
+ Get stats for a scraper's current job\n
144
+ LONGDESC
145
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
146
+ def stats(scraper_name)
147
+ client = Client::JobStat.new(options)
148
+ if options[:job]
149
+ puts "#{client.job_current_stats(options[:job])}"
150
+ else
151
+ puts "#{client.scraper_job_current_stats(scraper_name)}"
152
+ end
153
+
154
+ end
155
+
156
+
157
+ desc "job SUBCOMMAND ...ARGS", "manage scrapers jobs"
158
+ subcommand "job", ScraperJob
159
+
160
+ desc "deployment SUBCOMMAND ...ARGS", "manage scrapers deployments"
161
+ subcommand "deployment", ScraperDeployment
162
+
163
+ desc "finisher SUBCOMMAND ...ARGS", "manage scrapers finishers"
164
+ subcommand "finisher", ScraperFinisher
165
+
166
+ desc "output SUBCOMMAND ...ARGS", "view scraper outputs"
167
+ subcommand "output", JobOutput
168
+
169
+ desc "page SUBCOMMAND ...ARGS", "manage pages on a job"
170
+ subcommand "page", ScraperPage
171
+
172
+ desc "export SUBCOMMAND ...ARGS", "manage scraper's exports"
173
+ subcommand "export", ScraperExport
174
+
175
+ desc "exporter SUBCOMMAND ...ARGS", "manage scraper's exporters"
176
+ subcommand "exporter", ScraperExporter
177
+
178
+ desc "var SUBCOMMAND ...ARGS", "for managing scraper's variables"
179
+ subcommand "var", ScraperVar
180
+
181
+
182
+ end
183
+ end
184
+
185
+ end