answersengine 0.2.33

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +30 -0
  8. data/Rakefile +22 -0
  9. data/answersengine.gemspec +45 -0
  10. data/bin/console +14 -0
  11. data/bin/setup +8 -0
  12. data/examples/fetchtest/libraries/hello.rb +9 -0
  13. data/examples/fetchtest/libraries/hello_fail.rb +10 -0
  14. data/examples/fetchtest/parsers/failed.rb +2 -0
  15. data/examples/fetchtest/parsers/find_outputs.rb +18 -0
  16. data/examples/fetchtest/parsers/home.rb +50 -0
  17. data/examples/fetchtest/parsers/nested_fail.rb +3 -0
  18. data/examples/fetchtest/parsers/simple.rb +14 -0
  19. data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
  20. data/examples/fetchtest/seeders/failed.rb +1 -0
  21. data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
  22. data/examples/fetchtest/seeders/seed.rb +28 -0
  23. data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
  24. data/exe/answersengine +3 -0
  25. data/lib/answersengine.rb +5 -0
  26. data/lib/answersengine/cli.rb +33 -0
  27. data/lib/answersengine/cli/global_page.rb +39 -0
  28. data/lib/answersengine/cli/job.rb +30 -0
  29. data/lib/answersengine/cli/job_output.rb +69 -0
  30. data/lib/answersengine/cli/parser.rb +64 -0
  31. data/lib/answersengine/cli/scraper.rb +172 -0
  32. data/lib/answersengine/cli/scraper_deployment.rb +24 -0
  33. data/lib/answersengine/cli/scraper_export.rb +51 -0
  34. data/lib/answersengine/cli/scraper_exporter.rb +40 -0
  35. data/lib/answersengine/cli/scraper_job.rb +71 -0
  36. data/lib/answersengine/cli/scraper_page.rb +200 -0
  37. data/lib/answersengine/cli/seeder.rb +40 -0
  38. data/lib/answersengine/client.rb +23 -0
  39. data/lib/answersengine/client/backblaze_content.rb +45 -0
  40. data/lib/answersengine/client/base.rb +50 -0
  41. data/lib/answersengine/client/export.rb +10 -0
  42. data/lib/answersengine/client/global_page.rb +18 -0
  43. data/lib/answersengine/client/job.rb +53 -0
  44. data/lib/answersengine/client/job_export.rb +10 -0
  45. data/lib/answersengine/client/job_log.rb +27 -0
  46. data/lib/answersengine/client/job_output.rb +19 -0
  47. data/lib/answersengine/client/job_page.rb +62 -0
  48. data/lib/answersengine/client/job_stat.rb +16 -0
  49. data/lib/answersengine/client/scraper.rb +54 -0
  50. data/lib/answersengine/client/scraper_deployment.rb +17 -0
  51. data/lib/answersengine/client/scraper_export.rb +22 -0
  52. data/lib/answersengine/client/scraper_exporter.rb +14 -0
  53. data/lib/answersengine/client/scraper_job.rb +49 -0
  54. data/lib/answersengine/client/scraper_job_output.rb +19 -0
  55. data/lib/answersengine/client/scraper_job_page.rb +55 -0
  56. data/lib/answersengine/plugin.rb +6 -0
  57. data/lib/answersengine/plugin/context_exposer.rb +55 -0
  58. data/lib/answersengine/scraper.rb +16 -0
  59. data/lib/answersengine/scraper/executor.rb +292 -0
  60. data/lib/answersengine/scraper/parser.rb +18 -0
  61. data/lib/answersengine/scraper/ruby_parser_executor.rb +141 -0
  62. data/lib/answersengine/scraper/ruby_seeder_executor.rb +114 -0
  63. data/lib/answersengine/scraper/seeder.rb +18 -0
  64. data/lib/answersengine/version.rb +3 -0
  65. metadata +255 -0
data/exe/answersengine ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+ require 'answersengine/cli'
3
+ AnswersEngine::CLI.start
@@ -0,0 +1,5 @@
1
+ require "answersengine/version"
2
+ require "answersengine/scraper"
3
+
4
+ module AnswersEngine
5
+ end
@@ -0,0 +1,33 @@
1
+ require 'thor'
2
+ require 'answersengine/scraper'
3
+ require 'answersengine/cli/scraper_exporter'
4
+ require 'answersengine/cli/scraper_export'
5
+ require 'answersengine/cli/scraper_job'
6
+ require 'answersengine/cli/global_page'
7
+ require 'answersengine/cli/scraper_page'
8
+ require 'answersengine/cli/job_output'
9
+ require 'answersengine/cli/job'
10
+ require 'answersengine/cli/scraper_deployment'
11
+ require 'answersengine/cli/scraper'
12
+ require 'answersengine/cli/parser'
13
+ require 'answersengine/cli/seeder'
14
+
15
+
16
+ module AnswersEngine
17
+ class CLI < Thor
18
+ desc "scraper SUBCOMMAND ...ARGS", "manage scrapers"
19
+ subcommand "scraper", Scraper
20
+
21
+ desc "job SUBCOMMAND ...ARGS", "manage scrapers jobs"
22
+ subcommand "job", Job
23
+
24
+ desc "globalpage SUBCOMMAND ...ARGS", "interacts with global page"
25
+ subcommand "globalpage", GlobalPage
26
+
27
+ desc "parser SUBCOMMAND ...ARGS", "for parsing related activities"
28
+ subcommand "parser", Parser
29
+
30
+ desc "seeder SUBCOMMAND ...ARGS", "for seeding related activities"
31
+ subcommand "seeder", Seeder
32
+ end
33
+ end
@@ -0,0 +1,39 @@
1
+ module AnswersEngine
2
+ class CLI < Thor
3
+ class GlobalPage < Thor
4
+
5
+ desc "show <gid>", "Show a global page"
6
+ def show(gid)
7
+ client = Client::GlobalPage.new(options)
8
+ puts "#{client.find(gid)}"
9
+ end
10
+
11
+ desc "content <gid>", "Show content of a globalpage"
12
+ def content(gid)
13
+ client = Client::GlobalPage.new(options)
14
+ result = JSON.parse(client.find_content(gid).to_s)
15
+
16
+ if result['available'] == true
17
+ puts "Preview content url: \"#{result['preview_url']}\""
18
+ `open "#{result['preview_url']}"`
19
+ else
20
+ puts "Content does not exist"
21
+ end
22
+ end
23
+
24
+ desc "failedcontent <gid>", "Show failed content of a globalpage"
25
+ def failedcontent(gid)
26
+ client = Client::GlobalPage.new(options)
27
+ result = JSON.parse(client.find_failed_content(gid).to_s)
28
+
29
+ if result['available'] == true
30
+ puts "Preview failed content url: \"#{result['preview_url']}\""
31
+ `open "#{result['preview_url']}"`
32
+ else
33
+ puts "Failed Content does not exist"
34
+ end
35
+ end
36
+
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,30 @@
1
+ module AnswersEngine
2
+ class CLI < Thor
3
+ class Job < Thor
4
+ package_name "job"
5
+ def self.banner(command, namespace = nil, subcommand = false)
6
+ "#{basename} #{@package_name} #{command.usage}"
7
+ end
8
+
9
+
10
+ desc "list", "gets a list of jobs"
11
+ option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
12
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
13
+ long_desc <<-LONGDESC
14
+ List scrape jobs.
15
+ LONGDESC
16
+ def list()
17
+ client = Client::Job.new(options)
18
+ puts "#{client.all()}"
19
+ end
20
+
21
+ desc "show <job_id>", "Show a job"
22
+ def show(job_id)
23
+ client = Client::Job.new(options)
24
+ puts "#{client.find(job_id)}"
25
+ end
26
+
27
+ end
28
+ end
29
+
30
+ end
@@ -0,0 +1,69 @@
1
+ module AnswersEngine
2
+ class CLI < Thor
3
+ class JobOutput < Thor
4
+
5
+ package_name "scraper output"
6
+ def self.banner(command, namespace = nil, subcommand = false)
7
+ "#{basename} #{@package_name} #{command.usage}"
8
+ end
9
+
10
+ desc "list <scraper_name>", "List output records in a collection that is in the current job"
11
+ long_desc <<-LONGDESC
12
+ List all output records in a collection that is in the current job of a scraper\n
13
+ LONGDESC
14
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
15
+ option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
16
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
17
+ option :collection, :aliases => :c, desc: "Shows outputs from a specific collection.(defaults to 'default' collection)"
18
+ option :query, :aliases => :q, type: :string, banner: :JSON, desc: 'Set search query. Must be in json format. i.e: {"Foo":"bar"} '
19
+ def list(scraper_name)
20
+ collection = options.fetch(:collection) { 'default' }
21
+ if options[:job]
22
+ client = Client::JobOutput.new(options)
23
+ puts "#{client.all(options[:job], collection)}"
24
+ else
25
+ client = Client::ScraperJobOutput.new(options)
26
+ puts "#{client.all(scraper_name, collection)}"
27
+ end
28
+ end
29
+
30
+ desc "show <scraper_name> <record_id>", "Show one output record in a collection that is in the current job of a scraper"
31
+ long_desc <<-LONGDESC
32
+ Shows an output record in a collection that is in the current job of a scraper\n
33
+ <record_id>: ID of the output record.\x5
34
+ LONGDESC
35
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
36
+ option :collection, :aliases => :c, desc: "Shows output from a specific collection.(defaults to 'default' collection)"
37
+ def show(scraper_name, id)
38
+ collection = options.fetch(:collection) { 'default' }
39
+ if options[:job]
40
+ client = Client::JobOutput.new(options)
41
+ puts "#{client.find(options[:job], collection, id)}"
42
+ else
43
+ client = Client::ScraperJobOutput.new(options)
44
+ puts "#{client.find(scraper_name, collection, id)}"
45
+ end
46
+ end
47
+
48
+ desc "collections <scraper_name>", "list job output collections that are inside a current job of a scraper."
49
+ long_desc <<-LONGDESC
50
+ List job output collections that are inside a current job of a scraper.\x5
51
+ LONGDESC
52
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
53
+ option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
54
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
55
+ def collections(scraper_name)
56
+
57
+ if options[:job]
58
+ client = Client::JobOutput.new(options)
59
+ puts "#{client.collections(options[:job])}"
60
+ else
61
+ client = Client::ScraperJobOutput.new(options)
62
+ puts "#{client.collections(scraper_name)}"
63
+ end
64
+ end
65
+
66
+ end
67
+ end
68
+
69
+ end
@@ -0,0 +1,64 @@
1
+ module AnswersEngine
2
+ class CLI < Thor
3
+ class Parser < Thor
4
+ desc "try <scraper_name> <parser_file> <GID>", "Tries a parser on a Job Page"
5
+ long_desc <<-LONGDESC
6
+ Takes a parser script and runs it against a job page\x5
7
+ <parser_file>: Parser script file that will be executed on the page.\x5
8
+ <GID>: Global ID of the page.\x5
9
+ LONGDESC
10
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
11
+ option :global, :aliases => :g, type: :boolean, default: false, desc: 'Use globalpage instead of a job page'
12
+ option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
13
+ def try_parse(scraper_name, parser_file, gid)
14
+ begin
15
+
16
+ if options[:job]
17
+ job_id = options[:job]
18
+ elsif options[:global]
19
+ job_id = nil
20
+ else
21
+ job = Client::ScraperJob.new(options).find(scraper_name)
22
+ job_id = job['id']
23
+ end
24
+
25
+
26
+ vars = JSON.parse(options[:vars]) if options[:vars]
27
+ puts AnswersEngine::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars)
28
+
29
+ rescue JSON::ParserError
30
+ if options[:vars]
31
+ puts "Error: #{options[:vars]} on vars is not a valid JSON"
32
+ end
33
+ end
34
+ end
35
+
36
+ desc "exec <scraper_name> <parser_file> <GID>...<GID>", "Executes a parser script on one or more Job Pages within a scraper's current job"
37
+ long_desc <<-LONGDESC
38
+ Takes a parser script executes it against a job page(s) and save the output to the scraper's current job\x5
39
+ <parser_file>: Parser script file will be executed on the page.\x5
40
+ <GID>: Global ID of the page.\x5
41
+ LONGDESC
42
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
43
+ def exec_parse(scraper_name, parser_file, *gids)
44
+ gids.each do |gid|
45
+ begin
46
+ puts "Parsing #{gid}"
47
+
48
+ if options[:job]
49
+ job_id = options[:job]
50
+ else
51
+ job = Client::ScraperJob.new(options).find(scraper_name)
52
+ job_id = job['id']
53
+ end
54
+
55
+ puts AnswersEngine::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true)
56
+ rescue => e
57
+ puts e
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
63
+
64
+ end
@@ -0,0 +1,172 @@
1
+ module AnswersEngine
2
+ class CLI < Thor
3
+ class Scraper < Thor
4
+ desc "list", "List scrapers"
5
+
6
+ long_desc <<-LONGDESC
7
+ List all scrapers.
8
+ LONGDESC
9
+ option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
10
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
11
+ def list
12
+ client = Client::Scraper.new(options)
13
+ puts "#{client.all}"
14
+ end
15
+
16
+ desc "create <scraper_name> <git_repository>", "Create a scraper"
17
+ long_desc <<-LONGDESC
18
+ Creates a scraper\x5
19
+ <scraper_name>: Scraper name can only consist of alphabets, numbers, underscores and dashes. Name must be unique to your account.\x5
20
+ <git_repository>: URL to a valid Git repository.\x5
21
+ LONGDESC
22
+ option :branch, :aliases => :b, desc: 'Set the Git branch to use. Default: master'
23
+ option :freshness_type, :aliases => :t, desc: 'Set how fresh the page cache is. Possible values: day, week, month, year. Default: any'
24
+ option :proxy_type, desc: 'Set the Proxy type. Default: standard'
25
+ option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
26
+ option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
27
+ option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
28
+ option :disable_scheduler, type: :boolean, desc: 'Set true to disable scheduler. Default: false'
29
+ option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
30
+ option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
31
+ option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
32
+ def create(scraper_name, git_repository)
33
+ puts "options #{options}"
34
+ client = Client::Scraper.new(options)
35
+ puts "#{client.create(scraper_name, git_repository, options)}"
36
+ end
37
+
38
+ desc "update <scraper_name>", "Update a scraper"
39
+ long_desc <<-LONGDESC
40
+ Updates a scraper\x5
41
+ LONGDESC
42
+ option :branch, :aliases => :b, desc: 'Set the Git branch to use. Default: master'
43
+ option :name, :aliases => :n, desc: 'Set the scraper name. Name can only consist of alphabets, numbers, underscores and dashes. Name must be unique to your account'
44
+ option :repo, :aliases => :r, desc: 'Set the URL to a valid Git repository'
45
+ option :freshness_type, :aliases => :t, desc: 'Set how fresh the page cache is. Possible values: day, week, month, year. Default: any'
46
+ option :proxy_type, desc: 'Set the Proxy type. Default: standard'
47
+ option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
48
+ option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
49
+ option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
50
+ option :disable_scheduler, type: :boolean, desc: 'Set true to disable scheduler. Default: false'
51
+ option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
52
+ option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
53
+ option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
54
+ def update(scraper_name)
55
+ client = Client::Scraper.new(options)
56
+ puts "#{client.update(scraper_name, options)}"
57
+ end
58
+
59
+
60
+ desc "show <scraper_name>", "Show a scraper"
61
+ def show(scraper_name)
62
+ client = Client::Scraper.new(options)
63
+ puts "#{client.find(scraper_name)}"
64
+ end
65
+
66
+ desc "deploy <scraper_name>", "Deploy a scraper"
67
+ long_desc <<-LONGDESC
68
+ Deploys a scraper
69
+ LONGDESC
70
+ def deploy(scraper_name)
71
+ client = Client::ScraperDeployment.new()
72
+ puts "Deploying scraper. This may take a while..."
73
+ puts "#{client.deploy(scraper_name)}"
74
+ end
75
+
76
+ desc "start <scraper_name>", "Creates a scraping job and runs it"
77
+ long_desc <<-LONGDESC
78
+ Starts a scraper by creating an active scrape job\x5
79
+ LONGDESC
80
+ option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
81
+ option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
82
+ option :proxy_type, desc: 'Set the Proxy type. Default: standard'
83
+ def start(scraper_name)
84
+ client = Client::ScraperJob.new(options)
85
+ puts "Starting a scrape job..."
86
+ puts "#{client.create(scraper_name, options)}"
87
+ end
88
+
89
+
90
+ desc "log <scraper_name>", "List log entries related to a scraper's current job"
91
+ long_desc <<-LONGDESC
92
+ Shows log related to a scraper's current job. Defaults to showing the most recent entries\x5
93
+ LONGDESC
94
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
95
+ option :head, :aliases => :H, desc: 'Show the oldest log entries. If not set, newest entries is shown'
96
+ option :parsing, :aliases => :p, type: :boolean, desc: 'Show only log entries related to parsing errors'
97
+ option :seeding, :aliases => :s, type: :boolean, desc: 'Show only log entries related to seeding errors'
98
+ option :more, :aliases => :m, desc: 'Show next set of log entries. Enter the `More token`'
99
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 5000 per page.'
100
+ def log(scraper_name)
101
+ client = Client::JobLog.new(options)
102
+
103
+ query = {}
104
+ query["order"] = options.delete(:head) if options[:head]
105
+ query["job_type"] = "parsing" if options[:parsing]
106
+ query["job_type"] = "seeding" if options[:seeding]
107
+ query["page_token"] = options.delete(:more) if options[:more]
108
+ query["per_page"] = options.delete(:per_page) if options[:per_page]
109
+
110
+ puts "Fetching logs..."
111
+
112
+ if options[:job]
113
+ result = client.all_job_log(options[:job], {query: query})
114
+ else
115
+ result = client.scraper_all_job_log(scraper_name, {query: query})
116
+ end
117
+
118
+ if result['entries'].nil? || result["entries"].length == 0
119
+ puts "No logs yet, please try again later."
120
+ else
121
+ more_token = result["more_token"]
122
+
123
+ result["entries"].each do |entry|
124
+ puts "#{entry["timestamp"]} #{entry["severity"]}: #{entry["payload"]}" if entry.is_a?(Hash)
125
+ end
126
+
127
+ unless more_token.nil?
128
+ puts "-----------"
129
+ puts "To see more entries, add: \"--more #{more_token}\""
130
+ end
131
+ end
132
+ end
133
+
134
+ desc "stats <scraper_name>", "Get the current stat for a job"
135
+ long_desc <<-LONGDESC
136
+ Get stats for a scraper's current job\n
137
+ LONGDESC
138
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
139
+ def stats(scraper_name)
140
+ client = Client::JobStat.new(options)
141
+ if options[:job]
142
+ puts "#{client.job_current_stats(options[:job])}"
143
+ else
144
+ puts "#{client.scraper_job_current_stats(scraper_name)}"
145
+ end
146
+
147
+ end
148
+
149
+
150
+ desc "job SUBCOMMAND ...ARGS", "manage scrapers jobs"
151
+ subcommand "job", ScraperJob
152
+
153
+ desc "deployment SUBCOMMAND ...ARGS", "manage scrapers deployments"
154
+ subcommand "deployment", ScraperDeployment
155
+
156
+ desc "output SUBCOMMAND ...ARGS", "view scraper outputs"
157
+ subcommand "output", JobOutput
158
+
159
+ desc "page SUBCOMMAND ...ARGS", "manage pages on a job"
160
+ subcommand "page", ScraperPage
161
+
162
+ desc "export SUBCOMMAND ...ARGS", "manage scraper's exports"
163
+ subcommand "export", ScraperExport
164
+
165
+ desc "exporter SUBCOMMAND ...ARGS", "manage scraper's exporters"
166
+ subcommand "exporter", ScraperExporter
167
+
168
+
169
+ end
170
+ end
171
+
172
+ end
@@ -0,0 +1,24 @@
1
+ module AnswersEngine
2
+ class CLI < Thor
3
+ class ScraperDeployment < Thor
4
+
5
+ package_name "scraper deployment"
6
+ def self.banner(command, namespace = nil, subcommand = false)
7
+ "#{basename} #{@package_name} #{command.usage}"
8
+ end
9
+
10
+
11
+ desc "list <scraper_name>", "List deployments on a scraper"
12
+ long_desc <<-LONGDESC
13
+ List deployments on a scraper.
14
+ LONGDESC
15
+ option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
16
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
17
+ def list(scraper_name)
18
+ client = Client::ScraperDeployment.new(options)
19
+ puts "#{client.all(scraper_name)}"
20
+ end
21
+ end
22
+ end
23
+
24
+ end