answersengine 0.2.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +30 -0
  8. data/Rakefile +22 -0
  9. data/answersengine.gemspec +45 -0
  10. data/bin/console +14 -0
  11. data/bin/setup +8 -0
  12. data/examples/fetchtest/libraries/hello.rb +9 -0
  13. data/examples/fetchtest/libraries/hello_fail.rb +10 -0
  14. data/examples/fetchtest/parsers/failed.rb +2 -0
  15. data/examples/fetchtest/parsers/find_outputs.rb +18 -0
  16. data/examples/fetchtest/parsers/home.rb +50 -0
  17. data/examples/fetchtest/parsers/nested_fail.rb +3 -0
  18. data/examples/fetchtest/parsers/simple.rb +14 -0
  19. data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
  20. data/examples/fetchtest/seeders/failed.rb +1 -0
  21. data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
  22. data/examples/fetchtest/seeders/seed.rb +28 -0
  23. data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
  24. data/exe/answersengine +3 -0
  25. data/lib/answersengine.rb +5 -0
  26. data/lib/answersengine/cli.rb +33 -0
  27. data/lib/answersengine/cli/global_page.rb +39 -0
  28. data/lib/answersengine/cli/job.rb +30 -0
  29. data/lib/answersengine/cli/job_output.rb +69 -0
  30. data/lib/answersengine/cli/parser.rb +64 -0
  31. data/lib/answersengine/cli/scraper.rb +172 -0
  32. data/lib/answersengine/cli/scraper_deployment.rb +24 -0
  33. data/lib/answersengine/cli/scraper_export.rb +51 -0
  34. data/lib/answersengine/cli/scraper_exporter.rb +40 -0
  35. data/lib/answersengine/cli/scraper_job.rb +71 -0
  36. data/lib/answersengine/cli/scraper_page.rb +200 -0
  37. data/lib/answersengine/cli/seeder.rb +40 -0
  38. data/lib/answersengine/client.rb +23 -0
  39. data/lib/answersengine/client/backblaze_content.rb +45 -0
  40. data/lib/answersengine/client/base.rb +50 -0
  41. data/lib/answersengine/client/export.rb +10 -0
  42. data/lib/answersengine/client/global_page.rb +18 -0
  43. data/lib/answersengine/client/job.rb +53 -0
  44. data/lib/answersengine/client/job_export.rb +10 -0
  45. data/lib/answersengine/client/job_log.rb +27 -0
  46. data/lib/answersengine/client/job_output.rb +19 -0
  47. data/lib/answersengine/client/job_page.rb +62 -0
  48. data/lib/answersengine/client/job_stat.rb +16 -0
  49. data/lib/answersengine/client/scraper.rb +54 -0
  50. data/lib/answersengine/client/scraper_deployment.rb +17 -0
  51. data/lib/answersengine/client/scraper_export.rb +22 -0
  52. data/lib/answersengine/client/scraper_exporter.rb +14 -0
  53. data/lib/answersengine/client/scraper_job.rb +49 -0
  54. data/lib/answersengine/client/scraper_job_output.rb +19 -0
  55. data/lib/answersengine/client/scraper_job_page.rb +55 -0
  56. data/lib/answersengine/plugin.rb +6 -0
  57. data/lib/answersengine/plugin/context_exposer.rb +55 -0
  58. data/lib/answersengine/scraper.rb +16 -0
  59. data/lib/answersengine/scraper/executor.rb +292 -0
  60. data/lib/answersengine/scraper/parser.rb +18 -0
  61. data/lib/answersengine/scraper/ruby_parser_executor.rb +141 -0
  62. data/lib/answersengine/scraper/ruby_seeder_executor.rb +114 -0
  63. data/lib/answersengine/scraper/seeder.rb +18 -0
  64. data/lib/answersengine/version.rb +3 -0
  65. metadata +255 -0
data/exe/answersengine ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+ require 'answersengine/cli'
3
+ AnswersEngine::CLI.start
@@ -0,0 +1,5 @@
1
+ require "answersengine/version"
2
+ require "answersengine/scraper"
3
+
4
+ module AnswersEngine
5
+ end
@@ -0,0 +1,33 @@
1
+ require 'thor'
2
+ require 'answersengine/scraper'
3
+ require 'answersengine/cli/scraper_exporter'
4
+ require 'answersengine/cli/scraper_export'
5
+ require 'answersengine/cli/scraper_job'
6
+ require 'answersengine/cli/global_page'
7
+ require 'answersengine/cli/scraper_page'
8
+ require 'answersengine/cli/job_output'
9
+ require 'answersengine/cli/job'
10
+ require 'answersengine/cli/scraper_deployment'
11
+ require 'answersengine/cli/scraper'
12
+ require 'answersengine/cli/parser'
13
+ require 'answersengine/cli/seeder'
14
+
15
+
16
+ module AnswersEngine
17
+ class CLI < Thor
18
+ desc "scraper SUBCOMMAND ...ARGS", "manage scrapers"
19
+ subcommand "scraper", Scraper
20
+
21
+ desc "job SUBCOMMAND ...ARGS", "manage scrapers jobs"
22
+ subcommand "job", Job
23
+
24
+ desc "globalpage SUBCOMMAND ...ARGS", "interacts with global page"
25
+ subcommand "globalpage", GlobalPage
26
+
27
+ desc "parser SUBCOMMAND ...ARGS", "for parsing related activities"
28
+ subcommand "parser", Parser
29
+
30
+ desc "seeder SUBCOMMAND ...ARGS", "for seeding related activities"
31
+ subcommand "seeder", Seeder
32
+ end
33
+ end
@@ -0,0 +1,39 @@
1
+ module AnswersEngine
2
+ class CLI < Thor
3
+ class GlobalPage < Thor
4
+
5
+ desc "show <gid>", "Show a global page"
6
+ def show(gid)
7
+ client = Client::GlobalPage.new(options)
8
+ puts "#{client.find(gid)}"
9
+ end
10
+
11
+ desc "content <gid>", "Show content of a globalpage"
12
+ def content(gid)
13
+ client = Client::GlobalPage.new(options)
14
+ result = JSON.parse(client.find_content(gid).to_s)
15
+
16
+ if result['available'] == true
17
+ puts "Preview content url: \"#{result['preview_url']}\""
18
+ `open "#{result['preview_url']}"`
19
+ else
20
+ puts "Content does not exist"
21
+ end
22
+ end
23
+
24
+ desc "failedcontent <gid>", "Show failed content of a globalpage"
25
+ def failedcontent(gid)
26
+ client = Client::GlobalPage.new(options)
27
+ result = JSON.parse(client.find_failed_content(gid).to_s)
28
+
29
+ if result['available'] == true
30
+ puts "Preview failed content url: \"#{result['preview_url']}\""
31
+ `open "#{result['preview_url']}"`
32
+ else
33
+ puts "Failed Content does not exist"
34
+ end
35
+ end
36
+
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,30 @@
1
+ module AnswersEngine
2
+ class CLI < Thor
3
+ class Job < Thor
4
+ package_name "job"
5
+ def self.banner(command, namespace = nil, subcommand = false)
6
+ "#{basename} #{@package_name} #{command.usage}"
7
+ end
8
+
9
+
10
+ desc "list", "gets a list of jobs"
11
+ option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
12
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
13
+ long_desc <<-LONGDESC
14
+ List scrape jobs.
15
+ LONGDESC
16
+ def list()
17
+ client = Client::Job.new(options)
18
+ puts "#{client.all()}"
19
+ end
20
+
21
+ desc "show <job_id>", "Show a job"
22
+ def show(job_id)
23
+ client = Client::Job.new(options)
24
+ puts "#{client.find(job_id)}"
25
+ end
26
+
27
+ end
28
+ end
29
+
30
+ end
@@ -0,0 +1,69 @@
1
+ module AnswersEngine
2
+ class CLI < Thor
3
+ class JobOutput < Thor
4
+
5
+ package_name "scraper output"
6
+ def self.banner(command, namespace = nil, subcommand = false)
7
+ "#{basename} #{@package_name} #{command.usage}"
8
+ end
9
+
10
+ desc "list <scraper_name>", "List output records in a collection that is in the current job"
11
+ long_desc <<-LONGDESC
12
+ List all output records in a collection that is in the current job of a scraper\n
13
+ LONGDESC
14
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
15
+ option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
16
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
17
+ option :collection, :aliases => :c, desc: "Shows outputs from a specific collection.(defaults to 'default' collection)"
18
+ option :query, :aliases => :q, type: :string, banner: :JSON, desc: 'Set search query. Must be in json format. i.e: {"Foo":"bar"} '
19
+ def list(scraper_name)
20
+ collection = options.fetch(:collection) { 'default' }
21
+ if options[:job]
22
+ client = Client::JobOutput.new(options)
23
+ puts "#{client.all(options[:job], collection)}"
24
+ else
25
+ client = Client::ScraperJobOutput.new(options)
26
+ puts "#{client.all(scraper_name, collection)}"
27
+ end
28
+ end
29
+
30
+ desc "show <scraper_name> <record_id>", "Show one output record in a collection that is in the current job of a scraper"
31
+ long_desc <<-LONGDESC
32
+ Shows an output record in a collection that is in the current job of a scraper\n
33
+ <record_id>: ID of the output record.\x5
34
+ LONGDESC
35
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
36
+ option :collection, :aliases => :c, desc: "Shows output from a specific collection.(defaults to 'default' collection)"
37
+ def show(scraper_name, id)
38
+ collection = options.fetch(:collection) { 'default' }
39
+ if options[:job]
40
+ client = Client::JobOutput.new(options)
41
+ puts "#{client.find(options[:job], collection, id)}"
42
+ else
43
+ client = Client::ScraperJobOutput.new(options)
44
+ puts "#{client.find(scraper_name, collection, id)}"
45
+ end
46
+ end
47
+
48
+ desc "collections <scraper_name>", "list job output collections that are inside a current job of a scraper."
49
+ long_desc <<-LONGDESC
50
+ List job output collections that are inside a current job of a scraper.\x5
51
+ LONGDESC
52
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
53
+ option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
54
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
55
+ def collections(scraper_name)
56
+
57
+ if options[:job]
58
+ client = Client::JobOutput.new(options)
59
+ puts "#{client.collections(options[:job])}"
60
+ else
61
+ client = Client::ScraperJobOutput.new(options)
62
+ puts "#{client.collections(scraper_name)}"
63
+ end
64
+ end
65
+
66
+ end
67
+ end
68
+
69
+ end
@@ -0,0 +1,64 @@
1
+ module AnswersEngine
2
+ class CLI < Thor
3
+ class Parser < Thor
4
+ desc "try <scraper_name> <parser_file> <GID>", "Tries a parser on a Job Page"
5
+ long_desc <<-LONGDESC
6
+ Takes a parser script and runs it against a job page\x5
7
+ <parser_file>: Parser script file that will be executed on the page.\x5
8
+ <GID>: Global ID of the page.\x5
9
+ LONGDESC
10
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
11
+ option :global, :aliases => :g, type: :boolean, default: false, desc: 'Use globalpage instead of a job page'
12
+ option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
13
+ def try_parse(scraper_name, parser_file, gid)
14
+ begin
15
+
16
+ if options[:job]
17
+ job_id = options[:job]
18
+ elsif options[:global]
19
+ job_id = nil
20
+ else
21
+ job = Client::ScraperJob.new(options).find(scraper_name)
22
+ job_id = job['id']
23
+ end
24
+
25
+
26
+ vars = JSON.parse(options[:vars]) if options[:vars]
27
+ puts AnswersEngine::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars)
28
+
29
+ rescue JSON::ParserError
30
+ if options[:vars]
31
+ puts "Error: #{options[:vars]} on vars is not a valid JSON"
32
+ end
33
+ end
34
+ end
35
+
36
+ desc "exec <scraper_name> <parser_file> <GID>...<GID>", "Executes a parser script on one or more Job Pages within a scraper's current job"
37
+ long_desc <<-LONGDESC
38
+ Takes a parser script executes it against a job page(s) and save the output to the scraper's current job\x5
39
+ <parser_file>: Parser script file will be executed on the page.\x5
40
+ <GID>: Global ID of the page.\x5
41
+ LONGDESC
42
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
43
+ def exec_parse(scraper_name, parser_file, *gids)
44
+ gids.each do |gid|
45
+ begin
46
+ puts "Parsing #{gid}"
47
+
48
+ if options[:job]
49
+ job_id = options[:job]
50
+ else
51
+ job = Client::ScraperJob.new(options).find(scraper_name)
52
+ job_id = job['id']
53
+ end
54
+
55
+ puts AnswersEngine::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true)
56
+ rescue => e
57
+ puts e
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
63
+
64
+ end
@@ -0,0 +1,172 @@
1
+ module AnswersEngine
2
+ class CLI < Thor
3
+ class Scraper < Thor
4
+ desc "list", "List scrapers"
5
+
6
+ long_desc <<-LONGDESC
7
+ List all scrapers.
8
+ LONGDESC
9
+ option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
10
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
11
+ def list
12
+ client = Client::Scraper.new(options)
13
+ puts "#{client.all}"
14
+ end
15
+
16
+ desc "create <scraper_name> <git_repository>", "Create a scraper"
17
+ long_desc <<-LONGDESC
18
+ Creates a scraper\x5
19
+ <scraper_name>: Scraper name can only consist of alphabets, numbers, underscores and dashes. Name must be unique to your account.\x5
20
+ <git_repository>: URL to a valid Git repository.\x5
21
+ LONGDESC
22
+ option :branch, :aliases => :b, desc: 'Set the Git branch to use. Default: master'
23
+ option :freshness_type, :aliases => :t, desc: 'Set how fresh the page cache is. Possible values: day, week, month, year. Default: any'
24
+ option :proxy_type, desc: 'Set the Proxy type. Default: standard'
25
+ option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
26
+ option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
27
+ option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
28
+ option :disable_scheduler, type: :boolean, desc: 'Set true to disable scheduler. Default: false'
29
+ option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
30
+ option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
31
+ option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
32
+ def create(scraper_name, git_repository)
33
+ puts "options #{options}"
34
+ client = Client::Scraper.new(options)
35
+ puts "#{client.create(scraper_name, git_repository, options)}"
36
+ end
37
+
38
+ desc "update <scraper_name>", "Update a scraper"
39
+ long_desc <<-LONGDESC
40
+ Updates a scraper\x5
41
+ LONGDESC
42
+ option :branch, :aliases => :b, desc: 'Set the Git branch to use. Default: master'
43
+ option :name, :aliases => :n, desc: 'Set the scraper name. Name can only consist of alphabets, numbers, underscores and dashes. Name must be unique to your account'
44
+ option :repo, :aliases => :r, desc: 'Set the URL to a valid Git repository'
45
+ option :freshness_type, :aliases => :t, desc: 'Set how fresh the page cache is. Possible values: day, week, month, year. Default: any'
46
+ option :proxy_type, desc: 'Set the Proxy type. Default: standard'
47
+ option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
48
+ option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
49
+ option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
50
+ option :disable_scheduler, type: :boolean, desc: 'Set true to disable scheduler. Default: false'
51
+ option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
52
+ option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
53
+ option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
54
+ def update(scraper_name)
55
+ client = Client::Scraper.new(options)
56
+ puts "#{client.update(scraper_name, options)}"
57
+ end
58
+
59
+
60
+ desc "show <scraper_name>", "Show a scraper"
61
+ def show(scraper_name)
62
+ client = Client::Scraper.new(options)
63
+ puts "#{client.find(scraper_name)}"
64
+ end
65
+
66
+ desc "deploy <scraper_name>", "Deploy a scraper"
67
+ long_desc <<-LONGDESC
68
+ Deploys a scraper
69
+ LONGDESC
70
+ def deploy(scraper_name)
71
+ client = Client::ScraperDeployment.new()
72
+ puts "Deploying scraper. This may take a while..."
73
+ puts "#{client.deploy(scraper_name)}"
74
+ end
75
+
76
+ desc "start <scraper_name>", "Creates a scraping job and runs it"
77
+ long_desc <<-LONGDESC
78
+ Starts a scraper by creating an active scrape job\x5
79
+ LONGDESC
80
+ option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
81
+ option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
82
+ option :proxy_type, desc: 'Set the Proxy type. Default: standard'
83
+ def start(scraper_name)
84
+ client = Client::ScraperJob.new(options)
85
+ puts "Starting a scrape job..."
86
+ puts "#{client.create(scraper_name, options)}"
87
+ end
88
+
89
+
90
+ desc "log <scraper_name>", "List log entries related to a scraper's current job"
91
+ long_desc <<-LONGDESC
92
+ Shows log related to a scraper's current job. Defaults to showing the most recent entries\x5
93
+ LONGDESC
94
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
95
+ option :head, :aliases => :H, desc: 'Show the oldest log entries. If not set, newest entries is shown'
96
+ option :parsing, :aliases => :p, type: :boolean, desc: 'Show only log entries related to parsing errors'
97
+ option :seeding, :aliases => :s, type: :boolean, desc: 'Show only log entries related to seeding errors'
98
+ option :more, :aliases => :m, desc: 'Show next set of log entries. Enter the `More token`'
99
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 5000 per page.'
100
+ def log(scraper_name)
101
+ client = Client::JobLog.new(options)
102
+
103
+ query = {}
104
+ query["order"] = options.delete(:head) if options[:head]
105
+ query["job_type"] = "parsing" if options[:parsing]
106
+ query["job_type"] = "seeding" if options[:seeding]
107
+ query["page_token"] = options.delete(:more) if options[:more]
108
+ query["per_page"] = options.delete(:per_page) if options[:per_page]
109
+
110
+ puts "Fetching logs..."
111
+
112
+ if options[:job]
113
+ result = client.all_job_log(options[:job], {query: query})
114
+ else
115
+ result = client.scraper_all_job_log(scraper_name, {query: query})
116
+ end
117
+
118
+ if result['entries'].nil? || result["entries"].length == 0
119
+ puts "No logs yet, please try again later."
120
+ else
121
+ more_token = result["more_token"]
122
+
123
+ result["entries"].each do |entry|
124
+ puts "#{entry["timestamp"]} #{entry["severity"]}: #{entry["payload"]}" if entry.is_a?(Hash)
125
+ end
126
+
127
+ unless more_token.nil?
128
+ puts "-----------"
129
+ puts "To see more entries, add: \"--more #{more_token}\""
130
+ end
131
+ end
132
+ end
133
+
134
+ desc "stats <scraper_name>", "Get the current stat for a job"
135
+ long_desc <<-LONGDESC
136
+ Get stats for a scraper's current job\n
137
+ LONGDESC
138
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
139
+ def stats(scraper_name)
140
+ client = Client::JobStat.new(options)
141
+ if options[:job]
142
+ puts "#{client.job_current_stats(options[:job])}"
143
+ else
144
+ puts "#{client.scraper_job_current_stats(scraper_name)}"
145
+ end
146
+
147
+ end
148
+
149
+
150
+ desc "job SUBCOMMAND ...ARGS", "manage scrapers jobs"
151
+ subcommand "job", ScraperJob
152
+
153
+ desc "deployment SUBCOMMAND ...ARGS", "manage scrapers deployments"
154
+ subcommand "deployment", ScraperDeployment
155
+
156
+ desc "output SUBCOMMAND ...ARGS", "view scraper outputs"
157
+ subcommand "output", JobOutput
158
+
159
+ desc "page SUBCOMMAND ...ARGS", "manage pages on a job"
160
+ subcommand "page", ScraperPage
161
+
162
+ desc "export SUBCOMMAND ...ARGS", "manage scraper's exports"
163
+ subcommand "export", ScraperExport
164
+
165
+ desc "exporter SUBCOMMAND ...ARGS", "manage scraper's exporters"
166
+ subcommand "exporter", ScraperExporter
167
+
168
+
169
+ end
170
+ end
171
+
172
+ end
@@ -0,0 +1,24 @@
1
+ module AnswersEngine
2
+ class CLI < Thor
3
+ class ScraperDeployment < Thor
4
+
5
+ package_name "scraper deployment"
6
+ def self.banner(command, namespace = nil, subcommand = false)
7
+ "#{basename} #{@package_name} #{command.usage}"
8
+ end
9
+
10
+
11
+ desc "list <scraper_name>", "List deployments on a scraper"
12
+ long_desc <<-LONGDESC
13
+ List deployments on a scraper.
14
+ LONGDESC
15
+ option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
16
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
17
+ def list(scraper_name)
18
+ client = Client::ScraperDeployment.new(options)
19
+ puts "#{client.all(scraper_name)}"
20
+ end
21
+ end
22
+ end
23
+
24
+ end