answersengine 0.2.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +30 -0
  8. data/Rakefile +22 -0
  9. data/answersengine.gemspec +45 -0
  10. data/bin/console +14 -0
  11. data/bin/setup +8 -0
  12. data/examples/fetchtest/libraries/hello.rb +9 -0
  13. data/examples/fetchtest/libraries/hello_fail.rb +10 -0
  14. data/examples/fetchtest/parsers/failed.rb +2 -0
  15. data/examples/fetchtest/parsers/find_outputs.rb +18 -0
  16. data/examples/fetchtest/parsers/home.rb +50 -0
  17. data/examples/fetchtest/parsers/nested_fail.rb +3 -0
  18. data/examples/fetchtest/parsers/simple.rb +14 -0
  19. data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
  20. data/examples/fetchtest/seeders/failed.rb +1 -0
  21. data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
  22. data/examples/fetchtest/seeders/seed.rb +28 -0
  23. data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
  24. data/exe/answersengine +3 -0
  25. data/lib/answersengine.rb +5 -0
  26. data/lib/answersengine/cli.rb +33 -0
  27. data/lib/answersengine/cli/global_page.rb +39 -0
  28. data/lib/answersengine/cli/job.rb +30 -0
  29. data/lib/answersengine/cli/job_output.rb +69 -0
  30. data/lib/answersengine/cli/parser.rb +64 -0
  31. data/lib/answersengine/cli/scraper.rb +172 -0
  32. data/lib/answersengine/cli/scraper_deployment.rb +24 -0
  33. data/lib/answersengine/cli/scraper_export.rb +51 -0
  34. data/lib/answersengine/cli/scraper_exporter.rb +40 -0
  35. data/lib/answersengine/cli/scraper_job.rb +71 -0
  36. data/lib/answersengine/cli/scraper_page.rb +200 -0
  37. data/lib/answersengine/cli/seeder.rb +40 -0
  38. data/lib/answersengine/client.rb +23 -0
  39. data/lib/answersengine/client/backblaze_content.rb +45 -0
  40. data/lib/answersengine/client/base.rb +50 -0
  41. data/lib/answersengine/client/export.rb +10 -0
  42. data/lib/answersengine/client/global_page.rb +18 -0
  43. data/lib/answersengine/client/job.rb +53 -0
  44. data/lib/answersengine/client/job_export.rb +10 -0
  45. data/lib/answersengine/client/job_log.rb +27 -0
  46. data/lib/answersengine/client/job_output.rb +19 -0
  47. data/lib/answersengine/client/job_page.rb +62 -0
  48. data/lib/answersengine/client/job_stat.rb +16 -0
  49. data/lib/answersengine/client/scraper.rb +54 -0
  50. data/lib/answersengine/client/scraper_deployment.rb +17 -0
  51. data/lib/answersengine/client/scraper_export.rb +22 -0
  52. data/lib/answersengine/client/scraper_exporter.rb +14 -0
  53. data/lib/answersengine/client/scraper_job.rb +49 -0
  54. data/lib/answersengine/client/scraper_job_output.rb +19 -0
  55. data/lib/answersengine/client/scraper_job_page.rb +55 -0
  56. data/lib/answersengine/plugin.rb +6 -0
  57. data/lib/answersengine/plugin/context_exposer.rb +55 -0
  58. data/lib/answersengine/scraper.rb +16 -0
  59. data/lib/answersengine/scraper/executor.rb +292 -0
  60. data/lib/answersengine/scraper/parser.rb +18 -0
  61. data/lib/answersengine/scraper/ruby_parser_executor.rb +141 -0
  62. data/lib/answersengine/scraper/ruby_seeder_executor.rb +114 -0
  63. data/lib/answersengine/scraper/seeder.rb +18 -0
  64. data/lib/answersengine/version.rb +3 -0
  65. metadata +255 -0
@@ -0,0 +1,19 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class ScraperJobOutput < AnswersEngine::Client::Base
4
+ def find(scraper_name, collection, id)
5
+ self.class.get("/scrapers/#{scraper_name}/current_job/output/collections/#{collection}/records/#{id}", @options)
6
+ end
7
+
8
+ def all(scraper_name, collection = 'default')
9
+
10
+ self.class.get("/scrapers/#{scraper_name}/current_job/output/collections/#{collection}/records", @options)
11
+ end
12
+
13
+ def collections(scraper_name)
14
+ self.class.get("/scrapers/#{scraper_name}/current_job/output/collections", @options)
15
+ end
16
+ end
17
+ end
18
+ end
19
+
@@ -0,0 +1,55 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class ScraperJobPage < AnswersEngine::Client::Base
4
+ def find(scraper_name, gid)
5
+ self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}", @options)
6
+ end
7
+
8
+ def all(scraper_name, opts={})
9
+ self.class.get("/scrapers/#{scraper_name}/current_job/pages", @options)
10
+ end
11
+
12
+ def update(scraper_name, gid, opts={})
13
+ body = {}
14
+ body[:page_type] = opts[:page_type] if opts[:page_type]
15
+ body[:priority] = opts[:priority] if opts[:priority]
16
+ body[:vars] = opts[:vars] if opts[:vars]
17
+
18
+ @options.merge!({body: body.to_json})
19
+
20
+ self.class.put("/scrapers/#{scraper_name}/current_job/pages/#{gid}", @options)
21
+ end
22
+
23
+ def refetch(scraper_name, opts={})
24
+ self.class.put("/scrapers/#{scraper_name}/current_job/pages/refetch", @options)
25
+ end
26
+
27
+ def reset(scraper_name, gid, opts={})
28
+ self.class.put("/scrapers/#{scraper_name}/current_job/pages/#{gid}/reset", @options)
29
+ end
30
+
31
+ def enqueue(scraper_name, method, url, opts={})
32
+ body = {}
33
+ body[:method] = method != "" ? method : "GET"
34
+ body[:url] = url
35
+ body[:page_type] = opts[:page_type] if opts[:page_type]
36
+ body[:priority] = opts[:priority] if opts[:priority]
37
+ body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
38
+ body[:body] = opts[:body] if opts[:body]
39
+ body[:headers] = opts[:headers] if opts[:headers]
40
+ body[:vars] = opts[:vars] if opts[:vars]
41
+ body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
42
+ body[:freshness] = opts[:freshness] if opts[:freshness]
43
+ body[:ua_type] = opts[:ua_type] if opts[:ua_type]
44
+ body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
45
+ body[:cookie] = opts[:cookie] if opts[:cookie]
46
+
47
+ @options.merge!({body: body.to_json})
48
+
49
+ self.class.post("/scrapers/#{scraper_name}/current_job/pages", @options)
50
+ end
51
+
52
+ end
53
+ end
54
+ end
55
+
@@ -0,0 +1,6 @@
1
+ require 'answersengine/plugin/context_exposer'
2
+
3
+ module AnswersEngine
4
+ module Plugin
5
+ end
6
+ end
@@ -0,0 +1,55 @@
1
+ module AnswersEngine
2
+ module Plugin
3
+ module ContextExposer
4
+ def self.exposed_methods
5
+ raise NotImplementedError.new('Specify methods exposed to isolated env')
6
+ end
7
+
8
+ def exposed_methods
9
+ self.class.exposed_methods
10
+ end
11
+
12
+ # Create lambda to retrieve a variable or call instance method
13
+ def var_or_proc vars, key
14
+ myself = self # Avoid stack overflow
15
+ return lambda{vars[key]} if vars.has_key?(key)
16
+ lambda{|*args| myself.send(key, *args)}
17
+ end
18
+
19
+ def exposed_env vars
20
+ keys = exposed_methods + vars.keys
21
+ Hash[keys.uniq.map{|key|[key, var_or_proc(vars, key)]}]
22
+ end
23
+
24
+ def expose_to object, env
25
+ metaclass = class << object; self; end
26
+ env.each do |key, block|
27
+ metaclass.send(:define_method, key, block)
28
+ end
29
+ object
30
+ end
31
+
32
+ # Create isolated context object from self
33
+ def create_context vars = {}
34
+ create_top_object_script = '(
35
+ lambda do
36
+ object = Object.new
37
+ metaclass = class << object
38
+ define_method(:context_binding){binding}
39
+ end
40
+ object
41
+ end
42
+ ).call'
43
+ object = TOPLEVEL_BINDING.eval(create_top_object_script)
44
+ env = exposed_env(vars)
45
+ expose_to object, env
46
+ object
47
+ end
48
+
49
+ # Create an isolated binding
50
+ def isolated_binding vars = {}
51
+ create_context(vars).context_binding
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,16 @@
1
+ require "answersengine/plugin"
2
+ require "answersengine/scraper/parser"
3
+ require "answersengine/scraper/seeder"
4
+ require "answersengine/scraper/executor"
5
+ require "answersengine/scraper/ruby_parser_executor"
6
+ require "answersengine/scraper/ruby_seeder_executor"
7
+ require "answersengine/client"
8
+
9
+ module AnswersEngine
10
+ module Scraper
11
+ # def self.list(opts={})
12
+ # scraper = Client::Scraper.new(opts)
13
+ # "Listing scrapers #{ENV['ANSWERSENGINE_TOKEN']} for #{scraper.all}"
14
+ # end
15
+ end
16
+ end
@@ -0,0 +1,292 @@
1
+ require 'nokogiri'
2
+ module AnswersEngine
3
+ module Scraper
4
+ # @abstract
5
+ class Executor
6
+ # Max allowed page size when query outputs (see #find_outputs).
7
+ MAX_FIND_OUTPUTS_PER_PAGE = 500
8
+
9
+ attr_accessor :filename, :gid, :job_id
10
+
11
+ include AnswersEngine::Plugin::ContextExposer
12
+
13
+ def exec_parser(save=false)
14
+ raise "should be implemented in subclass"
15
+ end
16
+
17
+ def init_page()
18
+ if job_id
19
+ puts "getting Job Page"
20
+ init_job_page
21
+ else
22
+ puts "getting Global Page"
23
+ init_global_page()
24
+ end
25
+
26
+ end
27
+
28
+ def init_job_page()
29
+ client = Client::JobPage.new()
30
+ job_page = client.find(job_id, gid)
31
+ unless job_page.code == 200
32
+ raise "Job #{job_id} or GID #{gid} not found. Aborting execution!"
33
+ else
34
+ job_page
35
+ end
36
+
37
+ end
38
+
39
+ def parsing_update(options={})
40
+ client = Client::JobPage.new()
41
+ job_id = options.fetch(:job_id)
42
+ gid = options.fetch(:gid)
43
+
44
+ client.parsing_update(job_id, gid, options)
45
+ end
46
+
47
+ def seeding_update(options={})
48
+ client = Client::Job.new()
49
+ job_id = options.fetch(:job_id)
50
+
51
+ client.seeding_update(job_id, options)
52
+ end
53
+
54
+ def init_global_page()
55
+ client = Client::GlobalPage.new()
56
+ client.find(gid)
57
+ end
58
+
59
+ def get_content(gid)
60
+ client = Client::GlobalPage.new()
61
+ content_json = client.find_content(gid)
62
+
63
+ if content_json['available']
64
+ signed_url = content_json['signed_url']
65
+ Client::BackblazeContent.new.get_gunzipped_content(signed_url)
66
+ else
67
+ nil
68
+ end
69
+ end
70
+
71
+ def get_failed_content(gid)
72
+ client = Client::GlobalPage.new()
73
+ content_json = client.find_failed_content(gid)
74
+
75
+ if content_json['available']
76
+ signed_url = content_json['signed_url']
77
+ Client::BackblazeContent.new.get_gunzipped_content(signed_url)
78
+ else
79
+ nil
80
+ end
81
+ end
82
+
83
+ # Get current job id from scraper or default when scraper_name is null.
84
+ #
85
+ # @param [String|nil] scraper_name Scraper name.
86
+ # @param [Integer|nil] default (nil) Default job id when no scraper name.
87
+ #
88
+ # @raise [Exception] When scraper name is not null, and scraper doesn't
89
+ # exists or it has no current job.
90
+ def get_job_id scraper_name, default = nil
91
+ return default if scraper_name.nil?
92
+ job = Client::ScraperJob.new().find(scraper_name)
93
+ raise JSON.pretty_generate(job) if job['id'].nil?
94
+ job['id']
95
+ end
96
+
97
+ # Find outputs by collection and query with pagination.
98
+ #
99
+ # @param [String] collection ('default') Collection name.
100
+ # @param [Hash] query ({}) Filters to query.
101
+ # @param [Integer] page (1) Page number.
102
+ # @param [Integer] per_page (30) Page size.
103
+ # @param [Hash] opts ({}) Configuration options.
104
+ # @option opts [String|nil] :scraper_name (nil) Scraper name to query
105
+ # from.
106
+ # @option opts [Integer|nil] :job_id (nil) Job's id to query from.
107
+ #
108
+ # @raise [ArgumentError] +collection+ is not String.
109
+ # @raise [ArgumentError] +query+ is not a Hash.
110
+ # @raise [ArgumentError] +page+ is not an Integer greater than 0.
111
+ # @raise [ArgumentError] +per_page+ is not an Integer between 1 and 500.
112
+ #
113
+ # @return [Array]
114
+ #
115
+ # @example
116
+ # find_outputs
117
+ # @example
118
+ # find_outputs 'my_collection'
119
+ # @example
120
+ # find_outputs 'my_collection', {}
121
+ # @example
122
+ # find_outputs 'my_collection', {}, 1
123
+ # @example
124
+ # find_outputs 'my_collection', {}, 1, 30
125
+ # @example Find from another scraper by name
126
+ # find_outputs 'my_collection', {}, 1, 30, scraper_name: 'my_scraper'
127
+ # @example Find from another scraper by job_id
128
+ # find_outputs 'my_collection', {}, 1, 30, job_id: 123
129
+ #
130
+ # @note *opts `:job_id` option is prioritize over `:scraper_name` when
131
+ # both exists. If none add provided or nil values, then current job
132
+ # will be used to query instead, this is the defaul behavior.
133
+ def find_outputs(collection='default', query={}, page=1, per_page=30, opts = {})
134
+ # Validate parameters out from nil for easier user usage.
135
+ raise ArgumentError.new("collection needs to be a String") unless collection.is_a?(String)
136
+ raise ArgumentError.new("query needs to be a Hash, instead of: #{query}") unless query.is_a?(Hash)
137
+ unless page.is_a?(Integer) && page > 0
138
+ raise ArgumentError.new("page needs to be an Integer greater than 0")
139
+ end
140
+ unless per_page.is_a?(Integer) && per_page > 0 && per_page <= MAX_FIND_OUTPUTS_PER_PAGE
141
+ raise ArgumentError.new("per_page needs to be an Integer between 1 and #{MAX_FIND_OUTPUTS_PER_PAGE}")
142
+ end
143
+
144
+ options = {
145
+ query: query,
146
+ page: page,
147
+ per_page: per_page}
148
+
149
+ # Get job_id
150
+ query_job_id = opts[:job_id] || get_job_id(opts[:scraper_name], self.job_id)
151
+
152
+ client = Client::JobOutput.new(options)
153
+ response = client.all(query_job_id, collection)
154
+
155
+ if response.code != 200
156
+ raise "response_code: #{response.code}|#{response.parsed_response}"
157
+ end
158
+ (response.body != 'null') ? response.parsed_response : []
159
+ end
160
+
161
+ # Find one output by collection and query with pagination.
162
+ #
163
+ # @param [String] collection ('default') Collection name.
164
+ # @param [Hash] query ({}) Filters to query.
165
+ # @param [Hash] opts ({}) Configuration options.
166
+ # @option opts [String|nil] :scraper_name (nil) Scraper name to query
167
+ # from.
168
+ # @option opts [Integer|nil] :job_id (nil) Job's id to query from.
169
+ #
170
+ # @raise [ArgumentError] +collection+ is not String.
171
+ # @raise [ArgumentError] +query+ is not a Hash.
172
+ #
173
+ # @return [Hash|nil] `Hash` when found, and `nil` when no output is found.
174
+ #
175
+ # @example
176
+ # find_output
177
+ # @example
178
+ # find_output 'my_collection'
179
+ # @example
180
+ # find_output 'my_collection', {}
181
+ # @example Find from another scraper by name
182
+ # find_output 'my_collection', {}, scraper_name: 'my_scraper'
183
+ # @example Find from another scraper by job_id
184
+ # find_output 'my_collection', {}, job_id: 123
185
+ #
186
+ # @note *opts `:job_id` option is prioritize over `:scraper_name` when
187
+ # both exists. If none add provided or nil values, then current job
188
+ # will be used to query instead, this is the defaul behavior.
189
+ def find_output(collection='default', query={}, opts = {})
190
+ result = find_outputs(collection, query, 1, 1, opts)
191
+ result.respond_to?(:first) ? result.first : nil
192
+ end
193
+
194
+ def save_pages_and_outputs(pages = [], outputs = [], status)
195
+ total_pages = pages.count
196
+ total_outputs = outputs.count
197
+ records_per_slice = 100
198
+ until pages.empty? && outputs.empty?
199
+ pages_slice = pages.shift(records_per_slice)
200
+ outputs_slice = outputs.shift(records_per_slice)
201
+
202
+ log_msgs = []
203
+ unless pages_slice.empty?
204
+ log_msgs << "#{pages_slice.count} out of #{total_pages} Pages"
205
+ unless save
206
+ puts '----------------------------------------'
207
+ puts "Would have saved #{log_msgs.last}"
208
+ puts JSON.pretty_generate pages_slice
209
+ end
210
+ end
211
+
212
+ unless outputs_slice.empty?
213
+ log_msgs << "#{outputs_slice.count} out of #{total_outputs} Outputs"
214
+ unless save
215
+ puts '----------------------------------------'
216
+ puts "Would have saved #{log_msgs.last}"
217
+ puts JSON.pretty_generate outputs_slice
218
+ end
219
+ end
220
+
221
+ next unless save
222
+ log_msg = "Saving #{log_msgs.join(' and ')}."
223
+ puts "#{log_msg}"
224
+
225
+ # saving to server
226
+ response = update_to_server(
227
+ job_id: job_id,
228
+ gid: gid,
229
+ pages: pages_slice,
230
+ outputs: outputs_slice,
231
+ status: status)
232
+
233
+ if response.code == 200
234
+ log_msg = "Saved."
235
+ puts "#{log_msg}"
236
+ else
237
+ puts "Error: Unable to save Pages and/or Outputs to server: #{response.body}"
238
+ raise "Unable to save Pages and/or Outputs to server: #{response.body}"
239
+ end
240
+ end
241
+ end
242
+
243
+ def update_to_server(opts = {})
244
+ raise "Implemented in Subclass"
245
+ end
246
+
247
+ def clean_backtrace(backtrace)
248
+ i = backtrace.index{|x| x =~ /gems\/answersengine/i}
249
+ if i.to_i < 1
250
+ return []
251
+ else
252
+ return backtrace[0..(i-1)]
253
+ end
254
+ end
255
+
256
+ def save_type
257
+ raise NotImplementedError.new('Need to implement "save_type" method.')
258
+ end
259
+
260
+ # Saves pages from an array and clear it.
261
+ #
262
+ # @param [Array] pages ([]) Page array to save. Warning: all elements will
263
+ # be removed from the array.
264
+ #
265
+ # @note IMPORTANT: +pages+ array's elements will be removed.
266
+ def save_pages(pages=[])
267
+ save_pages_and_outputs(pages, [], save_type)
268
+ end
269
+
270
+ # Saves outputs from an array and clear it.
271
+ #
272
+ # @param [Array] outputs ([]) Output array to save. Warning: all elements
273
+ # will be removed from the array.
274
+ #
275
+ # @note IMPORTANT: +outputs+ array's elements will be removed.
276
+ def save_outputs(outputs=[])
277
+ save_pages_and_outputs([], outputs, save_type)
278
+ end
279
+
280
+ # Eval a filename with a custom binding
281
+ #
282
+ # @param [String] filename File path to read.
283
+ # @param [Binding] context Context binding to evaluate with.
284
+ #
285
+ # @note Using this method will allow scripts to contain `return` to
286
+ # exit the script sooner along some improved security.
287
+ def eval_with_context file_path, context
288
+ eval(File.read(file_path), context, file_path)
289
+ end
290
+ end
291
+ end
292
+ end