answersengine 0.10.1 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/CODE_OF_CONDUCT.md +1 -1
  3. data/LICENSE.txt +1 -1
  4. data/README.md +3 -4
  5. data/answersengine.gemspec +6 -12
  6. data/exe/answersengine +3 -2
  7. data/lib/answersengine.rb +20 -3
  8. metadata +14 -152
  9. data/examples/fetchtest/libraries/hello.rb +0 -9
  10. data/examples/fetchtest/libraries/hello_fail.rb +0 -10
  11. data/examples/fetchtest/parsers/failed.rb +0 -2
  12. data/examples/fetchtest/parsers/find_outputs.rb +0 -18
  13. data/examples/fetchtest/parsers/home.rb +0 -50
  14. data/examples/fetchtest/parsers/nested_fail.rb +0 -3
  15. data/examples/fetchtest/parsers/simple.rb +0 -14
  16. data/examples/fetchtest/seeders/csv_seeder.rb +0 -12
  17. data/examples/fetchtest/seeders/failed.rb +0 -1
  18. data/examples/fetchtest/seeders/list_of_urls.csv +0 -5
  19. data/examples/fetchtest/seeders/seed.rb +0 -28
  20. data/examples/fetchtest/seeders/test_reset_page.rb +0 -4
  21. data/lib/answersengine/cli.rb +0 -45
  22. data/lib/answersengine/cli/env_var.rb +0 -48
  23. data/lib/answersengine/cli/finisher.rb +0 -40
  24. data/lib/answersengine/cli/global_page.rb +0 -39
  25. data/lib/answersengine/cli/job.rb +0 -30
  26. data/lib/answersengine/cli/job_output.rb +0 -69
  27. data/lib/answersengine/cli/parser.rb +0 -64
  28. data/lib/answersengine/cli/scraper.rb +0 -185
  29. data/lib/answersengine/cli/scraper_deployment.rb +0 -24
  30. data/lib/answersengine/cli/scraper_export.rb +0 -51
  31. data/lib/answersengine/cli/scraper_exporter.rb +0 -40
  32. data/lib/answersengine/cli/scraper_finisher.rb +0 -20
  33. data/lib/answersengine/cli/scraper_job.rb +0 -75
  34. data/lib/answersengine/cli/scraper_job_var.rb +0 -48
  35. data/lib/answersengine/cli/scraper_page.rb +0 -203
  36. data/lib/answersengine/cli/scraper_var.rb +0 -48
  37. data/lib/answersengine/cli/seeder.rb +0 -40
  38. data/lib/answersengine/client.rb +0 -29
  39. data/lib/answersengine/client/auth_token.rb +0 -50
  40. data/lib/answersengine/client/backblaze_content.rb +0 -45
  41. data/lib/answersengine/client/base.rb +0 -55
  42. data/lib/answersengine/client/deploy_key.rb +0 -21
  43. data/lib/answersengine/client/env_var.rb +0 -28
  44. data/lib/answersengine/client/export.rb +0 -10
  45. data/lib/answersengine/client/global_page.rb +0 -18
  46. data/lib/answersengine/client/job.rb +0 -64
  47. data/lib/answersengine/client/job_export.rb +0 -10
  48. data/lib/answersengine/client/job_log.rb +0 -26
  49. data/lib/answersengine/client/job_output.rb +0 -19
  50. data/lib/answersengine/client/job_page.rb +0 -58
  51. data/lib/answersengine/client/job_stat.rb +0 -16
  52. data/lib/answersengine/client/scraper.rb +0 -57
  53. data/lib/answersengine/client/scraper_deployment.rb +0 -18
  54. data/lib/answersengine/client/scraper_export.rb +0 -22
  55. data/lib/answersengine/client/scraper_exporter.rb +0 -14
  56. data/lib/answersengine/client/scraper_finisher.rb +0 -16
  57. data/lib/answersengine/client/scraper_job.rb +0 -49
  58. data/lib/answersengine/client/scraper_job_output.rb +0 -19
  59. data/lib/answersengine/client/scraper_job_page.rb +0 -67
  60. data/lib/answersengine/client/scraper_job_var.rb +0 -28
  61. data/lib/answersengine/client/scraper_var.rb +0 -28
  62. data/lib/answersengine/plugin.rb +0 -6
  63. data/lib/answersengine/plugin/context_exposer.rb +0 -55
  64. data/lib/answersengine/scraper.rb +0 -18
  65. data/lib/answersengine/scraper/executor.rb +0 -373
  66. data/lib/answersengine/scraper/finisher.rb +0 -18
  67. data/lib/answersengine/scraper/parser.rb +0 -18
  68. data/lib/answersengine/scraper/ruby_finisher_executor.rb +0 -116
  69. data/lib/answersengine/scraper/ruby_parser_executor.rb +0 -200
  70. data/lib/answersengine/scraper/ruby_seeder_executor.rb +0 -120
  71. data/lib/answersengine/scraper/seeder.rb +0 -18
  72. data/lib/answersengine/version.rb +0 -3
@@ -1,18 +0,0 @@
1
- require "answersengine/plugin"
2
- require "answersengine/scraper/parser"
3
- require "answersengine/scraper/seeder"
4
- require "answersengine/scraper/finisher"
5
- require "answersengine/scraper/executor"
6
- require "answersengine/scraper/ruby_parser_executor"
7
- require "answersengine/scraper/ruby_seeder_executor"
8
- require "answersengine/scraper/ruby_finisher_executor"
9
- require "answersengine/client"
10
-
11
- module AnswersEngine
12
- module Scraper
13
- # def self.list(opts={})
14
- # scraper = Client::Scraper.new(opts)
15
- # "Listing scrapers #{ENV['ANSWERSENGINE_TOKEN']} for #{scraper.all}"
16
- # end
17
- end
18
- end
@@ -1,373 +0,0 @@
1
- require 'nokogiri'
2
- module AnswersEngine
3
- module Scraper
4
- # @abstract
5
- class Executor
6
- # Max allowed page size when query outputs (see #find_outputs).
7
- MAX_FIND_OUTPUTS_PER_PAGE = 500
8
-
9
- attr_accessor :filename, :gid, :job_id
10
-
11
- include AnswersEngine::Plugin::ContextExposer
12
-
13
- def exec_parser(save=false)
14
- raise "should be implemented in subclass"
15
- end
16
-
17
- def init_page()
18
- if job_id
19
- puts "getting Job Page"
20
- init_job_page
21
- else
22
- puts "getting Global Page"
23
- init_global_page()
24
- end
25
-
26
- end
27
-
28
- def init_job_page()
29
- client = Client::JobPage.new()
30
- job_page = client.find(job_id, gid)
31
- unless job_page.code == 200
32
- raise "Job #{job_id} or GID #{gid} not found. Aborting execution!"
33
- else
34
- job_page
35
- end
36
-
37
- end
38
-
39
- def parsing_update(options={})
40
- client = Client::JobPage.new()
41
- job_id = options.fetch(:job_id)
42
- gid = options.fetch(:gid)
43
-
44
- client.parsing_update(job_id, gid, options)
45
- end
46
-
47
- def seeding_update(options={})
48
- client = Client::Job.new()
49
- job_id = options.fetch(:job_id)
50
-
51
- client.seeding_update(job_id, options)
52
- end
53
-
54
- def finisher_update(options={})
55
- client = Client::Job.new()
56
- job_id = options.fetch(:job_id)
57
-
58
- client.finisher_update(job_id, options)
59
- end
60
-
61
- def init_global_page()
62
- client = Client::GlobalPage.new()
63
- client.find(gid)
64
- end
65
-
66
- def get_content(gid)
67
- client = Client::GlobalPage.new()
68
- content_json = client.find_content(gid)
69
-
70
- if content_json['available']
71
- signed_url = content_json['signed_url']
72
- Client::BackblazeContent.new.get_gunzipped_content(signed_url)
73
- else
74
- nil
75
- end
76
- end
77
-
78
- def get_failed_content(gid)
79
- client = Client::GlobalPage.new()
80
- content_json = client.find_failed_content(gid)
81
-
82
- if content_json['available']
83
- signed_url = content_json['signed_url']
84
- Client::BackblazeContent.new.get_gunzipped_content(signed_url)
85
- else
86
- nil
87
- end
88
- end
89
-
90
- # Get current job id from scraper or default when scraper_name is null.
91
- #
92
- # @param [String|nil] scraper_name Scraper name.
93
- # @param [Integer|nil] default (nil) Default job id when no scraper name.
94
- #
95
- # @raise [Exception] When scraper name is not null, and scraper doesn't
96
- # exists or it has no current job.
97
- def get_job_id scraper_name, default = nil
98
- return default if scraper_name.nil?
99
- job = Client::ScraperJob.new().find(scraper_name)
100
- raise JSON.pretty_generate(job) if job['id'].nil?
101
- job['id']
102
- end
103
-
104
- # Find outputs by collection and query with pagination.
105
- #
106
- # @param [String] collection ('default') Collection name.
107
- # @param [Hash] query ({}) Filters to query.
108
- # @param [Integer] page (1) Page number.
109
- # @param [Integer] per_page (100) Page size.
110
- # @param [Hash] opts ({}) Configuration options.
111
- # @option opts [String|nil] :scraper_name (nil) Scraper name to query
112
- # from.
113
- # @option opts [Integer|nil] :job_id (nil) Job's id to query from.
114
- #
115
- # @raise [ArgumentError] +collection+ is not String.
116
- # @raise [ArgumentError] +query+ is not a Hash.
117
- # @raise [ArgumentError] +page+ is not an Integer greater than 0.
118
- # @raise [ArgumentError] +per_page+ is not an Integer between 1 and 500.
119
- #
120
- # @return [Array]
121
- #
122
- # @example
123
- # find_outputs
124
- # @example
125
- # find_outputs 'my_collection'
126
- # @example
127
- # find_outputs 'my_collection', {}
128
- # @example
129
- # find_outputs 'my_collection', {}, 1
130
- # @example
131
- # find_outputs 'my_collection', {}, 1, 100
132
- # @example Find from another scraper by name
133
- # find_outputs 'my_collection', {}, 1, 100, scraper_name: 'my_scraper'
134
- # @example Find from another scraper by job_id
135
- # find_outputs 'my_collection', {}, 1, 100, job_id: 123
136
- #
137
- # @note *opts `:job_id` option is prioritize over `:scraper_name` when
138
- # both exists. If none add provided or nil values, then current job
139
- # will be used to query instead, this is the defaul behavior.
140
- def find_outputs(collection='default', query={}, page=1, per_page=100, opts = {})
141
- # Validate parameters out from nil for easier user usage.
142
- raise ArgumentError.new("collection needs to be a String") unless collection.is_a?(String)
143
- raise ArgumentError.new("query needs to be a Hash, instead of: #{query}") unless query.is_a?(Hash)
144
- unless page.is_a?(Integer) && page > 0
145
- raise ArgumentError.new("page needs to be an Integer greater than 0")
146
- end
147
- unless per_page.is_a?(Integer) && per_page > 0 && per_page <= MAX_FIND_OUTPUTS_PER_PAGE
148
- raise ArgumentError.new("per_page needs to be an Integer between 1 and #{MAX_FIND_OUTPUTS_PER_PAGE}")
149
- end
150
-
151
- options = {
152
- query: query,
153
- page: page,
154
- per_page: per_page}
155
-
156
- # Get job_id
157
- query_job_id = opts[:job_id] || get_job_id(opts[:scraper_name], self.job_id)
158
-
159
- client = Client::JobOutput.new(options)
160
- response = client.all(query_job_id, collection)
161
-
162
- if response.code != 200
163
- raise "response_code: #{response.code}|#{response.parsed_response}"
164
- end
165
- (response.body != 'null') ? response.parsed_response : []
166
- end
167
-
168
- # Find one output by collection and query with pagination.
169
- #
170
- # @param [String] collection ('default') Collection name.
171
- # @param [Hash] query ({}) Filters to query.
172
- # @param [Hash] opts ({}) Configuration options.
173
- # @option opts [String|nil] :scraper_name (nil) Scraper name to query
174
- # from.
175
- # @option opts [Integer|nil] :job_id (nil) Job's id to query from.
176
- #
177
- # @raise [ArgumentError] +collection+ is not String.
178
- # @raise [ArgumentError] +query+ is not a Hash.
179
- #
180
- # @return [Hash|nil] `Hash` when found, and `nil` when no output is found.
181
- #
182
- # @example
183
- # find_output
184
- # @example
185
- # find_output 'my_collection'
186
- # @example
187
- # find_output 'my_collection', {}
188
- # @example Find from another scraper by name
189
- # find_output 'my_collection', {}, scraper_name: 'my_scraper'
190
- # @example Find from another scraper by job_id
191
- # find_output 'my_collection', {}, job_id: 123
192
- #
193
- # @note *opts `:job_id` option is prioritize over `:scraper_name` when
194
- # both exists. If none add provided or nil values, then current job
195
- # will be used to query instead, this is the defaul behavior.
196
- def find_output(collection='default', query={}, opts = {})
197
- result = find_outputs(collection, query, 1, 1, opts)
198
- result.respond_to?(:first) ? result.first : nil
199
- end
200
-
201
- # Remove dups by prioritizing the latest dup.
202
- #
203
- # @param [Array] list List of hashes to dedup.
204
- # @param [Hash] key_defaults Key and default value pair hash to use on
205
- # uniq validation.
206
- #
207
- # @return [Integer] Removed duplicated items count.
208
- def remove_old_dups!(list, key_defaults)
209
- raw_count = list.count
210
- keys = key_defaults.keys
211
- force_uniq = 0
212
- list.reverse!.uniq! do |item|
213
- # Extract stringify keys as hash
214
- key_hash = Hash[item.map{|k,v|keys.include?(k.to_s) ? [k.to_s,v] : nil}.select{|i|!i.nil?}]
215
-
216
- # Apply defaults for uniq validation
217
- key_defaults.each{|k,v| key_hash[k] = v if key_hash[k].nil?}
218
-
219
- # Don't dedup nil key defaults
220
- skip_dedup = !keys.find{|k| key_hash[k].nil?}.nil?
221
- skip_dedup ? (force_uniq += 1) : key_hash
222
- end
223
- list.reverse!
224
- dup_count = raw_count - list.count
225
- dup_count
226
- end
227
-
228
- # Remove page dups by prioritizing the latest dup.
229
- #
230
- # @param [Array] list List of pages to dedup.
231
- #
232
- # @return [Integer] Removed duplicated items count.
233
- #
234
- # @note It will not dedup for now as it is hard to build gid.
235
- # TODO: Build gid so we can dedup
236
- def remove_old_page_dups!(list)
237
- key_defaults = {
238
- 'gid' => nil
239
- }
240
- remove_old_dups! list, key_defaults
241
- end
242
-
243
- # Remove dups by prioritizing the latest dup.
244
- #
245
- # @param [Array] list List of outputs to dedup.
246
- #
247
- # @return [Integer] Removed duplicated items count.
248
- def remove_old_output_dups!(list)
249
- key_defaults = {
250
- '_id' => nil,
251
- '_collection' => 'default'
252
- }
253
- remove_old_dups! list, key_defaults
254
- end
255
-
256
- def save_pages_and_outputs(pages = [], outputs = [], status)
257
- total_pages = pages.count
258
- total_outputs = outputs.count
259
- records_per_slice = 100
260
- until pages.empty? && outputs.empty?
261
- pages_slice = pages.shift(records_per_slice)
262
- pages_dup_count = remove_old_page_dups! pages_slice
263
- outputs_slice = outputs.shift(records_per_slice)
264
- outputs_dup_count = remove_old_output_dups! outputs_slice
265
-
266
- log_msgs = []
267
- unless pages_slice.empty?
268
- page_dups_ignored = pages_dup_count > 0 ? " (#{pages_dup_count} dups ignored)" : ''
269
- log_msgs << "#{pages_slice.count} out of #{total_pages} Pages#{page_dups_ignored}"
270
-
271
- unless save
272
- puts '----------------------------------------'
273
- puts "Trying to validate #{log_msgs.last}#{page_dups_ignored}"
274
- puts JSON.pretty_generate pages_slice
275
- end
276
- end
277
-
278
- unless outputs_slice.empty?
279
- output_dups_ignored = outputs_dup_count > 0 ? " (#{outputs_dup_count} dups ignored)" : ''
280
- log_msgs << "#{outputs_slice.count} out of #{total_outputs} Outputs#{output_dups_ignored}"
281
-
282
- unless save
283
- puts '----------------------------------------'
284
- puts "Trying to validate #{log_msgs.last}#{output_dups_ignored}"
285
- puts JSON.pretty_generate outputs_slice
286
- end
287
- end
288
-
289
- # behave differently if it is a real save
290
- if save
291
- log_msg = "Saving #{log_msgs.join(' and ')}."
292
- puts "#{log_msg}"
293
- else
294
- status = "#{status}_try"
295
- end
296
-
297
- # saving to server
298
- response = update_to_server(
299
- job_id: job_id,
300
- gid: gid,
301
- pages: pages_slice,
302
- outputs: outputs_slice,
303
- status: status)
304
-
305
- if response.code == 200
306
- if save
307
- log_msg = "Saved."
308
- puts "#{log_msg}"
309
- else
310
- puts "Validation successful"
311
- end
312
- else
313
- if save
314
- puts "Error: Unable to save Pages and/or Outputs to server: #{response.body}"
315
- raise "Unable to save Pages and/or Outputs to server: #{response.body}"
316
- else
317
- puts "Error: Invalid Pages and/or Outputs: #{response.body}"
318
- raise "Invalid Pages and/or Outputs: #{response.body}"
319
- end
320
- end
321
- end
322
- end
323
-
324
- def update_to_server(opts = {})
325
- raise "Implemented in Subclass"
326
- end
327
-
328
- def clean_backtrace(backtrace)
329
- i = backtrace.index{|x| x =~ /gems\/answersengine/i}
330
- if i.to_i < 1
331
- return []
332
- else
333
- return backtrace[0..(i-1)]
334
- end
335
- end
336
-
337
- def save_type
338
- raise NotImplementedError.new('Need to implement "save_type" method.')
339
- end
340
-
341
- # Saves pages from an array and clear it.
342
- #
343
- # @param [Array] pages ([]) Page array to save. Warning: all elements will
344
- # be removed from the array.
345
- #
346
- # @note IMPORTANT: +pages+ array's elements will be removed.
347
- def save_pages(pages=[])
348
- save_pages_and_outputs(pages, [], save_type)
349
- end
350
-
351
- # Saves outputs from an array and clear it.
352
- #
353
- # @param [Array] outputs ([]) Output array to save. Warning: all elements
354
- # will be removed from the array.
355
- #
356
- # @note IMPORTANT: +outputs+ array's elements will be removed.
357
- def save_outputs(outputs=[])
358
- save_pages_and_outputs([], outputs, save_type)
359
- end
360
-
361
- # Eval a filename with a custom binding
362
- #
363
- # @param [String] file_path File path to read.
364
- # @param [Binding] context Context binding to evaluate with.
365
- #
366
- # @note Using this method will allow scripts to contain `return` to
367
- # exit the script sooner along some improved security.
368
- def eval_with_context file_path, context
369
- eval(File.read(file_path), context, file_path)
370
- end
371
- end
372
- end
373
- end
@@ -1,18 +0,0 @@
1
- module AnswersEngine
2
- module Scraper
3
- class Finisher
4
-
5
- def self.exec_finisher(filename, job_id=nil, save=false)
6
- extname = File.extname(filename)
7
- case extname
8
- when '.rb'
9
- executor = RubyFinisherExecutor.new(filename: filename, job_id: job_id)
10
- executor.exec_finisher(save)
11
- else
12
- puts "Unable to find a finisher executor for file type \"#{extname}\""
13
- end
14
- end
15
-
16
- end
17
- end
18
- end
@@ -1,18 +0,0 @@
1
- module AnswersEngine
2
- module Scraper
3
- class Parser
4
- def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {})
5
- extname = File.extname(filename)
6
- case extname
7
- when '.rb'
8
- executor = RubyParserExecutor.new(filename: filename, gid: gid, job_id: job_id, vars: vars)
9
- executor.exec_parser(save)
10
- else
11
- puts "Unable to find a parser executor for file type \"#{extname}\""
12
- end
13
- end
14
-
15
-
16
- end
17
- end
18
- end
@@ -1,116 +0,0 @@
1
- module AnswersEngine
2
- module Scraper
3
- class RubyFinisherExecutor < Executor
4
- attr_accessor :save
5
-
6
- def initialize(options={})
7
- @filename = options.fetch(:filename) { raise "Filename is required"}
8
- @job_id = options[:job_id]
9
- end
10
-
11
- def self.exposed_methods
12
- [
13
- :outputs,
14
- :save_outputs,
15
- :find_output,
16
- :find_outputs
17
- ].freeze
18
- end
19
-
20
- def exec_finisher(save=false)
21
- @save = save
22
- if save
23
- puts "Executing finisher script"
24
- else
25
- puts "Trying finisher script"
26
- end
27
-
28
- eval_finisher_script(save)
29
- end
30
-
31
- def eval_finisher_script(save=false)
32
- update_finisher_starting_status
33
-
34
- proc = Proc.new do
35
- outputs = []
36
-
37
- begin
38
- context = isolated_binding({
39
- outputs: outputs,
40
- job_id: job_id
41
- })
42
- eval_with_context filename, context
43
- rescue SyntaxError => e
44
- handle_error(e) if save
45
- raise e
46
- rescue => e
47
- handle_error(e) if save
48
- raise e
49
- end
50
-
51
- puts "=========== Finisher Executed ==========="
52
- begin
53
- save_outputs(outputs)
54
- rescue => e
55
- handle_error(e) if save
56
- raise e
57
- end
58
-
59
- update_finisher_done_status
60
- end
61
- proc.call
62
- end
63
-
64
- def save_type
65
- :executing
66
- end
67
-
68
- def update_to_server(opts = {})
69
- finisher_update(
70
- job_id: opts[:job_id],
71
- outputs: opts[:outputs],
72
- finisher_status: opts[:status])
73
- end
74
-
75
- def update_finisher_starting_status
76
- return unless save
77
-
78
- response = finisher_update(
79
- job_id: job_id,
80
- finisher_status: :starting)
81
-
82
- if response.code == 200
83
- puts "Finisher Status Updated."
84
- else
85
- puts "Error: Unable to save Finisher Status to server: #{response.body}"
86
- raise "Unable to save Finisher Status to server: #{response.body}"
87
- end
88
- end
89
-
90
- def update_finisher_done_status
91
- return unless save
92
-
93
- response = finisher_update(
94
- job_id: job_id,
95
- finisher_status: :done)
96
-
97
- if response.code == 200
98
- puts "Finisher Done."
99
- else
100
- puts "Error: Unable to save Finisher Done Status to server: #{response.body}"
101
- raise "Unable to save Finisher Done Status to server: #{response.body}"
102
- end
103
- end
104
-
105
- def handle_error(e)
106
- error = ["Finisher #{e.class}: #{e.to_s} (Job:#{job_id}",clean_backtrace(e.backtrace)].join("\n")
107
-
108
- finisher_update(
109
- job_id: job_id,
110
- finisher_status: :failed,
111
- log_error: error)
112
- end
113
-
114
- end
115
- end
116
- end