answersengine 0.10.1 → 0.10.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/CODE_OF_CONDUCT.md +1 -1
  3. data/LICENSE.txt +1 -1
  4. data/README.md +3 -4
  5. data/answersengine.gemspec +6 -12
  6. data/exe/answersengine +3 -2
  7. data/lib/answersengine.rb +20 -3
  8. metadata +14 -152
  9. data/examples/fetchtest/libraries/hello.rb +0 -9
  10. data/examples/fetchtest/libraries/hello_fail.rb +0 -10
  11. data/examples/fetchtest/parsers/failed.rb +0 -2
  12. data/examples/fetchtest/parsers/find_outputs.rb +0 -18
  13. data/examples/fetchtest/parsers/home.rb +0 -50
  14. data/examples/fetchtest/parsers/nested_fail.rb +0 -3
  15. data/examples/fetchtest/parsers/simple.rb +0 -14
  16. data/examples/fetchtest/seeders/csv_seeder.rb +0 -12
  17. data/examples/fetchtest/seeders/failed.rb +0 -1
  18. data/examples/fetchtest/seeders/list_of_urls.csv +0 -5
  19. data/examples/fetchtest/seeders/seed.rb +0 -28
  20. data/examples/fetchtest/seeders/test_reset_page.rb +0 -4
  21. data/lib/answersengine/cli.rb +0 -45
  22. data/lib/answersengine/cli/env_var.rb +0 -48
  23. data/lib/answersengine/cli/finisher.rb +0 -40
  24. data/lib/answersengine/cli/global_page.rb +0 -39
  25. data/lib/answersengine/cli/job.rb +0 -30
  26. data/lib/answersengine/cli/job_output.rb +0 -69
  27. data/lib/answersengine/cli/parser.rb +0 -64
  28. data/lib/answersengine/cli/scraper.rb +0 -185
  29. data/lib/answersengine/cli/scraper_deployment.rb +0 -24
  30. data/lib/answersengine/cli/scraper_export.rb +0 -51
  31. data/lib/answersengine/cli/scraper_exporter.rb +0 -40
  32. data/lib/answersengine/cli/scraper_finisher.rb +0 -20
  33. data/lib/answersengine/cli/scraper_job.rb +0 -75
  34. data/lib/answersengine/cli/scraper_job_var.rb +0 -48
  35. data/lib/answersengine/cli/scraper_page.rb +0 -203
  36. data/lib/answersengine/cli/scraper_var.rb +0 -48
  37. data/lib/answersengine/cli/seeder.rb +0 -40
  38. data/lib/answersengine/client.rb +0 -29
  39. data/lib/answersengine/client/auth_token.rb +0 -50
  40. data/lib/answersengine/client/backblaze_content.rb +0 -45
  41. data/lib/answersengine/client/base.rb +0 -55
  42. data/lib/answersengine/client/deploy_key.rb +0 -21
  43. data/lib/answersengine/client/env_var.rb +0 -28
  44. data/lib/answersengine/client/export.rb +0 -10
  45. data/lib/answersengine/client/global_page.rb +0 -18
  46. data/lib/answersengine/client/job.rb +0 -64
  47. data/lib/answersengine/client/job_export.rb +0 -10
  48. data/lib/answersengine/client/job_log.rb +0 -26
  49. data/lib/answersengine/client/job_output.rb +0 -19
  50. data/lib/answersengine/client/job_page.rb +0 -58
  51. data/lib/answersengine/client/job_stat.rb +0 -16
  52. data/lib/answersengine/client/scraper.rb +0 -57
  53. data/lib/answersengine/client/scraper_deployment.rb +0 -18
  54. data/lib/answersengine/client/scraper_export.rb +0 -22
  55. data/lib/answersengine/client/scraper_exporter.rb +0 -14
  56. data/lib/answersengine/client/scraper_finisher.rb +0 -16
  57. data/lib/answersengine/client/scraper_job.rb +0 -49
  58. data/lib/answersengine/client/scraper_job_output.rb +0 -19
  59. data/lib/answersengine/client/scraper_job_page.rb +0 -67
  60. data/lib/answersengine/client/scraper_job_var.rb +0 -28
  61. data/lib/answersengine/client/scraper_var.rb +0 -28
  62. data/lib/answersengine/plugin.rb +0 -6
  63. data/lib/answersengine/plugin/context_exposer.rb +0 -55
  64. data/lib/answersengine/scraper.rb +0 -18
  65. data/lib/answersengine/scraper/executor.rb +0 -373
  66. data/lib/answersengine/scraper/finisher.rb +0 -18
  67. data/lib/answersengine/scraper/parser.rb +0 -18
  68. data/lib/answersengine/scraper/ruby_finisher_executor.rb +0 -116
  69. data/lib/answersengine/scraper/ruby_parser_executor.rb +0 -200
  70. data/lib/answersengine/scraper/ruby_seeder_executor.rb +0 -120
  71. data/lib/answersengine/scraper/seeder.rb +0 -18
  72. data/lib/answersengine/version.rb +0 -3
@@ -1,18 +0,0 @@
1
- require "answersengine/plugin"
2
- require "answersengine/scraper/parser"
3
- require "answersengine/scraper/seeder"
4
- require "answersengine/scraper/finisher"
5
- require "answersengine/scraper/executor"
6
- require "answersengine/scraper/ruby_parser_executor"
7
- require "answersengine/scraper/ruby_seeder_executor"
8
- require "answersengine/scraper/ruby_finisher_executor"
9
- require "answersengine/client"
10
-
11
- module AnswersEngine
12
- module Scraper
13
- # def self.list(opts={})
14
- # scraper = Client::Scraper.new(opts)
15
- # "Listing scrapers #{ENV['ANSWERSENGINE_TOKEN']} for #{scraper.all}"
16
- # end
17
- end
18
- end
@@ -1,373 +0,0 @@
1
- require 'nokogiri'
2
- module AnswersEngine
3
- module Scraper
4
- # @abstract
5
- class Executor
6
- # Max allowed page size when query outputs (see #find_outputs).
7
- MAX_FIND_OUTPUTS_PER_PAGE = 500
8
-
9
- attr_accessor :filename, :gid, :job_id
10
-
11
- include AnswersEngine::Plugin::ContextExposer
12
-
13
- def exec_parser(save=false)
14
- raise "should be implemented in subclass"
15
- end
16
-
17
- def init_page()
18
- if job_id
19
- puts "getting Job Page"
20
- init_job_page
21
- else
22
- puts "getting Global Page"
23
- init_global_page()
24
- end
25
-
26
- end
27
-
28
- def init_job_page()
29
- client = Client::JobPage.new()
30
- job_page = client.find(job_id, gid)
31
- unless job_page.code == 200
32
- raise "Job #{job_id} or GID #{gid} not found. Aborting execution!"
33
- else
34
- job_page
35
- end
36
-
37
- end
38
-
39
- def parsing_update(options={})
40
- client = Client::JobPage.new()
41
- job_id = options.fetch(:job_id)
42
- gid = options.fetch(:gid)
43
-
44
- client.parsing_update(job_id, gid, options)
45
- end
46
-
47
- def seeding_update(options={})
48
- client = Client::Job.new()
49
- job_id = options.fetch(:job_id)
50
-
51
- client.seeding_update(job_id, options)
52
- end
53
-
54
- def finisher_update(options={})
55
- client = Client::Job.new()
56
- job_id = options.fetch(:job_id)
57
-
58
- client.finisher_update(job_id, options)
59
- end
60
-
61
- def init_global_page()
62
- client = Client::GlobalPage.new()
63
- client.find(gid)
64
- end
65
-
66
- def get_content(gid)
67
- client = Client::GlobalPage.new()
68
- content_json = client.find_content(gid)
69
-
70
- if content_json['available']
71
- signed_url = content_json['signed_url']
72
- Client::BackblazeContent.new.get_gunzipped_content(signed_url)
73
- else
74
- nil
75
- end
76
- end
77
-
78
- def get_failed_content(gid)
79
- client = Client::GlobalPage.new()
80
- content_json = client.find_failed_content(gid)
81
-
82
- if content_json['available']
83
- signed_url = content_json['signed_url']
84
- Client::BackblazeContent.new.get_gunzipped_content(signed_url)
85
- else
86
- nil
87
- end
88
- end
89
-
90
- # Get current job id from scraper or default when scraper_name is null.
91
- #
92
- # @param [String|nil] scraper_name Scraper name.
93
- # @param [Integer|nil] default (nil) Default job id when no scraper name.
94
- #
95
- # @raise [Exception] When scraper name is not null, and scraper doesn't
96
- # exists or it has no current job.
97
- def get_job_id scraper_name, default = nil
98
- return default if scraper_name.nil?
99
- job = Client::ScraperJob.new().find(scraper_name)
100
- raise JSON.pretty_generate(job) if job['id'].nil?
101
- job['id']
102
- end
103
-
104
- # Find outputs by collection and query with pagination.
105
- #
106
- # @param [String] collection ('default') Collection name.
107
- # @param [Hash] query ({}) Filters to query.
108
- # @param [Integer] page (1) Page number.
109
- # @param [Integer] per_page (100) Page size.
110
- # @param [Hash] opts ({}) Configuration options.
111
- # @option opts [String|nil] :scraper_name (nil) Scraper name to query
112
- # from.
113
- # @option opts [Integer|nil] :job_id (nil) Job's id to query from.
114
- #
115
- # @raise [ArgumentError] +collection+ is not String.
116
- # @raise [ArgumentError] +query+ is not a Hash.
117
- # @raise [ArgumentError] +page+ is not an Integer greater than 0.
118
- # @raise [ArgumentError] +per_page+ is not an Integer between 1 and 500.
119
- #
120
- # @return [Array]
121
- #
122
- # @example
123
- # find_outputs
124
- # @example
125
- # find_outputs 'my_collection'
126
- # @example
127
- # find_outputs 'my_collection', {}
128
- # @example
129
- # find_outputs 'my_collection', {}, 1
130
- # @example
131
- # find_outputs 'my_collection', {}, 1, 100
132
- # @example Find from another scraper by name
133
- # find_outputs 'my_collection', {}, 1, 100, scraper_name: 'my_scraper'
134
- # @example Find from another scraper by job_id
135
- # find_outputs 'my_collection', {}, 1, 100, job_id: 123
136
- #
137
- # @note *opts `:job_id` option is prioritize over `:scraper_name` when
138
- # both exists. If none add provided or nil values, then current job
139
- # will be used to query instead, this is the defaul behavior.
140
- def find_outputs(collection='default', query={}, page=1, per_page=100, opts = {})
141
- # Validate parameters out from nil for easier user usage.
142
- raise ArgumentError.new("collection needs to be a String") unless collection.is_a?(String)
143
- raise ArgumentError.new("query needs to be a Hash, instead of: #{query}") unless query.is_a?(Hash)
144
- unless page.is_a?(Integer) && page > 0
145
- raise ArgumentError.new("page needs to be an Integer greater than 0")
146
- end
147
- unless per_page.is_a?(Integer) && per_page > 0 && per_page <= MAX_FIND_OUTPUTS_PER_PAGE
148
- raise ArgumentError.new("per_page needs to be an Integer between 1 and #{MAX_FIND_OUTPUTS_PER_PAGE}")
149
- end
150
-
151
- options = {
152
- query: query,
153
- page: page,
154
- per_page: per_page}
155
-
156
- # Get job_id
157
- query_job_id = opts[:job_id] || get_job_id(opts[:scraper_name], self.job_id)
158
-
159
- client = Client::JobOutput.new(options)
160
- response = client.all(query_job_id, collection)
161
-
162
- if response.code != 200
163
- raise "response_code: #{response.code}|#{response.parsed_response}"
164
- end
165
- (response.body != 'null') ? response.parsed_response : []
166
- end
167
-
168
- # Find one output by collection and query with pagination.
169
- #
170
- # @param [String] collection ('default') Collection name.
171
- # @param [Hash] query ({}) Filters to query.
172
- # @param [Hash] opts ({}) Configuration options.
173
- # @option opts [String|nil] :scraper_name (nil) Scraper name to query
174
- # from.
175
- # @option opts [Integer|nil] :job_id (nil) Job's id to query from.
176
- #
177
- # @raise [ArgumentError] +collection+ is not String.
178
- # @raise [ArgumentError] +query+ is not a Hash.
179
- #
180
- # @return [Hash|nil] `Hash` when found, and `nil` when no output is found.
181
- #
182
- # @example
183
- # find_output
184
- # @example
185
- # find_output 'my_collection'
186
- # @example
187
- # find_output 'my_collection', {}
188
- # @example Find from another scraper by name
189
- # find_output 'my_collection', {}, scraper_name: 'my_scraper'
190
- # @example Find from another scraper by job_id
191
- # find_output 'my_collection', {}, job_id: 123
192
- #
193
- # @note *opts `:job_id` option is prioritize over `:scraper_name` when
194
- # both exists. If none add provided or nil values, then current job
195
- # will be used to query instead, this is the defaul behavior.
196
- def find_output(collection='default', query={}, opts = {})
197
- result = find_outputs(collection, query, 1, 1, opts)
198
- result.respond_to?(:first) ? result.first : nil
199
- end
200
-
201
- # Remove dups by prioritizing the latest dup.
202
- #
203
- # @param [Array] list List of hashes to dedup.
204
- # @param [Hash] key_defaults Key and default value pair hash to use on
205
- # uniq validation.
206
- #
207
- # @return [Integer] Removed duplicated items count.
208
- def remove_old_dups!(list, key_defaults)
209
- raw_count = list.count
210
- keys = key_defaults.keys
211
- force_uniq = 0
212
- list.reverse!.uniq! do |item|
213
- # Extract stringify keys as hash
214
- key_hash = Hash[item.map{|k,v|keys.include?(k.to_s) ? [k.to_s,v] : nil}.select{|i|!i.nil?}]
215
-
216
- # Apply defaults for uniq validation
217
- key_defaults.each{|k,v| key_hash[k] = v if key_hash[k].nil?}
218
-
219
- # Don't dedup nil key defaults
220
- skip_dedup = !keys.find{|k| key_hash[k].nil?}.nil?
221
- skip_dedup ? (force_uniq += 1) : key_hash
222
- end
223
- list.reverse!
224
- dup_count = raw_count - list.count
225
- dup_count
226
- end
227
-
228
- # Remove page dups by prioritizing the latest dup.
229
- #
230
- # @param [Array] list List of pages to dedup.
231
- #
232
- # @return [Integer] Removed duplicated items count.
233
- #
234
- # @note It will not dedup for now as it is hard to build gid.
235
- # TODO: Build gid so we can dedup
236
- def remove_old_page_dups!(list)
237
- key_defaults = {
238
- 'gid' => nil
239
- }
240
- remove_old_dups! list, key_defaults
241
- end
242
-
243
- # Remove dups by prioritizing the latest dup.
244
- #
245
- # @param [Array] list List of outputs to dedup.
246
- #
247
- # @return [Integer] Removed duplicated items count.
248
- def remove_old_output_dups!(list)
249
- key_defaults = {
250
- '_id' => nil,
251
- '_collection' => 'default'
252
- }
253
- remove_old_dups! list, key_defaults
254
- end
255
-
256
- def save_pages_and_outputs(pages = [], outputs = [], status)
257
- total_pages = pages.count
258
- total_outputs = outputs.count
259
- records_per_slice = 100
260
- until pages.empty? && outputs.empty?
261
- pages_slice = pages.shift(records_per_slice)
262
- pages_dup_count = remove_old_page_dups! pages_slice
263
- outputs_slice = outputs.shift(records_per_slice)
264
- outputs_dup_count = remove_old_output_dups! outputs_slice
265
-
266
- log_msgs = []
267
- unless pages_slice.empty?
268
- page_dups_ignored = pages_dup_count > 0 ? " (#{pages_dup_count} dups ignored)" : ''
269
- log_msgs << "#{pages_slice.count} out of #{total_pages} Pages#{page_dups_ignored}"
270
-
271
- unless save
272
- puts '----------------------------------------'
273
- puts "Trying to validate #{log_msgs.last}#{page_dups_ignored}"
274
- puts JSON.pretty_generate pages_slice
275
- end
276
- end
277
-
278
- unless outputs_slice.empty?
279
- output_dups_ignored = outputs_dup_count > 0 ? " (#{outputs_dup_count} dups ignored)" : ''
280
- log_msgs << "#{outputs_slice.count} out of #{total_outputs} Outputs#{output_dups_ignored}"
281
-
282
- unless save
283
- puts '----------------------------------------'
284
- puts "Trying to validate #{log_msgs.last}#{output_dups_ignored}"
285
- puts JSON.pretty_generate outputs_slice
286
- end
287
- end
288
-
289
- # behave differently if it is a real save
290
- if save
291
- log_msg = "Saving #{log_msgs.join(' and ')}."
292
- puts "#{log_msg}"
293
- else
294
- status = "#{status}_try"
295
- end
296
-
297
- # saving to server
298
- response = update_to_server(
299
- job_id: job_id,
300
- gid: gid,
301
- pages: pages_slice,
302
- outputs: outputs_slice,
303
- status: status)
304
-
305
- if response.code == 200
306
- if save
307
- log_msg = "Saved."
308
- puts "#{log_msg}"
309
- else
310
- puts "Validation successful"
311
- end
312
- else
313
- if save
314
- puts "Error: Unable to save Pages and/or Outputs to server: #{response.body}"
315
- raise "Unable to save Pages and/or Outputs to server: #{response.body}"
316
- else
317
- puts "Error: Invalid Pages and/or Outputs: #{response.body}"
318
- raise "Invalid Pages and/or Outputs: #{response.body}"
319
- end
320
- end
321
- end
322
- end
323
-
324
- def update_to_server(opts = {})
325
- raise "Implemented in Subclass"
326
- end
327
-
328
- def clean_backtrace(backtrace)
329
- i = backtrace.index{|x| x =~ /gems\/answersengine/i}
330
- if i.to_i < 1
331
- return []
332
- else
333
- return backtrace[0..(i-1)]
334
- end
335
- end
336
-
337
- def save_type
338
- raise NotImplementedError.new('Need to implement "save_type" method.')
339
- end
340
-
341
- # Saves pages from an array and clear it.
342
- #
343
- # @param [Array] pages ([]) Page array to save. Warning: all elements will
344
- # be removed from the array.
345
- #
346
- # @note IMPORTANT: +pages+ array's elements will be removed.
347
- def save_pages(pages=[])
348
- save_pages_and_outputs(pages, [], save_type)
349
- end
350
-
351
- # Saves outputs from an array and clear it.
352
- #
353
- # @param [Array] outputs ([]) Output array to save. Warning: all elements
354
- # will be removed from the array.
355
- #
356
- # @note IMPORTANT: +outputs+ array's elements will be removed.
357
- def save_outputs(outputs=[])
358
- save_pages_and_outputs([], outputs, save_type)
359
- end
360
-
361
- # Eval a filename with a custom binding
362
- #
363
- # @param [String] file_path File path to read.
364
- # @param [Binding] context Context binding to evaluate with.
365
- #
366
- # @note Using this method will allow scripts to contain `return` to
367
- # exit the script sooner along some improved security.
368
- def eval_with_context file_path, context
369
- eval(File.read(file_path), context, file_path)
370
- end
371
- end
372
- end
373
- end
@@ -1,18 +0,0 @@
1
- module AnswersEngine
2
- module Scraper
3
- class Finisher
4
-
5
- def self.exec_finisher(filename, job_id=nil, save=false)
6
- extname = File.extname(filename)
7
- case extname
8
- when '.rb'
9
- executor = RubyFinisherExecutor.new(filename: filename, job_id: job_id)
10
- executor.exec_finisher(save)
11
- else
12
- puts "Unable to find a finisher executor for file type \"#{extname}\""
13
- end
14
- end
15
-
16
- end
17
- end
18
- end
@@ -1,18 +0,0 @@
1
- module AnswersEngine
2
- module Scraper
3
- class Parser
4
- def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {})
5
- extname = File.extname(filename)
6
- case extname
7
- when '.rb'
8
- executor = RubyParserExecutor.new(filename: filename, gid: gid, job_id: job_id, vars: vars)
9
- executor.exec_parser(save)
10
- else
11
- puts "Unable to find a parser executor for file type \"#{extname}\""
12
- end
13
- end
14
-
15
-
16
- end
17
- end
18
- end
@@ -1,116 +0,0 @@
1
- module AnswersEngine
2
- module Scraper
3
- class RubyFinisherExecutor < Executor
4
- attr_accessor :save
5
-
6
- def initialize(options={})
7
- @filename = options.fetch(:filename) { raise "Filename is required"}
8
- @job_id = options[:job_id]
9
- end
10
-
11
- def self.exposed_methods
12
- [
13
- :outputs,
14
- :save_outputs,
15
- :find_output,
16
- :find_outputs
17
- ].freeze
18
- end
19
-
20
- def exec_finisher(save=false)
21
- @save = save
22
- if save
23
- puts "Executing finisher script"
24
- else
25
- puts "Trying finisher script"
26
- end
27
-
28
- eval_finisher_script(save)
29
- end
30
-
31
- def eval_finisher_script(save=false)
32
- update_finisher_starting_status
33
-
34
- proc = Proc.new do
35
- outputs = []
36
-
37
- begin
38
- context = isolated_binding({
39
- outputs: outputs,
40
- job_id: job_id
41
- })
42
- eval_with_context filename, context
43
- rescue SyntaxError => e
44
- handle_error(e) if save
45
- raise e
46
- rescue => e
47
- handle_error(e) if save
48
- raise e
49
- end
50
-
51
- puts "=========== Finisher Executed ==========="
52
- begin
53
- save_outputs(outputs)
54
- rescue => e
55
- handle_error(e) if save
56
- raise e
57
- end
58
-
59
- update_finisher_done_status
60
- end
61
- proc.call
62
- end
63
-
64
- def save_type
65
- :executing
66
- end
67
-
68
- def update_to_server(opts = {})
69
- finisher_update(
70
- job_id: opts[:job_id],
71
- outputs: opts[:outputs],
72
- finisher_status: opts[:status])
73
- end
74
-
75
- def update_finisher_starting_status
76
- return unless save
77
-
78
- response = finisher_update(
79
- job_id: job_id,
80
- finisher_status: :starting)
81
-
82
- if response.code == 200
83
- puts "Finisher Status Updated."
84
- else
85
- puts "Error: Unable to save Finisher Status to server: #{response.body}"
86
- raise "Unable to save Finisher Status to server: #{response.body}"
87
- end
88
- end
89
-
90
- def update_finisher_done_status
91
- return unless save
92
-
93
- response = finisher_update(
94
- job_id: job_id,
95
- finisher_status: :done)
96
-
97
- if response.code == 200
98
- puts "Finisher Done."
99
- else
100
- puts "Error: Unable to save Finisher Done Status to server: #{response.body}"
101
- raise "Unable to save Finisher Done Status to server: #{response.body}"
102
- end
103
- end
104
-
105
- def handle_error(e)
106
- error = ["Finisher #{e.class}: #{e.to_s} (Job:#{job_id}",clean_backtrace(e.backtrace)].join("\n")
107
-
108
- finisher_update(
109
- job_id: job_id,
110
- finisher_status: :failed,
111
- log_error: error)
112
- end
113
-
114
- end
115
- end
116
- end