answersengine 0.2.33

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +30 -0
  8. data/Rakefile +22 -0
  9. data/answersengine.gemspec +45 -0
  10. data/bin/console +14 -0
  11. data/bin/setup +8 -0
  12. data/examples/fetchtest/libraries/hello.rb +9 -0
  13. data/examples/fetchtest/libraries/hello_fail.rb +10 -0
  14. data/examples/fetchtest/parsers/failed.rb +2 -0
  15. data/examples/fetchtest/parsers/find_outputs.rb +18 -0
  16. data/examples/fetchtest/parsers/home.rb +50 -0
  17. data/examples/fetchtest/parsers/nested_fail.rb +3 -0
  18. data/examples/fetchtest/parsers/simple.rb +14 -0
  19. data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
  20. data/examples/fetchtest/seeders/failed.rb +1 -0
  21. data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
  22. data/examples/fetchtest/seeders/seed.rb +28 -0
  23. data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
  24. data/exe/answersengine +3 -0
  25. data/lib/answersengine.rb +5 -0
  26. data/lib/answersengine/cli.rb +33 -0
  27. data/lib/answersengine/cli/global_page.rb +39 -0
  28. data/lib/answersengine/cli/job.rb +30 -0
  29. data/lib/answersengine/cli/job_output.rb +69 -0
  30. data/lib/answersengine/cli/parser.rb +64 -0
  31. data/lib/answersengine/cli/scraper.rb +172 -0
  32. data/lib/answersengine/cli/scraper_deployment.rb +24 -0
  33. data/lib/answersengine/cli/scraper_export.rb +51 -0
  34. data/lib/answersengine/cli/scraper_exporter.rb +40 -0
  35. data/lib/answersengine/cli/scraper_job.rb +71 -0
  36. data/lib/answersengine/cli/scraper_page.rb +200 -0
  37. data/lib/answersengine/cli/seeder.rb +40 -0
  38. data/lib/answersengine/client.rb +23 -0
  39. data/lib/answersengine/client/backblaze_content.rb +45 -0
  40. data/lib/answersengine/client/base.rb +50 -0
  41. data/lib/answersengine/client/export.rb +10 -0
  42. data/lib/answersengine/client/global_page.rb +18 -0
  43. data/lib/answersengine/client/job.rb +53 -0
  44. data/lib/answersengine/client/job_export.rb +10 -0
  45. data/lib/answersengine/client/job_log.rb +27 -0
  46. data/lib/answersengine/client/job_output.rb +19 -0
  47. data/lib/answersengine/client/job_page.rb +62 -0
  48. data/lib/answersengine/client/job_stat.rb +16 -0
  49. data/lib/answersengine/client/scraper.rb +54 -0
  50. data/lib/answersengine/client/scraper_deployment.rb +17 -0
  51. data/lib/answersengine/client/scraper_export.rb +22 -0
  52. data/lib/answersengine/client/scraper_exporter.rb +14 -0
  53. data/lib/answersengine/client/scraper_job.rb +49 -0
  54. data/lib/answersengine/client/scraper_job_output.rb +19 -0
  55. data/lib/answersengine/client/scraper_job_page.rb +55 -0
  56. data/lib/answersengine/plugin.rb +6 -0
  57. data/lib/answersengine/plugin/context_exposer.rb +55 -0
  58. data/lib/answersengine/scraper.rb +16 -0
  59. data/lib/answersengine/scraper/executor.rb +292 -0
  60. data/lib/answersengine/scraper/parser.rb +18 -0
  61. data/lib/answersengine/scraper/ruby_parser_executor.rb +141 -0
  62. data/lib/answersengine/scraper/ruby_seeder_executor.rb +114 -0
  63. data/lib/answersengine/scraper/seeder.rb +18 -0
  64. data/lib/answersengine/version.rb +3 -0
  65. metadata +255 -0
@@ -0,0 +1,19 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class ScraperJobOutput < AnswersEngine::Client::Base
4
+ def find(scraper_name, collection, id)
5
+ self.class.get("/scrapers/#{scraper_name}/current_job/output/collections/#{collection}/records/#{id}", @options)
6
+ end
7
+
8
+ def all(scraper_name, collection = 'default')
9
+
10
+ self.class.get("/scrapers/#{scraper_name}/current_job/output/collections/#{collection}/records", @options)
11
+ end
12
+
13
+ def collections(scraper_name)
14
+ self.class.get("/scrapers/#{scraper_name}/current_job/output/collections", @options)
15
+ end
16
+ end
17
+ end
18
+ end
19
+
@@ -0,0 +1,55 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class ScraperJobPage < AnswersEngine::Client::Base
4
+ def find(scraper_name, gid)
5
+ self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}", @options)
6
+ end
7
+
8
+ def all(scraper_name, opts={})
9
+ self.class.get("/scrapers/#{scraper_name}/current_job/pages", @options)
10
+ end
11
+
12
+ def update(scraper_name, gid, opts={})
13
+ body = {}
14
+ body[:page_type] = opts[:page_type] if opts[:page_type]
15
+ body[:priority] = opts[:priority] if opts[:priority]
16
+ body[:vars] = opts[:vars] if opts[:vars]
17
+
18
+ @options.merge!({body: body.to_json})
19
+
20
+ self.class.put("/scrapers/#{scraper_name}/current_job/pages/#{gid}", @options)
21
+ end
22
+
23
+ def refetch(scraper_name, opts={})
24
+ self.class.put("/scrapers/#{scraper_name}/current_job/pages/refetch", @options)
25
+ end
26
+
27
+ def reset(scraper_name, gid, opts={})
28
+ self.class.put("/scrapers/#{scraper_name}/current_job/pages/#{gid}/reset", @options)
29
+ end
30
+
31
+ def enqueue(scraper_name, method, url, opts={})
32
+ body = {}
33
+ body[:method] = method != "" ? method : "GET"
34
+ body[:url] = url
35
+ body[:page_type] = opts[:page_type] if opts[:page_type]
36
+ body[:priority] = opts[:priority] if opts[:priority]
37
+ body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
38
+ body[:body] = opts[:body] if opts[:body]
39
+ body[:headers] = opts[:headers] if opts[:headers]
40
+ body[:vars] = opts[:vars] if opts[:vars]
41
+ body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
42
+ body[:freshness] = opts[:freshness] if opts[:freshness]
43
+ body[:ua_type] = opts[:ua_type] if opts[:ua_type]
44
+ body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
45
+ body[:cookie] = opts[:cookie] if opts[:cookie]
46
+
47
+ @options.merge!({body: body.to_json})
48
+
49
+ self.class.post("/scrapers/#{scraper_name}/current_job/pages", @options)
50
+ end
51
+
52
+ end
53
+ end
54
+ end
55
+
@@ -0,0 +1,6 @@
1
+ require 'answersengine/plugin/context_exposer'
2
+
3
+ module AnswersEngine
4
+ module Plugin
5
+ end
6
+ end
@@ -0,0 +1,55 @@
1
+ module AnswersEngine
2
+ module Plugin
3
+ module ContextExposer
4
+ def self.exposed_methods
5
+ raise NotImplementedError.new('Specify methods exposed to isolated env')
6
+ end
7
+
8
+ def exposed_methods
9
+ self.class.exposed_methods
10
+ end
11
+
12
+ # Create lambda to retrieve a variable or call instance method
13
+ def var_or_proc vars, key
14
+ myself = self # Avoid stack overflow
15
+ return lambda{vars[key]} if vars.has_key?(key)
16
+ lambda{|*args| myself.send(key, *args)}
17
+ end
18
+
19
+ def exposed_env vars
20
+ keys = exposed_methods + vars.keys
21
+ Hash[keys.uniq.map{|key|[key, var_or_proc(vars, key)]}]
22
+ end
23
+
24
+ def expose_to object, env
25
+ metaclass = class << object; self; end
26
+ env.each do |key, block|
27
+ metaclass.send(:define_method, key, block)
28
+ end
29
+ object
30
+ end
31
+
32
+ # Create isolated context object from self
33
+ def create_context vars = {}
34
+ create_top_object_script = '(
35
+ lambda do
36
+ object = Object.new
37
+ metaclass = class << object
38
+ define_method(:context_binding){binding}
39
+ end
40
+ object
41
+ end
42
+ ).call'
43
+ object = TOPLEVEL_BINDING.eval(create_top_object_script)
44
+ env = exposed_env(vars)
45
+ expose_to object, env
46
+ object
47
+ end
48
+
49
+ # Create an isolated binding
50
+ def isolated_binding vars = {}
51
+ create_context(vars).context_binding
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,16 @@
1
+ require "answersengine/plugin"
2
+ require "answersengine/scraper/parser"
3
+ require "answersengine/scraper/seeder"
4
+ require "answersengine/scraper/executor"
5
+ require "answersengine/scraper/ruby_parser_executor"
6
+ require "answersengine/scraper/ruby_seeder_executor"
7
+ require "answersengine/client"
8
+
9
+ module AnswersEngine
10
+ module Scraper
11
+ # def self.list(opts={})
12
+ # scraper = Client::Scraper.new(opts)
13
+ # "Listing scrapers #{ENV['ANSWERSENGINE_TOKEN']} for #{scraper.all}"
14
+ # end
15
+ end
16
+ end
@@ -0,0 +1,292 @@
1
+ require 'nokogiri'
2
+ module AnswersEngine
3
+ module Scraper
4
+ # @abstract
5
+ class Executor
6
+ # Max allowed page size when query outputs (see #find_outputs).
7
+ MAX_FIND_OUTPUTS_PER_PAGE = 500
8
+
9
+ attr_accessor :filename, :gid, :job_id
10
+
11
+ include AnswersEngine::Plugin::ContextExposer
12
+
13
+ def exec_parser(save=false)
14
+ raise "should be implemented in subclass"
15
+ end
16
+
17
+ def init_page()
18
+ if job_id
19
+ puts "getting Job Page"
20
+ init_job_page
21
+ else
22
+ puts "getting Global Page"
23
+ init_global_page()
24
+ end
25
+
26
+ end
27
+
28
+ def init_job_page()
29
+ client = Client::JobPage.new()
30
+ job_page = client.find(job_id, gid)
31
+ unless job_page.code == 200
32
+ raise "Job #{job_id} or GID #{gid} not found. Aborting execution!"
33
+ else
34
+ job_page
35
+ end
36
+
37
+ end
38
+
39
+ def parsing_update(options={})
40
+ client = Client::JobPage.new()
41
+ job_id = options.fetch(:job_id)
42
+ gid = options.fetch(:gid)
43
+
44
+ client.parsing_update(job_id, gid, options)
45
+ end
46
+
47
+ def seeding_update(options={})
48
+ client = Client::Job.new()
49
+ job_id = options.fetch(:job_id)
50
+
51
+ client.seeding_update(job_id, options)
52
+ end
53
+
54
+ def init_global_page()
55
+ client = Client::GlobalPage.new()
56
+ client.find(gid)
57
+ end
58
+
59
+ def get_content(gid)
60
+ client = Client::GlobalPage.new()
61
+ content_json = client.find_content(gid)
62
+
63
+ if content_json['available']
64
+ signed_url = content_json['signed_url']
65
+ Client::BackblazeContent.new.get_gunzipped_content(signed_url)
66
+ else
67
+ nil
68
+ end
69
+ end
70
+
71
+ def get_failed_content(gid)
72
+ client = Client::GlobalPage.new()
73
+ content_json = client.find_failed_content(gid)
74
+
75
+ if content_json['available']
76
+ signed_url = content_json['signed_url']
77
+ Client::BackblazeContent.new.get_gunzipped_content(signed_url)
78
+ else
79
+ nil
80
+ end
81
+ end
82
+
83
+ # Get current job id from scraper or default when scraper_name is null.
84
+ #
85
+ # @param [String|nil] scraper_name Scraper name.
86
+ # @param [Integer|nil] default (nil) Default job id when no scraper name.
87
+ #
88
+ # @raise [Exception] When scraper name is not null, and scraper doesn't
89
+ # exists or it has no current job.
90
+ def get_job_id scraper_name, default = nil
91
+ return default if scraper_name.nil?
92
+ job = Client::ScraperJob.new().find(scraper_name)
93
+ raise JSON.pretty_generate(job) if job['id'].nil?
94
+ job['id']
95
+ end
96
+
97
+ # Find outputs by collection and query with pagination.
98
+ #
99
+ # @param [String] collection ('default') Collection name.
100
+ # @param [Hash] query ({}) Filters to query.
101
+ # @param [Integer] page (1) Page number.
102
+ # @param [Integer] per_page (30) Page size.
103
+ # @param [Hash] opts ({}) Configuration options.
104
+ # @option opts [String|nil] :scraper_name (nil) Scraper name to query
105
+ # from.
106
+ # @option opts [Integer|nil] :job_id (nil) Job's id to query from.
107
+ #
108
+ # @raise [ArgumentError] +collection+ is not String.
109
+ # @raise [ArgumentError] +query+ is not a Hash.
110
+ # @raise [ArgumentError] +page+ is not an Integer greater than 0.
111
+ # @raise [ArgumentError] +per_page+ is not an Integer between 1 and 500.
112
+ #
113
+ # @return [Array]
114
+ #
115
+ # @example
116
+ # find_outputs
117
+ # @example
118
+ # find_outputs 'my_collection'
119
+ # @example
120
+ # find_outputs 'my_collection', {}
121
+ # @example
122
+ # find_outputs 'my_collection', {}, 1
123
+ # @example
124
+ # find_outputs 'my_collection', {}, 1, 30
125
+ # @example Find from another scraper by name
126
+ # find_outputs 'my_collection', {}, 1, 30, scraper_name: 'my_scraper'
127
+ # @example Find from another scraper by job_id
128
+ # find_outputs 'my_collection', {}, 1, 30, job_id: 123
129
+ #
130
+ # @note *opts `:job_id` option is prioritize over `:scraper_name` when
131
+ # both exists. If none add provided or nil values, then current job
132
+ # will be used to query instead, this is the defaul behavior.
133
+ def find_outputs(collection='default', query={}, page=1, per_page=30, opts = {})
134
+ # Validate parameters out from nil for easier user usage.
135
+ raise ArgumentError.new("collection needs to be a String") unless collection.is_a?(String)
136
+ raise ArgumentError.new("query needs to be a Hash, instead of: #{query}") unless query.is_a?(Hash)
137
+ unless page.is_a?(Integer) && page > 0
138
+ raise ArgumentError.new("page needs to be an Integer greater than 0")
139
+ end
140
+ unless per_page.is_a?(Integer) && per_page > 0 && per_page <= MAX_FIND_OUTPUTS_PER_PAGE
141
+ raise ArgumentError.new("per_page needs to be an Integer between 1 and #{MAX_FIND_OUTPUTS_PER_PAGE}")
142
+ end
143
+
144
+ options = {
145
+ query: query,
146
+ page: page,
147
+ per_page: per_page}
148
+
149
+ # Get job_id
150
+ query_job_id = opts[:job_id] || get_job_id(opts[:scraper_name], self.job_id)
151
+
152
+ client = Client::JobOutput.new(options)
153
+ response = client.all(query_job_id, collection)
154
+
155
+ if response.code != 200
156
+ raise "response_code: #{response.code}|#{response.parsed_response}"
157
+ end
158
+ (response.body != 'null') ? response.parsed_response : []
159
+ end
160
+
161
+ # Find one output by collection and query with pagination.
162
+ #
163
+ # @param [String] collection ('default') Collection name.
164
+ # @param [Hash] query ({}) Filters to query.
165
+ # @param [Hash] opts ({}) Configuration options.
166
+ # @option opts [String|nil] :scraper_name (nil) Scraper name to query
167
+ # from.
168
+ # @option opts [Integer|nil] :job_id (nil) Job's id to query from.
169
+ #
170
+ # @raise [ArgumentError] +collection+ is not String.
171
+ # @raise [ArgumentError] +query+ is not a Hash.
172
+ #
173
+ # @return [Hash|nil] `Hash` when found, and `nil` when no output is found.
174
+ #
175
+ # @example
176
+ # find_output
177
+ # @example
178
+ # find_output 'my_collection'
179
+ # @example
180
+ # find_output 'my_collection', {}
181
+ # @example Find from another scraper by name
182
+ # find_output 'my_collection', {}, scraper_name: 'my_scraper'
183
+ # @example Find from another scraper by job_id
184
+ # find_output 'my_collection', {}, job_id: 123
185
+ #
186
+ # @note *opts `:job_id` option is prioritize over `:scraper_name` when
187
+ # both exists. If none add provided or nil values, then current job
188
+ # will be used to query instead, this is the defaul behavior.
189
+ def find_output(collection='default', query={}, opts = {})
190
+ result = find_outputs(collection, query, 1, 1, opts)
191
+ result.respond_to?(:first) ? result.first : nil
192
+ end
193
+
194
+ def save_pages_and_outputs(pages = [], outputs = [], status)
195
+ total_pages = pages.count
196
+ total_outputs = outputs.count
197
+ records_per_slice = 100
198
+ until pages.empty? && outputs.empty?
199
+ pages_slice = pages.shift(records_per_slice)
200
+ outputs_slice = outputs.shift(records_per_slice)
201
+
202
+ log_msgs = []
203
+ unless pages_slice.empty?
204
+ log_msgs << "#{pages_slice.count} out of #{total_pages} Pages"
205
+ unless save
206
+ puts '----------------------------------------'
207
+ puts "Would have saved #{log_msgs.last}"
208
+ puts JSON.pretty_generate pages_slice
209
+ end
210
+ end
211
+
212
+ unless outputs_slice.empty?
213
+ log_msgs << "#{outputs_slice.count} out of #{total_outputs} Outputs"
214
+ unless save
215
+ puts '----------------------------------------'
216
+ puts "Would have saved #{log_msgs.last}"
217
+ puts JSON.pretty_generate outputs_slice
218
+ end
219
+ end
220
+
221
+ next unless save
222
+ log_msg = "Saving #{log_msgs.join(' and ')}."
223
+ puts "#{log_msg}"
224
+
225
+ # saving to server
226
+ response = update_to_server(
227
+ job_id: job_id,
228
+ gid: gid,
229
+ pages: pages_slice,
230
+ outputs: outputs_slice,
231
+ status: status)
232
+
233
+ if response.code == 200
234
+ log_msg = "Saved."
235
+ puts "#{log_msg}"
236
+ else
237
+ puts "Error: Unable to save Pages and/or Outputs to server: #{response.body}"
238
+ raise "Unable to save Pages and/or Outputs to server: #{response.body}"
239
+ end
240
+ end
241
+ end
242
+
243
+ def update_to_server(opts = {})
244
+ raise "Implemented in Subclass"
245
+ end
246
+
247
+ def clean_backtrace(backtrace)
248
+ i = backtrace.index{|x| x =~ /gems\/answersengine/i}
249
+ if i.to_i < 1
250
+ return []
251
+ else
252
+ return backtrace[0..(i-1)]
253
+ end
254
+ end
255
+
256
+ def save_type
257
+ raise NotImplementedError.new('Need to implement "save_type" method.')
258
+ end
259
+
260
+ # Saves pages from an array and clear it.
261
+ #
262
+ # @param [Array] pages ([]) Page array to save. Warning: all elements will
263
+ # be removed from the array.
264
+ #
265
+ # @note IMPORTANT: +pages+ array's elements will be removed.
266
+ def save_pages(pages=[])
267
+ save_pages_and_outputs(pages, [], save_type)
268
+ end
269
+
270
+ # Saves outputs from an array and clear it.
271
+ #
272
+ # @param [Array] outputs ([]) Output array to save. Warning: all elements
273
+ # will be removed from the array.
274
+ #
275
+ # @note IMPORTANT: +outputs+ array's elements will be removed.
276
+ def save_outputs(outputs=[])
277
+ save_pages_and_outputs([], outputs, save_type)
278
+ end
279
+
280
+ # Eval a filename with a custom binding
281
+ #
282
+ # @param [String] filename File path to read.
283
+ # @param [Binding] context Context binding to evaluate with.
284
+ #
285
+ # @note Using this method will allow scripts to contain `return` to
286
+ # exit the script sooner along some improved security.
287
+ def eval_with_context file_path, context
288
+ eval(File.read(file_path), context, file_path)
289
+ end
290
+ end
291
+ end
292
+ end