datahen 0.10.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (78) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +29 -0
  8. data/Rakefile +22 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/datahen.gemspec +47 -0
  12. data/examples/fetchtest/libraries/hello.rb +9 -0
  13. data/examples/fetchtest/libraries/hello_fail.rb +10 -0
  14. data/examples/fetchtest/parsers/failed.rb +2 -0
  15. data/examples/fetchtest/parsers/find_outputs.rb +18 -0
  16. data/examples/fetchtest/parsers/home.rb +50 -0
  17. data/examples/fetchtest/parsers/nested_fail.rb +3 -0
  18. data/examples/fetchtest/parsers/simple.rb +14 -0
  19. data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
  20. data/examples/fetchtest/seeders/failed.rb +1 -0
  21. data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
  22. data/examples/fetchtest/seeders/seed.rb +28 -0
  23. data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
  24. data/exe/hen +3 -0
  25. data/lib/datahen.rb +5 -0
  26. data/lib/datahen/cli.rb +45 -0
  27. data/lib/datahen/cli/env_var.rb +48 -0
  28. data/lib/datahen/cli/finisher.rb +40 -0
  29. data/lib/datahen/cli/global_page.rb +39 -0
  30. data/lib/datahen/cli/job.rb +30 -0
  31. data/lib/datahen/cli/job_output.rb +69 -0
  32. data/lib/datahen/cli/parser.rb +64 -0
  33. data/lib/datahen/cli/scraper.rb +185 -0
  34. data/lib/datahen/cli/scraper_deployment.rb +24 -0
  35. data/lib/datahen/cli/scraper_export.rb +51 -0
  36. data/lib/datahen/cli/scraper_exporter.rb +40 -0
  37. data/lib/datahen/cli/scraper_finisher.rb +20 -0
  38. data/lib/datahen/cli/scraper_job.rb +75 -0
  39. data/lib/datahen/cli/scraper_job_var.rb +48 -0
  40. data/lib/datahen/cli/scraper_page.rb +203 -0
  41. data/lib/datahen/cli/scraper_var.rb +48 -0
  42. data/lib/datahen/cli/seeder.rb +40 -0
  43. data/lib/datahen/client.rb +29 -0
  44. data/lib/datahen/client/auth_token.rb +50 -0
  45. data/lib/datahen/client/backblaze_content.rb +45 -0
  46. data/lib/datahen/client/base.rb +69 -0
  47. data/lib/datahen/client/deploy_key.rb +21 -0
  48. data/lib/datahen/client/env_var.rb +28 -0
  49. data/lib/datahen/client/export.rb +10 -0
  50. data/lib/datahen/client/global_page.rb +18 -0
  51. data/lib/datahen/client/job.rb +64 -0
  52. data/lib/datahen/client/job_export.rb +10 -0
  53. data/lib/datahen/client/job_log.rb +26 -0
  54. data/lib/datahen/client/job_output.rb +19 -0
  55. data/lib/datahen/client/job_page.rb +58 -0
  56. data/lib/datahen/client/job_stat.rb +16 -0
  57. data/lib/datahen/client/scraper.rb +57 -0
  58. data/lib/datahen/client/scraper_deployment.rb +18 -0
  59. data/lib/datahen/client/scraper_export.rb +22 -0
  60. data/lib/datahen/client/scraper_exporter.rb +14 -0
  61. data/lib/datahen/client/scraper_finisher.rb +16 -0
  62. data/lib/datahen/client/scraper_job.rb +49 -0
  63. data/lib/datahen/client/scraper_job_output.rb +19 -0
  64. data/lib/datahen/client/scraper_job_page.rb +67 -0
  65. data/lib/datahen/client/scraper_job_var.rb +28 -0
  66. data/lib/datahen/client/scraper_var.rb +28 -0
  67. data/lib/datahen/plugin.rb +6 -0
  68. data/lib/datahen/plugin/context_exposer.rb +55 -0
  69. data/lib/datahen/scraper.rb +18 -0
  70. data/lib/datahen/scraper/executor.rb +373 -0
  71. data/lib/datahen/scraper/finisher.rb +18 -0
  72. data/lib/datahen/scraper/parser.rb +18 -0
  73. data/lib/datahen/scraper/ruby_finisher_executor.rb +116 -0
  74. data/lib/datahen/scraper/ruby_parser_executor.rb +200 -0
  75. data/lib/datahen/scraper/ruby_seeder_executor.rb +120 -0
  76. data/lib/datahen/scraper/seeder.rb +18 -0
  77. data/lib/datahen/version.rb +3 -0
  78. metadata +270 -0
@@ -0,0 +1,6 @@
1
+ require 'datahen/plugin/context_exposer'
2
+
3
+ module Datahen
4
+ module Plugin
5
+ end
6
+ end
@@ -0,0 +1,55 @@
1
+ module Datahen
2
+ module Plugin
3
+ module ContextExposer
4
+ def self.exposed_methods
5
+ raise NotImplementedError.new('Specify methods exposed to isolated env')
6
+ end
7
+
8
+ def exposed_methods
9
+ self.class.exposed_methods
10
+ end
11
+
12
+ # Create lambda to retrieve a variable or call instance method
13
+ def var_or_proc vars, key
14
+ myself = self # Avoid stack overflow
15
+ return lambda{vars[key]} if vars.has_key?(key)
16
+ lambda{|*args| myself.send(key, *args)}
17
+ end
18
+
19
+ def exposed_env vars
20
+ keys = exposed_methods + vars.keys
21
+ Hash[keys.uniq.map{|key|[key, var_or_proc(vars, key)]}]
22
+ end
23
+
24
+ def expose_to object, env
25
+ metaclass = class << object; self; end
26
+ env.each do |key, block|
27
+ metaclass.send(:define_method, key, block)
28
+ end
29
+ object
30
+ end
31
+
32
+ # Create isolated context object from self
33
+ def create_context vars = {}
34
+ create_top_object_script = '(
35
+ lambda do
36
+ object = Object.new
37
+ metaclass = class << object
38
+ define_method(:context_binding){binding}
39
+ end
40
+ object
41
+ end
42
+ ).call'
43
+ object = TOPLEVEL_BINDING.eval(create_top_object_script)
44
+ env = exposed_env(vars)
45
+ expose_to object, env
46
+ object
47
+ end
48
+
49
+ # Create an isolated binding
50
+ def isolated_binding vars = {}
51
+ create_context(vars).context_binding
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,18 @@
1
+ require "datahen/plugin"
2
+ require "datahen/scraper/parser"
3
+ require "datahen/scraper/seeder"
4
+ require "datahen/scraper/finisher"
5
+ require "datahen/scraper/executor"
6
+ require "datahen/scraper/ruby_parser_executor"
7
+ require "datahen/scraper/ruby_seeder_executor"
8
+ require "datahen/scraper/ruby_finisher_executor"
9
+ require "datahen/client"
10
+
11
+ module Datahen
12
+ module Scraper
13
+ # def self.list(opts={})
14
+ # scraper = Client::Scraper.new(opts)
15
+ # "Listing scrapers #{ENV['DATAHEN_TOKEN']} for #{scraper.all}"
16
+ # end
17
+ end
18
+ end
@@ -0,0 +1,373 @@
1
+ require 'nokogiri'
2
+ module Datahen
3
+ module Scraper
4
+ # @abstract
5
+ class Executor
6
+ # Max allowed page size when query outputs (see #find_outputs).
7
+ MAX_FIND_OUTPUTS_PER_PAGE = 500
8
+
9
+ attr_accessor :filename, :gid, :job_id
10
+
11
+ include Datahen::Plugin::ContextExposer
12
+
13
+ def exec_parser(save=false)
14
+ raise "should be implemented in subclass"
15
+ end
16
+
17
+ def init_page()
18
+ if job_id
19
+ puts "getting Job Page"
20
+ init_job_page
21
+ else
22
+ puts "getting Global Page"
23
+ init_global_page()
24
+ end
25
+
26
+ end
27
+
28
+ def init_job_page()
29
+ client = Client::JobPage.new()
30
+ job_page = client.find(job_id, gid)
31
+ unless job_page.code == 200
32
+ raise "Job #{job_id} or GID #{gid} not found. Aborting execution!"
33
+ else
34
+ job_page
35
+ end
36
+
37
+ end
38
+
39
+ def parsing_update(options={})
40
+ client = Client::JobPage.new()
41
+ job_id = options.fetch(:job_id)
42
+ gid = options.fetch(:gid)
43
+
44
+ client.parsing_update(job_id, gid, options)
45
+ end
46
+
47
+ def seeding_update(options={})
48
+ client = Client::Job.new()
49
+ job_id = options.fetch(:job_id)
50
+
51
+ client.seeding_update(job_id, options)
52
+ end
53
+
54
+ def finisher_update(options={})
55
+ client = Client::Job.new()
56
+ job_id = options.fetch(:job_id)
57
+
58
+ client.finisher_update(job_id, options)
59
+ end
60
+
61
+ def init_global_page()
62
+ client = Client::GlobalPage.new()
63
+ client.find(gid)
64
+ end
65
+
66
+ def get_content(gid)
67
+ client = Client::GlobalPage.new()
68
+ content_json = client.find_content(gid)
69
+
70
+ if content_json['available']
71
+ signed_url = content_json['signed_url']
72
+ Client::BackblazeContent.new.get_gunzipped_content(signed_url)
73
+ else
74
+ nil
75
+ end
76
+ end
77
+
78
+ def get_failed_content(gid)
79
+ client = Client::GlobalPage.new()
80
+ content_json = client.find_failed_content(gid)
81
+
82
+ if content_json['available']
83
+ signed_url = content_json['signed_url']
84
+ Client::BackblazeContent.new.get_gunzipped_content(signed_url)
85
+ else
86
+ nil
87
+ end
88
+ end
89
+
90
+ # Get current job id from scraper or default when scraper_name is null.
91
+ #
92
+ # @param [String|nil] scraper_name Scraper name.
93
+ # @param [Integer|nil] default (nil) Default job id when no scraper name.
94
+ #
95
+ # @raise [Exception] When scraper name is not null, and scraper doesn't
96
+ # exists or it has no current job.
97
+ def get_job_id scraper_name, default = nil
98
+ return default if scraper_name.nil?
99
+ job = Client::ScraperJob.new().find(scraper_name)
100
+ raise JSON.pretty_generate(job) if job['id'].nil?
101
+ job['id']
102
+ end
103
+
104
+ # Find outputs by collection and query with pagination.
105
+ #
106
+ # @param [String] collection ('default') Collection name.
107
+ # @param [Hash] query ({}) Filters to query.
108
+ # @param [Integer] page (1) Page number.
109
+ # @param [Integer] per_page (100) Page size.
110
+ # @param [Hash] opts ({}) Configuration options.
111
+ # @option opts [String|nil] :scraper_name (nil) Scraper name to query
112
+ # from.
113
+ # @option opts [Integer|nil] :job_id (nil) Job's id to query from.
114
+ #
115
+ # @raise [ArgumentError] +collection+ is not String.
116
+ # @raise [ArgumentError] +query+ is not a Hash.
117
+ # @raise [ArgumentError] +page+ is not an Integer greater than 0.
118
+ # @raise [ArgumentError] +per_page+ is not an Integer between 1 and 500.
119
+ #
120
+ # @return [Array]
121
+ #
122
+ # @example
123
+ # find_outputs
124
+ # @example
125
+ # find_outputs 'my_collection'
126
+ # @example
127
+ # find_outputs 'my_collection', {}
128
+ # @example
129
+ # find_outputs 'my_collection', {}, 1
130
+ # @example
131
+ # find_outputs 'my_collection', {}, 1, 100
132
+ # @example Find from another scraper by name
133
+ # find_outputs 'my_collection', {}, 1, 100, scraper_name: 'my_scraper'
134
+ # @example Find from another scraper by job_id
135
+ # find_outputs 'my_collection', {}, 1, 100, job_id: 123
136
+ #
137
+ # @note *opts `:job_id` option is prioritize over `:scraper_name` when
138
+ # both exists. If none add provided or nil values, then current job
139
+ # will be used to query instead, this is the defaul behavior.
140
+ def find_outputs(collection='default', query={}, page=1, per_page=100, opts = {})
141
+ # Validate parameters out from nil for easier user usage.
142
+ raise ArgumentError.new("collection needs to be a String") unless collection.is_a?(String)
143
+ raise ArgumentError.new("query needs to be a Hash, instead of: #{query}") unless query.is_a?(Hash)
144
+ unless page.is_a?(Integer) && page > 0
145
+ raise ArgumentError.new("page needs to be an Integer greater than 0")
146
+ end
147
+ unless per_page.is_a?(Integer) && per_page > 0 && per_page <= MAX_FIND_OUTPUTS_PER_PAGE
148
+ raise ArgumentError.new("per_page needs to be an Integer between 1 and #{MAX_FIND_OUTPUTS_PER_PAGE}")
149
+ end
150
+
151
+ options = {
152
+ query: query,
153
+ page: page,
154
+ per_page: per_page}
155
+
156
+ # Get job_id
157
+ query_job_id = opts[:job_id] || get_job_id(opts[:scraper_name], self.job_id)
158
+
159
+ client = Client::JobOutput.new(options)
160
+ response = client.all(query_job_id, collection)
161
+
162
+ if response.code != 200
163
+ raise "response_code: #{response.code}|#{response.parsed_response}"
164
+ end
165
+ (response.body != 'null') ? response.parsed_response : []
166
+ end
167
+
168
+ # Find one output by collection and query with pagination.
169
+ #
170
+ # @param [String] collection ('default') Collection name.
171
+ # @param [Hash] query ({}) Filters to query.
172
+ # @param [Hash] opts ({}) Configuration options.
173
+ # @option opts [String|nil] :scraper_name (nil) Scraper name to query
174
+ # from.
175
+ # @option opts [Integer|nil] :job_id (nil) Job's id to query from.
176
+ #
177
+ # @raise [ArgumentError] +collection+ is not String.
178
+ # @raise [ArgumentError] +query+ is not a Hash.
179
+ #
180
+ # @return [Hash|nil] `Hash` when found, and `nil` when no output is found.
181
+ #
182
+ # @example
183
+ # find_output
184
+ # @example
185
+ # find_output 'my_collection'
186
+ # @example
187
+ # find_output 'my_collection', {}
188
+ # @example Find from another scraper by name
189
+ # find_output 'my_collection', {}, scraper_name: 'my_scraper'
190
+ # @example Find from another scraper by job_id
191
+ # find_output 'my_collection', {}, job_id: 123
192
+ #
193
+ # @note *opts `:job_id` option is prioritize over `:scraper_name` when
194
+ # both exists. If none add provided or nil values, then current job
195
+ # will be used to query instead, this is the defaul behavior.
196
+ def find_output(collection='default', query={}, opts = {})
197
+ result = find_outputs(collection, query, 1, 1, opts)
198
+ result.respond_to?(:first) ? result.first : nil
199
+ end
200
+
201
+ # Remove dups by prioritizing the latest dup.
202
+ #
203
+ # @param [Array] list List of hashes to dedup.
204
+ # @param [Hash] key_defaults Key and default value pair hash to use on
205
+ # uniq validation.
206
+ #
207
+ # @return [Integer] Removed duplicated items count.
208
+ def remove_old_dups!(list, key_defaults)
209
+ raw_count = list.count
210
+ keys = key_defaults.keys
211
+ force_uniq = 0
212
+ list.reverse!.uniq! do |item|
213
+ # Extract stringify keys as hash
214
+ key_hash = Hash[item.map{|k,v|keys.include?(k.to_s) ? [k.to_s,v] : nil}.select{|i|!i.nil?}]
215
+
216
+ # Apply defaults for uniq validation
217
+ key_defaults.each{|k,v| key_hash[k] = v if key_hash[k].nil?}
218
+
219
+ # Don't dedup nil key defaults
220
+ skip_dedup = !keys.find{|k| key_hash[k].nil?}.nil?
221
+ skip_dedup ? (force_uniq += 1) : key_hash
222
+ end
223
+ list.reverse!
224
+ dup_count = raw_count - list.count
225
+ dup_count
226
+ end
227
+
228
+ # Remove page dups by prioritizing the latest dup.
229
+ #
230
+ # @param [Array] list List of pages to dedup.
231
+ #
232
+ # @return [Integer] Removed duplicated items count.
233
+ #
234
+ # @note It will not dedup for now as it is hard to build gid.
235
+ # TODO: Build gid so we can dedup
236
+ def remove_old_page_dups!(list)
237
+ key_defaults = {
238
+ 'gid' => nil
239
+ }
240
+ remove_old_dups! list, key_defaults
241
+ end
242
+
243
+ # Remove dups by prioritizing the latest dup.
244
+ #
245
+ # @param [Array] list List of outputs to dedup.
246
+ #
247
+ # @return [Integer] Removed duplicated items count.
248
+ def remove_old_output_dups!(list)
249
+ key_defaults = {
250
+ '_id' => nil,
251
+ '_collection' => 'default'
252
+ }
253
+ remove_old_dups! list, key_defaults
254
+ end
255
+
256
+ def save_pages_and_outputs(pages = [], outputs = [], status)
257
+ total_pages = pages.count
258
+ total_outputs = outputs.count
259
+ records_per_slice = 100
260
+ until pages.empty? && outputs.empty?
261
+ pages_slice = pages.shift(records_per_slice)
262
+ pages_dup_count = remove_old_page_dups! pages_slice
263
+ outputs_slice = outputs.shift(records_per_slice)
264
+ outputs_dup_count = remove_old_output_dups! outputs_slice
265
+
266
+ log_msgs = []
267
+ unless pages_slice.empty?
268
+ page_dups_ignored = pages_dup_count > 0 ? " (#{pages_dup_count} dups ignored)" : ''
269
+ log_msgs << "#{pages_slice.count} out of #{total_pages} Pages#{page_dups_ignored}"
270
+
271
+ unless save
272
+ puts '----------------------------------------'
273
+ puts "Trying to validate #{log_msgs.last}#{page_dups_ignored}"
274
+ puts JSON.pretty_generate pages_slice
275
+ end
276
+ end
277
+
278
+ unless outputs_slice.empty?
279
+ output_dups_ignored = outputs_dup_count > 0 ? " (#{outputs_dup_count} dups ignored)" : ''
280
+ log_msgs << "#{outputs_slice.count} out of #{total_outputs} Outputs#{output_dups_ignored}"
281
+
282
+ unless save
283
+ puts '----------------------------------------'
284
+ puts "Trying to validate #{log_msgs.last}#{output_dups_ignored}"
285
+ puts JSON.pretty_generate outputs_slice
286
+ end
287
+ end
288
+
289
+ # behave differently if it is a real save
290
+ if save
291
+ log_msg = "Saving #{log_msgs.join(' and ')}."
292
+ puts "#{log_msg}"
293
+ else
294
+ status = "#{status}_try"
295
+ end
296
+
297
+ # saving to server
298
+ response = update_to_server(
299
+ job_id: job_id,
300
+ gid: gid,
301
+ pages: pages_slice,
302
+ outputs: outputs_slice,
303
+ status: status)
304
+
305
+ if response.code == 200
306
+ if save
307
+ log_msg = "Saved."
308
+ puts "#{log_msg}"
309
+ else
310
+ puts "Validation successful"
311
+ end
312
+ else
313
+ if save
314
+ puts "Error: Unable to save Pages and/or Outputs to server: #{response.body}"
315
+ raise "Unable to save Pages and/or Outputs to server: #{response.body}"
316
+ else
317
+ puts "Error: Invalid Pages and/or Outputs: #{response.body}"
318
+ raise "Invalid Pages and/or Outputs: #{response.body}"
319
+ end
320
+ end
321
+ end
322
+ end
323
+
324
+ def update_to_server(opts = {})
325
+ raise "Implemented in Subclass"
326
+ end
327
+
328
+ def clean_backtrace(backtrace)
329
+ i = backtrace.index{|x| x =~ /gems\/datahen/i}
330
+ if i.to_i < 1
331
+ return []
332
+ else
333
+ return backtrace[0..(i-1)]
334
+ end
335
+ end
336
+
337
+ def save_type
338
+ raise NotImplementedError.new('Need to implement "save_type" method.')
339
+ end
340
+
341
+ # Saves pages from an array and clear it.
342
+ #
343
+ # @param [Array] pages ([]) Page array to save. Warning: all elements will
344
+ # be removed from the array.
345
+ #
346
+ # @note IMPORTANT: +pages+ array's elements will be removed.
347
+ def save_pages(pages=[])
348
+ save_pages_and_outputs(pages, [], save_type)
349
+ end
350
+
351
+ # Saves outputs from an array and clear it.
352
+ #
353
+ # @param [Array] outputs ([]) Output array to save. Warning: all elements
354
+ # will be removed from the array.
355
+ #
356
+ # @note IMPORTANT: +outputs+ array's elements will be removed.
357
+ def save_outputs(outputs=[])
358
+ save_pages_and_outputs([], outputs, save_type)
359
+ end
360
+
361
+ # Eval a filename with a custom binding
362
+ #
363
+ # @param [String] file_path File path to read.
364
+ # @param [Binding] context Context binding to evaluate with.
365
+ #
366
+ # @note Using this method will allow scripts to contain `return` to
367
+ # exit the script sooner along some improved security.
368
+ def eval_with_context file_path, context
369
+ eval(File.read(file_path), context, file_path)
370
+ end
371
+ end
372
+ end
373
+ end