datahen 0.10.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +29 -0
  8. data/Rakefile +22 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/datahen.gemspec +47 -0
  12. data/examples/fetchtest/libraries/hello.rb +9 -0
  13. data/examples/fetchtest/libraries/hello_fail.rb +10 -0
  14. data/examples/fetchtest/parsers/failed.rb +2 -0
  15. data/examples/fetchtest/parsers/find_outputs.rb +18 -0
  16. data/examples/fetchtest/parsers/home.rb +50 -0
  17. data/examples/fetchtest/parsers/nested_fail.rb +3 -0
  18. data/examples/fetchtest/parsers/simple.rb +14 -0
  19. data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
  20. data/examples/fetchtest/seeders/failed.rb +1 -0
  21. data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
  22. data/examples/fetchtest/seeders/seed.rb +28 -0
  23. data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
  24. data/exe/hen +3 -0
  25. data/lib/datahen.rb +5 -0
  26. data/lib/datahen/cli.rb +45 -0
  27. data/lib/datahen/cli/env_var.rb +48 -0
  28. data/lib/datahen/cli/finisher.rb +40 -0
  29. data/lib/datahen/cli/global_page.rb +39 -0
  30. data/lib/datahen/cli/job.rb +30 -0
  31. data/lib/datahen/cli/job_output.rb +69 -0
  32. data/lib/datahen/cli/parser.rb +64 -0
  33. data/lib/datahen/cli/scraper.rb +185 -0
  34. data/lib/datahen/cli/scraper_deployment.rb +24 -0
  35. data/lib/datahen/cli/scraper_export.rb +51 -0
  36. data/lib/datahen/cli/scraper_exporter.rb +40 -0
  37. data/lib/datahen/cli/scraper_finisher.rb +20 -0
  38. data/lib/datahen/cli/scraper_job.rb +75 -0
  39. data/lib/datahen/cli/scraper_job_var.rb +48 -0
  40. data/lib/datahen/cli/scraper_page.rb +203 -0
  41. data/lib/datahen/cli/scraper_var.rb +48 -0
  42. data/lib/datahen/cli/seeder.rb +40 -0
  43. data/lib/datahen/client.rb +29 -0
  44. data/lib/datahen/client/auth_token.rb +50 -0
  45. data/lib/datahen/client/backblaze_content.rb +45 -0
  46. data/lib/datahen/client/base.rb +69 -0
  47. data/lib/datahen/client/deploy_key.rb +21 -0
  48. data/lib/datahen/client/env_var.rb +28 -0
  49. data/lib/datahen/client/export.rb +10 -0
  50. data/lib/datahen/client/global_page.rb +18 -0
  51. data/lib/datahen/client/job.rb +64 -0
  52. data/lib/datahen/client/job_export.rb +10 -0
  53. data/lib/datahen/client/job_log.rb +26 -0
  54. data/lib/datahen/client/job_output.rb +19 -0
  55. data/lib/datahen/client/job_page.rb +58 -0
  56. data/lib/datahen/client/job_stat.rb +16 -0
  57. data/lib/datahen/client/scraper.rb +57 -0
  58. data/lib/datahen/client/scraper_deployment.rb +18 -0
  59. data/lib/datahen/client/scraper_export.rb +22 -0
  60. data/lib/datahen/client/scraper_exporter.rb +14 -0
  61. data/lib/datahen/client/scraper_finisher.rb +16 -0
  62. data/lib/datahen/client/scraper_job.rb +49 -0
  63. data/lib/datahen/client/scraper_job_output.rb +19 -0
  64. data/lib/datahen/client/scraper_job_page.rb +67 -0
  65. data/lib/datahen/client/scraper_job_var.rb +28 -0
  66. data/lib/datahen/client/scraper_var.rb +28 -0
  67. data/lib/datahen/plugin.rb +6 -0
  68. data/lib/datahen/plugin/context_exposer.rb +55 -0
  69. data/lib/datahen/scraper.rb +18 -0
  70. data/lib/datahen/scraper/executor.rb +373 -0
  71. data/lib/datahen/scraper/finisher.rb +18 -0
  72. data/lib/datahen/scraper/parser.rb +18 -0
  73. data/lib/datahen/scraper/ruby_finisher_executor.rb +116 -0
  74. data/lib/datahen/scraper/ruby_parser_executor.rb +200 -0
  75. data/lib/datahen/scraper/ruby_seeder_executor.rb +120 -0
  76. data/lib/datahen/scraper/seeder.rb +18 -0
  77. data/lib/datahen/version.rb +3 -0
  78. metadata +270 -0
@@ -0,0 +1,6 @@
1
+ require 'datahen/plugin/context_exposer'
2
+
3
+ module Datahen
4
+ module Plugin
5
+ end
6
+ end
@@ -0,0 +1,55 @@
1
+ module Datahen
2
+ module Plugin
3
+ module ContextExposer
4
+ def self.exposed_methods
5
+ raise NotImplementedError.new('Specify methods exposed to isolated env')
6
+ end
7
+
8
+ def exposed_methods
9
+ self.class.exposed_methods
10
+ end
11
+
12
+ # Create lambda to retrieve a variable or call instance method
13
+ def var_or_proc vars, key
14
+ myself = self # Avoid stack overflow
15
+ return lambda{vars[key]} if vars.has_key?(key)
16
+ lambda{|*args| myself.send(key, *args)}
17
+ end
18
+
19
+ def exposed_env vars
20
+ keys = exposed_methods + vars.keys
21
+ Hash[keys.uniq.map{|key|[key, var_or_proc(vars, key)]}]
22
+ end
23
+
24
+ def expose_to object, env
25
+ metaclass = class << object; self; end
26
+ env.each do |key, block|
27
+ metaclass.send(:define_method, key, block)
28
+ end
29
+ object
30
+ end
31
+
32
+ # Create isolated context object from self
33
+ def create_context vars = {}
34
+ create_top_object_script = '(
35
+ lambda do
36
+ object = Object.new
37
+ metaclass = class << object
38
+ define_method(:context_binding){binding}
39
+ end
40
+ object
41
+ end
42
+ ).call'
43
+ object = TOPLEVEL_BINDING.eval(create_top_object_script)
44
+ env = exposed_env(vars)
45
+ expose_to object, env
46
+ object
47
+ end
48
+
49
+ # Create an isolated binding
50
+ def isolated_binding vars = {}
51
+ create_context(vars).context_binding
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,18 @@
1
+ require "datahen/plugin"
2
+ require "datahen/scraper/parser"
3
+ require "datahen/scraper/seeder"
4
+ require "datahen/scraper/finisher"
5
+ require "datahen/scraper/executor"
6
+ require "datahen/scraper/ruby_parser_executor"
7
+ require "datahen/scraper/ruby_seeder_executor"
8
+ require "datahen/scraper/ruby_finisher_executor"
9
+ require "datahen/client"
10
+
11
+ module Datahen
12
+ module Scraper
13
+ # def self.list(opts={})
14
+ # scraper = Client::Scraper.new(opts)
15
+ # "Listing scrapers #{ENV['DATAHEN_TOKEN']} for #{scraper.all}"
16
+ # end
17
+ end
18
+ end
@@ -0,0 +1,373 @@
1
+ require 'nokogiri'
2
+ module Datahen
3
+ module Scraper
4
+ # @abstract
5
+ class Executor
6
+ # Max allowed page size when query outputs (see #find_outputs).
7
+ MAX_FIND_OUTPUTS_PER_PAGE = 500
8
+
9
+ attr_accessor :filename, :gid, :job_id
10
+
11
+ include Datahen::Plugin::ContextExposer
12
+
13
+ def exec_parser(save=false)
14
+ raise "should be implemented in subclass"
15
+ end
16
+
17
+ def init_page()
18
+ if job_id
19
+ puts "getting Job Page"
20
+ init_job_page
21
+ else
22
+ puts "getting Global Page"
23
+ init_global_page()
24
+ end
25
+
26
+ end
27
+
28
+ def init_job_page()
29
+ client = Client::JobPage.new()
30
+ job_page = client.find(job_id, gid)
31
+ unless job_page.code == 200
32
+ raise "Job #{job_id} or GID #{gid} not found. Aborting execution!"
33
+ else
34
+ job_page
35
+ end
36
+
37
+ end
38
+
39
+ def parsing_update(options={})
40
+ client = Client::JobPage.new()
41
+ job_id = options.fetch(:job_id)
42
+ gid = options.fetch(:gid)
43
+
44
+ client.parsing_update(job_id, gid, options)
45
+ end
46
+
47
+ def seeding_update(options={})
48
+ client = Client::Job.new()
49
+ job_id = options.fetch(:job_id)
50
+
51
+ client.seeding_update(job_id, options)
52
+ end
53
+
54
+ def finisher_update(options={})
55
+ client = Client::Job.new()
56
+ job_id = options.fetch(:job_id)
57
+
58
+ client.finisher_update(job_id, options)
59
+ end
60
+
61
+ def init_global_page()
62
+ client = Client::GlobalPage.new()
63
+ client.find(gid)
64
+ end
65
+
66
+ def get_content(gid)
67
+ client = Client::GlobalPage.new()
68
+ content_json = client.find_content(gid)
69
+
70
+ if content_json['available']
71
+ signed_url = content_json['signed_url']
72
+ Client::BackblazeContent.new.get_gunzipped_content(signed_url)
73
+ else
74
+ nil
75
+ end
76
+ end
77
+
78
+ def get_failed_content(gid)
79
+ client = Client::GlobalPage.new()
80
+ content_json = client.find_failed_content(gid)
81
+
82
+ if content_json['available']
83
+ signed_url = content_json['signed_url']
84
+ Client::BackblazeContent.new.get_gunzipped_content(signed_url)
85
+ else
86
+ nil
87
+ end
88
+ end
89
+
90
+ # Get current job id from scraper or default when scraper_name is null.
91
+ #
92
+ # @param [String|nil] scraper_name Scraper name.
93
+ # @param [Integer|nil] default (nil) Default job id when no scraper name.
94
+ #
95
+ # @raise [Exception] When scraper name is not null, and scraper doesn't
96
+ # exists or it has no current job.
97
+ def get_job_id scraper_name, default = nil
98
+ return default if scraper_name.nil?
99
+ job = Client::ScraperJob.new().find(scraper_name)
100
+ raise JSON.pretty_generate(job) if job['id'].nil?
101
+ job['id']
102
+ end
103
+
104
+ # Find outputs by collection and query with pagination.
105
+ #
106
+ # @param [String] collection ('default') Collection name.
107
+ # @param [Hash] query ({}) Filters to query.
108
+ # @param [Integer] page (1) Page number.
109
+ # @param [Integer] per_page (100) Page size.
110
+ # @param [Hash] opts ({}) Configuration options.
111
+ # @option opts [String|nil] :scraper_name (nil) Scraper name to query
112
+ # from.
113
+ # @option opts [Integer|nil] :job_id (nil) Job's id to query from.
114
+ #
115
+ # @raise [ArgumentError] +collection+ is not String.
116
+ # @raise [ArgumentError] +query+ is not a Hash.
117
+ # @raise [ArgumentError] +page+ is not an Integer greater than 0.
118
+ # @raise [ArgumentError] +per_page+ is not an Integer between 1 and 500.
119
+ #
120
+ # @return [Array]
121
+ #
122
+ # @example
123
+ # find_outputs
124
+ # @example
125
+ # find_outputs 'my_collection'
126
+ # @example
127
+ # find_outputs 'my_collection', {}
128
+ # @example
129
+ # find_outputs 'my_collection', {}, 1
130
+ # @example
131
+ # find_outputs 'my_collection', {}, 1, 100
132
+ # @example Find from another scraper by name
133
+ # find_outputs 'my_collection', {}, 1, 100, scraper_name: 'my_scraper'
134
+ # @example Find from another scraper by job_id
135
+ # find_outputs 'my_collection', {}, 1, 100, job_id: 123
136
+ #
137
+ # @note *opts `:job_id` option is prioritize over `:scraper_name` when
138
+ # both exists. If none add provided or nil values, then current job
139
+ # will be used to query instead, this is the defaul behavior.
140
+ def find_outputs(collection='default', query={}, page=1, per_page=100, opts = {})
141
+ # Validate parameters out from nil for easier user usage.
142
+ raise ArgumentError.new("collection needs to be a String") unless collection.is_a?(String)
143
+ raise ArgumentError.new("query needs to be a Hash, instead of: #{query}") unless query.is_a?(Hash)
144
+ unless page.is_a?(Integer) && page > 0
145
+ raise ArgumentError.new("page needs to be an Integer greater than 0")
146
+ end
147
+ unless per_page.is_a?(Integer) && per_page > 0 && per_page <= MAX_FIND_OUTPUTS_PER_PAGE
148
+ raise ArgumentError.new("per_page needs to be an Integer between 1 and #{MAX_FIND_OUTPUTS_PER_PAGE}")
149
+ end
150
+
151
+ options = {
152
+ query: query,
153
+ page: page,
154
+ per_page: per_page}
155
+
156
+ # Get job_id
157
+ query_job_id = opts[:job_id] || get_job_id(opts[:scraper_name], self.job_id)
158
+
159
+ client = Client::JobOutput.new(options)
160
+ response = client.all(query_job_id, collection)
161
+
162
+ if response.code != 200
163
+ raise "response_code: #{response.code}|#{response.parsed_response}"
164
+ end
165
+ (response.body != 'null') ? response.parsed_response : []
166
+ end
167
+
168
+ # Find one output by collection and query with pagination.
169
+ #
170
+ # @param [String] collection ('default') Collection name.
171
+ # @param [Hash] query ({}) Filters to query.
172
+ # @param [Hash] opts ({}) Configuration options.
173
+ # @option opts [String|nil] :scraper_name (nil) Scraper name to query
174
+ # from.
175
+ # @option opts [Integer|nil] :job_id (nil) Job's id to query from.
176
+ #
177
+ # @raise [ArgumentError] +collection+ is not String.
178
+ # @raise [ArgumentError] +query+ is not a Hash.
179
+ #
180
+ # @return [Hash|nil] `Hash` when found, and `nil` when no output is found.
181
+ #
182
+ # @example
183
+ # find_output
184
+ # @example
185
+ # find_output 'my_collection'
186
+ # @example
187
+ # find_output 'my_collection', {}
188
+ # @example Find from another scraper by name
189
+ # find_output 'my_collection', {}, scraper_name: 'my_scraper'
190
+ # @example Find from another scraper by job_id
191
+ # find_output 'my_collection', {}, job_id: 123
192
+ #
193
+ # @note *opts `:job_id` option is prioritize over `:scraper_name` when
194
+ # both exists. If none add provided or nil values, then current job
195
+ # will be used to query instead, this is the defaul behavior.
196
+ def find_output(collection='default', query={}, opts = {})
197
+ result = find_outputs(collection, query, 1, 1, opts)
198
+ result.respond_to?(:first) ? result.first : nil
199
+ end
200
+
201
+ # Remove dups by prioritizing the latest dup.
202
+ #
203
+ # @param [Array] list List of hashes to dedup.
204
+ # @param [Hash] key_defaults Key and default value pair hash to use on
205
+ # uniq validation.
206
+ #
207
+ # @return [Integer] Removed duplicated items count.
208
+ def remove_old_dups!(list, key_defaults)
209
+ raw_count = list.count
210
+ keys = key_defaults.keys
211
+ force_uniq = 0
212
+ list.reverse!.uniq! do |item|
213
+ # Extract stringify keys as hash
214
+ key_hash = Hash[item.map{|k,v|keys.include?(k.to_s) ? [k.to_s,v] : nil}.select{|i|!i.nil?}]
215
+
216
+ # Apply defaults for uniq validation
217
+ key_defaults.each{|k,v| key_hash[k] = v if key_hash[k].nil?}
218
+
219
+ # Don't dedup nil key defaults
220
+ skip_dedup = !keys.find{|k| key_hash[k].nil?}.nil?
221
+ skip_dedup ? (force_uniq += 1) : key_hash
222
+ end
223
+ list.reverse!
224
+ dup_count = raw_count - list.count
225
+ dup_count
226
+ end
227
+
228
+ # Remove page dups by prioritizing the latest dup.
229
+ #
230
+ # @param [Array] list List of pages to dedup.
231
+ #
232
+ # @return [Integer] Removed duplicated items count.
233
+ #
234
+ # @note It will not dedup for now as it is hard to build gid.
235
+ # TODO: Build gid so we can dedup
236
+ def remove_old_page_dups!(list)
237
+ key_defaults = {
238
+ 'gid' => nil
239
+ }
240
+ remove_old_dups! list, key_defaults
241
+ end
242
+
243
+ # Remove dups by prioritizing the latest dup.
244
+ #
245
+ # @param [Array] list List of outputs to dedup.
246
+ #
247
+ # @return [Integer] Removed duplicated items count.
248
+ def remove_old_output_dups!(list)
249
+ key_defaults = {
250
+ '_id' => nil,
251
+ '_collection' => 'default'
252
+ }
253
+ remove_old_dups! list, key_defaults
254
+ end
255
+
256
+ def save_pages_and_outputs(pages = [], outputs = [], status)
257
+ total_pages = pages.count
258
+ total_outputs = outputs.count
259
+ records_per_slice = 100
260
+ until pages.empty? && outputs.empty?
261
+ pages_slice = pages.shift(records_per_slice)
262
+ pages_dup_count = remove_old_page_dups! pages_slice
263
+ outputs_slice = outputs.shift(records_per_slice)
264
+ outputs_dup_count = remove_old_output_dups! outputs_slice
265
+
266
+ log_msgs = []
267
+ unless pages_slice.empty?
268
+ page_dups_ignored = pages_dup_count > 0 ? " (#{pages_dup_count} dups ignored)" : ''
269
+ log_msgs << "#{pages_slice.count} out of #{total_pages} Pages#{page_dups_ignored}"
270
+
271
+ unless save
272
+ puts '----------------------------------------'
273
+ puts "Trying to validate #{log_msgs.last}#{page_dups_ignored}"
274
+ puts JSON.pretty_generate pages_slice
275
+ end
276
+ end
277
+
278
+ unless outputs_slice.empty?
279
+ output_dups_ignored = outputs_dup_count > 0 ? " (#{outputs_dup_count} dups ignored)" : ''
280
+ log_msgs << "#{outputs_slice.count} out of #{total_outputs} Outputs#{output_dups_ignored}"
281
+
282
+ unless save
283
+ puts '----------------------------------------'
284
+ puts "Trying to validate #{log_msgs.last}#{output_dups_ignored}"
285
+ puts JSON.pretty_generate outputs_slice
286
+ end
287
+ end
288
+
289
+ # behave differently if it is a real save
290
+ if save
291
+ log_msg = "Saving #{log_msgs.join(' and ')}."
292
+ puts "#{log_msg}"
293
+ else
294
+ status = "#{status}_try"
295
+ end
296
+
297
+ # saving to server
298
+ response = update_to_server(
299
+ job_id: job_id,
300
+ gid: gid,
301
+ pages: pages_slice,
302
+ outputs: outputs_slice,
303
+ status: status)
304
+
305
+ if response.code == 200
306
+ if save
307
+ log_msg = "Saved."
308
+ puts "#{log_msg}"
309
+ else
310
+ puts "Validation successful"
311
+ end
312
+ else
313
+ if save
314
+ puts "Error: Unable to save Pages and/or Outputs to server: #{response.body}"
315
+ raise "Unable to save Pages and/or Outputs to server: #{response.body}"
316
+ else
317
+ puts "Error: Invalid Pages and/or Outputs: #{response.body}"
318
+ raise "Invalid Pages and/or Outputs: #{response.body}"
319
+ end
320
+ end
321
+ end
322
+ end
323
+
324
+ def update_to_server(opts = {})
325
+ raise "Implemented in Subclass"
326
+ end
327
+
328
+ def clean_backtrace(backtrace)
329
+ i = backtrace.index{|x| x =~ /gems\/datahen/i}
330
+ if i.to_i < 1
331
+ return []
332
+ else
333
+ return backtrace[0..(i-1)]
334
+ end
335
+ end
336
+
337
+ def save_type
338
+ raise NotImplementedError.new('Need to implement "save_type" method.')
339
+ end
340
+
341
+ # Saves pages from an array and clear it.
342
+ #
343
+ # @param [Array] pages ([]) Page array to save. Warning: all elements will
344
+ # be removed from the array.
345
+ #
346
+ # @note IMPORTANT: +pages+ array's elements will be removed.
347
+ def save_pages(pages=[])
348
+ save_pages_and_outputs(pages, [], save_type)
349
+ end
350
+
351
+ # Saves outputs from an array and clear it.
352
+ #
353
+ # @param [Array] outputs ([]) Output array to save. Warning: all elements
354
+ # will be removed from the array.
355
+ #
356
+ # @note IMPORTANT: +outputs+ array's elements will be removed.
357
+ def save_outputs(outputs=[])
358
+ save_pages_and_outputs([], outputs, save_type)
359
+ end
360
+
361
+ # Eval a filename with a custom binding
362
+ #
363
+ # @param [String] file_path File path to read.
364
+ # @param [Binding] context Context binding to evaluate with.
365
+ #
366
+ # @note Using this method will allow scripts to contain `return` to
367
+ # exit the script sooner along some improved security.
368
+ def eval_with_context file_path, context
369
+ eval(File.read(file_path), context, file_path)
370
+ end
371
+ end
372
+ end
373
+ end