dh_easy-core 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (79) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/.yardopts +1 -0
  5. data/CODE_OF_CONDUCT.md +74 -0
  6. data/Gemfile +6 -0
  7. data/LICENSE +21 -0
  8. data/README.md +20 -0
  9. data/Rakefile +22 -0
  10. data/dh_easy-core.gemspec +50 -0
  11. data/doc/DhEasy.html +117 -0
  12. data/doc/DhEasy/Core.html +1590 -0
  13. data/doc/DhEasy/Core/Config.html +311 -0
  14. data/doc/DhEasy/Core/Exception.html +117 -0
  15. data/doc/DhEasy/Core/Exception/OutdatedError.html +135 -0
  16. data/doc/DhEasy/Core/Helper.html +117 -0
  17. data/doc/DhEasy/Core/Helper/Cookie.html +1070 -0
  18. data/doc/DhEasy/Core/Mock.html +282 -0
  19. data/doc/DhEasy/Core/Mock/FakeDb.html +3779 -0
  20. data/doc/DhEasy/Core/Mock/FakeExecutor.html +3289 -0
  21. data/doc/DhEasy/Core/Mock/FakeFinisher.html +160 -0
  22. data/doc/DhEasy/Core/Mock/FakeParser.html +160 -0
  23. data/doc/DhEasy/Core/Mock/FakeSeeder.html +160 -0
  24. data/doc/DhEasy/Core/Plugin.html +117 -0
  25. data/doc/DhEasy/Core/Plugin/CollectionVault.html +299 -0
  26. data/doc/DhEasy/Core/Plugin/ConfigBehavior.html +541 -0
  27. data/doc/DhEasy/Core/Plugin/ContextIntegrator.html +445 -0
  28. data/doc/DhEasy/Core/Plugin/Executor.html +259 -0
  29. data/doc/DhEasy/Core/Plugin/ExecutorBehavior.html +344 -0
  30. data/doc/DhEasy/Core/Plugin/Finisher.html +265 -0
  31. data/doc/DhEasy/Core/Plugin/FinisherBehavior.html +142 -0
  32. data/doc/DhEasy/Core/Plugin/InitializeHook.html +220 -0
  33. data/doc/DhEasy/Core/Plugin/Parser.html +270 -0
  34. data/doc/DhEasy/Core/Plugin/ParserBehavior.html +235 -0
  35. data/doc/DhEasy/Core/Plugin/Seeder.html +674 -0
  36. data/doc/DhEasy/Core/Plugin/SeederBehavior.html +142 -0
  37. data/doc/DhEasy/Core/SmartCollection.html +1087 -0
  38. data/doc/_index.html +364 -0
  39. data/doc/class_list.html +51 -0
  40. data/doc/css/common.css +1 -0
  41. data/doc/css/full_list.css +58 -0
  42. data/doc/css/style.css +496 -0
  43. data/doc/file.README.html +91 -0
  44. data/doc/file_list.html +56 -0
  45. data/doc/frames.html +17 -0
  46. data/doc/index.html +91 -0
  47. data/doc/js/app.js +303 -0
  48. data/doc/js/full_list.js +216 -0
  49. data/doc/js/jquery.js +4 -0
  50. data/doc/method_list.html +939 -0
  51. data/doc/top-level-namespace.html +110 -0
  52. data/lib/dh_easy/core.rb +257 -0
  53. data/lib/dh_easy/core/config.rb +27 -0
  54. data/lib/dh_easy/core/exception.rb +8 -0
  55. data/lib/dh_easy/core/exception/outdated_error.rb +9 -0
  56. data/lib/dh_easy/core/helper.rb +8 -0
  57. data/lib/dh_easy/core/helper/cookie.rb +209 -0
  58. data/lib/dh_easy/core/mock.rb +45 -0
  59. data/lib/dh_easy/core/mock/fake_db.rb +561 -0
  60. data/lib/dh_easy/core/mock/fake_executor.rb +373 -0
  61. data/lib/dh_easy/core/mock/fake_finisher.rb +28 -0
  62. data/lib/dh_easy/core/mock/fake_parser.rb +33 -0
  63. data/lib/dh_easy/core/mock/fake_seeder.rb +28 -0
  64. data/lib/dh_easy/core/plugin.rb +19 -0
  65. data/lib/dh_easy/core/plugin/collection_vault.rb +23 -0
  66. data/lib/dh_easy/core/plugin/config_behavior.rb +43 -0
  67. data/lib/dh_easy/core/plugin/context_integrator.rb +60 -0
  68. data/lib/dh_easy/core/plugin/executor.rb +19 -0
  69. data/lib/dh_easy/core/plugin/executor_behavior.rb +32 -0
  70. data/lib/dh_easy/core/plugin/finisher.rb +19 -0
  71. data/lib/dh_easy/core/plugin/finisher_behavior.rb +9 -0
  72. data/lib/dh_easy/core/plugin/initialize_hook.rb +17 -0
  73. data/lib/dh_easy/core/plugin/parser.rb +19 -0
  74. data/lib/dh_easy/core/plugin/parser_behavior.rb +17 -0
  75. data/lib/dh_easy/core/plugin/seeder.rb +44 -0
  76. data/lib/dh_easy/core/plugin/seeder_behavior.rb +9 -0
  77. data/lib/dh_easy/core/smart_collection.rb +236 -0
  78. data/lib/dh_easy/core/version.rb +6 -0
  79. metadata +249 -0
@@ -0,0 +1,373 @@
1
+ module DhEasy
2
+ module Core
3
+ module Mock
4
+ # Fake executor that emulates `Datahen` executor.
5
+ module FakeExecutor
6
+ include Datahen::Plugin::ContextExposer
7
+
8
+ # Max allowed page size when query outputs (see #find_outputs).
9
+ MAX_FIND_OUTPUTS_PER_PAGE = 500
10
+
11
+ # Page content.
12
+ # @return [String,nil]
13
+ attr_accessor :content
14
+ # Failed page content.
15
+ # @return [String,nil]
16
+ attr_accessor :failed_content
17
+
18
+ # Validate executor methods compatibility.
19
+ # @private
20
+ #
21
+ # @param [Array] origin Datahen executor method collection.
22
+ # @param [Array] fragment Fake executor method collection.
23
+ #
24
+ # @return [Hash]
25
+ # @raise [DhEasy::Core::Exception::OutdatedError] When missing methods.
26
+ def self.check_compatibility origin, fragment
27
+ report = DhEasy::Core.analyze_compatibility origin, fragment
28
+
29
+ unless report[:new].count < 1
30
+ # Warn when outdated
31
+ warn <<-LONGDESC.gsub(/^\s+/,'')
32
+ It seems datahen has new unmapped methods, try updating
33
+ dh_easy-core gem or contacting gem maintainer to update it.
34
+ New methods: #{report[:new].join ', '}
35
+ LONGDESC
36
+ end
37
+
38
+ # Ensure no missing methods
39
+ unless report[:is_compatible]
40
+ message = <<-LONGDESC.gsub(/^\s+/,'')
41
+ There are missing methods! Check your datahen gem version.
42
+ Missing methods: #{report[:missing].join ', '}
43
+ LONGDESC
44
+ raise DhEasy::Core::Exception::OutdatedError.new(message)
45
+ end
46
+
47
+ report
48
+ end
49
+
50
+ # Draft pages, usually get saved after execution.
51
+ # @return [Array]
52
+ def pages
53
+ @pages ||= []
54
+ end
55
+
56
+ # Draft outputs, usually get saved after execution.
57
+ # @return [Array]
58
+ def outputs
59
+ @outputs ||= []
60
+ end
61
+
62
+ # Remove all elements on pages.
63
+ # @private
64
+ def clear_draft_pages
65
+ @pages.clear
66
+ end
67
+
68
+ # Remove all elements on outputs.
69
+ # @private
70
+ def clear_draft_outputs
71
+ @outputs.clear
72
+ end
73
+
74
+ # Fake database to represent what it is saved.
75
+ def db
76
+ @db ||= DhEasy::Core::Mock::FakeDb.new
77
+ end
78
+
79
+ # Initialize object.
80
+ #
81
+ # @param [Hash] opts ({}) Configuration options.
82
+ # @option opts [Array] :pages (nil) Array to initialize pages, can be nil for empty.
83
+ # @option opts [Array] :outputs (nil) Array to initialize outputs, can be nil for empty.
84
+ # @option opts [Integer] :job_id (nil) A number to represent the job_id.
85
+ # @option opts [Hash] :page (nil) Current page.
86
+ #
87
+ # @raise [ArgumentError] When pages or outputs are not Array.
88
+ def initialize opts = {}
89
+ unless opts[:pages].nil? || opts[:pages].is_a?(Array)
90
+ raise ArgumentError.new "Pages must be an array."
91
+ end
92
+ @pages = opts[:pages]
93
+ unless opts[:outputs].nil? || opts[:outputs].is_a?(Array)
94
+ raise ArgumentError.new "Outputs must be an array."
95
+ end
96
+ @outputs = opts[:outputs]
97
+ self.job_id = opts[:job_id]
98
+ self.scraper_name = opts[:scraper_name]
99
+ self.page = opts[:page]
100
+ end
101
+
102
+ # Fake scraper name used by executor.
103
+ # @return [Integer,nil]
104
+ def scraper_name
105
+ db.scraper_name
106
+ end
107
+
108
+ # Set fake scraper name value.
109
+ def scraper_name= value
110
+ db.scraper_name = value
111
+ end
112
+
113
+ # Fake job ID used by executor.
114
+ # @return [Integer,nil]
115
+ def job_id
116
+ db.job_id
117
+ end
118
+
119
+ # Set fake job ID value.
120
+ def job_id= value
121
+ db.job_id = value
122
+ page['job_id'] = value
123
+ end
124
+
125
+ # Current page used by executor.
126
+ # @return [Hash,nil]
127
+ def page
128
+ @page ||= DhEasy::Core::Mock::FakeDb.build_fake_page job_id: job_id
129
+ end
130
+
131
+ # Set current page.
132
+ def page= value
133
+ unless value.nil?
134
+ value = DhEasy::Core::Mock::FakeDb.build_page value
135
+ self.job_id = value['job_id'] unless value['job_id'].nil?
136
+ value['job_id'] ||= job_id
137
+ db.page_gid = value['gid'] unless value['gid'].nil?
138
+ end
139
+ @page = value
140
+ end
141
+
142
+ # Refetch self page flag.
143
+ # @return [Boollean]
144
+ # @note It is stronger than #reparse_self flag.
145
+ def refetch_self
146
+ @refetch_self ||= false
147
+ end
148
+
149
+ # Set refetch self page flag.
150
+ def refetch_self= value
151
+ @refetch_self = value
152
+ end
153
+
154
+ # Reparse self page flag.
155
+ # @return [Boollean]
156
+ def reparse_self
157
+ @reparse_self ||= false
158
+ end
159
+
160
+ # Set reparse self page flag.
161
+ def reparse_self= value
162
+ @reparse_self = value
163
+ end
164
+
165
+ # Retrive a list of saved jobs.
166
+ def saved_jobs
167
+ db.jobs
168
+ end
169
+
170
+ # Retrive a list of saved pages. Drafted pages can be included.
171
+ def saved_pages
172
+ db.pages
173
+ end
174
+
175
+ # Retrive a list of saved outputs.
176
+ def saved_outputs
177
+ db.outputs
178
+ end
179
+
180
+ # Save a job collection on db and remove all the element from +list+.
181
+ #
182
+ # @param [Array] list Collection of jobs to save.
183
+ def save_jobs list
184
+ list.each{|job| db.jobs << job}
185
+ list.clear
186
+ end
187
+
188
+ # Save a page collection on db and remove all the element from +list+.
189
+ #
190
+ # @param [Array] list Collection of pages to save.
191
+ def save_pages list
192
+ list.each{|page| db.pages << page}
193
+ list.clear
194
+ end
195
+
196
+ # Save an output collection on db and remove all the element from
197
+ # +list+.
198
+ #
199
+ # @param [Array] list Collection of outputs to save.
200
+ def save_outputs list
201
+ list.each{|output| db.outputs << output}
202
+ list.clear
203
+ end
204
+
205
+ # Execute any action applied to current page
206
+ def flush_self_actions
207
+ # Save page current page before refetch/reparse
208
+ if refetch_self || reparse_self
209
+ temp_page_gid_override = !db.allow_page_gid_override?
210
+ db.enable_page_gid_override if temp_page_gid_override
211
+ save_pages [page]
212
+ db.disable_page_gid_override if temp_page_gid_override
213
+ end
214
+ db.refetch(page['job_id'], page['gid']) if refetch_self
215
+ db.reparse(page['job_id'], page['gid']) if reparse_self
216
+ end
217
+
218
+ # Save draft pages into db and clear draft queue.
219
+ def flush_pages
220
+ save_pages pages
221
+ clear_draft_pages
222
+ end
223
+
224
+ # Save draft outputs into db and clear draft queue.
225
+ def flush_outputs
226
+ save_outputs outputs
227
+ clear_draft_outputs
228
+ end
229
+
230
+ # Save all drafts into db and clear draft queues.
231
+ def flush
232
+ flush_pages
233
+ flush_outputs
234
+ flush_self_actions
235
+ end
236
+
237
+ # Get latest job by scraper_name.
238
+ #
239
+ # @param [String] scraper_name Scraper name.
240
+ # @param [Hash] filter ({}) Additional_filters.
241
+ #
242
+ # @return [Hash,nil] Return nil if no scraper_name or scraper_name is
243
+ # nil.
244
+ def latest_job_by scraper_name, filter = {}
245
+ return nil if scraper_name.nil?
246
+ data = db.query :jobs, filter.merge('scraper_name' => scraper_name)
247
+ data.max{|a,b| a['created_at'] <=> b['created_at']}
248
+ end
249
+
250
+ # Find outputs by collection and query with pagination.
251
+ #
252
+ # @param [String] collection ('default') Collection name.
253
+ # @param [Hash] query ({}) Filters to query.
254
+ # @param [Integer] page (1) Page number.
255
+ # @param [Integer] per_page (30) Page size.
256
+ # @param [Hash] opts ({}) Configuration options.
257
+ # @option opts [String,nil] :scraper_name (nil) Scraper name to query
258
+ # from.
259
+ # @option opts [Integer,nil] :job_id (nil) Job's id to query from.
260
+ #
261
+ # @raise [ArgumentError] +collection+ is not String.
262
+ # @raise [ArgumentError] +query+ is not a Hash.
263
+ # @raise [ArgumentError] +page+ is not an Integer greater than 0.
264
+ # @raise [ArgumentError] +per_page+ is not an Integer between 1 and 500.
265
+ #
266
+ # @return [Array]
267
+ #
268
+ # @example
269
+ # find_outputs
270
+ # @example
271
+ # find_outputs 'my_collection'
272
+ # @example
273
+ # find_outputs 'my_collection', {}
274
+ # @example
275
+ # find_outputs 'my_collection', {}, 1
276
+ # @example
277
+ # find_outputs 'my_collection', {}, 1, 30
278
+ # @example Find from another scraper by name
279
+ # find_outputs 'my_collection', {}, 1, 30, scraper_name: 'my_scraper'
280
+ # @example Find from another scraper by job_id
281
+ # find_outputs 'my_collection', {}, 1, 30, job_id: 123
282
+ #
283
+ # @note *opts `:job_id` option is prioritize over `:scraper_name` when
284
+ # both exists. If none add provided or nil values, then current job
285
+ # will be used to query instead, this is the defaul behavior.
286
+ def find_outputs collection = 'default', query = {}, page = 1, per_page = 30, opts = {}
287
+ raise ArgumentError.new("collection needs to be a String.") unless collection.is_a?(String)
288
+ raise ArgumentError.new("query needs to be a Hash.") unless query.is_a?(Hash)
289
+ unless page.is_a?(Integer) && page > 0
290
+ raise ArgumentError.new("page needs to be an Integer greater than 0.")
291
+ end
292
+ unless per_page.is_a?(Integer) && per_page > 0 && per_page <= MAX_FIND_OUTPUTS_PER_PAGE
293
+ raise ArgumentError.new("per_page needs to be an Integer between 1 and #{MAX_FIND_OUTPUTS_PER_PAGE}.")
294
+ end
295
+
296
+ count = 0
297
+ offset = (page - 1) * per_page
298
+ job = latest_job_by(opts[:scraper_name])
299
+ fixed_query = query.merge(
300
+ '_collection' => collection,
301
+ '_job_id' => opts[:job_id] || (job.nil? ? job_id : job['job_id'])
302
+ )
303
+ db.query :outputs, fixed_query, offset, per_page
304
+ end
305
+
306
+ # Find one output by collection and query with pagination.
307
+ #
308
+ # @param [String] collection ('default') Collection name.
309
+ # @param [Hash] query ({}) Filters to query.
310
+ # @param [Hash] opts ({}) Configuration options.
311
+ # @option opts [String,nil] :scraper_name (nil) Scraper name to query
312
+ # from.
313
+ # @option opts [Integer,nil] :job_id (nil) Job's id to query from.
314
+ #
315
+ # @raise [ArgumentError] +collection+ is not String.
316
+ # @raise [ArgumentError] +query+ is not a Hash.
317
+ #
318
+ # @return [Hash, nil]
319
+ #
320
+ # @example
321
+ # find_output
322
+ # @example
323
+ # find_output 'my_collection'
324
+ # @example
325
+ # find_output 'my_collection', {}
326
+ # @example Find from another scraper by name
327
+ # find_output 'my_collection', {}, scraper_name: 'my_scraper'
328
+ # @example Find from another scraper by job_id
329
+ # find_output 'my_collection', {}, job_id: 123
330
+ #
331
+ # @note *opts `:job_id` option is prioritize over `:scraper_name` when
332
+ # both exists. If none add provided or nil values, then current job
333
+ # will be used to query instead, this is the defaul behavior.
334
+ def find_output collection = 'default', query = {}, opts = {}
335
+ result = find_outputs(collection, query, 1, 1, opts)
336
+ result.nil? ? nil : result.first
337
+ end
338
+
339
+ # Execute an script file as an executor.
340
+ #
341
+ # @param [String] file_path Script file path to execute.
342
+ def execute_script file_path, vars = {}
343
+ eval(File.read(file_path), isolated_binding(vars), file_path)
344
+ flush
345
+ end
346
+
347
+ # Refetch a page by gid.
348
+ #
349
+ # @param [String] gid Page's gid to refetch.
350
+ def refetch gid
351
+ raise ArgumentError.new("gid needs to be a String.") unless gid.is_a?(String)
352
+ if page['gid'] == gid
353
+ self.refetch_self = true
354
+ return
355
+ end
356
+ db.refetch(job_id, gid)
357
+ end
358
+
359
+ # Reparse a page by gid.
360
+ #
361
+ # @param [String] page_gid Page's gid to reparse.
362
+ def reparse page_gid
363
+ raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
364
+ if page['gid'] == page_gid
365
+ self.reparse_self = true
366
+ return
367
+ end
368
+ db.reparse(job_id, page_gid)
369
+ end
370
+ end
371
+ end
372
+ end
373
+ end
@@ -0,0 +1,28 @@
1
+ module DhEasy
2
+ module Core
3
+ module Mock
4
+ # Fake finisher that emulates `Datahen` finisher executor.
5
+ class FakeFinisher
6
+ include DhEasy::Core::Mock::FakeExecutor
7
+
8
+ # Fake finisher exposed methods to isolated context.
9
+ # @private
10
+ #
11
+ # @return [Array]
12
+ def self.exposed_methods
13
+ real_methods = Datahen::Scraper::RubyFinisherExecutor.exposed_methods.uniq
14
+ mock_methods = [
15
+ :outputs,
16
+ :save_outputs,
17
+ :find_output,
18
+ :find_outputs
19
+ ]
20
+ DhEasy::Core::Mock::FakeExecutor.check_compatibility real_methods, mock_methods
21
+ mock_methods << :job_id
22
+ mock_methods.freeze
23
+ mock_methods
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,33 @@
1
+ module DhEasy
2
+ module Core
3
+ module Mock
4
+ # Fake parser that emulates `Datahen` parser executor.
5
+ class FakeParser
6
+ include DhEasy::Core::Mock::FakeExecutor
7
+
8
+ # Fake parser exposed methods to isolated context.
9
+ # @private
10
+ #
11
+ # @return [Array]
12
+ def self.exposed_methods
13
+ real_methods = Datahen::Scraper::RubyParserExecutor.exposed_methods.uniq
14
+ mock_methods = [
15
+ :content,
16
+ :failed_content,
17
+ :outputs,
18
+ :pages,
19
+ :page,
20
+ :save_pages,
21
+ :save_outputs,
22
+ :find_output,
23
+ :find_outputs,
24
+ :refetch,
25
+ :reparse
26
+ ].freeze
27
+ DhEasy::Core::Mock::FakeExecutor.check_compatibility real_methods, mock_methods
28
+ mock_methods
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,28 @@
1
+ module DhEasy
2
+ module Core
3
+ module Mock
4
+ # Fake seeder that emulates `Datahen` seeder executor.
5
+ class FakeSeeder
6
+ include DhEasy::Core::Mock::FakeExecutor
7
+
8
+ # Fake seeder exposed methods to isolated context.
9
+ # @private
10
+ #
11
+ # @return [Array]
12
+ def self.exposed_methods
13
+ real_methods = Datahen::Scraper::RubySeederExecutor.exposed_methods.uniq
14
+ mock_methods = [
15
+ :outputs,
16
+ :pages,
17
+ :save_pages,
18
+ :save_outputs,
19
+ :find_output,
20
+ :find_outputs
21
+ ].freeze
22
+ DhEasy::Core::Mock::FakeExecutor.check_compatibility real_methods, mock_methods
23
+ mock_methods
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end