mdarray-jcsv 0.6.3-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +23 -0
  3. data/README.md +2 -0
  4. data/Rakefile +46 -0
  5. data/config.rb +104 -0
  6. data/lib/constraints.rb +205 -0
  7. data/lib/date_filters.rb +252 -0
  8. data/lib/dimensions.rb +276 -0
  9. data/lib/filters.rb +332 -0
  10. data/lib/jcsv.rb +107 -0
  11. data/lib/list_reader.rb +200 -0
  12. data/lib/locale.rb +192 -0
  13. data/lib/map_reader.rb +192 -0
  14. data/lib/mdarray-jcsv.rb +24 -0
  15. data/lib/mdarray_reader.rb +110 -0
  16. data/lib/numeric_filters.rb +225 -0
  17. data/lib/reader.rb +547 -0
  18. data/lib/supercsv_interface.rb +231 -0
  19. data/test/test_complete.rb +37 -0
  20. data/test/test_critbit.rb +442 -0
  21. data/test/test_customer_list.rb +436 -0
  22. data/test/test_customer_map.rb +209 -0
  23. data/test/test_customer_nhlist.rb +161 -0
  24. data/test/test_deep_map.rb +264 -0
  25. data/test/test_del.rb +73 -0
  26. data/test/test_dimensions.rb +231 -0
  27. data/test/test_example.rb +79 -0
  28. data/test/test_filters.rb +374 -0
  29. data/test/test_list_dimensions.rb +110 -0
  30. data/test/test_mdarray.rb +227 -0
  31. data/test/test_missing_data.rb +57 -0
  32. data/vendor/commons-beanutils-1.8.3.jar +0 -0
  33. data/vendor/commons-lang3-3.1.jar +0 -0
  34. data/vendor/dozer-5.4.0.jar +0 -0
  35. data/vendor/jcl-over-slf4j-1.6.6.jar +0 -0
  36. data/vendor/joda-time-2.7.jar +0 -0
  37. data/vendor/slf4j-api-1.7.5.jar +0 -0
  38. data/vendor/snakeyaml-1.14.jar +0 -0
  39. data/vendor/super-csv-2.4.0.jar +0 -0
  40. data/vendor/super-csv-dozer-2.4.0.jar +0 -0
  41. data/vendor/super-csv-java8-2.4.0.jar +0 -0
  42. data/vendor/super-csv-joda-2.4.0.jar +0 -0
  43. data/version.rb +2 -0
  44. metadata +196 -0
@@ -0,0 +1,547 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ ##########################################################################################
4
+ # @author Rodrigo Botafogo
5
+ #
6
+ # Copyright © 2015 Rodrigo Botafogo. All Rights Reserved. Permission to use, copy, modify,
7
+ # and distribute this software and its documentation, without fee and without a signed
8
+ # licensing agreement, is hereby granted, provided that the above copyright notice, this
9
+ # paragraph and the following two paragraphs appear in all copies, modifications, and
10
+ # distributions.
11
+ #
12
+ # IN NO EVENT SHALL RODRIGO BOTAFOGO BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL,
13
+ # INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF
14
+ # THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF RODRIGO BOTAFOGO HAS BEEN ADVISED OF THE
15
+ # POSSIBILITY OF SUCH DAMAGE.
16
+ #
17
+ # RODRIGO BOTAFOGO SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
18
+ # THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
19
+ # SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF ANY, PROVIDED HEREUNDER IS PROVIDED "AS IS".
20
+ # RODRIGO BOTAFOGO HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS,
21
+ # OR MODIFICATIONS.
22
+ ##########################################################################################
23
+
24
+ require_relative 'dimensions'
25
+
26
+ ##########################################################################################
27
+ #
28
+ ##########################################################################################
29
+
30
+ class String
31
+ def underscore
32
+ self.gsub(/::/, '/').
33
+ gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
34
+ gsub(/([a-z\d])([A-Z])/,'\1_\2').
35
+ tr("-", "_").
36
+ downcase
37
+ end
38
+ end
39
+
40
+ ##########################################################################################
41
+ #
42
+ ##########################################################################################
43
+
44
+ class Jcsv
45
+
46
+ #========================================================================================
47
+ #
48
+ #========================================================================================
49
+
50
+ module Header
51
+
52
+ #---------------------------------------------------------------------------------------
53
+ #
54
+ #---------------------------------------------------------------------------------------
55
+
56
+
57
+ #---------------------------------------------------------------------------------------
58
+ #
59
+ #---------------------------------------------------------------------------------------
60
+
61
+ def filters=(filters)
62
+
63
+ case filters
64
+ when Hash
65
+ filters = filters.inject({}){|memo,(k,v)| memo[k.to_sym] = v; memo} unless
66
+ @strings_as_keys
67
+ filters.each do |column_name, processor|
68
+ @filters[column_name] = processor
69
+ end
70
+ when Array
71
+ raise "One filter needed for each column. Filters size: #{filters.size}" if
72
+ headers.size != filters.size
73
+ filters.each_with_index do |processor, i|
74
+ @filters[i] = processor
75
+ end
76
+ else
77
+ raise ArgumentError.new("Filters parameters should either be a hash or an array of filters")
78
+ end
79
+
80
+ end
81
+
82
+ #---------------------------------------------------------------------------------------
83
+ # A chunk is either one row of the file, or an array with rows. One row can be either
84
+ # a one dimensional array with all columns or a hash with all columns (excluding the
85
+ # dimensions).
86
+ #---------------------------------------------------------------------------------------
87
+
88
+ def parse_with_block(&block)
89
+
90
+ # if there is a valid column_mapping, then we need to change the mapped_header
91
+ mapped_header = @headers
92
+ if (@column_mapping.mapping)
93
+ mapped_header = Array.new
94
+ @column_mapping.mapping.each_with_index do |map, index|
95
+ mapped_header[map] = @headers[index] if (map.is_a? Numeric)
96
+ end
97
+ end
98
+
99
+ while (!((chunk = read_chunk).nil?))
100
+ if (mapped_header.size == 0)
101
+ block.call(@reader.getLineNumber(), @reader.getRowNumber(), format(chunk))
102
+ else
103
+ block.call(@reader.getLineNumber(), @reader.getRowNumber(), format(chunk),
104
+ mapped_header)
105
+ end
106
+ end
107
+
108
+ end
109
+
110
+ end
111
+
112
+ #========================================================================================
113
+ #
114
+ #========================================================================================
115
+
116
+ module Headerless
117
+
118
+ #---------------------------------------------------------------------------------------
119
+ #
120
+ #---------------------------------------------------------------------------------------
121
+
122
+ def filters=(filters)
123
+
124
+ case filters
125
+ when Hash
126
+ raise MissingHeadersError.new("CSV file does not have headers. Cannot match filters with headers")
127
+ when Array
128
+ @filters = []
129
+
130
+ # Add method 'values' to filters so that it behaves as a hash and works the same
131
+ # as headed csv files
132
+ def @filters.values
133
+ self
134
+ end
135
+
136
+ filters.each_with_index do |processor, i|
137
+ @filters[i] = processor
138
+ end
139
+ else
140
+ raise ArgumentError.new("Filters parameters should be an array of filters")
141
+ end
142
+
143
+ end
144
+
145
+ #---------------------------------------------------------------------------------------
146
+ #
147
+ #---------------------------------------------------------------------------------------
148
+
149
+ def parse_with_block(&block)
150
+
151
+ while (!((chunk = read_chunk).nil?))
152
+ block.call(@reader.getLineNumber(), @reader.getRowNumber(), format(chunk), nil)
153
+ end
154
+
155
+ end
156
+
157
+ end
158
+
159
+ #========================================================================================
160
+ #
161
+ #========================================================================================
162
+
163
+ class Reader
164
+ include_package "org.supercsv.cellprocessor.ift"
165
+ include_package "org.supercsv.prefs"
166
+ include_package "org.supercsv.comment"
167
+
168
+ # Reader configuration parameters
169
+ attr_reader :filename
170
+ attr_reader :col_sep
171
+ attr_reader :comment_starts
172
+ attr_reader :comment_matches
173
+ attr_reader :ignore_empty_lines
174
+ attr_reader :surrounding_space_need_quotes
175
+ attr_reader :quote_char
176
+ attr_reader :strings_as_keys
177
+ attr_reader :format # output format: list, map, vector, others...
178
+ attr_reader :suppress_warnings # true if no warning messages should be shown
179
+
180
+ # chunk_size can be changed on the fly
181
+ attr_accessor :chunk_size
182
+
183
+ attr_reader :headers
184
+ attr_reader :data_labels
185
+ attr_reader :column_mapping
186
+ attr_reader :dimensions_names
187
+
188
+ # last processed column
189
+ attr_reader :processed_column
190
+
191
+ # Rows read. Returned when reading a chunk of data
192
+ attr_reader :rows
193
+
194
+ #---------------------------------------------------------------------------------------
195
+ #
196
+ #---------------------------------------------------------------------------------------
197
+
198
+ def [](dim)
199
+
200
+ case true
201
+ when (dim == :_data_)
202
+ @data_labels
203
+ when (@dimensions_names.include? dim)
204
+ @dimensions.dimensions[dim].labels.keys
205
+ else
206
+ raise ArgumentError.new("Unknown dimension #{dim}")
207
+ end
208
+
209
+ end
210
+
211
+ #---------------------------------------------------------------------------------------
212
+ # Accepts the following options:
213
+ # @param comment_starts: character at the beginning of the line that marks a comment
214
+ # @param comment_matches: delimiters that match a comment, needs to comment at the beginning
215
+ # and end of the comment, such as <!.*!>, comments everyting between <! and !>
216
+ # @param quote_char The quote character (used when a cell contains special characters,
217
+ # such as the delimiter char, a quote char, or spans multiple lines).
218
+ # @param col_sep the delimiter character (separates each cell in a row).
219
+ # @param surrounding_spaces_need_quotes Whether spaces surrounding a cell need quotes in
220
+ # order to be preserved. The default value is false (quotes aren't required).
221
+ # @param ignore_empty_lines Whether empty lines (i.e. containing only end of line symbols)
222
+ # are ignored. The default value is true (empty lines are ignored).
223
+ # @param format Format of result, list, map, vector.
224
+ # @param deep When true reads data as a deep map (hash), i.e., there is a hash of the
225
+ # first dimension, that has all rows with this dimension. If there is a second
226
+ # dimension, then this is also hashed across all rows, etc.
227
+ #---------------------------------------------------------------------------------------
228
+
229
+ def initialize(filename,
230
+ col_sep: ",",
231
+ comment_starts: false,
232
+ comment_matches: false,
233
+ ignore_empty_lines: true,
234
+ surrounding_space_need_quotes: false,
235
+ quote_char: "\"",
236
+ default_filter: Jcsv.optional,
237
+ strings_as_keys: false,
238
+ format: :list,
239
+ headers: true,
240
+ custom_headers: nil,
241
+ chunk_size: 0,
242
+ deep_map: false,
243
+ dimensions: nil,
244
+ suppress_warnings: false)
245
+
246
+ @filename = filename
247
+ @col_sep = col_sep
248
+ @comment_starts = comment_starts
249
+ @comment_matches = comment_matches
250
+ @default_filter = default_filter
251
+ @filters = false
252
+ @strings_as_keys = strings_as_keys
253
+ @headers = headers
254
+ @custom_headers = custom_headers
255
+ @ignore_empty_lines = ignore_empty_lines
256
+ @format = format
257
+ @surrounding_space_need_quotes = surrounding_space_need_quotes
258
+ @quote_char = quote_char
259
+ @chunk_size = (chunk_size == :all)? 1.0/0.0 : chunk_size
260
+ @deep_map = (@format == :list)? false : deep_map
261
+ @dimensions_names = dimensions
262
+ @column_mapping = Mapping.new
263
+ @suppress_warnings = suppress_warnings
264
+
265
+ prepare_dimensions if dimensions
266
+
267
+ # set all preferences. To create a new reader we need to have the dimensions already
268
+ # prepared as this information will be sent to supercsv for processing.
269
+ new_reader(set_preferences)
270
+
271
+ # Dynamic class change without writing subclasses. When headers, extend this class
272
+ # with methods that assume there is a header, when no headers, then extend this class
273
+ # with methods that know there is no header. Could have being done with subclasses,
274
+ # but this would all subclasses to have two subclasses one inheriting from the header
275
+ # class and one inheriting from the headerless classes. In this way we reduce the
276
+ # subclasses need.
277
+ @headers? prepare_headers : (@custom_headers? set_headers(@custom_headers) :
278
+ headerless)
279
+
280
+ # if there are dimensions, then we need to prepare the mappings accordingly. With
281
+ # dimensions defined, users cannot defined mappings.
282
+ dimensions_mappings if dimensions
283
+
284
+ end
285
+ =begin
286
+ #---------------------------------------------------------------------------------------
287
+ # read the whole file at once if no block given, or pass each row or chunk to the
288
+ # block to be processed.
289
+ #---------------------------------------------------------------------------------------
290
+
291
+ def read(&block)
292
+
293
+ # When no block given, chunks read are stored in an array and returned to the user.
294
+ if (!block_given?)
295
+ @rows = Array.new
296
+ parse_with_block do |line_no, row_no, chunk, headers|
297
+ @rows << chunk
298
+ end
299
+ @rows
300
+ else
301
+ parse_with_block(&block)
302
+ end
303
+
304
+ end
305
+ =end
306
+ #---------------------------------------------------------------------------------------
307
+ #
308
+ #---------------------------------------------------------------------------------------
309
+
310
+ def each(&block)
311
+
312
+ if (!block_given?)
313
+ to_enum
314
+ else
315
+ parse_with_block(&block)
316
+ end
317
+
318
+ end
319
+
320
+ #---------------------------------------------------------------------------------------
321
+ # Both map_reader and list_reader have a mapping= method. Is this really necessary?
322
+ # FIX!!!!
323
+ #---------------------------------------------------------------------------------------
324
+
325
+ def mapping=(map, dim_set = false)
326
+ p "reader.rb mapping =. FIX!"
327
+ @column_mapping.map = map
328
+ end
329
+
330
+ #---------------------------------------------------------------------------------------
331
+ #
332
+ #---------------------------------------------------------------------------------------
333
+
334
+ def dimensions
335
+ @reader.dimensions
336
+ end
337
+
338
+ #---------------------------------------------------------------------------------------
339
+ #
340
+ #---------------------------------------------------------------------------------------
341
+
342
+ private
343
+
344
+ #---------------------------------------------------------------------------------------
345
+ # A chunk is either one row of the file, or an array with rows. One row can be either
346
+ # a one dimensional array with all columns or a hash with all columns (excluding the
347
+ # dimensions).
348
+ #---------------------------------------------------------------------------------------
349
+
350
+ def read_chunk
351
+
352
+ return @reader.read(@column_mapping, @filters) if @chunk_size == 0
353
+
354
+ rows = Array.new
355
+ (1..@chunk_size).each do |i|
356
+ if ((row = @reader.read(@column_mapping, @filters)).nil?)
357
+ break
358
+ else
359
+ rows << row
360
+ end
361
+ end
362
+
363
+ (rows.size == 0)? nil : rows
364
+
365
+ end
366
+
367
+ #---------------------------------------------------------------------------------------
368
+ #
369
+ #---------------------------------------------------------------------------------------
370
+
371
+ def set_preferences
372
+
373
+ # Prepare preferences
374
+ builder = CsvPreference::Builder.new(@quote_char.to_java(:char), @col_sep.ord, "\n")
375
+ builder.skipComments(CommentStartsWith.new(@comment_starts)) if @comment_starts
376
+ builder.skipComments(CommentMatches.new(@comment_matches)) if @comment_matches
377
+ builder.ignoreEmptyLines(@ignore_empty_lines)
378
+ builder.surroundingSpacesNeedQuotes(@surrounding_space_need_quotes)
379
+ builder.build
380
+
381
+ end
382
+
383
+ #---------------------------------------------------------------------------------------
384
+ # Initialize filters with the default_filter. Only possible if the file has headers.
385
+ #---------------------------------------------------------------------------------------
386
+
387
+ def init_filters
388
+
389
+ @filters = Hash.new
390
+
391
+ # set all column filters to the @default_filter
392
+ @headers.each do |column_name|
393
+ @filters[column_name] = @default_filter
394
+ end
395
+
396
+ end
397
+
398
+ #---------------------------------------------------------------------------------------
399
+ #
400
+ #---------------------------------------------------------------------------------------
401
+
402
+ def _prepare_headers
403
+
404
+ # Convert headers to symbols, unless user specifically does not want it
405
+ @headers.map! do |head|
406
+ (head)? head.underscore.to_sym :
407
+ (raise MissingHeadersError.new("Column is missing header"))
408
+ end unless @strings_as_keys
409
+
410
+ if (@dimensions)
411
+ # Check dimensions names agains headers
412
+ @dimensions_names.each do |dim_name|
413
+ raise ArgumentError.new("Invalid dimension: #{dim_name} not in headers") if
414
+ !@headers.include?(dim_name)
415
+ end
416
+ @data_labels = @headers - @dimensions_names
417
+ end
418
+
419
+ # initialize filters with the default filter
420
+ init_filters
421
+
422
+ end
423
+
424
+ #---------------------------------------------------------------------------------------
425
+ #
426
+ #---------------------------------------------------------------------------------------
427
+
428
+ def prepare_headers
429
+
430
+ extend Header
431
+ # Read headers
432
+ @headers = @reader.headers
433
+ _prepare_headers
434
+
435
+ end
436
+
437
+ #---------------------------------------------------------------------------------------
438
+ #
439
+ #---------------------------------------------------------------------------------------
440
+
441
+ def set_headers(headers)
442
+
443
+ extend Header
444
+ # set headers
445
+ @headers = headers
446
+ _prepare_headers
447
+
448
+ end
449
+
450
+ #---------------------------------------------------------------------------------------
451
+ #
452
+ #---------------------------------------------------------------------------------------
453
+
454
+ def headerless
455
+ extend Headerless
456
+ end
457
+
458
+ #---------------------------------------------------------------------------------------
459
+ #
460
+ #---------------------------------------------------------------------------------------
461
+
462
+ def prepare_dimensions
463
+
464
+ if ((!@dimensions_names.nil?) && (@dimensions_names.size != 0))
465
+ # || options[:keep_original_headers]
466
+ @dimensions_names.map! { |x| x.downcase.to_sym } unless @strings_as_keys
467
+ @dimensions = Dimensions.new(@dimensions_names)
468
+ end
469
+
470
+ end
471
+
472
+ #---------------------------------------------------------------------------------------
473
+ #
474
+ #---------------------------------------------------------------------------------------
475
+
476
+ def dimensions_mappings
477
+
478
+ # Build mapping for the dimensions: dimensions need to map to true
479
+ map = Hash.new
480
+ @dimensions.each do |dim|
481
+ map[dim.name] = true
482
+ end
483
+ # send(:mapping=, map, true)
484
+ send(:assign_mapping, map)
485
+
486
+ end
487
+
488
+ end
489
+
490
+ end
491
+
492
+ require_relative 'list_reader'
493
+ require_relative 'map_reader'
494
+ require_relative 'mdarray_reader'
495
+
496
+
497
+ =begin
498
+
499
+ Dialect: "escaped"
500
+
501
+ delimiter = ',' skipinitialspace = 0
502
+ doublequote = 0 quoting = QUOTE_NONE
503
+ quotechar = '"' lineterminator = '\r\n'
504
+ escapechar = '\\'
505
+
506
+ col1,0,10/00/2010,Contains special chars: \" ' \, to be parsed
507
+ col1,1,10/01/2010,Contains special chars: \" ' \, to be parsed
508
+ col1,2,10/02/2010,Contains special chars: \" ' \, to be parsed
509
+
510
+
511
+ Dialect: "excel"
512
+
513
+ delimiter = ',' skipinitialspace = 0
514
+ doublequote = 1 quoting = QUOTE_MINIMAL
515
+ quotechar = '"' lineterminator = '\r\n'
516
+ escapechar = None
517
+
518
+ col1,0,10/00/2010,"Contains special chars: "" ' , to be parsed"
519
+ col1,1,10/01/2010,"Contains special chars: "" ' , to be parsed"
520
+ col1,2,10/02/2010,"Contains special chars: "" ' , to be parsed"
521
+
522
+
523
+ Dialect: "excel-tab"
524
+
525
+ delimiter = '\t' skipinitialspace = 0
526
+ doublequote = 1 quoting = QUOTE_MINIMAL
527
+ quotechar = '"' lineterminator = '\r\n'
528
+ escapechar = None
529
+
530
+ col1 0 10/00/2010 "Contains special chars: "" ' to be parsed"
531
+ col1 1 10/01/2010 "Contains special chars: "" ' to be parsed"
532
+ col1 2 10/02/2010 "Contains special chars: "" ' to be parsed"
533
+
534
+
535
+ Dialect: "singlequote"
536
+
537
+ delimiter = ',' skipinitialspace = 0
538
+ doublequote = 1 quoting = QUOTE_ALL
539
+ quotechar = "'" lineterminator = '\r\n'
540
+ escapechar = None
541
+
542
+ 'col1','0','10/00/2010','Contains special chars: " '' , to be parsed'
543
+ 'col1','1','10/01/2010','Contains special chars: " '' , to be parsed'
544
+ 'col1','2','10/02/2010','Contains special chars: " '' , to be parsed'
545
+
546
+ =end
547
+