RubyGems - mdarray-jcsv - Versions diffs - 0.6.3-java - Mend

mdarray-jcsv 0.6.3-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

checksums.yaml +7 -0
data/LICENSE.txt +23 -0
data/README.md +2 -0
data/Rakefile +46 -0
data/config.rb +104 -0
data/lib/constraints.rb +205 -0
data/lib/date_filters.rb +252 -0
data/lib/dimensions.rb +276 -0
data/lib/filters.rb +332 -0
data/lib/jcsv.rb +107 -0
data/lib/list_reader.rb +200 -0
data/lib/locale.rb +192 -0
data/lib/map_reader.rb +192 -0
data/lib/mdarray-jcsv.rb +24 -0
data/lib/mdarray_reader.rb +110 -0
data/lib/numeric_filters.rb +225 -0
data/lib/reader.rb +547 -0
data/lib/supercsv_interface.rb +231 -0
data/test/test_complete.rb +37 -0
data/test/test_critbit.rb +442 -0
data/test/test_customer_list.rb +436 -0
data/test/test_customer_map.rb +209 -0
data/test/test_customer_nhlist.rb +161 -0
data/test/test_deep_map.rb +264 -0
data/test/test_del.rb +73 -0
data/test/test_dimensions.rb +231 -0
data/test/test_example.rb +79 -0
data/test/test_filters.rb +374 -0
data/test/test_list_dimensions.rb +110 -0
data/test/test_mdarray.rb +227 -0
data/test/test_missing_data.rb +57 -0
data/vendor/commons-beanutils-1.8.3.jar +0 -0
data/vendor/commons-lang3-3.1.jar +0 -0
data/vendor/dozer-5.4.0.jar +0 -0
data/vendor/jcl-over-slf4j-1.6.6.jar +0 -0
data/vendor/joda-time-2.7.jar +0 -0
data/vendor/slf4j-api-1.7.5.jar +0 -0
data/vendor/snakeyaml-1.14.jar +0 -0
data/vendor/super-csv-2.4.0.jar +0 -0
data/vendor/super-csv-dozer-2.4.0.jar +0 -0
data/vendor/super-csv-java8-2.4.0.jar +0 -0
data/vendor/super-csv-joda-2.4.0.jar +0 -0
data/version.rb +2 -0
metadata +196 -0

data/lib/reader.rb ADDED

@@ -0,0 +1,547 @@
+# -*- coding: utf-8 -*-
+##########################################################################################
+# @author Rodrigo Botafogo
+#
+# Copyright © 2015 Rodrigo Botafogo. All Rights Reserved. Permission to use, copy, modify,
+# and distribute this software and its documentation, without fee and without a signed
+# licensing agreement, is hereby granted, provided that the above copyright notice, this
+# paragraph and the following two paragraphs appear in all copies, modifications, and
+# distributions.
+#
+# IN NO EVENT SHALL RODRIGO BOTAFOGO BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL,
+# INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF
+# THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF RODRIGO BOTAFOGO HAS BEEN ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# RODRIGO BOTAFOGO SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
+# SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF ANY, PROVIDED HEREUNDER IS PROVIDED "AS IS".
+# RODRIGO BOTAFOGO HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS,
+# OR MODIFICATIONS.
+##########################################################################################
+require_relative 'dimensions'
+##########################################################################################
+#
+##########################################################################################
+class String
+  def underscore
+    self.gsub(/::/, '/').
+      gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
+      gsub(/([a-z\d])([A-Z])/,'\1_\2').
+      tr("-", "_").
+      downcase
+  end
+end
+##########################################################################################
+#
+##########################################################################################
+class Jcsv
+  #========================================================================================
+  #
+  #========================================================================================
+  module Header
+    #---------------------------------------------------------------------------------------
+    #
+    #---------------------------------------------------------------------------------------
+    #---------------------------------------------------------------------------------------
+    #
+    #---------------------------------------------------------------------------------------
+    def filters=(filters)
+      case filters
+      when Hash
+        filters = filters.inject({}){|memo,(k,v)| memo[k.to_sym] = v; memo} unless
+          @strings_as_keys
+        filters.each do |column_name, processor|
+          @filters[column_name] = processor
+        end
+      when Array
+        raise "One filter needed for each column.  Filters size: #{filters.size}" if
+          headers.size != filters.size
+        filters.each_with_index do |processor, i|
+          @filters[i] = processor
+        end
+      else
+        raise ArgumentError.new("Filters parameters should either be a hash or an array of filters")
+      end
+    end
+    #---------------------------------------------------------------------------------------
+    # A chunk is either one row of the file, or an array with rows.  One row can be either
+    # a one dimensional array with all columns or a hash with all columns (excluding the
+    # dimensions).
+    #---------------------------------------------------------------------------------------
+    def parse_with_block(&block)
+      # if there is a valid column_mapping, then we need to change the mapped_header
+      mapped_header = @headers
+      if (@column_mapping.mapping)
+        mapped_header = Array.new
+        @column_mapping.mapping.each_with_index do |map, index|
+          mapped_header[map] = @headers[index] if (map.is_a? Numeric)
+        end
+      end
+      while (!((chunk = read_chunk).nil?))
+        if (mapped_header.size == 0)
+          block.call(@reader.getLineNumber(), @reader.getRowNumber(), format(chunk))
+        else
+          block.call(@reader.getLineNumber(), @reader.getRowNumber(), format(chunk),
+                     mapped_header)
+        end
+      end
+    end
+  end
+  #========================================================================================
+  #
+  #========================================================================================
+  module Headerless
+    #---------------------------------------------------------------------------------------
+    #
+    #---------------------------------------------------------------------------------------
+    def filters=(filters)
+      case filters
+      when Hash
+        raise MissingHeadersError.new("CSV file does not have headers.  Cannot match filters with headers")
+      when Array
+        @filters = []
+        # Add method 'values' to filters so that it behaves as a hash and works the same
+        # as headed csv files
+        def @filters.values
+          self
+        end
+        filters.each_with_index do |processor, i|
+          @filters[i] = processor
+        end
+      else
+        raise ArgumentError.new("Filters parameters should be an array of filters")
+      end
+    end
+    #---------------------------------------------------------------------------------------
+    #
+    #---------------------------------------------------------------------------------------
+    def parse_with_block(&block)
+      while (!((chunk = read_chunk).nil?))
+        block.call(@reader.getLineNumber(), @reader.getRowNumber(), format(chunk), nil)
+      end
+    end
+  end
+  #========================================================================================
+  #
+  #========================================================================================
+  class Reader
+    include_package "org.supercsv.cellprocessor.ift"
+    include_package "org.supercsv.prefs"
+    include_package "org.supercsv.comment"
+    # Reader configuration parameters
+    attr_reader :filename
+    attr_reader :col_sep
+    attr_reader :comment_starts
+    attr_reader :comment_matches
+    attr_reader :ignore_empty_lines
+    attr_reader :surrounding_space_need_quotes
+    attr_reader :quote_char
+    attr_reader :strings_as_keys
+    attr_reader :format               # output format: list, map, vector, others...
+    attr_reader :suppress_warnings    # true if no warning messages should be shown
+    # chunk_size can be changed on the fly
+    attr_accessor :chunk_size
+    attr_reader :headers
+    attr_reader :data_labels
+    attr_reader :column_mapping
+    attr_reader :dimensions_names
+    # last processed column
+    attr_reader :processed_column
+    # Rows read.  Returned when reading a chunk of data
+    attr_reader :rows
+    #---------------------------------------------------------------------------------------
+    #
+    #---------------------------------------------------------------------------------------
+    def [](dim)
+      case true
+      when (dim == :_data_)
+        @data_labels
+      when (@dimensions_names.include? dim)
+        @dimensions.dimensions[dim].labels.keys
+      else
+        raise ArgumentError.new("Unknown dimension #{dim}")
+      end
+    end
+    #---------------------------------------------------------------------------------------
+    # Accepts the following options:
+    # @param comment_starts: character at the beginning of the line that marks a comment
+    # @param comment_matches: delimiters that match a comment, needs to comment at the beginning
+    # and end of the comment, such as <!.*!>, comments everyting between <! and !>
+    # @param quote_char The quote character (used when a cell contains special characters,
+    # such as the delimiter char, a quote char, or spans multiple lines).
+    # @param col_sep the delimiter character (separates each cell in a row).
+    # @param surrounding_spaces_need_quotes Whether spaces surrounding a cell need quotes in
+    # order to be preserved. The default value is false (quotes aren't required).
+    # @param ignore_empty_lines Whether empty lines (i.e. containing only end of line symbols)
+    # are ignored. The default value is true (empty lines are ignored).
+    # @param format Format of result, list, map, vector.
+    # @param deep When true reads data as a deep map (hash), i.e., there is a hash of the
+    # first dimension, that has all rows with this dimension.  If there is a second
+    # dimension, then this is also hashed across all rows, etc.
+    #---------------------------------------------------------------------------------------
+    def initialize(filename,
+                   col_sep: ",",
+                   comment_starts: false,
+                   comment_matches: false,
+                   ignore_empty_lines: true,
+                   surrounding_space_need_quotes: false,
+                   quote_char: "\"",
+                   default_filter: Jcsv.optional,
+                   strings_as_keys: false,
+                   format: :list,
+                   headers: true,
+                   custom_headers: nil,
+                   chunk_size: 0,
+                   deep_map: false,
+                   dimensions: nil,
+                   suppress_warnings: false)
+      @filename = filename
+      @col_sep = col_sep
+      @comment_starts = comment_starts
+      @comment_matches = comment_matches
+      @default_filter = default_filter
+      @filters = false
+      @strings_as_keys = strings_as_keys
+      @headers = headers
+      @custom_headers = custom_headers
+      @ignore_empty_lines = ignore_empty_lines
+      @format = format
+      @surrounding_space_need_quotes = surrounding_space_need_quotes
+      @quote_char = quote_char
+      @chunk_size = (chunk_size == :all)? 1.0/0.0 : chunk_size
+      @deep_map = (@format == :list)? false : deep_map
+      @dimensions_names = dimensions
+      @column_mapping = Mapping.new
+      @suppress_warnings = suppress_warnings
+      prepare_dimensions if dimensions
+      # set all preferences.  To create a new reader we need to have the dimensions already
+      # prepared as this information will be sent to supercsv for processing.
+      new_reader(set_preferences)
+      # Dynamic class change without writing subclasses. When headers, extend this class
+      # with methods that assume there is a header, when no headers, then extend this class
+      # with methods that know there is no header.  Could have being done with subclasses,
+      # but this would all subclasses to have two subclasses one inheriting from the header
+      # class and one inheriting from the headerless classes.  In this way we reduce the
+      # subclasses need.
+      @headers? prepare_headers : (@custom_headers? set_headers(@custom_headers) :
+                                     headerless)
+      # if there are dimensions, then we need to prepare the mappings accordingly.  With
+      # dimensions defined, users cannot defined mappings.
+      dimensions_mappings if dimensions
+    end
+=begin
+    #---------------------------------------------------------------------------------------
+    # read the whole file at once if no block given, or pass each row or chunk to the
+    # block to be processed.
+    #---------------------------------------------------------------------------------------
+    def read(&block)
+      # When no block given, chunks read are stored in an array and returned to the user.
+      if (!block_given?)
+        @rows = Array.new
+        parse_with_block do |line_no, row_no, chunk, headers|
+          @rows << chunk
+        end
+        @rows
+      else
+        parse_with_block(&block)
+      end
+    end
+=end
+    #---------------------------------------------------------------------------------------
+    #
+    #---------------------------------------------------------------------------------------
+    def each(&block)
+      if (!block_given?)
+        to_enum
+      else
+        parse_with_block(&block)
+      end
+    end
+    #---------------------------------------------------------------------------------------
+    # Both map_reader and list_reader have a mapping= method.  Is this really necessary?
+    # FIX!!!!
+    #---------------------------------------------------------------------------------------
+    def mapping=(map, dim_set = false)
+      p "reader.rb mapping =.  FIX!"
+      @column_mapping.map = map
+    end
+    #---------------------------------------------------------------------------------------
+    #
+    #---------------------------------------------------------------------------------------
+    def dimensions
+      @reader.dimensions
+    end
+    #---------------------------------------------------------------------------------------
+    #
+    #---------------------------------------------------------------------------------------
+    private
+    #---------------------------------------------------------------------------------------
+    # A chunk is either one row of the file, or an array with rows.  One row can be either
+    # a one dimensional array with all columns or a hash with all columns (excluding the
+    # dimensions).
+    #---------------------------------------------------------------------------------------
+    def read_chunk
+      return @reader.read(@column_mapping, @filters) if @chunk_size == 0
+      rows = Array.new
+      (1..@chunk_size).each do |i|
+        if ((row = @reader.read(@column_mapping, @filters)).nil?)
+          break
+        else
+          rows << row
+        end
+      end
+      (rows.size == 0)? nil : rows
+    end
+    #---------------------------------------------------------------------------------------
+    #
+    #---------------------------------------------------------------------------------------
+    def set_preferences
+      # Prepare preferences
+      builder = CsvPreference::Builder.new(@quote_char.to_java(:char), @col_sep.ord, "\n")
+      builder.skipComments(CommentStartsWith.new(@comment_starts)) if @comment_starts
+      builder.skipComments(CommentMatches.new(@comment_matches)) if @comment_matches
+      builder.ignoreEmptyLines(@ignore_empty_lines)
+      builder.surroundingSpacesNeedQuotes(@surrounding_space_need_quotes)
+      builder.build
+    end
+    #---------------------------------------------------------------------------------------
+    # Initialize filters with the default_filter.  Only possible if the file has headers.
+    #---------------------------------------------------------------------------------------
+    def init_filters
+      @filters = Hash.new
+      # set all column filters to the @default_filter
+      @headers.each do |column_name|
+        @filters[column_name] = @default_filter
+      end
+    end
+    #---------------------------------------------------------------------------------------
+    #
+    #---------------------------------------------------------------------------------------
+    def _prepare_headers
+      # Convert headers to symbols, unless user specifically does not want it
+      @headers.map! do |head|
+        (head)? head.underscore.to_sym :
+          (raise MissingHeadersError.new("Column is missing header"))
+      end unless @strings_as_keys
+      if (@dimensions)
+        # Check dimensions names agains headers
+        @dimensions_names.each do |dim_name|
+          raise ArgumentError.new("Invalid dimension: #{dim_name} not in headers") if
+            !@headers.include?(dim_name)
+        end
+        @data_labels = @headers - @dimensions_names
+      end
+      # initialize filters with the default filter
+      init_filters
+    end
+    #---------------------------------------------------------------------------------------
+    #
+    #---------------------------------------------------------------------------------------
+    def prepare_headers
+      extend Header
+      # Read headers
+      @headers = @reader.headers
+      _prepare_headers
+    end
+    #---------------------------------------------------------------------------------------
+    #
+    #---------------------------------------------------------------------------------------
+    def set_headers(headers)
+      extend Header
+      # set headers
+      @headers = headers
+      _prepare_headers
+    end
+    #---------------------------------------------------------------------------------------
+    #
+    #---------------------------------------------------------------------------------------
+    def headerless
+      extend Headerless
+    end
+    #---------------------------------------------------------------------------------------
+    #
+    #---------------------------------------------------------------------------------------
+    def prepare_dimensions
+      if ((!@dimensions_names.nil?) && (@dimensions_names.size != 0))
+        # || options[:keep_original_headers]
+        @dimensions_names.map! { |x| x.downcase.to_sym } unless @strings_as_keys
+        @dimensions = Dimensions.new(@dimensions_names)
+      end
+    end
+    #---------------------------------------------------------------------------------------
+    #
+    #---------------------------------------------------------------------------------------
+    def dimensions_mappings
+      # Build mapping for the dimensions: dimensions need to map to true
+      map = Hash.new
+      @dimensions.each do |dim|
+        map[dim.name] = true
+      end
+      # send(:mapping=, map, true)
+      send(:assign_mapping, map)
+    end
+  end
+end
+require_relative 'list_reader'
+require_relative 'map_reader'
+require_relative 'mdarray_reader'
+=begin
+Dialect: "escaped"
+  delimiter   = ','       skipinitialspace = 0
+  doublequote = 0         quoting          = QUOTE_NONE
+  quotechar   = '"'       lineterminator   = '\r\n'
+  escapechar  = '\\'
+col1,0,10/00/2010,Contains special chars: \" ' \, to be parsed
+col1,1,10/01/2010,Contains special chars: \" ' \, to be parsed
+col1,2,10/02/2010,Contains special chars: \" ' \, to be parsed
+Dialect: "excel"
+  delimiter   = ','       skipinitialspace = 0
+  doublequote = 1         quoting          = QUOTE_MINIMAL
+  quotechar   = '"'       lineterminator   = '\r\n'
+  escapechar  = None
+col1,0,10/00/2010,"Contains special chars: "" ' , to be parsed"
+col1,1,10/01/2010,"Contains special chars: "" ' , to be parsed"
+col1,2,10/02/2010,"Contains special chars: "" ' , to be parsed"
+Dialect: "excel-tab"
+  delimiter   = '\t'      skipinitialspace = 0
+  doublequote = 1         quoting          = QUOTE_MINIMAL
+  quotechar   = '"'       lineterminator   = '\r\n'
+  escapechar  = None
+col1    0       10/00/2010      "Contains special chars: "" '    to be parsed"
+col1    1       10/01/2010      "Contains special chars: "" '    to be parsed"
+col1    2       10/02/2010      "Contains special chars: "" '    to be parsed"
+Dialect: "singlequote"
+  delimiter   = ','       skipinitialspace = 0
+  doublequote = 1         quoting          = QUOTE_ALL
+  quotechar   = "'"       lineterminator   = '\r\n'
+  escapechar  = None
+'col1','0','10/00/2010','Contains special chars: " '' , to be parsed'
+'col1','1','10/01/2010','Contains special chars: " '' , to be parsed'
+'col1','2','10/02/2010','Contains special chars: " '' , to be parsed'
+=end