RubyGems - uncsv - Versions diffs - 0.3.1 - Mend

uncsv 0.3.1

Files changed (27) hide show

data/lib/uncsv/config.rb ADDED Viewed

@@ -0,0 +1,204 @@
+# frozen_string_literal: true
+class Uncsv
+  # Configuration options for parsing CSVs. It is a struct-like object with
+  # attribute acessors.
+  class Config
+    # Options that directly map to Std-lib `CSV` options
+    CSV_OPTS = %i[
+      col_sep row_sep quote_char field_size_limit
+    ].freeze
+    # The default values applied if an attribute's value is not specified when
+    # constructing a new `Config` object.
+    DEFAULTS = {
+      col_sep: ',',
+      expand_headers: false,
+      field_size_limit: nil,
+      header_rows: [],
+      header_separator: '.',
+      nil_empty: true,
+      normalize_headers: false,
+      quote_char: '"',
+      row_sep: :auto,
+      skip_rows: [],
+      skip_blanks: false,
+      unique_headers: false
+    }.freeze
+    # The string that separates each field
+    #
+    # Default: `","`.
+    #
+    # @return [String] The column separator string
+    # @see (see #initialize)
+    attr_accessor :col_sep
+    # @!attribute expand_headers
+    #   Whether to fill empty headers with values from the left.
+    #
+    #   Default `false`. If set to `true`, blank header row cells will assume
+    #   the header of the row to their left. This is useful for heirarchical
+    #   headers where not all the header cells are filled in. If set to an
+    #   array of header indexes, only the specified headers will be expanded.
+    #
+    #   @return [Array] An array of expaned header indexes
+    # The maximum size CSV will read ahead looking for a closing quote.
+    #
+    # Default: `nil`.
+    #
+    # @return [nil, Integer] The maximum field size
+    # @see (see #initialize)
+    attr_accessor :field_size_limit
+    # Indexes of the rows to use as headers
+    #
+    # Default: `[]`. Accepts an array of zero-based indexes or a single index.
+    # For example, it could be set to `0` to indicate a header in the first row.
+    # If set to an array of indexes (`[1,2]`), the header row text will be
+    # joined by the `:header_separator`. For example, if if the cell (0,0) had
+    # the value `"Personal"` and cell (1,0) had the value "Name", the header
+    # would become `"Personal.Name"`. Any data above the last header row will be
+    # ignored.
+    #
+    # @return [Array] The header row indexes
+    attr_reader :header_rows
+    # The separator between multiple header fields
+    #
+    # Default: `"."`. When using multiple header rows, this is a string used
+    # to separate the individual header fields.
+    #
+    # @return [String] The separator string
+    attr_accessor :header_separator
+    # Whether to represent empty cells as `nil`.
+    #
+    # Default `false`. If `true`, empty cells will be set to `nil`, otherwise,
+    # they are set to an empty string.
+    #
+    # @return [Boolean] Whether empty cells will be `nil`ed
+    attr_accessor :nil_empty
+    # Whether to rewrite headers to a standard format
+    #
+    # Default `false`. If set to `true`, header field text will be normalized.
+    # The text will be lowercased, and non-alphanumeric characters will be
+    # replaced with underscores (`_`).
+    #
+    # If set to a string, those characters will
+    # be replaced with the string instead.
+    #
+    # If set to a hash, the hash will be treated as options to KeyNormalizer,
+    # accepting the `:separator`, and `:downcase` options.
+    #
+    # If set to another object, it is expected to respond to the
+    # `normalize(key)` method by returning a normalized string.
+    #
+    # @see KeyNormalizer
+    # @return [KeyNormalizer, Object] The KeyNormalizer object or equivalent
+    attr_reader :normalize_headers
+    # The character used to quote individual fields
+    #
+    # Default `'"'`. If set to `true`, header field text will be normalized. The
+    # text will be lowercased, and non-alphanumeric characters will be replaced
+    # with underscores (`_`). If set to a string, those characters will be
+    # replaced with the string instead.
+    #
+    # @return [String] The quote character
+    # @see (see #initialize)
+    attr_accessor :quote_char
+    # The string at the end of each row
+    #
+    # Default `:auto`.
+    #
+    # @return [:auto, String] The row separator
+    # @see (see #initialize)
+    attr_accessor :row_sep
+    # Whether to skip blank rows
+    #
+    # Default `false`. If `true`, rows whose fields are all empty will be
+    # skipped.
+    #
+    # @return [Boolean] Whether blank rows will be skipped
+    attr_accessor :skip_blanks
+    # An array of row indexes to skip
+    #
+    # Default `[]`. If set to an array of zero-based row indexes, those rows
+    # will be skipped. This option does not apply to header rows.
+    #
+    # @return [Array] The row index to skip
+    attr_reader :skip_rows
+    # Whether to force headers to be unique
+    #
+    # Default `false`. If set to `true`, headers will be forced to be unique by
+    # appending numbers to duplicates. For example, if two header cells have the
+    # text `"Name"`, the headers will become `"Name.0"`, and `"Name.1"`. The
+    # separator between the text and the number can be set using the
+    # `:header_separator` option.
+    #
+    # @return [Boolean] Whether headers will be uniqued
+    attr_accessor :unique_headers
+    # Create a new `Config` object.
+    #
+    # Options will be set to the defaults unless overridden by the `opts`
+    # parameter.
+    #
+    # @param opts [Hash] A hash of configuration options. See the individual
+    #   attributes for detailed descriptions.
+    #
+    # @see http://ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html#method-c-new
+    #   CSV#new
+    def initialize(opts = {})
+      DEFAULTS.merge(opts).each { |k, v| public_send("#{k}=", v) }
+    end
+    def skip_rows=(rows)
+      rows = [rows] unless rows.is_a?(Array)
+      @skip_rows = Hash[rows.map { |r| [r, true] }]
+    end
+    def header_rows=(rows)
+      rows = [rows] unless rows.is_a?(Array)
+      @header_rows = rows.sort
+    end
+    def expand_headers=(value)
+      value = [value] if value.is_a?(Integer)
+      @expand_headers = value
+    end
+    def normalize_headers=(value)
+      if value.is_a?(Hash)
+        value = KeyNormalizer.new(value)
+      elsif value.is_a?(String)
+        value = KeyNormalizer.new(separator: value)
+      elsif value == true
+        value = KeyNormalizer.new
+      end
+      @normalize_headers = value
+    end
+    def expand_headers
+      return header_rows if @expand_headers == true
+      return [] if @expand_headers == false
+      @expand_headers
+    end
+    # Get options passed through to `CSV#new`.
+    #
+    # @return [Hash] A hash of the CSV options
+    # @see (see #initialize)
+    def csv_opts
+      Hash[CSV_OPTS.map { |k| [k, public_send(k)] }]
+    end
+  end
+end

data/lib/uncsv/header.rb ADDED Viewed

@@ -0,0 +1,173 @@
+# frozen_string_literal: true
+class Uncsv
+  # A parsed CSV header.
+  class Header
+    # Create a new `Header` object
+    #
+    # @param headers [Array<Array<String>>] An array of header row values
+    # @param config [Config] Configuration options. Default options if `nil`.
+    def initialize(headers, config = nil)
+      @headers = headers
+      @config = config || Config.new
+      @to_a = nil
+    end
+    # Iterate over each header field
+    #
+    # @yield A block to run for each header field
+    # @yieldparam row [String] A header field
+    # @return [Enumerator] An enumerator over header field
+    def each(&block)
+      to_a.each(&block)
+    end
+    # Get an array of parsed header fields
+    #
+    # The header fields are cached, so consecutive calls to this method return
+    # the same array.
+    #
+    # @return [Array] The array of header fields
+    def to_a
+      @to_a ||= begin
+        headers = nil_empty(@headers)
+        headers = square(headers)
+        headers = normalize(headers) if @config.normalize_headers
+        headers = expand(headers)
+        combined = combine(headers)
+        combined = unique(combined) if @config.unique_headers
+        combined
+      end
+    end
+    class << self
+      # Parse headers from a CSV
+      #
+      # @param csv [CSV] A
+      #   {http://ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html CSV} object.
+      # @param config [Config] Configuration options. Default options if `nil`.
+      # @return [OpenStruct] An object with the methods `header`, `index`, and
+      #   `rows`. `header` is the {Header} object. `index` is the next CSV row
+      #   index. `rows` is an array of the skipped rows including the header
+      #   rows.
+      def parse!(csv, config)
+        index = config.header_rows.empty? ? 0 : (config.header_rows.max + 1)
+        rows = read_rows(csv, index)
+        headers = config.header_rows.map { |i| rows[i] }
+        OpenStruct.new(
+          header: new(headers, config),
+          index: index,
+          rows: rows
+        )
+      end
+      private
+      # Read a given number of rows from a CSV
+      #
+      # @param csv [CSV] A
+      #   {http://ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html CSV} object to
+      #   read rows from.
+      # @param count [Integer] The number of rows to read
+      # @return [Array<Array<String>>] An array of the read rows
+      def read_rows(csv, count)
+        (0...count).map { csv.shift }
+      end
+    end
+    private
+    # Combine multiple headers into a single header
+    #
+    # Joins individual headers with the `header_separator`.
+    #
+    # @param headers [Array<Array<String>>] The headers to combine
+    # @return [Array<String>] The combined header
+    def combine(headers)
+      headers.each_with_object([]) do |header, combined|
+        header.each_with_index do |key, index|
+          parts = [combined[index], key].compact
+          combined[index] = if parts.empty?
+            nil
+          else
+            parts.join(@config.header_separator)
+          end
+        end
+      end
+    end
+    # Fills in `nil` headers from the left
+    #
+    # @param headers [Array<Array<String>>] The headers to expand
+    # @return [Array<Array<String>>] The expanded headers
+    def expand(headers)
+      headers.each_with_index.map do |header, index|
+        next header unless @config.expand_headers.include?(index)
+        last = nil
+        header.map do |key|
+          key ? last = key : last
+        end
+      end
+    end
+    # Unique headers by adding numbers to the end
+    #
+    # @param combined [Array<String>] The combined headers to unique
+    # @return [Array<String>] The uniqued headers
+    def unique(combined)
+      combined = combined.dup
+      collate(combined).each do |key, indexes|
+        next if indexes.size == 1
+        indexes.each_with_index do |index, count|
+          combined[index] = [key, count].compact.join(@config.header_separator)
+        end
+      end
+      combined
+    end
+    # Create a hash of headers to arrays of their indexes
+    #
+    # Used for checking for header uniqueness
+    #
+    # @param header [Array<String>] The combined header to collate
+    # @return [Hash] The collated headers
+    def collate(header)
+      collated = {}
+      header.each_with_index do |key, index|
+        collated[key] = (collated[key] || []) << index
+      end
+      collated
+    end
+    # Normalize header values
+    #
+    # @param headers [Array<Array<String>>] The array of uncombined headers to
+    #   normalize
+    def normalize(headers)
+      headers.map do |header|
+        header.map do |key|
+          @config.normalize_headers.normalize(key)
+        end
+      end
+    end
+    # Make the headers all the same length
+    #
+    # @param headers [Array<Array<String>>] An array of headers to square
+    # @return [Array<Array<String>>] The squared headers
+    def square(headers)
+      length = headers.map(&:size).max
+      headers.map { |h| h.fill(nil, h.size, length - h.size) }
+    end
+    # Convert header empty strings to nil
+    #
+    # @param headers [Array<Array<String>>] An array of headers to convert
+    # @return [Array<Array<String>>] The converted headers
+    def nil_empty(headers)
+      headers.map { |h| h.map { |k| k == '' ? nil : k } }
+    end
+  end
+end

data/lib/uncsv/key_normalizer.rb ADDED Viewed

@@ -0,0 +1,61 @@
+# frozen_string_literal: true
+class Uncsv
+  # Normalizes strings into a consistant format
+  class KeyNormalizer
+    # The default values applied if an attribute's value is not specified when
+    # constructing a new `KeyNormalizer` object.
+    DEFAULTS = {
+      downcase: true,
+      separator: '_'
+    }.freeze
+    # A string to replace all non-alphanumeric characters in the key
+    #
+    # Default: '_'. Can be set to an empty string to remove non-alphanumeric
+    # characters without replacing them.
+    #
+    # @return [String] The separator string
+    attr_accessor :separator
+    # Sets keys to all lower-case if set to `true`
+    #
+    # Default: true
+    #
+    # @return [Boolean] Whether the key will be lower-cased
+    attr_accessor :downcase
+    # Create a new `KeyNormalizer` object.
+    #
+    # Options will be set to the defaults unless overridden by the `opts`
+    # parameter.
+    #
+    # @param opts [Hash] A hash of configuration options. See the individual
+    #   attributes for detailed descriptions.
+    def initialize(opts = {})
+      DEFAULTS.merge(opts).each { |k, v| public_send("#{k}=", v) }
+    end
+    # Normalize a key
+    #
+    # Replaces non-alphanumeric characters with `separator`, then
+    # deduplicates underscores and trims them from the ends of the key. Then
+    # the key is lower-cased if `downcase` is set.
+    #
+    # @param key [String, nil] The key field to normalize
+    # @return [String, nil] The normalized header field or `nil` if the input
+    #   key is `nil`.
+    def normalize(key)
+      return nil if key.nil?
+      key = key.gsub(/[^a-z0-9]+/i, separator)
+      unless separator.empty?
+        escaped_separator = Regexp.escape(separator)
+        key.gsub!(/#{escaped_separator}{2,}/, separator)
+        key.gsub!(/^#{escaped_separator}|#{escaped_separator}$/, '')
+      end
+      key.downcase! if downcase
+      key
+    end
+  end
+end

data/lib/uncsv/row.rb ADDED Viewed

@@ -0,0 +1,109 @@
+# frozen_string_literal: true
+class Uncsv
+  # A single data row from a CSV. Fields can be accessed by header or zero-based
+  # index.
+  class Row
+    include Enumerable
+    # The headers for each field
+    #
+    # If a header for a given field is not defined, it will be `nil`.
+    #
+    # @return [Array] An array of the field headers
+    attr_reader :header
+    # The fields ordered from left to right
+    #
+    # An array of zero-indexed field values. If a field is empty it will be
+    # `nil`, or `''` if `nil_empty` is `false`.
+    #
+    # @return [Array] An array of the field values
+    attr_reader :fields
+    # Create a new `Row` object
+    #
+    # The `header` and `fields` arrays do not need to be the same length. If
+    # they are not, the missing values will be filled with `nil`.
+    #
+    # @param header [Array] The field headers
+    # @param fields [Array] The field values
+    # @param config [Config] Configuration options. Default options if `nil`.
+    def initialize(header, fields, config = nil)
+      @config = config || Config.new
+      @header = square(header, fields.size)
+      @fields = square(fields, header.size).map { |f| process(f) }
+      @map = Hash[header.zip(@fields)]
+    end
+    # Get a field by index or header
+    #
+    # If `key` is an `Integer`, get a field by a zero-based index. If `key` is a
+    # header, access a field by it's header. If `key` is nil, or if a field does
+    # not exist, will return `nil`.
+    #
+    # @param key [Integer, String] The index or header
+    # @return [String, nil] The field value if it exists
+    def [](key)
+      return if key.nil?
+      value = key.is_a?(Integer) ? @fields[key] : @map[key]
+      process(value)
+    end
+    # Gets a hash of headers to fields
+    #
+    # `nil` headers will not be included in the hash.
+    #
+    # @return [Hash] A hash of headers to fields
+    def to_h
+      Hash[@header.compact.map { |h| [h, self[h]] }]
+    end
+    # Iterate over each pair of headers and fields
+    #
+    # @yield A block to run for each pair
+    # @yieldparam row [Row] A row object
+    # @return [Enumerator] An enumerator over each pair
+    def each(&block)
+      @map.each_pair(&block)
+    end
+    # Get a field by index or header and specify a default
+    #
+    # Tries to get the field specified by key (see {#[]}). If the field
+    # is `nil`, returns the default. If a block is given, the default is the
+    # block's return value, otherwise the default is the `default` argument.
+    #
+    # @yield A block to run if the field is `nil`
+    # @yieldparam key [String] The `key` parameter
+    # @return [String, Object] The field value or default
+    def fetch(key, default = nil)
+      value = self[key]
+      return value unless value.nil?
+      block_given? ? yield(key) : default
+    end
+    private
+    # Fills an array with nil to extend it to the given size
+    #
+    # @param array [Array] The array to square
+    # @param size [Integer] The target array size
+    # @return [Array] The squared array
+    def square(array, size)
+      array.fill(nil, array.size, size - array.size)
+    end
+    # Transforms a field value according to the config options
+    #
+    # @param field [String] The field value to process
+    # @return [String] The processed field
+    def process(field)
+      field = '' if field.nil? && !@config.nil_empty
+      field = nil if field == '' && @config.nil_empty
+      field
+    end
+  end
+end

data/lib/uncsv/rows.rb ADDED Viewed

@@ -0,0 +1,99 @@
+# frozen_string_literal: true
+require 'csv'
+class Uncsv
+  # A collection of parsed rows from a CSV
+  class Rows
+    # Create a new `Rows` object
+    #
+    # @param csv [CSV] A
+    #   {http://ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html CSV} object.
+    # @param config [Config] Configuration options. Default options if `nil`.
+    def initialize(csv, config = nil)
+      @csv = csv
+      @config = config || Config.new
+      @started = false
+      @parsed = nil
+    end
+    # Iterate over each row
+    #
+    # @yield A block to run for each row
+    # @yieldparam row [Row] A row object
+    # @return [Enumerator] An enumerator over each row
+    def each(&block)
+      Enumerator.new do |yielder|
+        start
+        index = parsed.index
+        loop do
+          break unless yield_row(yielder, index)
+          index += 1
+        end
+      end.each(&block)
+    end
+    # Get the CSV header
+    #
+    # @return [Array] An array of the CSV header fields
+    # @see Header#to_a
+    def header
+      parsed.header.to_a
+    end
+    private
+    # Whether the given row should be skipped
+    #
+    # @param fields [Array] An array of field values
+    # @param index [Integer] The zero-based row index
+    # @return [Boolean] Whether the row should be skipped
+    def should_skip?(fields, index)
+      return true if @config.skip_rows[index]
+      return true if @config.skip_blanks && fields.compact.empty?
+      false
+    end
+    # Yield a row from the CSV to the Enumerator yielder
+    #
+    # Reads a row from the CSV and yields a parsed row if necessary.
+    #
+    # @param yielder [Enumerator::Yielder] A yielder to yield the row to
+    # @param index [Integer] The next row index
+    # @return [Boolean] `false` if the CSV is ended
+    def yield_row(yielder, index)
+      fields = @csv.shift
+      return false unless fields
+      unless should_skip?(fields, index)
+        yielder << Row.new(header, fields, @config)
+      end
+      true
+    end
+    # Start reading the CSV
+    #
+    # If the CSV has already been read, it will be rewound and the header will
+    # be reset.
+    def start
+      if @started
+        @parsed = nil
+        @csv.rewind
+      else
+        @started = true
+      end
+    end
+    # Get the header parse object
+    #
+    # The parsed header is cached, so multiple calls will return the same
+    # instance.
+    #
+    # @return [OpenStruct] The parsed header object
+    def parsed
+      @parsed ||= Header.parse!(@csv, @config)
+    end
+  end
+end

data/lib/uncsv/version.rb ADDED Viewed

@@ -0,0 +1,6 @@
+# frozen_string_literal: true
+class Uncsv
+  # The current Uncsv library version
+  VERSION = '0.3.1'
+end