RubyGems - json_csv - Versions diffs - 0.0.6 → 0.0.7 - Mend

json_csv 0.0.6 → 0.0.7

Files changed (8) hide show

checksums.yaml +4 -4
data/lib/json_csv/csv_builder.rb +53 -0
data/lib/json_csv/csv_to_json.rb +97 -95
data/lib/json_csv/json_to_csv.rb +93 -35
data/lib/json_csv/version.rb +1 -1
data/lib/json_csv.rb +2 -2
metadata +2 -2
data/lib/json_csv/array_notation.rb +0 -23

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: cca6d71ea7947967ea15bfa7fd89195549e1f5b4
-  data.tar.gz: dca198bcf9db549b57a9c09bac5661194658b12e
+  metadata.gz: '0043834553b3243296e8b2b55c6806721b82673c'
+  data.tar.gz: 857ab74bc2b9f95ffb0f5d9d47ba520f8ba0a17f
 SHA512:
-  metadata.gz: 8929b7911dbb7da2e29a56c8e6a2b93df9ab203d3202d435b3932c782654f7392b3dce1d27215021b53544864a8569a6e5b230a0bd029a21745dc27ace2e2989
-  data.tar.gz: c00fed1d0fbed0b41c0261005909290ea48990cf7b4532bc0ff60f16cbaa1251c3aaa5244f6cad0cf695d66e43f822abf9f92a881632da1acdf49143b8e37d8a
+  metadata.gz: 427f4714f7b6999c798bef38b8e80bef3f0fe5033b31909e4a3aa064aee79e407e5f5957a207dfaa55d9d389b7da763d813c053ce184f3dd8bc97c0b38ead32a
+  data.tar.gz: cdf0311ca31c8c830f22daf426f18e2a36f77cb5ebc47ebdc97ceaf8eeecea1bf1b88f2dec60dd2155e4f6d94c17ccfa5aded6cb64efe9b399d1aaf27beb5738

data/lib/json_csv/csv_builder.rb ADDED Viewed

@@ -0,0 +1,53 @@
+require 'csv'
+require 'json_csv/json_to_csv'
+module JsonCsv
+  class CsvBuilder
+    private_class_method :new # private constructor. we don't want users to initialize this class.
+    attr_reader :known_headers_to_indexes # map of all headers seen by this CsvBuilder, mapped to their column order indexes
+    def initialize(open_csv_handle)
+      @known_headers_to_indexes = {}
+      @open_csv_handle = open_csv_handle
+    end
+    # Adds data from the given json hash to the CSV we're building.
+    def add(json_hash)
+      row_to_write = []
+      JsonCsv.json_hash_to_flat_csv_row_hash(json_hash).each do |column_header, cell_value|
+        known_headers_to_indexes[column_header] = known_headers_to_indexes.length unless known_headers_to_indexes.key?(column_header)
+        row_to_write[known_headers_to_indexes[column_header]] = cell_value
+      end
+      @open_csv_handle << row_to_write
+    end
+    # Writes out a CSV file that does NOT contain a header row. Only data values.
+    # Returns an array of headers that correspond to the written-out CSV file's columns.
+    #
+    # Why don't we include CSV headers in the CSV?  Because don't know what set of headers
+    # we're working with while we dynamically create this CSV.  Different JSON documents may
+    # or may not all contain the same headers. For this reason, this is more of an internal
+    # method that isn't called directly by users of this gem.
+    def self.create_csv_without_headers(csv_outfile_path, csv_write_mode = 'wb')
+      csv_builder = nil
+      CSV.open(csv_outfile_path, csv_write_mode) do |csv|
+        csv_builder = new(csv)
+        yield csv_builder
+      end
+      csv_builder.known_headers_to_indexes.keys
+    end
+    def self.original_header_indexes_to_sorted_indexes(csv_headers, column_header_comparator)
+      original_headers_to_indexes = Hash[csv_headers.map.with_index { |header, index| [header, index] }]
+      headers_to_sorted_indexes = Hash[csv_headers.sort(&column_header_comparator).map.with_index { |header, index| [header, index] }]
+      original_to_sorted_index_map = {}
+      original_headers_to_indexes.each do |header, original_index|
+        original_to_sorted_index_map[original_index] = headers_to_sorted_indexes[header]
+      end
+      original_to_sorted_index_map
+    end
+  end
+end

data/lib/json_csv/csv_to_json.rb CHANGED Viewed

@@ -1,4 +1,3 @@
-require 'json_csv/array_notation'
 require 'json_csv/utils'
 require 'csv'
@@ -11,117 +10,120 @@ module JsonCsv
     TYPE_BOOLEAN = 'boolean'.freeze
     FIELD_CASTING_TYPES = [TYPE_STRING, TYPE_INTEGER, TYPE_FLOAT, TYPE_BOOLEAN].freeze
-    # Takes flat csv data and yields to a block for each row,
-    # presenting that row as un-flattened json.
-    # This method works for large CSVs and uses very little memory
-    # because it only keeps one row in memory at a time.
-    def csv_file_to_hierarchical_json_hash(path_to_csv, field_casting_rules = {}, array_notation = JsonCsv::ArrayNotation::BRACKETS, strip_value_whitespace = true)
-      i = 0
-      CSV.foreach(path_to_csv, headers: true, header_converters: lambda { |header|
-        if array_notation == JsonCsv::ArrayNotation::DASH
-          JsonCsv::ArrayNotation.dash_header_to_bracket_header(header).strip
-        else
-          header.strip
+    def self.included(base)
+      base.extend ClassMethods
+    end
+    module ClassMethods
+      # Takes flat csv data and yields to a block for each row,
+      # presenting that row as un-flattened json.
+      # This method works for large CSVs and uses very little memory
+      # because it only keeps one row in memory at a time.
+      # Sample usage: csv_file_to_hierarchical_json_hash(path_to_csv, field_casting_rules = {}, strip_value_whitespace = true) do |row_json_hash, row_number|
+      def csv_file_to_hierarchical_json_hash(path_to_csv, field_casting_rules = {}, strip_value_whitespace = true)
+        i = 1 # start with row 1 because this corresponds to the first row of 0-indexed CSV data
+        CSV.foreach(path_to_csv, headers: true, header_converters: lambda { |header|
+          header.strip # remove leading and trailing header whitespace
+        }) do |row_data_hash|
+          yield csv_row_hash_to_hierarchical_json_hash(row_data_hash, field_casting_rules, strip_value_whitespace), i
+          i += 1
         end
-      }) do |row_data_hash|
-        yield csv_row_hash_to_hierarchical_json_hash(row_data_hash, field_casting_rules, strip_value_whitespace), i
-        i += 1
       end
-    end
-    def csv_row_hash_to_hierarchical_json_hash(row_data_hash, field_casting_rules, strip_value_whitespace = true)
-      hierarchical_hash = {}
-      row_data_hash.each do |key, value|
-        next if value.nil? || value == '' # ignore nil or empty string values
-        put_value_at_json_path(hierarchical_hash, key, value, field_casting_rules)
+      def csv_row_hash_to_hierarchical_json_hash(row_data_hash, field_casting_rules, strip_value_whitespace = true)
+        hierarchical_hash = {}
+        row_data_hash.each do |key, value|
+          next if value.nil? || value == '' # ignore nil or empty string values
+          put_value_at_json_path(hierarchical_hash, key, value, field_casting_rules)
+        end
+        # Clean up empty array elements, which may have come about from CSV data
+        # that was 1-indexed instead of 0-indexed.
+        JsonCsv::Utils.recursively_remove_blank_fields!(hierarchical_hash)
+        JsonCsv::Utils.recursively_strip_value_whitespace!(hierarchical_hash) if strip_value_whitespace
+        hierarchical_hash
       end
-      # Clean up empty array elements, which may have come about from CSV data
-      # that was 1-indexed instead of 0-indexed.
-      JsonCsv::Utils.recursively_remove_blank_fields!(hierarchical_hash)
-      JsonCsv::Utils.recursively_strip_value_whitespace!(hierarchical_hash) if strip_value_whitespace
-      hierarchical_hash
-    end
-    # For the given obj, puts the given value at the given json_path,
-    # creating nested elements as needed. This method calls itself
-    # recursively when placing a value at a nested path, and during
-    # this sequence of calls the obj param may either be a hash or an array.
-    def put_value_at_json_path(obj, json_path, value, field_casting_rules = {}, full_json_path_from_top = json_path)
-      json_path_pieces = json_path_to_pieces(json_path)
+      # For the given obj, puts the given value at the given json_path,
+      # creating nested elements as needed. This method calls itself
+      # recursively when placing a value at a nested path, and during
+      # this sequence of calls the obj param may either be a hash or an array.
+      def put_value_at_json_path(obj, json_path, value, field_casting_rules = {}, full_json_path_from_top = json_path)
+        json_path_pieces = json_path_to_pieces(json_path)
-      if json_path_pieces.length == 1
-        # If the full_json_path_from_top matches one of the field_casting_rules,
-        # then case this field to the specified cast type
-        full_json_path_from_top_as_field_casting_rule_pattern = real_json_path_to_field_casting_rule_pattern(full_json_path_from_top)
-        obj[json_path_pieces[0]] = field_casting_rules.key?(full_json_path_from_top_as_field_casting_rule_pattern) ? apply_field_casting_type(value, field_casting_rules[full_json_path_from_top_as_field_casting_rule_pattern]) : value
-      else
-        obj[json_path_pieces[0]] ||= (json_path_pieces[1].is_a?(Integer) ? [] : {})
-        put_value_at_json_path(obj[json_path_pieces[0]], pieces_to_json_path(json_path_pieces[1..-1]), value, field_casting_rules, full_json_path_from_top)
+        if json_path_pieces.length == 1
+          # If the full_json_path_from_top matches one of the field_casting_rules,
+          # then case this field to the specified cast type
+          full_json_path_from_top_as_field_casting_rule_pattern = real_json_path_to_field_casting_rule_pattern(full_json_path_from_top)
+          obj[json_path_pieces[0]] = field_casting_rules.key?(full_json_path_from_top_as_field_casting_rule_pattern) ? apply_field_casting_type(value, field_casting_rules[full_json_path_from_top_as_field_casting_rule_pattern]) : value
+        else
+          obj[json_path_pieces[0]] ||= (json_path_pieces[1].is_a?(Integer) ? [] : {})
+          put_value_at_json_path(obj[json_path_pieces[0]], pieces_to_json_path(json_path_pieces[1..-1]), value, field_casting_rules, full_json_path_from_top)
+        end
       end
-    end
-    # Takes a real json_path like "related_books[1].notes_from_reviewers[0]" and
-    # converts it to a field_casting_rule_pattern like: "related_books[x].notes_from_reviewers[x]"
-    def real_json_path_to_field_casting_rule_pattern(full_json_path_from_top)
-      full_json_path_from_top.gsub(/\d+/, 'x')
-    end
+      # Takes a real json_path like "related_books[1].notes_from_reviewers[0]" and
+      # converts it to a field_casting_rule_pattern like: "related_books[x].notes_from_reviewers[x]"
+      def real_json_path_to_field_casting_rule_pattern(full_json_path_from_top)
+        full_json_path_from_top.gsub(/\d+/, 'x')
+      end
-    def apply_field_casting_type(value, field_casting_type)
-      raise ArgumentError, "Invalid cast type #{field_casting_type}" unless FIELD_CASTING_TYPES.include?(field_casting_type)
+      def apply_field_casting_type(value, field_casting_type)
+        raise ArgumentError, "Invalid cast type #{field_casting_type}" unless FIELD_CASTING_TYPES.include?(field_casting_type)
-      case field_casting_type
-      when TYPE_INTEGER
-        raise ArgumentError, "\"#{value}\" is not an integer" unless value =~ /^[0-9]+$/
-        value.to_i
-      when TYPE_FLOAT
-        raise ArgumentError, "\"#{value}\" is not a float" unless value =~ /^[0-9]+(\.[0-9]+)*$/ || value =~ /^\.[0-9]+$/
-        value.to_f
-      when TYPE_BOOLEAN
-        if value.downcase == 'true'
-          true
-        elsif value.downcase == 'false'
-          false
+        case field_casting_type
+        when TYPE_INTEGER
+          raise ArgumentError, "\"#{value}\" is not an integer" unless value =~ /^[0-9]+$/
+          value.to_i
+        when TYPE_FLOAT
+          raise ArgumentError, "\"#{value}\" is not a float" unless value =~ /^[0-9]+(\.[0-9]+)*$/ || value =~ /^\.[0-9]+$/
+          value.to_f
+        when TYPE_BOOLEAN
+          if value.downcase == 'true'
+            true
+          elsif value.downcase == 'false'
+            false
+          else
+            raise ArgumentError, "\"#{value}\" is not a boolean"
+          end
         else
-          raise ArgumentError, "\"#{value}\" is not a boolean"
+          value # fall back to string, which is the original form
         end
-      else
-        value # fall back to string, which is the original form
       end
-    end
-    # Takes the given json_path and splits it into individual json path pieces.
-    # e.g. Takes "related_books[1].notes_from_reviewers[0]" and converts it to:
-    # ["related_books", 1, "notes_from_reviewers", 0]
-    def json_path_to_pieces(json_path)
-      # split on...
-      # '].' (when preceded by a number)
-      # OR
-      # '[' (when followed by a number)
-      # OR
-      # ']' (when preceded by a number)
-      # OR
-      # '.' (always)
-      # ...and remove empty elements (which only come up when you're working with
-      # a json_path like '[0]', which splits between the first bracket and the number)
-      pieces = json_path.split(/(?<=\d)\]\.|\[(?=\d)|(?<=\d)\]|\./).reject { |piece| piece == '' }
-      pieces.map { |piece| piece.to_i.to_s == piece ? piece.to_i : piece } # numeric pieces should be actual numbers
-    end
+      # Takes the given json_path and splits it into individual json path pieces.
+      # e.g. Takes "related_books[1].notes_from_reviewers[0]" and converts it to:
+      # ["related_books", 1, "notes_from_reviewers", 0]
+      def json_path_to_pieces(json_path)
+        # split on...
+        # '].' (when preceded by a number)
+        # OR
+        # '[' (when followed by a number)
+        # OR
+        # ']' (when preceded by a number)
+        # OR
+        # '.' (always)
+        # ...and remove empty elements (which only come up when you're working with
+        # a json_path like '[0]', which splits between the first bracket and the number)
+        pieces = json_path.split(/(?<=\d)\]\.|\[(?=\d)|(?<=\d)\]|\./).reject { |piece| piece == '' }
+        pieces.map { |piece| piece.to_i.to_s == piece ? piece.to_i : piece } # numeric pieces should be actual numbers
+      end
-    # Generates a string json path from the given pieces
-    # e.g. Takes ["related_books", 1, "notes_from_reviewers", 0] and converts it to:
-    # "related_books[1].notes_from_reviewers[0]"
-    def pieces_to_json_path(pieces)
-      json_path = ''
-      pieces.each do |piece|
-        if piece.is_a?(Integer)
-          json_path += "[#{piece}]"
-        else
-          json_path += '.' unless json_path.empty?
-          json_path += piece
+      # Generates a string json path from the given pieces
+      # e.g. Takes ["related_books", 1, "notes_from_reviewers", 0] and converts it to:
+      # "related_books[1].notes_from_reviewers[0]"
+      def pieces_to_json_path(pieces)
+        json_path = ''
+        pieces.each do |piece|
+          if piece.is_a?(Integer)
+            json_path += "[#{piece}]"
+          else
+            json_path += '.' unless json_path.empty?
+            json_path += piece
+          end
         end
+        json_path
       end
-      json_path
-    end
+    end
   end
 end

data/lib/json_csv/json_to_csv.rb CHANGED Viewed

@@ -1,50 +1,108 @@
 require 'json'
+require 'json_csv/csv_builder'
 module JsonCsv
   module JsonToCsv
-    # Converts the given json_hash into a flat csv hash, converting all values to
-    # strings (because CSVs are dumb and don't store info about data types)
-    # Set first_index to 1 if you want the first element in an array to
-    #
-    def json_hash_to_flat_csv_row_hash(json_hash, array_notation = JsonCsv::ArrayNotation::BRACKETS)
-      flat = flatten_hash(json_hash)
-      # Convert values to strings because in the CSV file, all values are strings
-      flat.each { |key, val| flat[key] = val.nil? ? '' : val.to_s }
-      # If we're using dash array notation, convert the headers
-      if array_notation == JsonCsv::ArrayNotation::DASH
-        Hash[flat.map { |key, val| [JsonCsv::ArrayNotation.bracket_header_to_dash_header(key), val] }]
-      else
-        flat
-      end
+    def self.included(base)
+      base.extend ClassMethods
     end
-    # This method calls itself recursively while flattening a hash, and during
-    # this sequence of calls the obj param may either be a hash or an array.
-    def flatten_hash(obj, parent_path = '', flat_hash_to_build = {})
-      if obj.is_a?(Hash)
-        obj.each do |key, val|
-          if key_contains_unallowed_characters?(key)
-            raise ArgumentError, 'Cannot deal with hash keys that contain "[" or "]" or "." because these characters have special meanings in CSV headers.'
+    module ClassMethods
+      DEFAULT_HEADER_SORT_COMPARATOR = lambda do |header1, header2|
+        # Ensure correct alphabetical sorting AND numeric sorting via zero-padding of numbers
+        header1_with_zero_padding = header1.gsub(/(?<=\[)\d+(?=\])/) { |capture| capture.to_i.to_s.rjust(5, '0') }
+        header2_with_zero_padding = header2.gsub(/(?<=\[)\d+(?=\])/) { |capture| capture.to_i.to_s.rjust(5, '0') }
+        header1_with_zero_padding <=> header2_with_zero_padding
+      end
+      def default_header_comparison(header1, header2)
+        DEFAULT_HEADER_SORT_COMPARATOR.call(header1, header2)
+      end
+      # Example usage:
+      # create_csv_for_json_records('/path/to/file.csv') do |csv_builder|
+      #   json_docs.each do |json_doc|
+      #     csv_builder.add(json_hash)
+      #   end
+      # end
+      def create_csv_for_json_records(csv_outfile_path, header_sort_comparator = DEFAULT_HEADER_SORT_COMPARATOR)
+        csv_temp_outfile_path = csv_outfile_path + '.temp'
+        begin
+          # Step 1: Build CSV with unsorted headers in temp file
+          csv_headers = JsonCsv::CsvBuilder.create_csv_without_headers(csv_temp_outfile_path, 'wb') do |csv_builder|
+            yield csv_builder
           end
-          path = parent_path + (parent_path.empty? ? '' : '.') + key
-          flatten_hash(val, path, flat_hash_to_build)
-        end
-      elsif obj.is_a?(Array)
-        obj.each_with_index do |el, index|
-          path = parent_path + "[#{index}]"
-          flatten_hash(el, path, flat_hash_to_build)
+          # Step 2: Sort CSV columns by header, based on column_header_comparator
+          original_to_sorted_index_map = JsonCsv::CsvBuilder.original_header_indexes_to_sorted_indexes(csv_headers, header_sort_comparator)
+          CSV.open(csv_outfile_path, 'wb') do |final_csv|
+            # Open temporary CSV for reading
+            CSV.open(csv_temp_outfile_path, 'rb') do |temp_csv|
+              # write out ordered header row
+              reordered_header_row = []
+              csv_headers.each_with_index do |header, index|
+                reordered_header_row[original_to_sorted_index_map[index]] = header
+              end
+              final_csv << reordered_header_row
+              temp_csv.each do |temp_csv_row|
+                reordered_temp_csv_row = []
+                # write out ordered data row
+                temp_csv_row.each_with_index do |cell_value, index|
+                  reordered_temp_csv_row[original_to_sorted_index_map[index]] = cell_value
+                end
+                final_csv << reordered_temp_csv_row
+              end
+            end
+          end
+        ensure
+          # Always delete the temporary CSV
+          FileUtils.rm_f(csv_temp_outfile_path)
         end
-      else
-        flat_hash_to_build[parent_path] = obj unless obj.nil? || obj == '' # ignore nil or empty string values
       end
-      flat_hash_to_build
-    end
+      # Converts the given json_hash into a flat csv hash, converting all values to
+      # strings (because CSVs are dumb and don't store info about data types)
+      # Set first_index to 1 if you want the first element in an array to
+      #
+      def json_hash_to_flat_csv_row_hash(json_hash)
+        flat = flatten_hash(json_hash)
+        # Convert values to strings because in the CSV file, all values are strings
+        flat.each { |key, val| flat[key] = val.nil? ? '' : val.to_s }
+        flat
+      end
-    def key_contains_unallowed_characters?(key)
-      return true if key.index('[') || key.index(']') || key.index('.')
-      false
+      # This method calls itself recursively while flattening a hash, and during
+      # this sequence of calls the obj param may either be a hash or an array.
+      def flatten_hash(obj, parent_path = '', flat_hash_to_build = {})
+        if obj.is_a?(Hash)
+          obj.each do |key, val|
+            if key_contains_unallowed_characters?(key)
+              raise ArgumentError, 'Cannot deal with hash keys that contain "[" or "]" or "." because these characters have special meanings in CSV headers.'
+            end
+            path = parent_path + (parent_path.empty? ? '' : '.') + key
+            flatten_hash(val, path, flat_hash_to_build)
+          end
+        elsif obj.is_a?(Array)
+          obj.each_with_index do |el, index|
+            path = parent_path + "[#{index}]"
+            flatten_hash(el, path, flat_hash_to_build)
+          end
+        else
+          flat_hash_to_build[parent_path] = obj unless obj.nil? || obj == '' # ignore nil or empty string values
+        end
+        flat_hash_to_build
+      end
+      def key_contains_unallowed_characters?(key)
+        return true if key.index('[') || key.index(']') || key.index('.')
+        false
+      end
     end
   end

data/lib/json_csv/version.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module JsonCsv
-  VERSION = '0.0.6'.freeze
+  VERSION = '0.0.7'.freeze
   def self.version
     VERSION

data/lib/json_csv.rb CHANGED Viewed

@@ -3,6 +3,6 @@ require "json_csv/json_to_csv"
 require "json_csv/csv_to_json"
 module JsonCsv
-  extend JsonCsv::JsonToCsv
-  extend JsonCsv::CsvToJson
+  include JsonCsv::JsonToCsv
+  include JsonCsv::CsvToJson
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: json_csv
 version: !ruby/object:Gem::Version
-  version: 0.0.6
+  version: 0.0.7
 platform: ruby
 authors:
 - Eric O'Hanlon
@@ -88,7 +88,7 @@ extra_rdoc_files: []
 files:
 - README.md
 - lib/json_csv.rb
-- lib/json_csv/array_notation.rb
+- lib/json_csv/csv_builder.rb
 - lib/json_csv/csv_to_json.rb
 - lib/json_csv/json_to_csv.rb
 - lib/json_csv/utils.rb

data/lib/json_csv/array_notation.rb DELETED Viewed

@@ -1,23 +0,0 @@
-module JsonCsv
-  module ArrayNotation
-    BRACKETS = 'BRACKETS'.freeze
-    DASH = 'DASH'.freeze
-    VALID_ARRAY_NOTATIONS = [BRACKETS, DASH].freeze
-    def self.bracket_header_to_dash_header(bracket_header)
-      # e.g. replace occurrences of '[1]' with '-1'
-      bracket_header.gsub(/(\[(\d+)\])/, '-\2')
-    end
-    def self.dash_header_to_bracket_header(dash_header)
-      # e.g. replace occurrences of '-1' with '[1]'
-      dash_header.gsub(/(-(\d+))/, '[\2]')
-    end
-    def self.raise_error_if_invalid_array_notation_value!(error_class, array_notation)
-      raise error_class, "Invalid array notation. Must be one of #{VALID_ARRAY_NOTATIONS.join(' or ')}." unless VALID_ARRAY_NOTATIONS.include?(array_notation)
-    end
-  end
-end