RubyGems - csv-import-analyzer - Versions diffs - 0.0.3 → 0.0.4 - Mend

csv-import-analyzer 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/README.md +8 -1
data/csv-import-analyzer.gemspec +1 -1
data/lib/csv-import-analyzer.rb +6 -4
data/lib/csv-import-analyzer/analyzer/csv_check_bounds.rb +30 -19
data/lib/csv-import-analyzer/analyzer/delimiter_identifier.rb +44 -24
data/lib/csv-import-analyzer/analyzer/file_type_assertion.rb +1 -5
data/lib/csv-import-analyzer/csv_datatype_analysis.rb +25 -9
data/lib/csv-import-analyzer/csv_sanitizer.rb +67 -17
data/lib/csv-import-analyzer/export/metadata_analysis.rb +63 -7
data/lib/csv-import-analyzer/helpers/common_functions.rb +4 -0
data/lib/csv-import-analyzer/helpers/datatype_validation.rb +6 -6
data/lib/csv-import-analyzer/helpers/string_class_extensions.rb +9 -3
data/lib/csv-import-analyzer/query_builder/mysql_query_helper.rb +2 -2
data/lib/csv-import-analyzer/query_builder/pg_query_helper.rb +1 -2
data/lib/csv-import-analyzer/query_builder/query_helper.rb +2 -2
data/lib/csv-import-analyzer/sql_query_builder.rb +27 -12
data/lib/csv-import-analyzer/version.rb +1 -1
data/spec/csv-import-analyzer/analyzer/csv_check_bounds_spec.rb +8 -8
data/spec/csv-import-analyzer/analyzer/delimiter_identifier_spec.rb +13 -13
data/spec/csv-import-analyzer/csv_sanitizer_spec.rb +10 -7
data/spec/csv-import-analyzer/helpers/common_functions_spec.rb +20 -19
data/spec/csv-import-analyzer/helpers/datatype_validation_spec.rb +28 -28
data/spec/csv-import-analyzer/helpers/string_class_extension_spec.rb +6 -6
data/spec/csv-import-analyzer/query_builder/mysql_query_helper_spec.rb +13 -13
data/spec/csv-import-analyzer/query_builder/pg_query_helper_spec.rb +16 -16
data/spec/csv-import-analyzer_spec.rb +3 -6
data/spec/fixtures/sample.csv +2 -2
data/spec/spec_helper.rb +3 -0
metadata +17 -6
data/lib/csv-import-analyzer/sampleTab.csv +0 -5
data/samples/metadata_output.json +0 -70
data/spec/csv-import-analyzer/csv_datatype_analysis_spec.rb +0 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 5342c73b7ed62a280eadd0bffbb914c9d782d815
-  data.tar.gz: fdf0c897afaac56c4e558d5c7ee195914d7ffce7
+  metadata.gz: 7b62ce806e6c9ce5dbd5bb625cec61e9f62e861d
+  data.tar.gz: 097ec0f8a105b92a0adf7869f4e01d5330d62abc
 SHA512:
-  metadata.gz: e683e160c6aeb7073027322837bb9af40a2fc898e739c6c549699b830ef53e8387ef10d507f5a099a7e0662aab872f5a896642002fab8298ee3d8b59cc2ae2b6
-  data.tar.gz: dc2b67f259b8524e1bf155e8939511de5e2fb0e6099aa8800e876fa36d7b73c362ca2bbd0ef6adcff788264e5bf39f00d14ec1fc2ad969ce061b28039b04bcaf
+  metadata.gz: a0e7aff88f46560a7c263bee87f6840f2b918acd5003e0a3cd710a37910392d1452468d5fce3e8d0ee3aece44ab79f17c08da1aabab226ae70ef9bd58e9cf45c
+  data.tar.gz: 9778e7a00b0c972aaa45a6260a4a03b7ea699612c68740ab8acd10bbfc1369e4fc6f9e5c167a2b4936877c87c99a22df9965865d95cc760763541c6524f21a2f

data/.gitignore CHANGED Viewed

@@ -13,3 +13,4 @@
 *.a
 mkmf.log
 *.gem
+/samples

data/README.md CHANGED Viewed

@@ -34,10 +34,17 @@ Calling process on a filename would generate a metadata_output.json which has th
 ## TODO:
   <ul>
+    <li> Handle control of processed input file to user </li>
     <li> Return the analysis as Json object.</li>
-    <li> Structuring the analysis outputted to csv better </li>
+    <li> Better - Structuring the analysis outputted to csv</li>
+    <li> Add support to convert and import xlsx files to csv </li>
   </ul>
+## Additional Information
+### Dependencies
+  <ul><li><a href="https://github.com/tilo/smarter_csv">smarter_csv</a> - For processing the csv in chunks</li></ul>
 ## Contributing
 1. Fork it ( https://github.com/avinash-vllbh/csv-import-analyzer/fork )

data/csv-import-analyzer.gemspec CHANGED Viewed

@@ -24,5 +24,5 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "rspec", "~> 3.0"
   spec.add_development_dependency "simplecov", "~> 0.9"
-  spec.add_runtime_dependency "smarter_csv", "~> 1.0.17"
+  spec.add_runtime_dependency "smarter_csv", "~> 1.0", ">= 1.0.17"
 end

data/lib/csv-import-analyzer.rb CHANGED Viewed

@@ -1,10 +1,15 @@
-require 'pry'
 require_relative "csv-import-analyzer/csv_sanitizer"
 require_relative "csv-import-analyzer/helpers/errors"
 module CsvImportAnalyzer
   # To identify the methods in the module as class methods
   extend self
+  ###
+  # main public interface to the library
+  # makes sure that the file exists and
+  # passes the file and any additional options given to CsvSanitizer
+  # returns FileNotFound if given file is invalid
+  ###
   def process(filename, options = {})
     if File::exist?(filename)
       CsvImportAnalyzer::CsvSanitizer.new().process(File.absolute_path(filename), options)
@@ -13,6 +18,3 @@ module CsvImportAnalyzer
     end
   end
 end
-CsvImportAnalyzer.process("sampleTab.csv", {:metadata_output => true, :unique => 2})
-# CsvImportAnalyzer.process("sampleTab.csv", {:metadata_output => true, :out_format => :csv})

data/lib/csv-import-analyzer/analyzer/csv_check_bounds.rb CHANGED Viewed

@@ -1,5 +1,4 @@
 require 'smarter_csv'
-require 'pry'
 require_relative "../helpers/common_functions"
 require_relative "../helpers/errors"
@@ -15,7 +14,6 @@ module CsvImportAnalyzer
       @min_max_bounds = {}
       @distinct_values = {}
       @nullable = options[:nullable] || []
     end
     def filename
@@ -31,10 +29,14 @@ module CsvImportAnalyzer
       @max_distinct_values ||= Integer(options[:unique]) + 1
     end
-    # Public interface that is called - Processes the CSV file for min & max values for each column
+    # Public interface for CsvCheckBounds
+    # Processes the CSV file for min & max values and distinct values for each column
     def get_min_max_values
       unless filename.nil?
         if File.exist?(filename)
+          # Using SmarterCSV gem to retrieve the csv records in chunks
+          # Chunk size can be set by the user
+          # E.g. :chunk_size => 200 would retrieve 200 rows each time
           SmarterCSV.process(filename, {:col_sep => delimiter, :chunk_size => chunk_size,
           :remove_empty_values => false, :remove_zero_values => false}) do |chunk|
             chunk.each do |row|
@@ -59,9 +61,10 @@ module CsvImportAnalyzer
     private
-    ##
-    #If the key is of String type then we find the max length of it
-    ##
+    ###
+    # If the key is of String type then we find the max length of it
+    # Any other datatype would have a min and max ranges
+    ###
     def process_min_max_for_column(key, value)
       if min_max_bounds[key].nil?
         unless csv_column_datatypes[key] == :string
@@ -73,22 +76,30 @@ module CsvImportAnalyzer
       add_bounds(key, value)
     end
-    ##
-    #Method which decides on the min max values for each key and according to the passsed in value
-    ##
+    ###
+    # Method to decide on the min max values for each key
+    # Checks for length if key is of String format
+    # Check for values if key is of Numeric or Datetime format
+    ###
     def add_bounds(key, value)
-      if csv_column_datatypes[key] == :string
-        min_max_bounds[key][:min] = value.length if value.length < min_max_bounds[key][:min]
-        min_max_bounds[key][:max] = value.length if value.length > min_max_bounds[key][:max]
-      else
-        min_max_bounds[key][:min] = value if value < min_max_bounds[key][:min]
-        min_max_bounds[key][:max] = value if value > min_max_bounds[key][:max]
+      begin
+        if csv_column_datatypes[key] == :string
+          min_max_bounds[key][:min] = value.length if value.length < min_max_bounds[key][:min]
+          min_max_bounds[key][:max] = value.length if value.length > min_max_bounds[key][:max]
+        else
+          min_max_bounds[key][:min] = value if value < min_max_bounds[key][:min]
+          min_max_bounds[key][:max] = value if value > min_max_bounds[key][:max]
+        end
+      rescue ArgumentError, NoMethodError => e
+        ###
+        # TODO: Handle csv parse coversions of datatypes
+        ###
       end
     end
-    ##
-    #Processes the max number of distinct values set for each column
-    ##
+    ###
+    # Processes the max number of distinct values set for each column
+    ###
     def process_distinct_values(key, value)
       if distinct_values[key].nil?
         distinct_values[key] = [value]
@@ -101,4 +112,4 @@ module CsvImportAnalyzer
     end
   end
-end
+end

data/lib/csv-import-analyzer/analyzer/delimiter_identifier.rb CHANGED Viewed

@@ -1,39 +1,31 @@
 require_relative "../helpers/string_class_extensions"
 require 'pry'
 module CsvImportAnalyzer
   module DelimiterIdentifier
-    # attr_accessor :delimiter, :delimiter_count
+    ###
+    # Types of delimiters that the gem has to lookout for.
+    # Could be changed in future or to custom delimiters
+    # returns a @delimiter instance variable array
+    ###
     def delimiter
       @delimiter ||= [",", ";", "\t", "|"]
     end
+    ###
+    # Routine to intialize the delimiter_count hash with the delimiters defined above with a base count of 0
+    # Returns @delimiter_count instance variable
+    ###
     def delimiter_count
       @delimiter_count ||= Hash[delimiter.map {|v| [v,0]}]
       @delimiter_count
     end
-    def getting_contents_of_quoted_values(input)
-      #return a join of all the strings inside quotes inside a line
-      input.scan(/".*?"/).join
-    end
-    def count_occurances_delimiter(line)
-      delimiter_count.keys.each do |key|
-        #Count the occurances of delimiter in a line
-        total_count_delimiter = line.substr_count(key)
-        #count the occurances of delimiter between quotes inside a line to disregard them
-        quoted_delimiter_count = getting_contents_of_quoted_values(line).substr_count(key)
-        delimiter_count[key] += total_count_delimiter - quoted_delimiter_count
-      end
-    end
-    def return_plausible_delimiter
-      return delimiter_count.key(delimiter_count.values.max)
-    end
+    ###
+    # Method to analyze input data and determine delimiter
+    # Input can be either a csv file or even a array of strings
+    # returns delimiter
+    ###
     def identify_delimiter(filename_or_sample)
       #filename_or_sample input can be either a File or an Array or a string - Return delimiter for File or an Array of strings (if found)
       if filename_or_sample.class == String
@@ -60,7 +52,35 @@ module CsvImportAnalyzer
         InvalidInput.new
       end
     end
+    private
+    def getting_contents_of_quoted_values(input)
+      #return a join of all the strings inside quotes inside a line
+      input.scan(/".*?"/).join
+    end
+    ###
+    # Find the count of delimiter occurances in a line
+    # CSV files can have delimiters escaped between quotes
+    # valid count = total_count - delimiters inside quotes
+    ###
+    def count_occurances_delimiter(line)
+      delimiter_count.keys.each do |key|
+        #Count the occurances of delimiter in a line
+        total_count_delimiter = line.substr_count(key)
+        #count the occurances of delimiter between quotes inside a line to disregard them
+        quoted_delimiter_count = getting_contents_of_quoted_values(line).substr_count(key)
+        delimiter_count[key] += total_count_delimiter - quoted_delimiter_count
+      end
+    end
+    ###
+    # Plausible delimiter would be the one i.e. of most occurance of the set of rows
+    ###
+    def return_plausible_delimiter
+      return delimiter_count.key(delimiter_count.values.max)
+    end
   end
 end
-# puts CsvImportAnalyzer::DelimiterIdentifier.identify_delimiter("/home/avinash/Desktop/csv-import-analyzer/spec/fixtures/sample.csv")

data/lib/csv-import-analyzer/analyzer/file_type_assertion.rb CHANGED Viewed

@@ -16,11 +16,7 @@ module CsvImportAnalyzer
         end
       end
-      def self.convert_excel_to_csv
-      end
-      def csv_clean
+      def convert_excel_to_csv
       end

data/lib/csv-import-analyzer/csv_datatype_analysis.rb CHANGED Viewed

@@ -1,10 +1,10 @@
 require "smarter_csv"
 require "tempfile"
-require "pry"
 require_relative "helpers/datatype_validation"
 require_relative "analyzer/csv_check_bounds"
 require_relative "helpers/common_functions"
 require_relative "sql_query_builder"
+require "pry"
 module CsvImportAnalyzer
   class CsvDatatypeAnalysis
@@ -27,8 +27,12 @@ module CsvImportAnalyzer
       @options[:filename]
     end
+    ###
     # Process a chunk of csv file for all possible datatypes towards each column in the row
+    # This datatype analysis is used for analyzing,
+    #                 Min - Max values of each column
+    #                 Distinct values of each column
+    #                 Enumeration eligibility
     def datatype_analysis
       SmarterCSV.process(filename, {:col_sep => delimiter, :chunk_size => chunk_size,
         :remove_empty_values => false, :remove_zero_values => false}) do |chunk|
@@ -61,14 +65,18 @@ module CsvImportAnalyzer
       return options[:chunk]
     end
-    #Call DatatypeValidator in helper module to process the possible datatype for the value
-    #Is this the right way to hide dependency on the external classes or objects
-    #May be a static would do ? Should I create an object and call method on the object each time rather than instantiate a new object each time ??
+    ###
+    # Call DatatypeValidator in helper module to process the possible datatype for the value
+    # Is this the right way to hide dependency on the external classes or objects
+    # May be a static would do ? Should I create an object and call method on the object each time rather than instantiate a new object each time ??
+    ###
     def determine_dataype(value)
       return validate_field(value)
     end
+    ###
     # Build the hash of hashes which hold the count of different possible datatypes for each row
+    ###
     def add_to_datatype(key, datatype)
       if csv_column_datatypes[key].nil?
         csv_column_datatypes[key] = {datatype => 1}
@@ -81,8 +89,11 @@ module CsvImportAnalyzer
       end
     end
-    #Finalize the datatype for each column, A column datatype would be set to varchar or string if atleast of it's values tend to be string
-    #If the column doesn't have any possible strings then assign the datatype to column with maximum count of identified possibilites
+    ###
+    # Finalize the datatype for each column.
+    # A column datatype would be set to varchar or string if even one of it's values tend to be string
+    # If the column doesn't have any possible strings then assign the datatype to column with maximum count of identified possibilites
+    ###
     def finalize_datatypes_for_csv
       csv_column_datatypes.map { |column_name, possible_datatypes|
         #If there is string type even atleast 1 there is no other option but to set the datatype to string => varchar
@@ -95,7 +106,12 @@ module CsvImportAnalyzer
       }
     end
-    #Decide if simple datatype analysis is enough or proced further
+    ###
+    # Decide if simple datatype analysis is enough or proced further
+    # Proceed further would be to
+    #                 Identify min and max bounds for each column
+    #                 Identify if the number distinct values are less than set threshold
+    ###
     def take_further_actions
       if options[:check_bounds]
         min_max_bounds = CsvImportAnalyzer::CsvCheckBounds.new(options)
@@ -107,4 +123,4 @@ module CsvImportAnalyzer
       query.generate_query
     end
   end
-end
+end

data/lib/csv-import-analyzer/csv_sanitizer.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require "smarter_csv"
 require "tempfile"
+require "pry"
 require_relative "analyzer/delimiter_identifier"
 require_relative "helpers/string_class_extensions"
 require_relative "helpers/common_functions"
@@ -11,27 +12,46 @@ module CsvImportAnalyzer
     include CsvImportAnalyzer::Helper
     include CsvImportAnalyzer::DelimiterIdentifier
+    ###
+    # Public interface for the entire library
+    # What id does?
+    #      Sets "options" varaible by merging default values and passed values
+    #      Finds the delimiter by analyzing a sample
+    #      Sanitizes or preprocesses the csv file by creating a temporary processed file
+    #               Replacing null and empty values with NULL
+    #               Replace single quotes with double quotes if needed
+    #      Handle CSVMalformedError by logging the error to error report
+    # Passes the options to DatatypeAnalysis
+    ###
     def process(filename, options)
       options = defaults.merge(options)
       if File.exist?(filename)
-        options[:filename] = filename
-        #first thing to do - find the delimiter of the file.
         delimiter = identify_delimiter(filename)
         options[:delimiter] = delimiter
+        # create tempfiles to update any changes being made
+        temp_file, processed_file = create_tempfiles(filename, options)
+        options[:temp_file] = temp_file.path
+        line_count = 1
         File.foreach(filename) do |line|
-          #Check if the line is empty - no point in processing empty lines
-          if line.length > 1
+          if line.length > 1 #Check if the line is empty - no point in processing empty lines
             line = replace_line_single_quotes(line,delimiter)
             begin
               line = CSV.parse_line(line, {:col_sep => delimiter})
-            rescue CSV::MalformedCSVError => error
-              line = "#{line}\""
+            rescue CSV::MalformedCSVError
+              # MalformedCSVError is due to illegal quoting or unclosed quotes
+              # Try to add a quote at the end and resume processing
+              # Log the changes to report
+              temp_file.write("MalformedCSVError at line #{line_count}")
+              line = line.insert(-2, "\"")
               line = CSV.parse_line(line, {:col_sep => delimiter})
             end
             line = replace_null_values(line)
+            processed_file.write(line.to_csv({:col_sep => delimiter, :converters => :numeric}))
           end
+          line_count += 1
         end
+        temp_file.close
+        processed_file.close
         # Cleaned the file - Now analyze for datatypes
         CsvImportAnalyzer::CsvDatatypeAnalysis.new(options).datatype_analysis
       else
@@ -41,21 +61,30 @@ module CsvImportAnalyzer
     private
+    ###
+    # Hash of default values that would be merged with user passed in values
+    # returns [Hash] defaults
+    ###
     def defaults
       {
-        :metadata_output => nil,
-        :processed_input => nil,
-        :unique => 10,
-        :check_bounds => true,
-        :datatype_analysis => 200,
-        :chunk => 20,
-        :database => [:pg, :mysql],
-        :quote_convert => true,
-        :replace_nulls => true,
-        :out_format => :json
+        :metadata_output => nil,      # To be set if metadata needs to be printed to a file
+        :processed_input => nil,      # To be set if processed input is needed
+        :unique => 10,                # Threshold for number of defaults values that needs to identified
+        :check_bounds => true,        # Option to check for min - max bounds for each column [true => find the bounds]
+        :datatype_analysis => 200,    # Number of rows to be sampled for datatype analysis
+        :chunk => 200,                # Chunk size (no of rows) that needs to processed in-memory [Important not to load entire file into memory]
+        :database => [:pg, :mysql],   # Databases for which schema needs to be generated
+        :quote_convert => true,       # Convert any single quotes to double quotes
+        :replace_nulls => true,       # Replace nulls, empty's, nils, Null's with NULL
+        :out_format => :json          # Set what type of output do you need as analysis
       }
     end
+    ###
+    # Replaces single quotes with doubles in each line
+    # Escapes the double quotes if it's between two single quotes before
+    # returns [String] result
+    ###
     def replace_line_single_quotes(line, delimiter)
       delimiter = "\\|" if delimiter == "|"
       pattern = "#{delimiter}'.*?'#{delimiter}" # set the pattern to opening and closing single quote found between delimiters
@@ -82,5 +111,26 @@ module CsvImportAnalyzer
       end
       return line
     end
+    ###
+    # Uses ruby tempfile to create temp files for
+    #                 1. Store processed file
+    #                 2. Error reporting
+    # Returns the file handler for a temp file.
+    # This tempfile holds any modifications being done to the file.
+    ###
+    def create_tempfiles(filename, options)
+      options[:original_filename] = filename
+      filename = File.basename(filename)
+      processed_filename = File.join(Dir.tmpdir, "processed_"+filename)
+      options[:filename] = processed_filename
+      # filename += Time.now.strftime("%Y%m%d%H%M%S")
+      # temp_file = Tempfile.new(filename)
+      # temp_file = File.open(File.join(Dir.tmpdir, filename), "w+")
+      temp_file = File.join(Dir.tmpdir, "error_report_"+filename)
+      temp_file = File.open(temp_file, "w+")
+      processed_file = File.open(processed_filename, "w+")
+      return temp_file, processed_file
+    end
   end
 end