RubyGems - csv-import-analyzer - Versions diffs - 0.0.1 - Mend

csv-import-analyzer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

checksums.yaml +7 -0
data/.gitignore +14 -0
data/.rspec +3 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +35 -0
data/Rakefile +2 -0
data/csv-import-analyzer.gemspec +29 -0
data/lib/csv-import-analyzer.rb +18 -0
data/lib/csv-import-analyzer/analyzer/csv_check_bounds.rb +104 -0
data/lib/csv-import-analyzer/analyzer/delimiter_identifier.rb +66 -0
data/lib/csv-import-analyzer/analyzer/file_type_assertion.rb +29 -0
data/lib/csv-import-analyzer/csv_datatype_analysis.rb +110 -0
data/lib/csv-import-analyzer/csv_sanitizer.rb +86 -0
data/lib/csv-import-analyzer/export/metadata_analysis.rb +156 -0
data/lib/csv-import-analyzer/helpers/common_functions.rb +11 -0
data/lib/csv-import-analyzer/helpers/datatype_validation.rb +85 -0
data/lib/csv-import-analyzer/helpers/errors.rb +3 -0
data/lib/csv-import-analyzer/helpers/string_class_extensions.rb +8 -0
data/lib/csv-import-analyzer/query_builder/mysql_query_helper.rb +31 -0
data/lib/csv-import-analyzer/query_builder/pg_query_helper.rb +27 -0
data/lib/csv-import-analyzer/query_builder/query_helper.rb +27 -0
data/lib/csv-import-analyzer/sampleTab.csv +5 -0
data/lib/csv-import-analyzer/sql_query_builder.rb +125 -0
data/lib/csv-import-analyzer/version.rb +5 -0
data/lib/metadata_output.json +70 -0
data/lib/sampleTab.csv +5 -0
data/spec/csv-import-analyzer/analyzer/csv_check_bounds_spec.rb +43 -0
data/spec/csv-import-analyzer/analyzer/delimiter_identifier_spec.rb +61 -0
data/spec/csv-import-analyzer/analyzer/file_type_assertion_spec.rb +0 -0
data/spec/csv-import-analyzer/csv_datatype_analysis_spec.rb +1 -0
data/spec/csv-import-analyzer/csv_sanitizer_spec.rb +24 -0
data/spec/csv-import-analyzer/export/metadata_analysis_spec.rb +0 -0
data/spec/csv-import-analyzer/helpers/common_functions_spec.rb +31 -0
data/spec/csv-import-analyzer/helpers/csv_check_bounds_spec.rb +3 -0
data/spec/csv-import-analyzer/helpers/datatype_validation_spec.rb +75 -0
data/spec/csv-import-analyzer/helpers/mysql_query_helper_spec.rb +0 -0
data/spec/csv-import-analyzer/helpers/pq_query_helper_spec.rb +0 -0
data/spec/csv-import-analyzer/helpers/string_class_extension_spec.rb +18 -0
data/spec/csv-import-analyzer/query_builder/mysql_query_helper_spec.rb +54 -0
data/spec/csv-import-analyzer/query_builder/pg_query_helper_spec.rb +55 -0
data/spec/csv-import-analyzer_spec.rb +14 -0
data/spec/fixtures/sample.csv +5 -0
data/spec/fixtures/sample_options.yml +11 -0
data/spec/fixtures/semicolon-sample.csv +5 -0
data/spec/spec_helper.rb +84 -0
metadata +208 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: d57394db99e42dd7154c4c1506d94fa7310311db
+  data.tar.gz: 3b07ff8c14728eb61ceca8ecf107b1be6dea3344
+SHA512:
+  metadata.gz: 4942b31ef92123e2fbedbe1b73eb1aa2814ed459d2b875f4435fb607eb4c072d7f10164bae285ee4ed1af0404ccebab73194526eacb786f9422a3c3540e8f66b
+  data.tar.gz: ad717f58dac4d563f6cf1edfb89ec14ca6299c18e6714b45fa1ccf6f78d93f758f7fd66952b76fe52cab07c414405070523c16446f98a45644f7209b3087915f

data/.gitignore ADDED Viewed

@@ -0,0 +1,14 @@
+/.bundle/
+/.yardoc
+/Gemfile.lock
+/_yardoc/
+/coverage/
+/doc/
+/pkg/
+/spec/reports/
+/tmp/
+*.bundle
+*.so
+*.o
+*.a
+mkmf.log

data/.rspec ADDED Viewed

@@ -0,0 +1,3 @@
+--color
+--require spec_helper
+--format documentation

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in csv-import-analyzer.gemspec
+gemspec

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2014 avinash vallabhaneni
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,35 @@
+# Csv::Import::Analyzer
+Perform datatype analysis on desired chunk
+Calculate min-max bounds for each column
+Determine which coulmns are nullable in the csv file
+Note: This gem expects the first line to be definitve header, as in like column names if the csv file has to be imported to database.
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'csv-import-analyzer'
+```
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install csv-import-analyzer
+## Usage
+TODO: Write usage instructions here
+## Contributing
+1. Fork it ( https://github.com/avinash-vllbh/csv-import-analyzer/fork )
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create a new Pull Request

data/Rakefile ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ require "bundler/gem_tasks"
2	+

data/csv-import-analyzer.gemspec ADDED Viewed

@@ -0,0 +1,29 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'csv-import-analyzer/version'
+Gem::Specification.new do |spec|
+  spec.name          = "csv-import-analyzer"
+  spec.version       = CsvImportAnalyzer::Version::VERSION
+  spec.authors       = ["avinash vallabhaneni"]
+  spec.email         = ["avinash.vallab@gmail.com"]
+  spec.description   = %q{Santize large csv files and help in predicting datatypes including min max values for easy import to SQL}
+  spec.summary       = %q{To process large csv files and predict valid datatypes of each column for easy import into SQL}
+  spec.homepage      = "http://rubygems.org/gems/csv-import-analyzer"
+  spec.license       = "MIT"
+  spec.files         = `git ls-files -z`.split("\x0")
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "bundler", "~> 1.6"
+  spec.add_development_dependency "rake", "~> 10"
+  spec.add_development_dependency "pry", "~> 0.10"
+  spec.add_development_dependency "rspec", "~> 3.0"
+  spec.add_development_dependency "simplecov", "~> 0.9"
+  spec.add_runtime_dependency "smarter_csv", "~> 1.0.17"
+  spec.add_runtime_dependency "roo", "~> 1.13"
+end

data/lib/csv-import-analyzer.rb ADDED Viewed

@@ -0,0 +1,18 @@
+require 'pry'
+require_relative "csv-import-analyzer/csv_sanitizer"
+require_relative "csv-import-analyzer/helpers/errors"
+module CsvImportAnalyzer
+  # To identify the methods in the module as class methods
+  extend self
+  def process(filename, options = {})
+    if File::exist?(filename)
+      CsvImportAnalyzer::CsvSanitizer.new().process(File.absolute_path(filename), options)
+    else
+      FileNotFound.new
+    end
+  end
+end
+CsvImportAnalyzer.process("sampleTab.csv", {:metadata_output => true, :unique => 2})
+# CsvImportAnalyzer.process("sampleTab.csv", {:metadata_output => true, :out_format => :csv})

data/lib/csv-import-analyzer/analyzer/csv_check_bounds.rb ADDED Viewed

@@ -0,0 +1,104 @@
+require 'smarter_csv'
+require 'pry'
+require_relative "../helpers/common_functions"
+require_relative "../helpers/errors"
+module CsvImportAnalyzer
+  class CsvCheckBounds
+    include CsvImportAnalyzer::Helper
+    attr_accessor :min_max_bounds, :distinct_values, :csv_column_datatypes, :options, :nullable, :max_distinct_values
+    def initialize(options = {})
+      @csv_column_datatypes = options[:csv_column_datatypes]
+      @options = options
+      @min_max_bounds = {}
+      @distinct_values = {}
+      @nullable = options[:nullable] || []
+    end
+    def filename
+      return options[:filename]
+    end
+    def chunk_size
+      return options[:chunk_size]
+    end
+    def delimiter
+      return options[:delimiter]
+    end
+    def max_distinct_values
+      @max_distinct_values ||= Integer(options[:unique]) + 1
+    end
+    # Public interface that is called - Processes the CSV file for min & max values for each column
+    def get_min_max_values
+      unless filename.nil?
+        if File.exist?(filename)
+          SmarterCSV.process(filename, {:col_sep => delimiter, :chunk_size => chunk_size,
+          :remove_empty_values => false, :remove_zero_values => false}) do |chunk|
+            chunk.each do |row|
+              row.each do |key, value|
+                unless null_like?(value)
+                  process_min_max_for_column(key, value)
+                  process_distinct_values(key, value)
+                else
+                  nullable.push(key) unless nullable.include?(key)
+                end
+              end
+            end
+          end
+          return {:min_max => min_max_bounds, :uniques => distinct_values}
+        else
+          FileNotFound.new
+        end
+      else
+        MissingRequiredArguments.new("valid filename is required to check bounds")
+      end
+    end
+    private
+    ##
+    #If the key is of String type then we find the max length of it
+    ##
+    def process_min_max_for_column(key, value)
+      if min_max_bounds[key].nil?
+        unless csv_column_datatypes[key] == :string
+          min_max_bounds[key] = {:min => value, :max => value}
+        else
+          min_max_bounds[key] = {:min => value.length, :max => 0}
+        end
+      end
+      add_bounds(key, value)
+    end
+    ##
+    #Method which decides on the min max values for each key and according to the passsed in value
+    ##
+    def add_bounds(key, value)
+      if csv_column_datatypes[key] == :string
+        min_max_bounds[key][:min] = value.length if value.length < min_max_bounds[key][:min]
+        min_max_bounds[key][:max] = value.length if value.length > min_max_bounds[key][:max]
+      else
+        min_max_bounds[key][:min] = value if value < min_max_bounds[key][:min]
+        min_max_bounds[key][:max] = value if value > min_max_bounds[key][:max]
+      end
+    end
+    ##
+    #Processes the max number of distinct values set for each column
+    ##
+    def process_distinct_values(key, value)
+      if distinct_values[key].nil?
+        distinct_values[key] = [value]
+      else
+        if distinct_values[key].size > max_distinct_values
+        else
+          distinct_values[key].push(value) unless distinct_values[key].include?(value)
+        end
+      end
+    end
+  end
+end

data/lib/csv-import-analyzer/analyzer/delimiter_identifier.rb ADDED Viewed

@@ -0,0 +1,66 @@
+require_relative "../helpers/string_class_extensions"
+require 'pry'
+module CsvImportAnalyzer
+  module DelimiterIdentifier
+    # attr_accessor :delimiter, :delimiter_count
+    def delimiter
+      @delimiter ||= [",", ";", "\t", "|"]
+    end
+    def delimiter_count
+      @delimiter_count ||= Hash[delimiter.map {|v| [v,0]}]
+      @delimiter_count
+    end
+    def getting_contents_of_quoted_values(input)
+      #return a join of all the strings inside quotes inside a line
+      input.scan(/".*?"/).join
+    end
+    def count_occurances_delimiter(line)
+      delimiter_count.keys.each do |key|
+        #Count the occurances of delimiter in a line
+        total_count_delimiter = line.substr_count(key)
+        #count the occurances of delimiter between quotes inside a line to disregard them
+        quoted_delimiter_count = getting_contents_of_quoted_values(line).substr_count(key)
+        delimiter_count[key] += total_count_delimiter - quoted_delimiter_count
+      end
+    end
+    def return_plausible_delimiter
+      return delimiter_count.key(delimiter_count.values.max)
+    end
+    def identify_delimiter(filename_or_sample)
+      #filename_or_sample input can be either a File or an Array or a string - Return delimiter for File or an Array of strings (if found)
+      if filename_or_sample.class == String
+        if File::exists?(filename_or_sample)
+          current_line_number = 0
+          File.foreach(filename_or_sample) do |line|
+            count_occurances_delimiter(line)
+            current_line_number += 1
+            if current_line_number > 3
+              break
+            end
+          end
+        else
+          # count_occurances_delimiter(filename_or_sample)
+          return FileNotFound.new
+        end
+        return_plausible_delimiter
+      elsif filename_or_sample.class == Array
+        filename_or_sample.each do |line|
+          count_occurances_delimiter(line)
+        end
+        return_plausible_delimiter
+      else
+        InvalidInput.new
+      end
+    end
+  end
+end
+# puts CsvImportAnalyzer::DelimiterIdentifier.identify_delimiter("/home/avinash/Desktop/csv-import-analyzer/spec/fixtures/sample.csv")

data/lib/csv-import-analyzer/analyzer/file_type_assertion.rb ADDED Viewed

@@ -0,0 +1,29 @@
+# file_type_assertion.rb
+require 'pry'
+module CsvImportAnalyzer
+  module Analyzer
+    class FileTypeAssertion
+      def check_file_type(filename)
+        extension = File.absolute_path(filename).split(".").last
+        if extension == "csv"
+          Analyzer::FileTypeAssertion.new("sampleTab.csv")
+        #Try adding support for non csv files - xlsx, xls in future
+        elsif extension == "xlsx"
+          puts "xlsx"
+        else
+          # return UnsupportedFileFormat.new
+        end
+      end
+      def self.convert_excel_to_csv
+      end
+      def csv_clean
+      end
+    end
+  end
+end

data/lib/csv-import-analyzer/csv_datatype_analysis.rb ADDED Viewed

@@ -0,0 +1,110 @@
+require "smarter_csv"
+require "tempfile"
+require "pry"
+require_relative "helpers/datatype_validation"
+require_relative "analyzer/csv_check_bounds"
+require_relative "helpers/common_functions"
+require_relative "sql_query_builder"
+module CsvImportAnalyzer
+  class CsvDatatypeAnalysis
+    include CsvImportAnalyzer::Helper
+    include CsvImportAnalyzer::DatatypeValidator
+    attr_accessor :csv_column_datatypes, :nullable
+    def initialize(options)
+      @options = options
+      @csv_column_datatypes = {}
+      @nullable = []
+    end
+    def options
+      @options
+    end
+    def filename
+      @options[:filename]
+    end
+    # Process a chunk of csv file for all possible datatypes towards each column in the row
+    def datatype_analysis
+      SmarterCSV.process(filename, {:col_sep => delimiter, :chunk_size => chunk_size,
+        :remove_empty_values => false, :remove_zero_values => false}) do |chunk|
+        chunk.each do |row|
+          row.each do |key, value|
+            unless null_like?(value)
+              datatype = determine_dataype(value)
+              add_to_datatype(key, datatype.to_sym)
+            else
+              nullable.push(key) unless nullable.include?(key)
+            end
+          end
+        end
+        break
+      end
+      options[:csv_datatype_analysis] = csv_column_datatypes.clone # To retain the current state of csv_column_datatypes since it's altered further
+      finalize_datatypes_for_csv
+      options[:csv_column_datatypes] = csv_column_datatypes
+      options[:nullable] = nullable
+      take_further_actions
+    end
+    private
+    def delimiter
+      return options[:delimiter]
+    end
+    def chunk_size
+      return options[:chunk]
+    end
+    #Call DatatypeValidator in helper module to process the possible datatype for the value
+    #Is this the right way to hide dependency on the external classes or objects
+    #May be a static would do ? Should I create an object and call method on the object each time rather than instantiate a new object each time ??
+    def determine_dataype(value)
+      return validate_field(value)
+    end
+    # Build the hash of hashes which hold the count of different possible datatypes for each row
+    def add_to_datatype(key, datatype)
+      if csv_column_datatypes[key].nil?
+        csv_column_datatypes[key] = {datatype => 1}
+      else
+        if csv_column_datatypes[key][datatype].nil?
+          csv_column_datatypes[key][datatype] = 1
+        else
+          csv_column_datatypes[key][datatype] += 1
+        end
+      end
+    end
+    #Finalize the datatype for each column, A column datatype would be set to varchar or string if atleast of it's values tend to be string
+    #If the column doesn't have any possible strings then assign the datatype to column with maximum count of identified possibilites
+    def finalize_datatypes_for_csv
+      csv_column_datatypes.map { |column_name, possible_datatypes|
+        #If there is string type even atleast 1 there is no other option but to set the datatype to string => varchar
+        if possible_datatypes.has_key?(:string)
+          csv_column_datatypes[column_name] = :string
+        else
+          #set the max occurance datatype as the datatype of column
+          csv_column_datatypes[column_name] = possible_datatypes.key(possible_datatypes.values.max)
+        end
+      }
+    end
+    #Decide if simple datatype analysis is enough or proced further
+    def take_further_actions
+      if options[:check_bounds]
+        min_max_bounds = CsvImportAnalyzer::CsvCheckBounds.new(options)
+        res = min_max_bounds.get_min_max_values
+        options[:min_max_bounds] = res[:min_max]
+        options[:uniques] = res[:uniques]
+      end
+      query = CsvImportAnalyzer::SqlQueryBuilder.new(options)
+      query.generate_query
+    end
+  end
+end