RubyGems - csv-import-analyzer - Versions diffs - 0.0.3 → 0.0.4 - Mend

csv-import-analyzer 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/README.md +8 -1
data/csv-import-analyzer.gemspec +1 -1
data/lib/csv-import-analyzer.rb +6 -4
data/lib/csv-import-analyzer/analyzer/csv_check_bounds.rb +30 -19
data/lib/csv-import-analyzer/analyzer/delimiter_identifier.rb +44 -24
data/lib/csv-import-analyzer/analyzer/file_type_assertion.rb +1 -5
data/lib/csv-import-analyzer/csv_datatype_analysis.rb +25 -9
data/lib/csv-import-analyzer/csv_sanitizer.rb +67 -17
data/lib/csv-import-analyzer/export/metadata_analysis.rb +63 -7
data/lib/csv-import-analyzer/helpers/common_functions.rb +4 -0
data/lib/csv-import-analyzer/helpers/datatype_validation.rb +6 -6
data/lib/csv-import-analyzer/helpers/string_class_extensions.rb +9 -3
data/lib/csv-import-analyzer/query_builder/mysql_query_helper.rb +2 -2
data/lib/csv-import-analyzer/query_builder/pg_query_helper.rb +1 -2
data/lib/csv-import-analyzer/query_builder/query_helper.rb +2 -2
data/lib/csv-import-analyzer/sql_query_builder.rb +27 -12
data/lib/csv-import-analyzer/version.rb +1 -1
data/spec/csv-import-analyzer/analyzer/csv_check_bounds_spec.rb +8 -8
data/spec/csv-import-analyzer/analyzer/delimiter_identifier_spec.rb +13 -13
data/spec/csv-import-analyzer/csv_sanitizer_spec.rb +10 -7
data/spec/csv-import-analyzer/helpers/common_functions_spec.rb +20 -19
data/spec/csv-import-analyzer/helpers/datatype_validation_spec.rb +28 -28
data/spec/csv-import-analyzer/helpers/string_class_extension_spec.rb +6 -6
data/spec/csv-import-analyzer/query_builder/mysql_query_helper_spec.rb +13 -13
data/spec/csv-import-analyzer/query_builder/pg_query_helper_spec.rb +16 -16
data/spec/csv-import-analyzer_spec.rb +3 -6
data/spec/fixtures/sample.csv +2 -2
data/spec/spec_helper.rb +3 -0
metadata +17 -6
data/lib/csv-import-analyzer/sampleTab.csv +0 -5
data/samples/metadata_output.json +0 -70
data/spec/csv-import-analyzer/csv_datatype_analysis_spec.rb +0 -1

data/lib/csv-import-analyzer/export/metadata_analysis.rb CHANGED Viewed

@@ -1,4 +1,3 @@
-require 'pry'
 require 'json'
 module CsvImportAnalyzer
   class MetadataAnalysis
@@ -48,6 +47,11 @@ module CsvImportAnalyzer
       @max_distinct_values ||= Integer(options[:unique]) + 1
     end
+    ###
+    # Builds the metadata from the analysis done so far
+    # Creates a new json file with the analysis added to it if options[:metadata_output] is set
+    # returns JSON object of the metadata
+    ###
     def metadata_print
       build_metadata_output
       if options[:metadata_output]
@@ -64,16 +68,22 @@ module CsvImportAnalyzer
     private
+    ###
+    # Create or overwrite the metadata_output.json file if it already exists
+    # Write the metadata to the file and close it
+    ###
     def json_print_to_file
       outfile = File.open("metadata_output.json", "w")
       outfile << JSON.pretty_generate(metadata)
       outfile.close
     end
-    # Priniting that csv from json is a mess - How to make pretty print ?
+    ###
+    # Priniting the metadat to csv - How to make sense of the csv print??
+    # TODO: ADD support for returning data analysis as csv file
+    ###
     def csv_print_to_file
       CSV.open("metadata_output.csv", "w") do |csv|
-        binding.pry
         metadata.each do |key, value|
           if value.class == Hash
             csv << [key]
@@ -85,6 +95,10 @@ module CsvImportAnalyzer
       end
     end
+    ###
+    # Handle the key => value pairs to be printed as CSV files
+    # Recursively prints the key and value
+    ###
     def print_hash_to_csv(hash, csv_handler)
       if hash.class == Hash
         hash.each do |key, value|
@@ -96,6 +110,11 @@ module CsvImportAnalyzer
       end
     end
+    ###
+    # Build the metadata hash with need key value pairs
+    # Add the analysis data to @metadata instance variable
+    # E.g. metadata[:csv_file] means the metadata for csv file
+    ###
     def build_metadata_output
       metadata[:csv_file] = add_file_metadata
       metadata[:data_manipulations] = add_data_manipulations
@@ -103,16 +122,33 @@ module CsvImportAnalyzer
       metadata[:sql] = add_sql_data
     end
+    ###
+    # Metadata of the file
+    # adds the filename, file_path, record delimiter of the file along with processed file metadata
+    # Returns a hash of file data
+    ###
     def add_file_metadata
       file_data = {}
-      file_data[:filename] = File.basename(options[:filename])
-      file_data[:file_size] = File.size(options[:filename])
+      file_data[:filename] = File.basename(options[:original_filename])
+      file_data[:file_size] = File.size(options[:original_filename])
+      file_data[:record_delimiter] = options[:delimiter]
+      file_data[:processed_filename] = File.basename(options[:filename])
+      file_data[:processed_file_path] = options[:filename]
+      file_data[:processed_file_size] = File.size(options[:filename])
+      file_data[:error_report] = options[:temp_file]
       # file_data[:rows] = options[:rows]
       # file_data[:columns] = options[:columns]
-      file_data[:record_delimiter] = options[:delimiter]
       return file_data
     end
+    ###
+    # Add the data manipulations done to the processed file
+    # Currently only two types of manipulations
+    #   replace all the nulls and empty values with NULL
+    #   replace single quotes with double quotes
+    # returns hash of data_manipulations
+    ###
     def add_data_manipulations
       data_manipulations = {}
       data_manipulations[:replace_nulls] = options[:replace_nulls]
@@ -120,6 +156,22 @@ module CsvImportAnalyzer
       return data_manipulations
     end
+    ###
+    # builds a columns hash with metadata of each column
+    # E.g
+    # "photo_id": {
+    #   "datatype": "int",        => Tells the datatype is int
+    #   "datatype_analysis": {    => gives the results of datatypes analyis done
+    #                                  eventhough the column is determined to be int
+    #                                  in reality it could have "int": 20, "float": "5"
+    #                                  This would help the analyst to get a sense of data late on
+    #     "int": 20
+    #   },
+    #   "distinct_values": "11+"  => Cotains an array of distinct values,
+    #                                  if they are less than the threshold set
+    #                      or
+    #                   [1, 2, 3]
+    # },
     def add_header_metadata
       columns = {}
       header_datatypes.keys.each do |column_name|
@@ -142,6 +194,10 @@ module CsvImportAnalyzer
       return columns
     end
+    ###
+    # Add the queries for each database type specified
+    # build an sql hash with both create and import statements
+    ###
     def add_sql_data
       sql = {}
       databases.each do |db|
@@ -153,4 +209,4 @@ module CsvImportAnalyzer
     end
   end
-end
+end

data/lib/csv-import-analyzer/helpers/common_functions.rb CHANGED Viewed

@@ -1,5 +1,9 @@
 module CsvImportAnalyzer
   module Helper
+    ###
+    # To determine if a certain field in the dataset of null type
+    # returns a boolean of it's either null or not
+    ###
     def null_like?(value)
       if ["NULL", "Null", "NUll", "NULl", "null", nil, "", "NAN", "\\N"].include?(value)
         true

data/lib/csv-import-analyzer/helpers/datatype_validation.rb CHANGED Viewed

@@ -1,17 +1,16 @@
-require 'pry'
 module CsvImportAnalyzer
   module DatatypeValidator
     def validate_field(content)
-      return get_datatype(content)
+      get_datatype(content)
     end
     private
     ###
-    # Date.parse("12/31/20145234", "%m/%d/%Y") => true which is not supposed to be true (although technically its true)
+    # Date.parse("12/31/20145234", "%m/%d/%Y") => true, which is not supposed to be true (although technically its true)
     # Validate year part has only 4 numbers in it
     ###
     def validate_year_date(field)
       date = nil
       formats = ["%d/%m/%Y","%d-%m-%Y","%d %m %Y","%m/%d/%Y","%m-%d-%Y","%m %d %Y"]
@@ -54,7 +53,8 @@ module CsvImportAnalyzer
     end
     ###
-    #To determine the data-type of an input field
+    # To determine the data-type of an input field
+    # Returns a field is either int, float, string, date, datetime type
     ###
     def get_datatype(field)
       #Remove if field has any comma's for int and float rep
@@ -82,4 +82,4 @@ module CsvImportAnalyzer
       return "string"
     end
   end
-end
+end

data/lib/csv-import-analyzer/helpers/string_class_extensions.rb CHANGED Viewed

@@ -1,8 +1,14 @@
-require 'pry'
 class String
-  #Extending string class to return the count of substr inside a string
+  ###
+  # Monkey patch string class to find the count of needle in haystack
+  # haystack is self => string in itself
+  # needle could be anything
+  # E.g.
+  # "hello, how, are, you".substr_count(",") => 3
+  ###
   def substr_count(needle)
     needle = "\\#{needle}" if(needle == '|') # To escape inside regex
     self.scan(/(#{needle})/).size
   end
-end
+end

data/lib/csv-import-analyzer/query_builder/mysql_query_helper.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 require_relative "../helpers/errors"
-require 'pry'
 module CsvImportAnalyzer
   module MysqlQueryHelper
@@ -28,4 +28,4 @@ module CsvImportAnalyzer
     end
   end
-end
+end

data/lib/csv-import-analyzer/query_builder/pg_query_helper.rb CHANGED Viewed

@@ -1,5 +1,4 @@
 require_relative "../helpers/errors"
-require "pry"
 module CsvImportAnalyzer
   module PgQueryHelper
@@ -24,4 +23,4 @@ module CsvImportAnalyzer
     end
   end
-end
+end

data/lib/csv-import-analyzer/query_builder/query_helper.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 require_relative "../helpers/errors"
-require "pry"
 module CsvImportAnalyzer
   module QueryHelper
@@ -24,4 +24,4 @@ module CsvImportAnalyzer
     end
   end
-end
+end

data/lib/csv-import-analyzer/sql_query_builder.rb CHANGED Viewed

@@ -1,14 +1,15 @@
-require 'pry'
 require_relative "query_builder/mysql_query_helper"
 require_relative "query_builder/pg_query_helper"
 require_relative "export/metadata_analysis"
 module CsvImportAnalyzer
   class SqlQueryBuilder
-    # include CsvImportAnalyzer::mysql_query_helper
     attr_accessor :create_query, :import_query, :csv_column_datatypes, :min_max_bounds, :nullable, :sql_helper_options
+    ###
     # Since Building SQL is dependent on multiple things,
     # decided to go with an arguments hash that gets passed when creating an object for the class
+    ###
     def initialize(args)
       @options = args
       @create_query = {}
@@ -33,8 +34,8 @@ module CsvImportAnalyzer
     end
     def tablename
-      # May be optimize this, not run all three operations everytime filename method is called
-      # May be creating filename as instance variable and using a double pipe will relive it from running everytime doesn't it?
+      # May be optimize this, not run all three operations everytime filename method is called ??
+      # May be creating filename as instance variable and using a double pipe will relive it from running everytime doesn't it??
       tablename = File.basename(options[:filename])
       tablename.gsub!(" ", "_")
       tablename.downcase!
@@ -53,6 +54,12 @@ module CsvImportAnalyzer
       @pg_helper
     end
+    ###
+    # Goes through each of the columns datatypes and prepares SQL statements for
+    #         1. Importing CSV files to database
+    #         2. Create table schema for the files
+    # Makes a function call to return the metadata analysis of the file
+    ###
     def generate_query
       databases.each do |db|
         create_query[db] = ["create table #{tablename} ("]
@@ -70,6 +77,10 @@ module CsvImportAnalyzer
     private
+    ###
+    # Based on the database type set in options
+    # returns query part for the header (column name)
+    ###
     def build_query_for_datatype(header, datatype)
       query = {}
       databases.each do |db|
@@ -87,6 +98,10 @@ module CsvImportAnalyzer
       return query
     end
+    ###
+    # based on database type set in options
+    # returns import query for the database
+    ###
     def prepare_import_csv
       databases.each do |db|
         if db == :mysql
@@ -97,6 +112,9 @@ module CsvImportAnalyzer
       end
     end
+    ###
+    # prepares sql statements based on the query for each header formed earlier
+    ###
     def prepare_sql_statements
       databases.each do |db|
         create_query[db][0] = create_query[db].first + " " + create_query[db][1]
@@ -106,6 +124,11 @@ module CsvImportAnalyzer
       end
     end
+    ###
+    # set's the create query and import query's in options
+    # these fields will be added to the metadata later
+    # instantiates MetadataAnalysis and passes options hash
+    ###
     def print_metadata_analysis
       options[:create_query] = create_query
       options[:import_query] = import_query
@@ -115,11 +138,3 @@ module CsvImportAnalyzer
   end
 end
-#Testing
-# args = {}
-# args[:options] = {:delimiter => ",", :chunk => 20, :filename => "/home/avinash/Desktop/csv-import-analyzer/lib/csv-import-analyzer/sampleTab"}
-# args[:column_datatypes] = {:year_id=>:int, :make_id=>:string, :model_id=>:string, :description_id=>:string, :price_id=>:float}
-# args[:nullable] = [:description_id]
-# query = CsvImportAnalyzer::SqlQueryBuilder.new(args)
-# puts query.generate_query

data/lib/csv-import-analyzer/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module CsvImportAnalyzer
   module Version
-    VERSION = "0.0.3"
+    VERSION = "0.0.4"
   end
 end

data/spec/csv-import-analyzer/analyzer/csv_check_bounds_spec.rb CHANGED Viewed

@@ -2,38 +2,38 @@
 describe CsvImportAnalyzer::CsvCheckBounds do
-  describe '#get_min_max_values' do
-    context 'when not initialized right' do
+  describe "#get_min_max_values" do
+    context "when not initialized right" do
       let(:options) {Hash[filename: "sample", chunk_size: 200, delimiter: ",", unique: 2]}
-      it 'will fail gracefully if filename is nil' do
+      it "will fail gracefully if filename is nil" do
         @csv_check_bounds = CsvImportAnalyzer::CsvCheckBounds.new
         expect(@csv_check_bounds.get_min_max_values).to be_instance_of(MissingRequiredArguments)
       end
-      it 'returns FileNotFound error if file is not found' do
+      it "returns FileNotFound error if file is not found" do
         @csv_check_bounds = CsvImportAnalyzer::CsvCheckBounds.new(options)
         expect(@csv_check_bounds.get_min_max_values).to be_instance_of(FileNotFound)
       end
     end
-    context 'when initialized right' do
+    context "when initialized right" do
       let(:options) {Hash[filename: $sample_csv_path, chunk_size: 200, delimiter: ",", unique: 2, csv_column_datatypes: {:year_id => :int, :make_id => :string, :model_id => :string, :description_id => :string, :price_id => :float}]}
       before(:each) do
         @csv_check_bounds = CsvImportAnalyzer::CsvCheckBounds.new(options)
       end
-      it 'returns a Hash' do
+      it "returns a Hash" do
         expect(@csv_check_bounds.get_min_max_values).to be_an_instance_of(Hash)
       end
-      it 'returns correct min & max values for integer type' do
+      it "returns correct min & max values for integer type" do
         result = @csv_check_bounds.get_min_max_values
         expect(result[:min_max][:year_id][:min]).to eq(1996)
         expect(result[:min_max][:year_id][:max]).to eq(1999)
       end
-      it 'returns correct min & max lengths for string type' do
+      it "returns correct min & max lengths for string type" do
         result = @csv_check_bounds.get_min_max_values
         expect(result[:min_max][:make_id][:min]).to eq(4)
         expect(result[:min_max][:make_id][:max]).to eq(7)

data/spec/csv-import-analyzer/analyzer/delimiter_identifier_spec.rb CHANGED Viewed

@@ -3,57 +3,57 @@
 class DummyClass
 end
-describe '#identify_delimiter' do
+describe "#identify_delimiter" do
   before(:each) do
     @dummy_class = DummyClass.new
     @dummy_class.extend(CsvImportAnalyzer::DelimiterIdentifier)
   end
-  context 'unable to determine the delimiter' do
+  context "unable to determine the delimiter" do
-    it 'return invalid input when the input is neither string nor array' do
+    it "return invalid input when the input is neither string nor array" do
       expect(@dummy_class.identify_delimiter(3)).to be_instance_of(InvalidInput)
     end
-    it 'returns file not found when the input string is not a valid file' do
+    it "returns file not found when the input string is not a valid file" do
       expect(@dummy_class.identify_delimiter("test")).to be_instance_of(FileNotFound)
     end
   end
-  context 'finds the delimiter when the input is a file' do
+  context "finds the delimiter when the input is a file" do
-    it 'returns a comma as the delimiter for sample_csv file' do
+    it "returns a comma as the delimiter for sample_csv file" do
       expect(@dummy_class.identify_delimiter($sample_csv_path)).to eq(",")
     end
-    it 'returns a semicolon as the the delimiter for sample_ssv file' do
+    it "returns a semicolon as the the delimiter for sample_ssv file" do
       expect(@dummy_class.identify_delimiter($sample_ssv_path)).to eq(";")
     end
   end
-  context 'finds the delimiter when the input is an array' do
+  context "finds the delimiter when the input is an array" do
     let(:sample) {['1999;Chevy;"Venture ""Extended Edition""";"";4900.00','1999;\'Chevy\';"Venture ""Extended Edition; Very Large""";;5000.00']}
-    it 'returns a semicolon as the delimiter for sample array input' do
+    it "returns a semicolon as the delimiter for sample array input" do
       expect(@dummy_class.identify_delimiter(sample)).to eq(";")
     end
   end
 end
-describe '#return_plausible_delimiter' do
+describe "#return_plausible_delimiter" do
   before(:each) do
     @dummy_class = DummyClass.new
     @dummy_class.extend(CsvImportAnalyzer::DelimiterIdentifier)
   end
-  context 'identifies delimiter' do
-    it 'returns comma as the delimiter by default' do
+  context "identifies delimiter" do
+    it "returns comma as the delimiter by default" do
       expect(@dummy_class.return_plausible_delimiter).to eq(",")
     end
-    it 'returns semicolon as the delimiter for sample delimiter_count' do
+    it "returns semicolon as the delimiter for sample delimiter_count" do
       @dummy_class.stub(:delimiter_count).and_return(Hash[","=>15, ";"=>16, "\t"=>0, "|"=>0])
       expect(@dummy_class.return_plausible_delimiter).to eq(";")
     end