csv-import-analyzer 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/README.md +8 -1
  4. data/csv-import-analyzer.gemspec +1 -1
  5. data/lib/csv-import-analyzer.rb +6 -4
  6. data/lib/csv-import-analyzer/analyzer/csv_check_bounds.rb +30 -19
  7. data/lib/csv-import-analyzer/analyzer/delimiter_identifier.rb +44 -24
  8. data/lib/csv-import-analyzer/analyzer/file_type_assertion.rb +1 -5
  9. data/lib/csv-import-analyzer/csv_datatype_analysis.rb +25 -9
  10. data/lib/csv-import-analyzer/csv_sanitizer.rb +67 -17
  11. data/lib/csv-import-analyzer/export/metadata_analysis.rb +63 -7
  12. data/lib/csv-import-analyzer/helpers/common_functions.rb +4 -0
  13. data/lib/csv-import-analyzer/helpers/datatype_validation.rb +6 -6
  14. data/lib/csv-import-analyzer/helpers/string_class_extensions.rb +9 -3
  15. data/lib/csv-import-analyzer/query_builder/mysql_query_helper.rb +2 -2
  16. data/lib/csv-import-analyzer/query_builder/pg_query_helper.rb +1 -2
  17. data/lib/csv-import-analyzer/query_builder/query_helper.rb +2 -2
  18. data/lib/csv-import-analyzer/sql_query_builder.rb +27 -12
  19. data/lib/csv-import-analyzer/version.rb +1 -1
  20. data/spec/csv-import-analyzer/analyzer/csv_check_bounds_spec.rb +8 -8
  21. data/spec/csv-import-analyzer/analyzer/delimiter_identifier_spec.rb +13 -13
  22. data/spec/csv-import-analyzer/csv_sanitizer_spec.rb +10 -7
  23. data/spec/csv-import-analyzer/helpers/common_functions_spec.rb +20 -19
  24. data/spec/csv-import-analyzer/helpers/datatype_validation_spec.rb +28 -28
  25. data/spec/csv-import-analyzer/helpers/string_class_extension_spec.rb +6 -6
  26. data/spec/csv-import-analyzer/query_builder/mysql_query_helper_spec.rb +13 -13
  27. data/spec/csv-import-analyzer/query_builder/pg_query_helper_spec.rb +16 -16
  28. data/spec/csv-import-analyzer_spec.rb +3 -6
  29. data/spec/fixtures/sample.csv +2 -2
  30. data/spec/spec_helper.rb +3 -0
  31. metadata +17 -6
  32. data/lib/csv-import-analyzer/sampleTab.csv +0 -5
  33. data/samples/metadata_output.json +0 -70
  34. data/spec/csv-import-analyzer/csv_datatype_analysis_spec.rb +0 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5342c73b7ed62a280eadd0bffbb914c9d782d815
4
- data.tar.gz: fdf0c897afaac56c4e558d5c7ee195914d7ffce7
3
+ metadata.gz: 7b62ce806e6c9ce5dbd5bb625cec61e9f62e861d
4
+ data.tar.gz: 097ec0f8a105b92a0adf7869f4e01d5330d62abc
5
5
  SHA512:
6
- metadata.gz: e683e160c6aeb7073027322837bb9af40a2fc898e739c6c549699b830ef53e8387ef10d507f5a099a7e0662aab872f5a896642002fab8298ee3d8b59cc2ae2b6
7
- data.tar.gz: dc2b67f259b8524e1bf155e8939511de5e2fb0e6099aa8800e876fa36d7b73c362ca2bbd0ef6adcff788264e5bf39f00d14ec1fc2ad969ce061b28039b04bcaf
6
+ metadata.gz: a0e7aff88f46560a7c263bee87f6840f2b918acd5003e0a3cd710a37910392d1452468d5fce3e8d0ee3aece44ab79f17c08da1aabab226ae70ef9bd58e9cf45c
7
+ data.tar.gz: 9778e7a00b0c972aaa45a6260a4a03b7ea699612c68740ab8acd10bbfc1369e4fc6f9e5c167a2b4936877c87c99a22df9965865d95cc760763541c6524f21a2f
data/.gitignore CHANGED
@@ -13,3 +13,4 @@
13
13
  *.a
14
14
  mkmf.log
15
15
  *.gem
16
+ /samples
data/README.md CHANGED
@@ -34,10 +34,17 @@ Calling process on a filename would generate a metadata_output.json which has th
34
34
 
35
35
  ## TODO:
36
36
  <ul>
37
+ <li> Handle control of processed input file to user </li>
37
38
  <li> Return the analysis as Json object.</li>
38
- <li> Structuring the analysis outputted to csv better </li>
39
+ <li> Better - Structuring the analysis outputted to csv</li>
40
+ <li> Add support to convert and import xlsx files to csv </li>
39
41
  </ul>
40
42
 
43
+ ## Additional Information
44
+
45
+ ### Dependencies
46
+ <ul><li><a href="https://github.com/tilo/smarter_csv">smarter_csv</a> - For processing the csv in chunks</li></ul>
47
+
41
48
  ## Contributing
42
49
 
43
50
  1. Fork it ( https://github.com/avinash-vllbh/csv-import-analyzer/fork )
@@ -24,5 +24,5 @@ Gem::Specification.new do |spec|
24
24
  spec.add_development_dependency "rspec", "~> 3.0"
25
25
  spec.add_development_dependency "simplecov", "~> 0.9"
26
26
 
27
- spec.add_runtime_dependency "smarter_csv", "~> 1.0.17"
27
+ spec.add_runtime_dependency "smarter_csv", "~> 1.0", ">= 1.0.17"
28
28
  end
@@ -1,10 +1,15 @@
1
- require 'pry'
2
1
  require_relative "csv-import-analyzer/csv_sanitizer"
3
2
  require_relative "csv-import-analyzer/helpers/errors"
4
3
  module CsvImportAnalyzer
5
4
  # To identify the methods in the module as class methods
6
5
  extend self
7
6
 
7
+ ###
8
+ # main public interface to the library
9
+ # makes sure that the file exists and
10
+ # passes the file and any additional options given to CsvSanitizer
11
+ # returns FileNotFound if given file is invalid
12
+ ###
8
13
  def process(filename, options = {})
9
14
  if File::exist?(filename)
10
15
  CsvImportAnalyzer::CsvSanitizer.new().process(File.absolute_path(filename), options)
@@ -13,6 +18,3 @@ module CsvImportAnalyzer
13
18
  end
14
19
  end
15
20
  end
16
-
17
- CsvImportAnalyzer.process("sampleTab.csv", {:metadata_output => true, :unique => 2})
18
- # CsvImportAnalyzer.process("sampleTab.csv", {:metadata_output => true, :out_format => :csv})
@@ -1,5 +1,4 @@
1
1
  require 'smarter_csv'
2
- require 'pry'
3
2
  require_relative "../helpers/common_functions"
4
3
  require_relative "../helpers/errors"
5
4
 
@@ -15,7 +14,6 @@ module CsvImportAnalyzer
15
14
  @min_max_bounds = {}
16
15
  @distinct_values = {}
17
16
  @nullable = options[:nullable] || []
18
-
19
17
  end
20
18
 
21
19
  def filename
@@ -31,10 +29,14 @@ module CsvImportAnalyzer
31
29
  @max_distinct_values ||= Integer(options[:unique]) + 1
32
30
  end
33
31
 
34
- # Public interface that is called - Processes the CSV file for min & max values for each column
32
+ # Public interface for CsvCheckBounds
33
+ # Processes the CSV file for min & max values and distinct values for each column
35
34
  def get_min_max_values
36
35
  unless filename.nil?
37
36
  if File.exist?(filename)
37
+ # Using SmarterCSV gem to retrieve the csv records in chunks
38
+ # Chunk size can be set by the user
39
+ # E.g. :chunk_size => 200 would retrieve 200 rows each time
38
40
  SmarterCSV.process(filename, {:col_sep => delimiter, :chunk_size => chunk_size,
39
41
  :remove_empty_values => false, :remove_zero_values => false}) do |chunk|
40
42
  chunk.each do |row|
@@ -59,9 +61,10 @@ module CsvImportAnalyzer
59
61
 
60
62
  private
61
63
 
62
- ##
63
- #If the key is of String type then we find the max length of it
64
- ##
64
+ ###
65
+ # If the key is of String type then we find the max length of it
66
+ # Any other datatype would have a min and max ranges
67
+ ###
65
68
  def process_min_max_for_column(key, value)
66
69
  if min_max_bounds[key].nil?
67
70
  unless csv_column_datatypes[key] == :string
@@ -73,22 +76,30 @@ module CsvImportAnalyzer
73
76
  add_bounds(key, value)
74
77
  end
75
78
 
76
- ##
77
- #Method which decides on the min max values for each key and according to the passsed in value
78
- ##
79
+ ###
80
+ # Method to decide on the min max values for each key
81
+ # Checks for length if key is of String format
82
+ # Check for values if key is of Numeric or Datetime format
83
+ ###
79
84
  def add_bounds(key, value)
80
- if csv_column_datatypes[key] == :string
81
- min_max_bounds[key][:min] = value.length if value.length < min_max_bounds[key][:min]
82
- min_max_bounds[key][:max] = value.length if value.length > min_max_bounds[key][:max]
83
- else
84
- min_max_bounds[key][:min] = value if value < min_max_bounds[key][:min]
85
- min_max_bounds[key][:max] = value if value > min_max_bounds[key][:max]
85
+ begin
86
+ if csv_column_datatypes[key] == :string
87
+ min_max_bounds[key][:min] = value.length if value.length < min_max_bounds[key][:min]
88
+ min_max_bounds[key][:max] = value.length if value.length > min_max_bounds[key][:max]
89
+ else
90
+ min_max_bounds[key][:min] = value if value < min_max_bounds[key][:min]
91
+ min_max_bounds[key][:max] = value if value > min_max_bounds[key][:max]
92
+ end
93
+ rescue ArgumentError, NoMethodError => e
94
+ ###
95
+ # TODO: Handle csv parse coversions of datatypes
96
+ ###
86
97
  end
87
98
  end
88
99
 
89
- ##
90
- #Processes the max number of distinct values set for each column
91
- ##
100
+ ###
101
+ # Processes the max number of distinct values set for each column
102
+ ###
92
103
  def process_distinct_values(key, value)
93
104
  if distinct_values[key].nil?
94
105
  distinct_values[key] = [value]
@@ -101,4 +112,4 @@ module CsvImportAnalyzer
101
112
  end
102
113
 
103
114
  end
104
- end
115
+ end
@@ -1,39 +1,31 @@
1
1
  require_relative "../helpers/string_class_extensions"
2
2
  require 'pry'
3
-
4
3
  module CsvImportAnalyzer
5
4
  module DelimiterIdentifier
6
5
 
7
- # attr_accessor :delimiter, :delimiter_count
8
-
6
+ ###
7
+ # Types of delimiters that the gem has to lookout for.
8
+ # Could be changed in future or to custom delimiters
9
+ # returns a @delimiter instance variable array
10
+ ###
9
11
  def delimiter
10
12
  @delimiter ||= [",", ";", "\t", "|"]
11
13
  end
12
14
 
15
+ ###
16
+ # Routine to intialize the delimiter_count hash with the delimiters defined above with a base count of 0
17
+ # Returns @delimiter_count instance variable
18
+ ###
13
19
  def delimiter_count
14
20
  @delimiter_count ||= Hash[delimiter.map {|v| [v,0]}]
15
21
  @delimiter_count
16
22
  end
17
23
 
18
- def getting_contents_of_quoted_values(input)
19
- #return a join of all the strings inside quotes inside a line
20
- input.scan(/".*?"/).join
21
- end
22
-
23
- def count_occurances_delimiter(line)
24
- delimiter_count.keys.each do |key|
25
- #Count the occurances of delimiter in a line
26
- total_count_delimiter = line.substr_count(key)
27
- #count the occurances of delimiter between quotes inside a line to disregard them
28
- quoted_delimiter_count = getting_contents_of_quoted_values(line).substr_count(key)
29
- delimiter_count[key] += total_count_delimiter - quoted_delimiter_count
30
- end
31
- end
32
-
33
- def return_plausible_delimiter
34
- return delimiter_count.key(delimiter_count.values.max)
35
- end
36
-
24
+ ###
25
+ # Method to analyze input data and determine delimiter
26
+ # Input can be either a csv file or even a array of strings
27
+ # returns delimiter
28
+ ###
37
29
  def identify_delimiter(filename_or_sample)
38
30
  #filename_or_sample input can be either a File or an Array or a string - Return delimiter for File or an Array of strings (if found)
39
31
  if filename_or_sample.class == String
@@ -60,7 +52,35 @@ module CsvImportAnalyzer
60
52
  InvalidInput.new
61
53
  end
62
54
  end
55
+
56
+ private
57
+
58
+ def getting_contents_of_quoted_values(input)
59
+ #return a join of all the strings inside quotes inside a line
60
+ input.scan(/".*?"/).join
61
+ end
62
+
63
+ ###
64
+ # Find the count of delimiter occurances in a line
65
+ # CSV files can have delimiters escaped between quotes
66
+ # valid count = total_count - delimiters inside quotes
67
+ ###
68
+ def count_occurances_delimiter(line)
69
+ delimiter_count.keys.each do |key|
70
+ #Count the occurances of delimiter in a line
71
+ total_count_delimiter = line.substr_count(key)
72
+ #count the occurances of delimiter between quotes inside a line to disregard them
73
+ quoted_delimiter_count = getting_contents_of_quoted_values(line).substr_count(key)
74
+ delimiter_count[key] += total_count_delimiter - quoted_delimiter_count
75
+ end
76
+ end
77
+
78
+ ###
79
+ # Plausible delimiter would be the one i.e. of most occurance of the set of rows
80
+ ###
81
+ def return_plausible_delimiter
82
+ return delimiter_count.key(delimiter_count.values.max)
83
+ end
84
+
63
85
  end
64
86
  end
65
-
66
- # puts CsvImportAnalyzer::DelimiterIdentifier.identify_delimiter("/home/avinash/Desktop/csv-import-analyzer/spec/fixtures/sample.csv")
@@ -16,11 +16,7 @@ module CsvImportAnalyzer
16
16
  end
17
17
  end
18
18
 
19
- def self.convert_excel_to_csv
20
-
21
- end
22
-
23
- def csv_clean
19
+ def convert_excel_to_csv
24
20
 
25
21
  end
26
22
 
@@ -1,10 +1,10 @@
1
1
  require "smarter_csv"
2
2
  require "tempfile"
3
- require "pry"
4
3
  require_relative "helpers/datatype_validation"
5
4
  require_relative "analyzer/csv_check_bounds"
6
5
  require_relative "helpers/common_functions"
7
6
  require_relative "sql_query_builder"
7
+ require "pry"
8
8
 
9
9
  module CsvImportAnalyzer
10
10
  class CsvDatatypeAnalysis
@@ -27,8 +27,12 @@ module CsvImportAnalyzer
27
27
  @options[:filename]
28
28
  end
29
29
 
30
-
30
+ ###
31
31
  # Process a chunk of csv file for all possible datatypes towards each column in the row
32
+ # This datatype analysis is used for analyzing,
33
+ # Min - Max values of each column
34
+ # Distinct values of each column
35
+ # Enumeration eligibility
32
36
  def datatype_analysis
33
37
  SmarterCSV.process(filename, {:col_sep => delimiter, :chunk_size => chunk_size,
34
38
  :remove_empty_values => false, :remove_zero_values => false}) do |chunk|
@@ -61,14 +65,18 @@ module CsvImportAnalyzer
61
65
  return options[:chunk]
62
66
  end
63
67
 
64
- #Call DatatypeValidator in helper module to process the possible datatype for the value
65
- #Is this the right way to hide dependency on the external classes or objects
66
- #May be a static would do ? Should I create an object and call method on the object each time rather than instantiate a new object each time ??
68
+ ###
69
+ # Call DatatypeValidator in helper module to process the possible datatype for the value
70
+ # Is this the right way to hide dependency on the external classes or objects
71
+ # May be a static would do ? Should I create an object and call method on the object each time rather than instantiate a new object each time ??
72
+ ###
67
73
  def determine_dataype(value)
68
74
  return validate_field(value)
69
75
  end
70
76
 
77
+ ###
71
78
  # Build the hash of hashes which hold the count of different possible datatypes for each row
79
+ ###
72
80
  def add_to_datatype(key, datatype)
73
81
  if csv_column_datatypes[key].nil?
74
82
  csv_column_datatypes[key] = {datatype => 1}
@@ -81,8 +89,11 @@ module CsvImportAnalyzer
81
89
  end
82
90
  end
83
91
 
84
- #Finalize the datatype for each column, A column datatype would be set to varchar or string if atleast of it's values tend to be string
85
- #If the column doesn't have any possible strings then assign the datatype to column with maximum count of identified possibilites
92
+ ###
93
+ # Finalize the datatype for each column.
94
+ # A column datatype would be set to varchar or string if even one of it's values tend to be string
95
+ # If the column doesn't have any possible strings then assign the datatype to column with maximum count of identified possibilites
96
+ ###
86
97
  def finalize_datatypes_for_csv
87
98
  csv_column_datatypes.map { |column_name, possible_datatypes|
88
99
  #If there is string type even atleast 1 there is no other option but to set the datatype to string => varchar
@@ -95,7 +106,12 @@ module CsvImportAnalyzer
95
106
  }
96
107
  end
97
108
 
98
- #Decide if simple datatype analysis is enough or proced further
109
+ ###
110
+ # Decide if simple datatype analysis is enough or proced further
111
+ # Proceed further would be to
112
+ # Identify min and max bounds for each column
113
+ # Identify if the number distinct values are less than set threshold
114
+ ###
99
115
  def take_further_actions
100
116
  if options[:check_bounds]
101
117
  min_max_bounds = CsvImportAnalyzer::CsvCheckBounds.new(options)
@@ -107,4 +123,4 @@ module CsvImportAnalyzer
107
123
  query.generate_query
108
124
  end
109
125
  end
110
- end
126
+ end
@@ -1,5 +1,6 @@
1
1
  require "smarter_csv"
2
2
  require "tempfile"
3
+ require "pry"
3
4
  require_relative "analyzer/delimiter_identifier"
4
5
  require_relative "helpers/string_class_extensions"
5
6
  require_relative "helpers/common_functions"
@@ -11,27 +12,46 @@ module CsvImportAnalyzer
11
12
  include CsvImportAnalyzer::Helper
12
13
  include CsvImportAnalyzer::DelimiterIdentifier
13
14
 
15
+ ###
16
+ # Public interface for the entire library
17
+ # What id does?
18
+ # Sets "options" varaible by merging default values and passed values
19
+ # Finds the delimiter by analyzing a sample
20
+ # Sanitizes or preprocesses the csv file by creating a temporary processed file
21
+ # Replacing null and empty values with NULL
22
+ # Replace single quotes with double quotes if needed
23
+ # Handle CSVMalformedError by logging the error to error report
24
+ # Passes the options to DatatypeAnalysis
25
+ ###
14
26
  def process(filename, options)
15
-
16
27
  options = defaults.merge(options)
17
28
  if File.exist?(filename)
18
- options[:filename] = filename
19
- #first thing to do - find the delimiter of the file.
20
29
  delimiter = identify_delimiter(filename)
21
30
  options[:delimiter] = delimiter
31
+ # create tempfiles to update any changes being made
32
+ temp_file, processed_file = create_tempfiles(filename, options)
33
+ options[:temp_file] = temp_file.path
34
+ line_count = 1
22
35
  File.foreach(filename) do |line|
23
- #Check if the line is empty - no point in processing empty lines
24
- if line.length > 1
36
+ if line.length > 1 #Check if the line is empty - no point in processing empty lines
25
37
  line = replace_line_single_quotes(line,delimiter)
26
38
  begin
27
39
  line = CSV.parse_line(line, {:col_sep => delimiter})
28
- rescue CSV::MalformedCSVError => error
29
- line = "#{line}\""
40
+ rescue CSV::MalformedCSVError
41
+ # MalformedCSVError is due to illegal quoting or unclosed quotes
42
+ # Try to add a quote at the end and resume processing
43
+ # Log the changes to report
44
+ temp_file.write("MalformedCSVError at line #{line_count}")
45
+ line = line.insert(-2, "\"")
30
46
  line = CSV.parse_line(line, {:col_sep => delimiter})
31
47
  end
32
48
  line = replace_null_values(line)
49
+ processed_file.write(line.to_csv({:col_sep => delimiter, :converters => :numeric}))
33
50
  end
51
+ line_count += 1
34
52
  end
53
+ temp_file.close
54
+ processed_file.close
35
55
  # Cleaned the file - Now analyze for datatypes
36
56
  CsvImportAnalyzer::CsvDatatypeAnalysis.new(options).datatype_analysis
37
57
  else
@@ -41,21 +61,30 @@ module CsvImportAnalyzer
41
61
 
42
62
  private
43
63
 
64
+ ###
65
+ # Hash of default values that would be merged with user passed in values
66
+ # returns [Hash] defaults
67
+ ###
44
68
  def defaults
45
69
  {
46
- :metadata_output => nil,
47
- :processed_input => nil,
48
- :unique => 10,
49
- :check_bounds => true,
50
- :datatype_analysis => 200,
51
- :chunk => 20,
52
- :database => [:pg, :mysql],
53
- :quote_convert => true,
54
- :replace_nulls => true,
55
- :out_format => :json
70
+ :metadata_output => nil, # To be set if metadata needs to be printed to a file
71
+ :processed_input => nil, # To be set if processed input is needed
72
+ :unique => 10, # Threshold for number of defaults values that needs to identified
73
+ :check_bounds => true, # Option to check for min - max bounds for each column [true => find the bounds]
74
+ :datatype_analysis => 200, # Number of rows to be sampled for datatype analysis
75
+ :chunk => 200, # Chunk size (no of rows) that needs to processed in-memory [Important not to load entire file into memory]
76
+ :database => [:pg, :mysql], # Databases for which schema needs to be generated
77
+ :quote_convert => true, # Convert any single quotes to double quotes
78
+ :replace_nulls => true, # Replace nulls, empty's, nils, Null's with NULL
79
+ :out_format => :json # Set what type of output do you need as analysis
56
80
  }
57
81
  end
58
82
 
83
+ ###
84
+ # Replaces single quotes with doubles in each line
85
+ # Escapes the double quotes if it's between two single quotes before
86
+ # returns [String] result
87
+ ###
59
88
  def replace_line_single_quotes(line, delimiter)
60
89
  delimiter = "\\|" if delimiter == "|"
61
90
  pattern = "#{delimiter}'.*?'#{delimiter}" # set the pattern to opening and closing single quote found between delimiters
@@ -82,5 +111,26 @@ module CsvImportAnalyzer
82
111
  end
83
112
  return line
84
113
  end
114
+
115
+ ###
116
+ # Uses ruby tempfile to create temp files for
117
+ # 1. Store processed file
118
+ # 2. Error reporting
119
+ # Returns the file handler for a temp file.
120
+ # This tempfile holds any modifications being done to the file.
121
+ ###
122
+ def create_tempfiles(filename, options)
123
+ options[:original_filename] = filename
124
+ filename = File.basename(filename)
125
+ processed_filename = File.join(Dir.tmpdir, "processed_"+filename)
126
+ options[:filename] = processed_filename
127
+ # filename += Time.now.strftime("%Y%m%d%H%M%S")
128
+ # temp_file = Tempfile.new(filename)
129
+ # temp_file = File.open(File.join(Dir.tmpdir, filename), "w+")
130
+ temp_file = File.join(Dir.tmpdir, "error_report_"+filename)
131
+ temp_file = File.open(temp_file, "w+")
132
+ processed_file = File.open(processed_filename, "w+")
133
+ return temp_file, processed_file
134
+ end
85
135
  end
86
136
  end