csv-import-analyzer 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/README.md +8 -1
  4. data/csv-import-analyzer.gemspec +1 -1
  5. data/lib/csv-import-analyzer.rb +6 -4
  6. data/lib/csv-import-analyzer/analyzer/csv_check_bounds.rb +30 -19
  7. data/lib/csv-import-analyzer/analyzer/delimiter_identifier.rb +44 -24
  8. data/lib/csv-import-analyzer/analyzer/file_type_assertion.rb +1 -5
  9. data/lib/csv-import-analyzer/csv_datatype_analysis.rb +25 -9
  10. data/lib/csv-import-analyzer/csv_sanitizer.rb +67 -17
  11. data/lib/csv-import-analyzer/export/metadata_analysis.rb +63 -7
  12. data/lib/csv-import-analyzer/helpers/common_functions.rb +4 -0
  13. data/lib/csv-import-analyzer/helpers/datatype_validation.rb +6 -6
  14. data/lib/csv-import-analyzer/helpers/string_class_extensions.rb +9 -3
  15. data/lib/csv-import-analyzer/query_builder/mysql_query_helper.rb +2 -2
  16. data/lib/csv-import-analyzer/query_builder/pg_query_helper.rb +1 -2
  17. data/lib/csv-import-analyzer/query_builder/query_helper.rb +2 -2
  18. data/lib/csv-import-analyzer/sql_query_builder.rb +27 -12
  19. data/lib/csv-import-analyzer/version.rb +1 -1
  20. data/spec/csv-import-analyzer/analyzer/csv_check_bounds_spec.rb +8 -8
  21. data/spec/csv-import-analyzer/analyzer/delimiter_identifier_spec.rb +13 -13
  22. data/spec/csv-import-analyzer/csv_sanitizer_spec.rb +10 -7
  23. data/spec/csv-import-analyzer/helpers/common_functions_spec.rb +20 -19
  24. data/spec/csv-import-analyzer/helpers/datatype_validation_spec.rb +28 -28
  25. data/spec/csv-import-analyzer/helpers/string_class_extension_spec.rb +6 -6
  26. data/spec/csv-import-analyzer/query_builder/mysql_query_helper_spec.rb +13 -13
  27. data/spec/csv-import-analyzer/query_builder/pg_query_helper_spec.rb +16 -16
  28. data/spec/csv-import-analyzer_spec.rb +3 -6
  29. data/spec/fixtures/sample.csv +2 -2
  30. data/spec/spec_helper.rb +3 -0
  31. metadata +17 -6
  32. data/lib/csv-import-analyzer/sampleTab.csv +0 -5
  33. data/samples/metadata_output.json +0 -70
  34. data/spec/csv-import-analyzer/csv_datatype_analysis_spec.rb +0 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5342c73b7ed62a280eadd0bffbb914c9d782d815
4
- data.tar.gz: fdf0c897afaac56c4e558d5c7ee195914d7ffce7
3
+ metadata.gz: 7b62ce806e6c9ce5dbd5bb625cec61e9f62e861d
4
+ data.tar.gz: 097ec0f8a105b92a0adf7869f4e01d5330d62abc
5
5
  SHA512:
6
- metadata.gz: e683e160c6aeb7073027322837bb9af40a2fc898e739c6c549699b830ef53e8387ef10d507f5a099a7e0662aab872f5a896642002fab8298ee3d8b59cc2ae2b6
7
- data.tar.gz: dc2b67f259b8524e1bf155e8939511de5e2fb0e6099aa8800e876fa36d7b73c362ca2bbd0ef6adcff788264e5bf39f00d14ec1fc2ad969ce061b28039b04bcaf
6
+ metadata.gz: a0e7aff88f46560a7c263bee87f6840f2b918acd5003e0a3cd710a37910392d1452468d5fce3e8d0ee3aece44ab79f17c08da1aabab226ae70ef9bd58e9cf45c
7
+ data.tar.gz: 9778e7a00b0c972aaa45a6260a4a03b7ea699612c68740ab8acd10bbfc1369e4fc6f9e5c167a2b4936877c87c99a22df9965865d95cc760763541c6524f21a2f
data/.gitignore CHANGED
@@ -13,3 +13,4 @@
13
13
  *.a
14
14
  mkmf.log
15
15
  *.gem
16
+ /samples
data/README.md CHANGED
@@ -34,10 +34,17 @@ Calling process on a filename would generate a metadata_output.json which has th
34
34
 
35
35
  ## TODO:
36
36
  <ul>
37
+ <li> Handle control of processed input file to user </li>
37
38
  <li> Return the analysis as Json object.</li>
38
- <li> Structuring the analysis outputted to csv better </li>
39
+ <li> Better - Structuring the analysis outputted to csv</li>
40
+ <li> Add support to convert and import xlsx files to csv </li>
39
41
  </ul>
40
42
 
43
+ ## Additional Information
44
+
45
+ ### Dependencies
46
+ <ul><li><a href="https://github.com/tilo/smarter_csv">smarter_csv</a> - For processing the csv in chunks</li></ul>
47
+
41
48
  ## Contributing
42
49
 
43
50
  1. Fork it ( https://github.com/avinash-vllbh/csv-import-analyzer/fork )
@@ -24,5 +24,5 @@ Gem::Specification.new do |spec|
24
24
  spec.add_development_dependency "rspec", "~> 3.0"
25
25
  spec.add_development_dependency "simplecov", "~> 0.9"
26
26
 
27
- spec.add_runtime_dependency "smarter_csv", "~> 1.0.17"
27
+ spec.add_runtime_dependency "smarter_csv", "~> 1.0", ">= 1.0.17"
28
28
  end
@@ -1,10 +1,15 @@
1
- require 'pry'
2
1
  require_relative "csv-import-analyzer/csv_sanitizer"
3
2
  require_relative "csv-import-analyzer/helpers/errors"
4
3
  module CsvImportAnalyzer
5
4
  # To identify the methods in the module as class methods
6
5
  extend self
7
6
 
7
+ ###
8
+ # main public interface to the library
9
+ # makes sure that the file exists and
10
+ # passes the file and any additional options given to CsvSanitizer
11
+ # returns FileNotFound if given file is invalid
12
+ ###
8
13
  def process(filename, options = {})
9
14
  if File::exist?(filename)
10
15
  CsvImportAnalyzer::CsvSanitizer.new().process(File.absolute_path(filename), options)
@@ -13,6 +18,3 @@ module CsvImportAnalyzer
13
18
  end
14
19
  end
15
20
  end
16
-
17
- CsvImportAnalyzer.process("sampleTab.csv", {:metadata_output => true, :unique => 2})
18
- # CsvImportAnalyzer.process("sampleTab.csv", {:metadata_output => true, :out_format => :csv})
@@ -1,5 +1,4 @@
1
1
  require 'smarter_csv'
2
- require 'pry'
3
2
  require_relative "../helpers/common_functions"
4
3
  require_relative "../helpers/errors"
5
4
 
@@ -15,7 +14,6 @@ module CsvImportAnalyzer
15
14
  @min_max_bounds = {}
16
15
  @distinct_values = {}
17
16
  @nullable = options[:nullable] || []
18
-
19
17
  end
20
18
 
21
19
  def filename
@@ -31,10 +29,14 @@ module CsvImportAnalyzer
31
29
  @max_distinct_values ||= Integer(options[:unique]) + 1
32
30
  end
33
31
 
34
- # Public interface that is called - Processes the CSV file for min & max values for each column
32
+ # Public interface for CsvCheckBounds
33
+ # Processes the CSV file for min & max values and distinct values for each column
35
34
  def get_min_max_values
36
35
  unless filename.nil?
37
36
  if File.exist?(filename)
37
+ # Using SmarterCSV gem to retrieve the csv records in chunks
38
+ # Chunk size can be set by the user
39
+ # E.g. :chunk_size => 200 would retrieve 200 rows each time
38
40
  SmarterCSV.process(filename, {:col_sep => delimiter, :chunk_size => chunk_size,
39
41
  :remove_empty_values => false, :remove_zero_values => false}) do |chunk|
40
42
  chunk.each do |row|
@@ -59,9 +61,10 @@ module CsvImportAnalyzer
59
61
 
60
62
  private
61
63
 
62
- ##
63
- #If the key is of String type then we find the max length of it
64
- ##
64
+ ###
65
+ # If the key is of String type then we find the max length of it
66
+ # Any other datatype would have a min and max ranges
67
+ ###
65
68
  def process_min_max_for_column(key, value)
66
69
  if min_max_bounds[key].nil?
67
70
  unless csv_column_datatypes[key] == :string
@@ -73,22 +76,30 @@ module CsvImportAnalyzer
73
76
  add_bounds(key, value)
74
77
  end
75
78
 
76
- ##
77
- #Method which decides on the min max values for each key and according to the passsed in value
78
- ##
79
+ ###
80
+ # Method to decide on the min max values for each key
81
+ # Checks for length if key is of String format
82
+ # Check for values if key is of Numeric or Datetime format
83
+ ###
79
84
  def add_bounds(key, value)
80
- if csv_column_datatypes[key] == :string
81
- min_max_bounds[key][:min] = value.length if value.length < min_max_bounds[key][:min]
82
- min_max_bounds[key][:max] = value.length if value.length > min_max_bounds[key][:max]
83
- else
84
- min_max_bounds[key][:min] = value if value < min_max_bounds[key][:min]
85
- min_max_bounds[key][:max] = value if value > min_max_bounds[key][:max]
85
+ begin
86
+ if csv_column_datatypes[key] == :string
87
+ min_max_bounds[key][:min] = value.length if value.length < min_max_bounds[key][:min]
88
+ min_max_bounds[key][:max] = value.length if value.length > min_max_bounds[key][:max]
89
+ else
90
+ min_max_bounds[key][:min] = value if value < min_max_bounds[key][:min]
91
+ min_max_bounds[key][:max] = value if value > min_max_bounds[key][:max]
92
+ end
93
+ rescue ArgumentError, NoMethodError => e
94
+ ###
95
+ # TODO: Handle csv parse coversions of datatypes
96
+ ###
86
97
  end
87
98
  end
88
99
 
89
- ##
90
- #Processes the max number of distinct values set for each column
91
- ##
100
+ ###
101
+ # Processes the max number of distinct values set for each column
102
+ ###
92
103
  def process_distinct_values(key, value)
93
104
  if distinct_values[key].nil?
94
105
  distinct_values[key] = [value]
@@ -101,4 +112,4 @@ module CsvImportAnalyzer
101
112
  end
102
113
 
103
114
  end
104
- end
115
+ end
@@ -1,39 +1,31 @@
1
1
  require_relative "../helpers/string_class_extensions"
2
2
  require 'pry'
3
-
4
3
  module CsvImportAnalyzer
5
4
  module DelimiterIdentifier
6
5
 
7
- # attr_accessor :delimiter, :delimiter_count
8
-
6
+ ###
7
+ # Types of delimiters that the gem has to lookout for.
8
+ # Could be changed in future or to custom delimiters
9
+ # returns a @delimiter instance variable array
10
+ ###
9
11
  def delimiter
10
12
  @delimiter ||= [",", ";", "\t", "|"]
11
13
  end
12
14
 
15
+ ###
16
+ # Routine to intialize the delimiter_count hash with the delimiters defined above with a base count of 0
17
+ # Returns @delimiter_count instance variable
18
+ ###
13
19
  def delimiter_count
14
20
  @delimiter_count ||= Hash[delimiter.map {|v| [v,0]}]
15
21
  @delimiter_count
16
22
  end
17
23
 
18
- def getting_contents_of_quoted_values(input)
19
- #return a join of all the strings inside quotes inside a line
20
- input.scan(/".*?"/).join
21
- end
22
-
23
- def count_occurances_delimiter(line)
24
- delimiter_count.keys.each do |key|
25
- #Count the occurances of delimiter in a line
26
- total_count_delimiter = line.substr_count(key)
27
- #count the occurances of delimiter between quotes inside a line to disregard them
28
- quoted_delimiter_count = getting_contents_of_quoted_values(line).substr_count(key)
29
- delimiter_count[key] += total_count_delimiter - quoted_delimiter_count
30
- end
31
- end
32
-
33
- def return_plausible_delimiter
34
- return delimiter_count.key(delimiter_count.values.max)
35
- end
36
-
24
+ ###
25
+ # Method to analyze input data and determine delimiter
26
+ # Input can be either a csv file or even a array of strings
27
+ # returns delimiter
28
+ ###
37
29
  def identify_delimiter(filename_or_sample)
38
30
  #filename_or_sample input can be either a File or an Array or a string - Return delimiter for File or an Array of strings (if found)
39
31
  if filename_or_sample.class == String
@@ -60,7 +52,35 @@ module CsvImportAnalyzer
60
52
  InvalidInput.new
61
53
  end
62
54
  end
55
+
56
+ private
57
+
58
+ def getting_contents_of_quoted_values(input)
59
+ #return a join of all the strings inside quotes inside a line
60
+ input.scan(/".*?"/).join
61
+ end
62
+
63
+ ###
64
+ # Find the count of delimiter occurances in a line
65
+ # CSV files can have delimiters escaped between quotes
66
+ # valid count = total_count - delimiters inside quotes
67
+ ###
68
+ def count_occurances_delimiter(line)
69
+ delimiter_count.keys.each do |key|
70
+ #Count the occurances of delimiter in a line
71
+ total_count_delimiter = line.substr_count(key)
72
+ #count the occurances of delimiter between quotes inside a line to disregard them
73
+ quoted_delimiter_count = getting_contents_of_quoted_values(line).substr_count(key)
74
+ delimiter_count[key] += total_count_delimiter - quoted_delimiter_count
75
+ end
76
+ end
77
+
78
+ ###
79
+ # Plausible delimiter would be the one i.e. of most occurance of the set of rows
80
+ ###
81
+ def return_plausible_delimiter
82
+ return delimiter_count.key(delimiter_count.values.max)
83
+ end
84
+
63
85
  end
64
86
  end
65
-
66
- # puts CsvImportAnalyzer::DelimiterIdentifier.identify_delimiter("/home/avinash/Desktop/csv-import-analyzer/spec/fixtures/sample.csv")
@@ -16,11 +16,7 @@ module CsvImportAnalyzer
16
16
  end
17
17
  end
18
18
 
19
- def self.convert_excel_to_csv
20
-
21
- end
22
-
23
- def csv_clean
19
+ def convert_excel_to_csv
24
20
 
25
21
  end
26
22
 
@@ -1,10 +1,10 @@
1
1
  require "smarter_csv"
2
2
  require "tempfile"
3
- require "pry"
4
3
  require_relative "helpers/datatype_validation"
5
4
  require_relative "analyzer/csv_check_bounds"
6
5
  require_relative "helpers/common_functions"
7
6
  require_relative "sql_query_builder"
7
+ require "pry"
8
8
 
9
9
  module CsvImportAnalyzer
10
10
  class CsvDatatypeAnalysis
@@ -27,8 +27,12 @@ module CsvImportAnalyzer
27
27
  @options[:filename]
28
28
  end
29
29
 
30
-
30
+ ###
31
31
  # Process a chunk of csv file for all possible datatypes towards each column in the row
32
+ # This datatype analysis is used for analyzing,
33
+ # Min - Max values of each column
34
+ # Distinct values of each column
35
+ # Enumeration eligibility
32
36
  def datatype_analysis
33
37
  SmarterCSV.process(filename, {:col_sep => delimiter, :chunk_size => chunk_size,
34
38
  :remove_empty_values => false, :remove_zero_values => false}) do |chunk|
@@ -61,14 +65,18 @@ module CsvImportAnalyzer
61
65
  return options[:chunk]
62
66
  end
63
67
 
64
- #Call DatatypeValidator in helper module to process the possible datatype for the value
65
- #Is this the right way to hide dependency on the external classes or objects
66
- #May be a static would do ? Should I create an object and call method on the object each time rather than instantiate a new object each time ??
68
+ ###
69
+ # Call DatatypeValidator in helper module to process the possible datatype for the value
70
+ # Is this the right way to hide dependency on the external classes or objects
71
+ # May be a static would do ? Should I create an object and call method on the object each time rather than instantiate a new object each time ??
72
+ ###
67
73
  def determine_dataype(value)
68
74
  return validate_field(value)
69
75
  end
70
76
 
77
+ ###
71
78
  # Build the hash of hashes which hold the count of different possible datatypes for each row
79
+ ###
72
80
  def add_to_datatype(key, datatype)
73
81
  if csv_column_datatypes[key].nil?
74
82
  csv_column_datatypes[key] = {datatype => 1}
@@ -81,8 +89,11 @@ module CsvImportAnalyzer
81
89
  end
82
90
  end
83
91
 
84
- #Finalize the datatype for each column, A column datatype would be set to varchar or string if atleast of it's values tend to be string
85
- #If the column doesn't have any possible strings then assign the datatype to column with maximum count of identified possibilites
92
+ ###
93
+ # Finalize the datatype for each column.
94
+ # A column datatype would be set to varchar or string if even one of it's values tend to be string
95
+ # If the column doesn't have any possible strings then assign the datatype to column with maximum count of identified possibilites
96
+ ###
86
97
  def finalize_datatypes_for_csv
87
98
  csv_column_datatypes.map { |column_name, possible_datatypes|
88
99
  #If there is string type even atleast 1 there is no other option but to set the datatype to string => varchar
@@ -95,7 +106,12 @@ module CsvImportAnalyzer
95
106
  }
96
107
  end
97
108
 
98
- #Decide if simple datatype analysis is enough or proced further
109
+ ###
110
+ # Decide if simple datatype analysis is enough or proced further
111
+ # Proceed further would be to
112
+ # Identify min and max bounds for each column
113
+ # Identify if the number distinct values are less than set threshold
114
+ ###
99
115
  def take_further_actions
100
116
  if options[:check_bounds]
101
117
  min_max_bounds = CsvImportAnalyzer::CsvCheckBounds.new(options)
@@ -107,4 +123,4 @@ module CsvImportAnalyzer
107
123
  query.generate_query
108
124
  end
109
125
  end
110
- end
126
+ end
@@ -1,5 +1,6 @@
1
1
  require "smarter_csv"
2
2
  require "tempfile"
3
+ require "pry"
3
4
  require_relative "analyzer/delimiter_identifier"
4
5
  require_relative "helpers/string_class_extensions"
5
6
  require_relative "helpers/common_functions"
@@ -11,27 +12,46 @@ module CsvImportAnalyzer
11
12
  include CsvImportAnalyzer::Helper
12
13
  include CsvImportAnalyzer::DelimiterIdentifier
13
14
 
15
+ ###
16
+ # Public interface for the entire library
17
+ # What id does?
18
+ # Sets "options" varaible by merging default values and passed values
19
+ # Finds the delimiter by analyzing a sample
20
+ # Sanitizes or preprocesses the csv file by creating a temporary processed file
21
+ # Replacing null and empty values with NULL
22
+ # Replace single quotes with double quotes if needed
23
+ # Handle CSVMalformedError by logging the error to error report
24
+ # Passes the options to DatatypeAnalysis
25
+ ###
14
26
  def process(filename, options)
15
-
16
27
  options = defaults.merge(options)
17
28
  if File.exist?(filename)
18
- options[:filename] = filename
19
- #first thing to do - find the delimiter of the file.
20
29
  delimiter = identify_delimiter(filename)
21
30
  options[:delimiter] = delimiter
31
+ # create tempfiles to update any changes being made
32
+ temp_file, processed_file = create_tempfiles(filename, options)
33
+ options[:temp_file] = temp_file.path
34
+ line_count = 1
22
35
  File.foreach(filename) do |line|
23
- #Check if the line is empty - no point in processing empty lines
24
- if line.length > 1
36
+ if line.length > 1 #Check if the line is empty - no point in processing empty lines
25
37
  line = replace_line_single_quotes(line,delimiter)
26
38
  begin
27
39
  line = CSV.parse_line(line, {:col_sep => delimiter})
28
- rescue CSV::MalformedCSVError => error
29
- line = "#{line}\""
40
+ rescue CSV::MalformedCSVError
41
+ # MalformedCSVError is due to illegal quoting or unclosed quotes
42
+ # Try to add a quote at the end and resume processing
43
+ # Log the changes to report
44
+ temp_file.write("MalformedCSVError at line #{line_count}")
45
+ line = line.insert(-2, "\"")
30
46
  line = CSV.parse_line(line, {:col_sep => delimiter})
31
47
  end
32
48
  line = replace_null_values(line)
49
+ processed_file.write(line.to_csv({:col_sep => delimiter, :converters => :numeric}))
33
50
  end
51
+ line_count += 1
34
52
  end
53
+ temp_file.close
54
+ processed_file.close
35
55
  # Cleaned the file - Now analyze for datatypes
36
56
  CsvImportAnalyzer::CsvDatatypeAnalysis.new(options).datatype_analysis
37
57
  else
@@ -41,21 +61,30 @@ module CsvImportAnalyzer
41
61
 
42
62
  private
43
63
 
64
+ ###
65
+ # Hash of default values that would be merged with user passed in values
66
+ # returns [Hash] defaults
67
+ ###
44
68
  def defaults
45
69
  {
46
- :metadata_output => nil,
47
- :processed_input => nil,
48
- :unique => 10,
49
- :check_bounds => true,
50
- :datatype_analysis => 200,
51
- :chunk => 20,
52
- :database => [:pg, :mysql],
53
- :quote_convert => true,
54
- :replace_nulls => true,
55
- :out_format => :json
70
+ :metadata_output => nil, # To be set if metadata needs to be printed to a file
71
+ :processed_input => nil, # To be set if processed input is needed
72
+ :unique => 10, # Threshold for number of defaults values that needs to identified
73
+ :check_bounds => true, # Option to check for min - max bounds for each column [true => find the bounds]
74
+ :datatype_analysis => 200, # Number of rows to be sampled for datatype analysis
75
+ :chunk => 200, # Chunk size (no of rows) that needs to processed in-memory [Important not to load entire file into memory]
76
+ :database => [:pg, :mysql], # Databases for which schema needs to be generated
77
+ :quote_convert => true, # Convert any single quotes to double quotes
78
+ :replace_nulls => true, # Replace nulls, empty's, nils, Null's with NULL
79
+ :out_format => :json # Set what type of output do you need as analysis
56
80
  }
57
81
  end
58
82
 
83
+ ###
84
+ # Replaces single quotes with doubles in each line
85
+ # Escapes the double quotes if it's between two single quotes before
86
+ # returns [String] result
87
+ ###
59
88
  def replace_line_single_quotes(line, delimiter)
60
89
  delimiter = "\\|" if delimiter == "|"
61
90
  pattern = "#{delimiter}'.*?'#{delimiter}" # set the pattern to opening and closing single quote found between delimiters
@@ -82,5 +111,26 @@ module CsvImportAnalyzer
82
111
  end
83
112
  return line
84
113
  end
114
+
115
+ ###
116
+ # Uses ruby tempfile to create temp files for
117
+ # 1. Store processed file
118
+ # 2. Error reporting
119
+ # Returns the file handler for a temp file.
120
+ # This tempfile holds any modifications being done to the file.
121
+ ###
122
+ def create_tempfiles(filename, options)
123
+ options[:original_filename] = filename
124
+ filename = File.basename(filename)
125
+ processed_filename = File.join(Dir.tmpdir, "processed_"+filename)
126
+ options[:filename] = processed_filename
127
+ # filename += Time.now.strftime("%Y%m%d%H%M%S")
128
+ # temp_file = Tempfile.new(filename)
129
+ # temp_file = File.open(File.join(Dir.tmpdir, filename), "w+")
130
+ temp_file = File.join(Dir.tmpdir, "error_report_"+filename)
131
+ temp_file = File.open(temp_file, "w+")
132
+ processed_file = File.open(processed_filename, "w+")
133
+ return temp_file, processed_file
134
+ end
85
135
  end
86
136
  end