csv-import-analyzer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +35 -0
  7. data/Rakefile +2 -0
  8. data/csv-import-analyzer.gemspec +29 -0
  9. data/lib/csv-import-analyzer.rb +18 -0
  10. data/lib/csv-import-analyzer/analyzer/csv_check_bounds.rb +104 -0
  11. data/lib/csv-import-analyzer/analyzer/delimiter_identifier.rb +66 -0
  12. data/lib/csv-import-analyzer/analyzer/file_type_assertion.rb +29 -0
  13. data/lib/csv-import-analyzer/csv_datatype_analysis.rb +110 -0
  14. data/lib/csv-import-analyzer/csv_sanitizer.rb +86 -0
  15. data/lib/csv-import-analyzer/export/metadata_analysis.rb +156 -0
  16. data/lib/csv-import-analyzer/helpers/common_functions.rb +11 -0
  17. data/lib/csv-import-analyzer/helpers/datatype_validation.rb +85 -0
  18. data/lib/csv-import-analyzer/helpers/errors.rb +3 -0
  19. data/lib/csv-import-analyzer/helpers/string_class_extensions.rb +8 -0
  20. data/lib/csv-import-analyzer/query_builder/mysql_query_helper.rb +31 -0
  21. data/lib/csv-import-analyzer/query_builder/pg_query_helper.rb +27 -0
  22. data/lib/csv-import-analyzer/query_builder/query_helper.rb +27 -0
  23. data/lib/csv-import-analyzer/sampleTab.csv +5 -0
  24. data/lib/csv-import-analyzer/sql_query_builder.rb +125 -0
  25. data/lib/csv-import-analyzer/version.rb +5 -0
  26. data/lib/metadata_output.json +70 -0
  27. data/lib/sampleTab.csv +5 -0
  28. data/spec/csv-import-analyzer/analyzer/csv_check_bounds_spec.rb +43 -0
  29. data/spec/csv-import-analyzer/analyzer/delimiter_identifier_spec.rb +61 -0
  30. data/spec/csv-import-analyzer/analyzer/file_type_assertion_spec.rb +0 -0
  31. data/spec/csv-import-analyzer/csv_datatype_analysis_spec.rb +1 -0
  32. data/spec/csv-import-analyzer/csv_sanitizer_spec.rb +24 -0
  33. data/spec/csv-import-analyzer/export/metadata_analysis_spec.rb +0 -0
  34. data/spec/csv-import-analyzer/helpers/common_functions_spec.rb +31 -0
  35. data/spec/csv-import-analyzer/helpers/csv_check_bounds_spec.rb +3 -0
  36. data/spec/csv-import-analyzer/helpers/datatype_validation_spec.rb +75 -0
  37. data/spec/csv-import-analyzer/helpers/mysql_query_helper_spec.rb +0 -0
  38. data/spec/csv-import-analyzer/helpers/pq_query_helper_spec.rb +0 -0
  39. data/spec/csv-import-analyzer/helpers/string_class_extension_spec.rb +18 -0
  40. data/spec/csv-import-analyzer/query_builder/mysql_query_helper_spec.rb +54 -0
  41. data/spec/csv-import-analyzer/query_builder/pg_query_helper_spec.rb +55 -0
  42. data/spec/csv-import-analyzer_spec.rb +14 -0
  43. data/spec/fixtures/sample.csv +5 -0
  44. data/spec/fixtures/sample_options.yml +11 -0
  45. data/spec/fixtures/semicolon-sample.csv +5 -0
  46. data/spec/spec_helper.rb +84 -0
  47. metadata +208 -0
@@ -0,0 +1,86 @@
1
+ require "smarter_csv"
2
+ require "tempfile"
3
+ require_relative "analyzer/delimiter_identifier"
4
+ require_relative "helpers/string_class_extensions"
5
+ require_relative "helpers/common_functions"
6
+ require_relative "helpers/errors"
7
+ require_relative "csv_datatype_analysis"
8
+
9
+ module CsvImportAnalyzer
10
+ class CsvSanitizer
11
+ include CsvImportAnalyzer::Helper
12
+ include CsvImportAnalyzer::DelimiterIdentifier
13
+
14
+ def process(filename, options)
15
+
16
+ options = defaults.merge(options)
17
+ if File.exist?(filename)
18
+ options[:filename] = filename
19
+ #first thing to do - find the delimiter of the file.
20
+ delimiter = identify_delimiter(filename)
21
+ options[:delimiter] = delimiter
22
+ File.foreach(filename) do |line|
23
+ #Check if the line is empty - no point in processing empty lines
24
+ if line.length > 1
25
+ line = replace_line_single_quotes(line,delimiter)
26
+ begin
27
+ line = CSV.parse_line(line, {:col_sep => delimiter})
28
+ rescue CSV::MalformedCSVError => error
29
+ line = "#{line}\""
30
+ line = CSV.parse_line(line, {:col_sep => delimiter})
31
+ end
32
+ line = replace_null_values(line)
33
+ end
34
+ end
35
+ # Cleaned the file - Now analyze for datatypes
36
+ CsvImportAnalyzer::CsvDatatypeAnalysis.new(options).datatype_analysis
37
+ else
38
+ FileNotFound.new
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ def defaults
45
+ {
46
+ :metadata_output => nil,
47
+ :processed_input => nil,
48
+ :unique => 10,
49
+ :check_bounds => true,
50
+ :datatype_analysis => 200,
51
+ :chunk => 20,
52
+ :database => [:pg, :mysql],
53
+ :quote_convert => true,
54
+ :replace_nulls => true,
55
+ :out_format => :json
56
+ }
57
+ end
58
+
59
+ def replace_line_single_quotes(line, delimiter)
60
+ delimiter = "\\|" if delimiter == "|"
61
+ pattern = "#{delimiter}'.*?'#{delimiter}" # set the pattern to opening and closing single quote found between delimiters
62
+ res = line.gsub(/#{pattern}/)
63
+ result = res.each { |match|
64
+ replace = "#{delimiter}\""
65
+ replace = "\|\"" if delimiter == "\\|"
66
+ match = match.gsub(/^#{delimiter}'/,replace)
67
+ replace = "\"#{delimiter}"
68
+ replace = "\"\|" if delimiter == "\\|"
69
+ match = match.gsub(/'#{delimiter}$/,replace)
70
+ }
71
+ result = result.gsub(/''/,'\'') #replace any single quote that might have been used twice to escape single quote before
72
+ return result
73
+ end
74
+
75
+ # Replace all nil, "NAN", empty values with NULL for maintaining consistency during database import
76
+ def replace_null_values(line)
77
+ line.each do |value|
78
+ if null_like?(value)
79
+ replace_index = line.index(value)
80
+ line[replace_index] = "NULL"
81
+ end
82
+ end
83
+ return line
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,156 @@
1
+ require 'pry'
2
+ require 'json'
3
+ module CsvImportAnalyzer
4
+ class MetadataAnalysis
5
+ attr_accessor :metadata, :max_distinct_values
6
+ def initialize(options)
7
+ @options = options
8
+ @metadata = {}
9
+ end
10
+
11
+ def options
12
+ @options
13
+ end
14
+
15
+ def metadata
16
+ @metadata
17
+ end
18
+
19
+ def header_datatypes
20
+ @options[:csv_column_datatypes]
21
+ end
22
+
23
+ def header_datatype_analysis
24
+ @options[:csv_datatype_analysis]
25
+ end
26
+
27
+ def nullable_columns
28
+ @options[:nullable]
29
+ end
30
+
31
+ def databases
32
+ @options[:database]
33
+ end
34
+
35
+ def create_queries
36
+ @options[:create_query]
37
+ end
38
+
39
+ def import_queries
40
+ @options[:import_query]
41
+ end
42
+
43
+ def unique_values
44
+ @options[:uniques]
45
+ end
46
+
47
+ def max_distinct_values
48
+ @max_distinct_values ||= Integer(options[:unique]) + 1
49
+ end
50
+
51
+ def metadata_print
52
+ build_metadata_output
53
+ if options[:metadata_output]
54
+ if options[:out_format] == :json
55
+ json_print_to_file
56
+ end
57
+ if options[:out_format] == :csv
58
+ csv_print_to_file
59
+ end
60
+ end
61
+ return JSON.pretty_generate(metadata)
62
+ end
63
+
64
+
65
+ private
66
+
67
+ def json_print_to_file
68
+ outfile = File.open("metadata_output.json", "w")
69
+ outfile << JSON.pretty_generate(metadata)
70
+ outfile.close
71
+ end
72
+
73
+ # Priniting that csv from json is a mess - How to make pretty print ?
74
+ def csv_print_to_file
75
+ CSV.open("metadata_output.csv", "w") do |csv|
76
+ binding.pry
77
+ metadata.each do |key, value|
78
+ if value.class == Hash
79
+ csv << [key]
80
+ print_hash_to_csv(value, csv)
81
+ else
82
+ csv << [key, value]
83
+ end
84
+ end
85
+ end
86
+ end
87
+
88
+ def print_hash_to_csv(hash, csv_handler)
89
+ if hash.class == Hash
90
+ hash.each do |key, value|
91
+ csv_handler << [key]
92
+ print_hash_to_csv(value, csv_handler)
93
+ end
94
+ else
95
+ csv_handler << [hash]
96
+ end
97
+ end
98
+
99
+ def build_metadata_output
100
+ metadata[:csv_file] = add_file_metadata
101
+ metadata[:data_manipulations] = add_data_manipulations
102
+ metadata[:csv_headers] = add_header_metadata
103
+ metadata[:sql] = add_sql_data
104
+ end
105
+
106
+ def add_file_metadata
107
+ file_data = {}
108
+ file_data[:filename] = File.basename(options[:filename])
109
+ file_data[:file_size] = File.size(options[:filename])
110
+ # file_data[:rows] = options[:rows]
111
+ # file_data[:columns] = options[:columns]
112
+ file_data[:record_delimiter] = options[:delimiter]
113
+ return file_data
114
+ end
115
+
116
+ def add_data_manipulations
117
+ data_manipulations = {}
118
+ data_manipulations[:replace_nulls] = options[:replace_nulls]
119
+ data_manipulations[:replace_quotes] = options[:quote_convert]
120
+ return data_manipulations
121
+ end
122
+
123
+ def add_header_metadata
124
+ columns = {}
125
+ header_datatypes.keys.each do |column_name|
126
+ begin
127
+ columns[column_name] = {}
128
+ columns[column_name][:datatype] = header_datatypes[column_name]
129
+ columns[column_name][:datatype_analysis] = header_datatype_analysis[column_name]
130
+ if unique_values[column_name].size > max_distinct_values
131
+ columns[column_name][:distinct_values] = "#{max_distinct_values}+"
132
+ else
133
+ columns[column_name][:distinct_values] = unique_values[column_name]
134
+ end
135
+ if nullable_columns.include?(column_name)
136
+ columns[column_name][:nullable] = true
137
+ end
138
+ rescue Exception => e
139
+ puts e
140
+ end
141
+ end
142
+ return columns
143
+ end
144
+
145
+ def add_sql_data
146
+ sql = {}
147
+ databases.each do |db|
148
+ sql[db] = {}
149
+ sql[db][:create_query] = create_queries[db]
150
+ sql[db][:import_query] = import_queries[db]
151
+ end
152
+ return sql
153
+ end
154
+
155
+ end
156
+ end
@@ -0,0 +1,11 @@
1
+ module CsvImportAnalyzer
2
+ module Helper
3
+ def null_like?(value)
4
+ if ["NULL", "Null", "NUll", "NULl", "null", nil, "", "NAN", "\\N"].include?(value)
5
+ true
6
+ else
7
+ false
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,85 @@
1
+ require 'pry'
2
+ module CsvImportAnalyzer
3
+ module DatatypeValidator
4
+
5
+ def validate_field(content)
6
+ return get_datatype(content)
7
+ end
8
+
9
+ private
10
+ ###
11
+ # Date.parse("12/31/20145234", "%m/%d/%Y") => true which is not supposed to be true (although technically its true)
12
+ # Validate year part has only 4 numbers in it
13
+ ###
14
+
15
+ def validate_year_date(field)
16
+ date = nil
17
+ formats = ["%d/%m/%Y","%d-%m-%Y","%d %m %Y","%m/%d/%Y","%m-%d-%Y","%m %d %Y"]
18
+ formats.each do |format|
19
+ if(Date.strptime(field, format) rescue false)
20
+ date = Date.strptime(field, format)
21
+ break
22
+ end
23
+ end
24
+ unless date.nil?
25
+ field = date.to_s.scan(/\d*/) # Return an array with patterns matching with only numbers in it
26
+ if field[0].length == 4
27
+ return true
28
+ else
29
+ return false
30
+ end
31
+ else
32
+ return false
33
+ end
34
+ end
35
+
36
+ ###
37
+ # To check for pattern of Date format after Date.parse is successfull
38
+ # Date.parse(3000) => true which is not supposed to be true
39
+ ###
40
+ def datetime_pattern(field)
41
+ pattern1 = field.scan(/[0-9]\//)
42
+ pattern2 = field.scan(/[0-9]\-/)
43
+ pattern3 = field.scan(/[0-9] /)
44
+ pattern4 = field.scan(/[0-9] [A-Z][a-z][a-z] [0-9]|[0-9]-[A-Z][a-z][a-z]-[0-9]|[0-9] [a-z][a-z][a-z] [0-9]|[0-9]-[a-z][a-z][a-z]-[0-9]/)
45
+ if(pattern1.size == 2||pattern2.size == 2||pattern3.size == 2||pattern4.size != 0)
46
+ if(validate_year_date(field))
47
+ return true
48
+ else
49
+ return false
50
+ end
51
+ else
52
+ return false
53
+ end
54
+ end
55
+
56
+ ###
57
+ #To determine the data-type of an input field
58
+ ###
59
+ def get_datatype(field)
60
+ #Remove if field has any comma's for int and float rep
61
+ if field != nil && field.class == String
62
+ num = field.gsub(/,/,'')
63
+ else
64
+ num = field
65
+ end
66
+ if(Integer(num) rescue false)
67
+ if num.class == Float
68
+ return "float"
69
+ end
70
+ return "int"
71
+ elsif(Float(num) rescue false)
72
+ return "float"
73
+ elsif(Date.parse(field) rescue false || Date.strptime(field, '%m/%d/%Y') rescue false || Date.strptime(field, '%m-%d-%Y') rescue false || Date.strptime(field, '%m %d %Y') rescue false)
74
+ if datetime_pattern(field)
75
+ if field =~ /:/ # To check if the field contains any pattern for Hours:minutes
76
+ return "datetime"
77
+ else
78
+ return "date"
79
+ end
80
+ end
81
+ end
82
+ return "string"
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,3 @@
1
+ class FileNotFound < StandardError; end
2
+ class InvalidInput < StandardError; end
3
+ class MissingRequiredArguments < StandardError; end
@@ -0,0 +1,8 @@
1
+ require 'pry'
2
+ class String
3
+ #Extending string class to return the count of substr inside a string
4
+ def substr_count(needle)
5
+ needle = "\\#{needle}" if(needle == '|') # To escape inside regex
6
+ self.scan(/(#{needle})/).size
7
+ end
8
+ end
@@ -0,0 +1,31 @@
1
+ require_relative "../helpers/errors"
2
+ require 'pry'
3
+ module CsvImportAnalyzer
4
+ module MysqlQueryHelper
5
+
6
+ def form_query_for_datatype(args)
7
+ unless args[:datatype].nil? || args[:header].nil?
8
+ if args[:datatype] == :string
9
+ return args[:header].to_s + " varchar(255)"
10
+ else
11
+ return args[:header].to_s + " " + args[:datatype].to_s
12
+ end
13
+ else
14
+ MissingRequiredArguments.new("Required arguments missing for form_query_for_datatype")
15
+ end
16
+ end
17
+
18
+ def import_csv(args)
19
+ unless args[:tablename].nil? || args[:filename].nil? || args[:delimiter].nil?
20
+ import_statement = "LOAD DATA INFILE #{args[:filename]} INTO TABLE #{args[:tablename]} "+
21
+ "FIELDS TERMINATED BY '#{args[:delimiter]}' "+
22
+ "ENCLOSED BY '\"' "+
23
+ "LINES TERMINATED BY '\\n' "+
24
+ "IGNORE 1 LINES;"
25
+ else
26
+ MissingRequiredArguments.new("Required arguments missing for import_csv")
27
+ end
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,27 @@
1
+ require_relative "../helpers/errors"
2
+ require "pry"
3
+ module CsvImportAnalyzer
4
+ module PgQueryHelper
5
+
6
+ def form_query_for_datatype(args)
7
+ unless args[:datatype].nil? || args[:header].nil?
8
+ if args[:datatype] == :string
9
+ return args[:header].to_s + " varchar(255)"
10
+ else
11
+ return args[:header].to_s + " " + args[:datatype].to_s
12
+ end
13
+ else
14
+ MissingRequiredArguments.new("Required arguments missing for form_query_for_datatype")
15
+ end
16
+ end
17
+
18
+ def import_csv(args)
19
+ unless args[:tablename].nil? || args[:filename].nil? || args[:delimiter].nil?
20
+ pg_import_statement = "COPY #{args[:tablename]} FROM '#{args[:filename]}' HEADER DELIMITER '#{args[:delimiter]}' CSV NULL AS 'NULL';"
21
+ else
22
+ MissingRequiredArguments.new("Required arguments missing for import_csv")
23
+ end
24
+ end
25
+
26
+ end
27
+ end
@@ -0,0 +1,27 @@
1
+ require_relative "../helpers/errors"
2
+ require "pry"
3
+ module CsvImportAnalyzer
4
+ module QueryHelper
5
+
6
+ def form_query_for_datatype(args)
7
+ unless args[:datatype].nil? || args[:header].nil?
8
+ if args[:datatype] == :string
9
+ return args[:header].to_s + " varchar(255)"
10
+ else
11
+ return args[:header].to_s + " " + args[:datatype].to_s
12
+ end
13
+ else
14
+ MissingRequiredArgumentsError.new("Required arguments missing for form_query_for_datatype")
15
+ end
16
+ end
17
+
18
+ def import_csv(args)
19
+ unless args[:tablename].nil? || args[:filename].nil? || args[:delimiter].nil?
20
+ pg_import_statement = "COPY #{args[:tablename]} FROM '#{args[:filename]}' HEADER DELIMITER '#{args[:delimiter]}' CSV NULL AS 'NULL';"
21
+ else
22
+ MissingRequiredArgumentsError.new("Required arguments missing for import_csv")
23
+ end
24
+ end
25
+
26
+ end
27
+ end