csv-import-analyzer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +35 -0
  7. data/Rakefile +2 -0
  8. data/csv-import-analyzer.gemspec +29 -0
  9. data/lib/csv-import-analyzer.rb +18 -0
  10. data/lib/csv-import-analyzer/analyzer/csv_check_bounds.rb +104 -0
  11. data/lib/csv-import-analyzer/analyzer/delimiter_identifier.rb +66 -0
  12. data/lib/csv-import-analyzer/analyzer/file_type_assertion.rb +29 -0
  13. data/lib/csv-import-analyzer/csv_datatype_analysis.rb +110 -0
  14. data/lib/csv-import-analyzer/csv_sanitizer.rb +86 -0
  15. data/lib/csv-import-analyzer/export/metadata_analysis.rb +156 -0
  16. data/lib/csv-import-analyzer/helpers/common_functions.rb +11 -0
  17. data/lib/csv-import-analyzer/helpers/datatype_validation.rb +85 -0
  18. data/lib/csv-import-analyzer/helpers/errors.rb +3 -0
  19. data/lib/csv-import-analyzer/helpers/string_class_extensions.rb +8 -0
  20. data/lib/csv-import-analyzer/query_builder/mysql_query_helper.rb +31 -0
  21. data/lib/csv-import-analyzer/query_builder/pg_query_helper.rb +27 -0
  22. data/lib/csv-import-analyzer/query_builder/query_helper.rb +27 -0
  23. data/lib/csv-import-analyzer/sampleTab.csv +5 -0
  24. data/lib/csv-import-analyzer/sql_query_builder.rb +125 -0
  25. data/lib/csv-import-analyzer/version.rb +5 -0
  26. data/lib/metadata_output.json +70 -0
  27. data/lib/sampleTab.csv +5 -0
  28. data/spec/csv-import-analyzer/analyzer/csv_check_bounds_spec.rb +43 -0
  29. data/spec/csv-import-analyzer/analyzer/delimiter_identifier_spec.rb +61 -0
  30. data/spec/csv-import-analyzer/analyzer/file_type_assertion_spec.rb +0 -0
  31. data/spec/csv-import-analyzer/csv_datatype_analysis_spec.rb +1 -0
  32. data/spec/csv-import-analyzer/csv_sanitizer_spec.rb +24 -0
  33. data/spec/csv-import-analyzer/export/metadata_analysis_spec.rb +0 -0
  34. data/spec/csv-import-analyzer/helpers/common_functions_spec.rb +31 -0
  35. data/spec/csv-import-analyzer/helpers/csv_check_bounds_spec.rb +3 -0
  36. data/spec/csv-import-analyzer/helpers/datatype_validation_spec.rb +75 -0
  37. data/spec/csv-import-analyzer/helpers/mysql_query_helper_spec.rb +0 -0
  38. data/spec/csv-import-analyzer/helpers/pq_query_helper_spec.rb +0 -0
  39. data/spec/csv-import-analyzer/helpers/string_class_extension_spec.rb +18 -0
  40. data/spec/csv-import-analyzer/query_builder/mysql_query_helper_spec.rb +54 -0
  41. data/spec/csv-import-analyzer/query_builder/pg_query_helper_spec.rb +55 -0
  42. data/spec/csv-import-analyzer_spec.rb +14 -0
  43. data/spec/fixtures/sample.csv +5 -0
  44. data/spec/fixtures/sample_options.yml +11 -0
  45. data/spec/fixtures/semicolon-sample.csv +5 -0
  46. data/spec/spec_helper.rb +84 -0
  47. metadata +208 -0
@@ -0,0 +1,86 @@
1
+ require "smarter_csv"
2
+ require "tempfile"
3
+ require_relative "analyzer/delimiter_identifier"
4
+ require_relative "helpers/string_class_extensions"
5
+ require_relative "helpers/common_functions"
6
+ require_relative "helpers/errors"
7
+ require_relative "csv_datatype_analysis"
8
+
9
+ module CsvImportAnalyzer
10
+ class CsvSanitizer
11
+ include CsvImportAnalyzer::Helper
12
+ include CsvImportAnalyzer::DelimiterIdentifier
13
+
14
+ def process(filename, options)
15
+
16
+ options = defaults.merge(options)
17
+ if File.exist?(filename)
18
+ options[:filename] = filename
19
+ #first thing to do - find the delimiter of the file.
20
+ delimiter = identify_delimiter(filename)
21
+ options[:delimiter] = delimiter
22
+ File.foreach(filename) do |line|
23
+ #Check if the line is empty - no point in processing empty lines
24
+ if line.length > 1
25
+ line = replace_line_single_quotes(line,delimiter)
26
+ begin
27
+ line = CSV.parse_line(line, {:col_sep => delimiter})
28
+ rescue CSV::MalformedCSVError => error
29
+ line = "#{line}\""
30
+ line = CSV.parse_line(line, {:col_sep => delimiter})
31
+ end
32
+ line = replace_null_values(line)
33
+ end
34
+ end
35
+ # Cleaned the file - Now analyze for datatypes
36
+ CsvImportAnalyzer::CsvDatatypeAnalysis.new(options).datatype_analysis
37
+ else
38
+ FileNotFound.new
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ def defaults
45
+ {
46
+ :metadata_output => nil,
47
+ :processed_input => nil,
48
+ :unique => 10,
49
+ :check_bounds => true,
50
+ :datatype_analysis => 200,
51
+ :chunk => 20,
52
+ :database => [:pg, :mysql],
53
+ :quote_convert => true,
54
+ :replace_nulls => true,
55
+ :out_format => :json
56
+ }
57
+ end
58
+
59
+ def replace_line_single_quotes(line, delimiter)
60
+ delimiter = "\\|" if delimiter == "|"
61
+ pattern = "#{delimiter}'.*?'#{delimiter}" # set the pattern to opening and closing single quote found between delimiters
62
+ res = line.gsub(/#{pattern}/)
63
+ result = res.each { |match|
64
+ replace = "#{delimiter}\""
65
+ replace = "\|\"" if delimiter == "\\|"
66
+ match = match.gsub(/^#{delimiter}'/,replace)
67
+ replace = "\"#{delimiter}"
68
+ replace = "\"\|" if delimiter == "\\|"
69
+ match = match.gsub(/'#{delimiter}$/,replace)
70
+ }
71
+ result = result.gsub(/''/,'\'') #replace any single quote that might have been used twice to escape single quote before
72
+ return result
73
+ end
74
+
75
+ # Replace all nil, "NAN", empty values with NULL for maintaining consistency during database import
76
+ def replace_null_values(line)
77
+ line.each do |value|
78
+ if null_like?(value)
79
+ replace_index = line.index(value)
80
+ line[replace_index] = "NULL"
81
+ end
82
+ end
83
+ return line
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,156 @@
1
+ require 'pry'
2
+ require 'json'
3
+ module CsvImportAnalyzer
4
+ class MetadataAnalysis
5
+ attr_accessor :metadata, :max_distinct_values
6
+ def initialize(options)
7
+ @options = options
8
+ @metadata = {}
9
+ end
10
+
11
+ def options
12
+ @options
13
+ end
14
+
15
+ def metadata
16
+ @metadata
17
+ end
18
+
19
+ def header_datatypes
20
+ @options[:csv_column_datatypes]
21
+ end
22
+
23
+ def header_datatype_analysis
24
+ @options[:csv_datatype_analysis]
25
+ end
26
+
27
+ def nullable_columns
28
+ @options[:nullable]
29
+ end
30
+
31
+ def databases
32
+ @options[:database]
33
+ end
34
+
35
+ def create_queries
36
+ @options[:create_query]
37
+ end
38
+
39
+ def import_queries
40
+ @options[:import_query]
41
+ end
42
+
43
+ def unique_values
44
+ @options[:uniques]
45
+ end
46
+
47
+ def max_distinct_values
48
+ @max_distinct_values ||= Integer(options[:unique]) + 1
49
+ end
50
+
51
+ def metadata_print
52
+ build_metadata_output
53
+ if options[:metadata_output]
54
+ if options[:out_format] == :json
55
+ json_print_to_file
56
+ end
57
+ if options[:out_format] == :csv
58
+ csv_print_to_file
59
+ end
60
+ end
61
+ return JSON.pretty_generate(metadata)
62
+ end
63
+
64
+
65
+ private
66
+
67
+ def json_print_to_file
68
+ outfile = File.open("metadata_output.json", "w")
69
+ outfile << JSON.pretty_generate(metadata)
70
+ outfile.close
71
+ end
72
+
73
+ # Priniting that csv from json is a mess - How to make pretty print ?
74
+ def csv_print_to_file
75
+ CSV.open("metadata_output.csv", "w") do |csv|
76
+ binding.pry
77
+ metadata.each do |key, value|
78
+ if value.class == Hash
79
+ csv << [key]
80
+ print_hash_to_csv(value, csv)
81
+ else
82
+ csv << [key, value]
83
+ end
84
+ end
85
+ end
86
+ end
87
+
88
+ def print_hash_to_csv(hash, csv_handler)
89
+ if hash.class == Hash
90
+ hash.each do |key, value|
91
+ csv_handler << [key]
92
+ print_hash_to_csv(value, csv_handler)
93
+ end
94
+ else
95
+ csv_handler << [hash]
96
+ end
97
+ end
98
+
99
+ def build_metadata_output
100
+ metadata[:csv_file] = add_file_metadata
101
+ metadata[:data_manipulations] = add_data_manipulations
102
+ metadata[:csv_headers] = add_header_metadata
103
+ metadata[:sql] = add_sql_data
104
+ end
105
+
106
+ def add_file_metadata
107
+ file_data = {}
108
+ file_data[:filename] = File.basename(options[:filename])
109
+ file_data[:file_size] = File.size(options[:filename])
110
+ # file_data[:rows] = options[:rows]
111
+ # file_data[:columns] = options[:columns]
112
+ file_data[:record_delimiter] = options[:delimiter]
113
+ return file_data
114
+ end
115
+
116
+ def add_data_manipulations
117
+ data_manipulations = {}
118
+ data_manipulations[:replace_nulls] = options[:replace_nulls]
119
+ data_manipulations[:replace_quotes] = options[:quote_convert]
120
+ return data_manipulations
121
+ end
122
+
123
+ def add_header_metadata
124
+ columns = {}
125
+ header_datatypes.keys.each do |column_name|
126
+ begin
127
+ columns[column_name] = {}
128
+ columns[column_name][:datatype] = header_datatypes[column_name]
129
+ columns[column_name][:datatype_analysis] = header_datatype_analysis[column_name]
130
+ if unique_values[column_name].size > max_distinct_values
131
+ columns[column_name][:distinct_values] = "#{max_distinct_values}+"
132
+ else
133
+ columns[column_name][:distinct_values] = unique_values[column_name]
134
+ end
135
+ if nullable_columns.include?(column_name)
136
+ columns[column_name][:nullable] = true
137
+ end
138
+ rescue Exception => e
139
+ puts e
140
+ end
141
+ end
142
+ return columns
143
+ end
144
+
145
+ def add_sql_data
146
+ sql = {}
147
+ databases.each do |db|
148
+ sql[db] = {}
149
+ sql[db][:create_query] = create_queries[db]
150
+ sql[db][:import_query] = import_queries[db]
151
+ end
152
+ return sql
153
+ end
154
+
155
+ end
156
+ end
@@ -0,0 +1,11 @@
1
+ module CsvImportAnalyzer
2
+ module Helper
3
+ def null_like?(value)
4
+ if ["NULL", "Null", "NUll", "NULl", "null", nil, "", "NAN", "\\N"].include?(value)
5
+ true
6
+ else
7
+ false
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,85 @@
1
+ require 'pry'
2
+ module CsvImportAnalyzer
3
+ module DatatypeValidator
4
+
5
+ def validate_field(content)
6
+ return get_datatype(content)
7
+ end
8
+
9
+ private
10
+ ###
11
+ # Date.parse("12/31/20145234", "%m/%d/%Y") => true which is not supposed to be true (although technically its true)
12
+ # Validate year part has only 4 numbers in it
13
+ ###
14
+
15
+ def validate_year_date(field)
16
+ date = nil
17
+ formats = ["%d/%m/%Y","%d-%m-%Y","%d %m %Y","%m/%d/%Y","%m-%d-%Y","%m %d %Y"]
18
+ formats.each do |format|
19
+ if(Date.strptime(field, format) rescue false)
20
+ date = Date.strptime(field, format)
21
+ break
22
+ end
23
+ end
24
+ unless date.nil?
25
+ field = date.to_s.scan(/\d*/) # Return an array with patterns matching with only numbers in it
26
+ if field[0].length == 4
27
+ return true
28
+ else
29
+ return false
30
+ end
31
+ else
32
+ return false
33
+ end
34
+ end
35
+
36
+ ###
37
+ # To check for pattern of Date format after Date.parse is successfull
38
+ # Date.parse(3000) => true which is not supposed to be true
39
+ ###
40
+ def datetime_pattern(field)
41
+ pattern1 = field.scan(/[0-9]\//)
42
+ pattern2 = field.scan(/[0-9]\-/)
43
+ pattern3 = field.scan(/[0-9] /)
44
+ pattern4 = field.scan(/[0-9] [A-Z][a-z][a-z] [0-9]|[0-9]-[A-Z][a-z][a-z]-[0-9]|[0-9] [a-z][a-z][a-z] [0-9]|[0-9]-[a-z][a-z][a-z]-[0-9]/)
45
+ if(pattern1.size == 2||pattern2.size == 2||pattern3.size == 2||pattern4.size != 0)
46
+ if(validate_year_date(field))
47
+ return true
48
+ else
49
+ return false
50
+ end
51
+ else
52
+ return false
53
+ end
54
+ end
55
+
56
+ ###
57
+ #To determine the data-type of an input field
58
+ ###
59
+ def get_datatype(field)
60
+ #Remove if field has any comma's for int and float rep
61
+ if field != nil && field.class == String
62
+ num = field.gsub(/,/,'')
63
+ else
64
+ num = field
65
+ end
66
+ if(Integer(num) rescue false)
67
+ if num.class == Float
68
+ return "float"
69
+ end
70
+ return "int"
71
+ elsif(Float(num) rescue false)
72
+ return "float"
73
+ elsif(Date.parse(field) rescue false || Date.strptime(field, '%m/%d/%Y') rescue false || Date.strptime(field, '%m-%d-%Y') rescue false || Date.strptime(field, '%m %d %Y') rescue false)
74
+ if datetime_pattern(field)
75
+ if field =~ /:/ # To check if the field contains any pattern for Hours:minutes
76
+ return "datetime"
77
+ else
78
+ return "date"
79
+ end
80
+ end
81
+ end
82
+ return "string"
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,3 @@
1
+ class FileNotFound < StandardError; end
2
+ class InvalidInput < StandardError; end
3
+ class MissingRequiredArguments < StandardError; end
@@ -0,0 +1,8 @@
1
+ require 'pry'
2
+ class String
3
+ #Extending string class to return the count of substr inside a string
4
+ def substr_count(needle)
5
+ needle = "\\#{needle}" if(needle == '|') # To escape inside regex
6
+ self.scan(/(#{needle})/).size
7
+ end
8
+ end
@@ -0,0 +1,31 @@
1
+ require_relative "../helpers/errors"
2
+ require 'pry'
3
+ module CsvImportAnalyzer
4
+ module MysqlQueryHelper
5
+
6
+ def form_query_for_datatype(args)
7
+ unless args[:datatype].nil? || args[:header].nil?
8
+ if args[:datatype] == :string
9
+ return args[:header].to_s + " varchar(255)"
10
+ else
11
+ return args[:header].to_s + " " + args[:datatype].to_s
12
+ end
13
+ else
14
+ MissingRequiredArguments.new("Required arguments missing for form_query_for_datatype")
15
+ end
16
+ end
17
+
18
+ def import_csv(args)
19
+ unless args[:tablename].nil? || args[:filename].nil? || args[:delimiter].nil?
20
+ import_statement = "LOAD DATA INFILE #{args[:filename]} INTO TABLE #{args[:tablename]} "+
21
+ "FIELDS TERMINATED BY '#{args[:delimiter]}' "+
22
+ "ENCLOSED BY '\"' "+
23
+ "LINES TERMINATED BY '\\n' "+
24
+ "IGNORE 1 LINES;"
25
+ else
26
+ MissingRequiredArguments.new("Required arguments missing for import_csv")
27
+ end
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,27 @@
1
+ require_relative "../helpers/errors"
2
+ require "pry"
3
+ module CsvImportAnalyzer
4
+ module PgQueryHelper
5
+
6
+ def form_query_for_datatype(args)
7
+ unless args[:datatype].nil? || args[:header].nil?
8
+ if args[:datatype] == :string
9
+ return args[:header].to_s + " varchar(255)"
10
+ else
11
+ return args[:header].to_s + " " + args[:datatype].to_s
12
+ end
13
+ else
14
+ MissingRequiredArguments.new("Required arguments missing for form_query_for_datatype")
15
+ end
16
+ end
17
+
18
+ def import_csv(args)
19
+ unless args[:tablename].nil? || args[:filename].nil? || args[:delimiter].nil?
20
+ pg_import_statement = "COPY #{args[:tablename]} FROM '#{args[:filename]}' HEADER DELIMITER '#{args[:delimiter]}' CSV NULL AS 'NULL';"
21
+ else
22
+ MissingRequiredArguments.new("Required arguments missing for import_csv")
23
+ end
24
+ end
25
+
26
+ end
27
+ end
@@ -0,0 +1,27 @@
1
+ require_relative "../helpers/errors"
2
+ require "pry"
3
+ module CsvImportAnalyzer
4
+ module QueryHelper
5
+
6
+ def form_query_for_datatype(args)
7
+ unless args[:datatype].nil? || args[:header].nil?
8
+ if args[:datatype] == :string
9
+ return args[:header].to_s + " varchar(255)"
10
+ else
11
+ return args[:header].to_s + " " + args[:datatype].to_s
12
+ end
13
+ else
14
+ MissingRequiredArgumentsError.new("Required arguments missing for form_query_for_datatype")
15
+ end
16
+ end
17
+
18
+ def import_csv(args)
19
+ unless args[:tablename].nil? || args[:filename].nil? || args[:delimiter].nil?
20
+ pg_import_statement = "COPY #{args[:tablename]} FROM '#{args[:filename]}' HEADER DELIMITER '#{args[:delimiter]}' CSV NULL AS 'NULL';"
21
+ else
22
+ MissingRequiredArgumentsError.new("Required arguments missing for import_csv")
23
+ end
24
+ end
25
+
26
+ end
27
+ end