csv-import-analyzer 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +35 -0
- data/Rakefile +2 -0
- data/csv-import-analyzer.gemspec +29 -0
- data/lib/csv-import-analyzer.rb +18 -0
- data/lib/csv-import-analyzer/analyzer/csv_check_bounds.rb +104 -0
- data/lib/csv-import-analyzer/analyzer/delimiter_identifier.rb +66 -0
- data/lib/csv-import-analyzer/analyzer/file_type_assertion.rb +29 -0
- data/lib/csv-import-analyzer/csv_datatype_analysis.rb +110 -0
- data/lib/csv-import-analyzer/csv_sanitizer.rb +86 -0
- data/lib/csv-import-analyzer/export/metadata_analysis.rb +156 -0
- data/lib/csv-import-analyzer/helpers/common_functions.rb +11 -0
- data/lib/csv-import-analyzer/helpers/datatype_validation.rb +85 -0
- data/lib/csv-import-analyzer/helpers/errors.rb +3 -0
- data/lib/csv-import-analyzer/helpers/string_class_extensions.rb +8 -0
- data/lib/csv-import-analyzer/query_builder/mysql_query_helper.rb +31 -0
- data/lib/csv-import-analyzer/query_builder/pg_query_helper.rb +27 -0
- data/lib/csv-import-analyzer/query_builder/query_helper.rb +27 -0
- data/lib/csv-import-analyzer/sampleTab.csv +5 -0
- data/lib/csv-import-analyzer/sql_query_builder.rb +125 -0
- data/lib/csv-import-analyzer/version.rb +5 -0
- data/lib/metadata_output.json +70 -0
- data/lib/sampleTab.csv +5 -0
- data/spec/csv-import-analyzer/analyzer/csv_check_bounds_spec.rb +43 -0
- data/spec/csv-import-analyzer/analyzer/delimiter_identifier_spec.rb +61 -0
- data/spec/csv-import-analyzer/analyzer/file_type_assertion_spec.rb +0 -0
- data/spec/csv-import-analyzer/csv_datatype_analysis_spec.rb +1 -0
- data/spec/csv-import-analyzer/csv_sanitizer_spec.rb +24 -0
- data/spec/csv-import-analyzer/export/metadata_analysis_spec.rb +0 -0
- data/spec/csv-import-analyzer/helpers/common_functions_spec.rb +31 -0
- data/spec/csv-import-analyzer/helpers/csv_check_bounds_spec.rb +3 -0
- data/spec/csv-import-analyzer/helpers/datatype_validation_spec.rb +75 -0
- data/spec/csv-import-analyzer/helpers/mysql_query_helper_spec.rb +0 -0
- data/spec/csv-import-analyzer/helpers/pq_query_helper_spec.rb +0 -0
- data/spec/csv-import-analyzer/helpers/string_class_extension_spec.rb +18 -0
- data/spec/csv-import-analyzer/query_builder/mysql_query_helper_spec.rb +54 -0
- data/spec/csv-import-analyzer/query_builder/pg_query_helper_spec.rb +55 -0
- data/spec/csv-import-analyzer_spec.rb +14 -0
- data/spec/fixtures/sample.csv +5 -0
- data/spec/fixtures/sample_options.yml +11 -0
- data/spec/fixtures/semicolon-sample.csv +5 -0
- data/spec/spec_helper.rb +84 -0
- metadata +208 -0
@@ -0,0 +1,86 @@
|
|
1
|
+
require "smarter_csv"
|
2
|
+
require "tempfile"
|
3
|
+
require_relative "analyzer/delimiter_identifier"
|
4
|
+
require_relative "helpers/string_class_extensions"
|
5
|
+
require_relative "helpers/common_functions"
|
6
|
+
require_relative "helpers/errors"
|
7
|
+
require_relative "csv_datatype_analysis"
|
8
|
+
|
9
|
+
module CsvImportAnalyzer
|
10
|
+
class CsvSanitizer
|
11
|
+
include CsvImportAnalyzer::Helper
|
12
|
+
include CsvImportAnalyzer::DelimiterIdentifier
|
13
|
+
|
14
|
+
def process(filename, options)
|
15
|
+
|
16
|
+
options = defaults.merge(options)
|
17
|
+
if File.exist?(filename)
|
18
|
+
options[:filename] = filename
|
19
|
+
#first thing to do - find the delimiter of the file.
|
20
|
+
delimiter = identify_delimiter(filename)
|
21
|
+
options[:delimiter] = delimiter
|
22
|
+
File.foreach(filename) do |line|
|
23
|
+
#Check if the line is empty - no point in processing empty lines
|
24
|
+
if line.length > 1
|
25
|
+
line = replace_line_single_quotes(line,delimiter)
|
26
|
+
begin
|
27
|
+
line = CSV.parse_line(line, {:col_sep => delimiter})
|
28
|
+
rescue CSV::MalformedCSVError => error
|
29
|
+
line = "#{line}\""
|
30
|
+
line = CSV.parse_line(line, {:col_sep => delimiter})
|
31
|
+
end
|
32
|
+
line = replace_null_values(line)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
# Cleaned the file - Now analyze for datatypes
|
36
|
+
CsvImportAnalyzer::CsvDatatypeAnalysis.new(options).datatype_analysis
|
37
|
+
else
|
38
|
+
FileNotFound.new
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def defaults
|
45
|
+
{
|
46
|
+
:metadata_output => nil,
|
47
|
+
:processed_input => nil,
|
48
|
+
:unique => 10,
|
49
|
+
:check_bounds => true,
|
50
|
+
:datatype_analysis => 200,
|
51
|
+
:chunk => 20,
|
52
|
+
:database => [:pg, :mysql],
|
53
|
+
:quote_convert => true,
|
54
|
+
:replace_nulls => true,
|
55
|
+
:out_format => :json
|
56
|
+
}
|
57
|
+
end
|
58
|
+
|
59
|
+
def replace_line_single_quotes(line, delimiter)
|
60
|
+
delimiter = "\\|" if delimiter == "|"
|
61
|
+
pattern = "#{delimiter}'.*?'#{delimiter}" # set the pattern to opening and closing single quote found between delimiters
|
62
|
+
res = line.gsub(/#{pattern}/)
|
63
|
+
result = res.each { |match|
|
64
|
+
replace = "#{delimiter}\""
|
65
|
+
replace = "\|\"" if delimiter == "\\|"
|
66
|
+
match = match.gsub(/^#{delimiter}'/,replace)
|
67
|
+
replace = "\"#{delimiter}"
|
68
|
+
replace = "\"\|" if delimiter == "\\|"
|
69
|
+
match = match.gsub(/'#{delimiter}$/,replace)
|
70
|
+
}
|
71
|
+
result = result.gsub(/''/,'\'') #replace any single quote that might have been used twice to escape single quote before
|
72
|
+
return result
|
73
|
+
end
|
74
|
+
|
75
|
+
# Replace all nil, "NAN", empty values with NULL for maintaining consistency during database import
|
76
|
+
def replace_null_values(line)
|
77
|
+
line.each do |value|
|
78
|
+
if null_like?(value)
|
79
|
+
replace_index = line.index(value)
|
80
|
+
line[replace_index] = "NULL"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
return line
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,156 @@
|
|
1
|
+
require 'pry'
|
2
|
+
require 'json'
|
3
|
+
module CsvImportAnalyzer
|
4
|
+
class MetadataAnalysis
|
5
|
+
attr_accessor :metadata, :max_distinct_values
|
6
|
+
def initialize(options)
|
7
|
+
@options = options
|
8
|
+
@metadata = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def options
|
12
|
+
@options
|
13
|
+
end
|
14
|
+
|
15
|
+
def metadata
|
16
|
+
@metadata
|
17
|
+
end
|
18
|
+
|
19
|
+
def header_datatypes
|
20
|
+
@options[:csv_column_datatypes]
|
21
|
+
end
|
22
|
+
|
23
|
+
def header_datatype_analysis
|
24
|
+
@options[:csv_datatype_analysis]
|
25
|
+
end
|
26
|
+
|
27
|
+
def nullable_columns
|
28
|
+
@options[:nullable]
|
29
|
+
end
|
30
|
+
|
31
|
+
def databases
|
32
|
+
@options[:database]
|
33
|
+
end
|
34
|
+
|
35
|
+
def create_queries
|
36
|
+
@options[:create_query]
|
37
|
+
end
|
38
|
+
|
39
|
+
def import_queries
|
40
|
+
@options[:import_query]
|
41
|
+
end
|
42
|
+
|
43
|
+
def unique_values
|
44
|
+
@options[:uniques]
|
45
|
+
end
|
46
|
+
|
47
|
+
def max_distinct_values
|
48
|
+
@max_distinct_values ||= Integer(options[:unique]) + 1
|
49
|
+
end
|
50
|
+
|
51
|
+
def metadata_print
|
52
|
+
build_metadata_output
|
53
|
+
if options[:metadata_output]
|
54
|
+
if options[:out_format] == :json
|
55
|
+
json_print_to_file
|
56
|
+
end
|
57
|
+
if options[:out_format] == :csv
|
58
|
+
csv_print_to_file
|
59
|
+
end
|
60
|
+
end
|
61
|
+
return JSON.pretty_generate(metadata)
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def json_print_to_file
|
68
|
+
outfile = File.open("metadata_output.json", "w")
|
69
|
+
outfile << JSON.pretty_generate(metadata)
|
70
|
+
outfile.close
|
71
|
+
end
|
72
|
+
|
73
|
+
# Priniting that csv from json is a mess - How to make pretty print ?
|
74
|
+
def csv_print_to_file
|
75
|
+
CSV.open("metadata_output.csv", "w") do |csv|
|
76
|
+
binding.pry
|
77
|
+
metadata.each do |key, value|
|
78
|
+
if value.class == Hash
|
79
|
+
csv << [key]
|
80
|
+
print_hash_to_csv(value, csv)
|
81
|
+
else
|
82
|
+
csv << [key, value]
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def print_hash_to_csv(hash, csv_handler)
|
89
|
+
if hash.class == Hash
|
90
|
+
hash.each do |key, value|
|
91
|
+
csv_handler << [key]
|
92
|
+
print_hash_to_csv(value, csv_handler)
|
93
|
+
end
|
94
|
+
else
|
95
|
+
csv_handler << [hash]
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def build_metadata_output
|
100
|
+
metadata[:csv_file] = add_file_metadata
|
101
|
+
metadata[:data_manipulations] = add_data_manipulations
|
102
|
+
metadata[:csv_headers] = add_header_metadata
|
103
|
+
metadata[:sql] = add_sql_data
|
104
|
+
end
|
105
|
+
|
106
|
+
def add_file_metadata
|
107
|
+
file_data = {}
|
108
|
+
file_data[:filename] = File.basename(options[:filename])
|
109
|
+
file_data[:file_size] = File.size(options[:filename])
|
110
|
+
# file_data[:rows] = options[:rows]
|
111
|
+
# file_data[:columns] = options[:columns]
|
112
|
+
file_data[:record_delimiter] = options[:delimiter]
|
113
|
+
return file_data
|
114
|
+
end
|
115
|
+
|
116
|
+
def add_data_manipulations
|
117
|
+
data_manipulations = {}
|
118
|
+
data_manipulations[:replace_nulls] = options[:replace_nulls]
|
119
|
+
data_manipulations[:replace_quotes] = options[:quote_convert]
|
120
|
+
return data_manipulations
|
121
|
+
end
|
122
|
+
|
123
|
+
def add_header_metadata
|
124
|
+
columns = {}
|
125
|
+
header_datatypes.keys.each do |column_name|
|
126
|
+
begin
|
127
|
+
columns[column_name] = {}
|
128
|
+
columns[column_name][:datatype] = header_datatypes[column_name]
|
129
|
+
columns[column_name][:datatype_analysis] = header_datatype_analysis[column_name]
|
130
|
+
if unique_values[column_name].size > max_distinct_values
|
131
|
+
columns[column_name][:distinct_values] = "#{max_distinct_values}+"
|
132
|
+
else
|
133
|
+
columns[column_name][:distinct_values] = unique_values[column_name]
|
134
|
+
end
|
135
|
+
if nullable_columns.include?(column_name)
|
136
|
+
columns[column_name][:nullable] = true
|
137
|
+
end
|
138
|
+
rescue Exception => e
|
139
|
+
puts e
|
140
|
+
end
|
141
|
+
end
|
142
|
+
return columns
|
143
|
+
end
|
144
|
+
|
145
|
+
def add_sql_data
|
146
|
+
sql = {}
|
147
|
+
databases.each do |db|
|
148
|
+
sql[db] = {}
|
149
|
+
sql[db][:create_query] = create_queries[db]
|
150
|
+
sql[db][:import_query] = import_queries[db]
|
151
|
+
end
|
152
|
+
return sql
|
153
|
+
end
|
154
|
+
|
155
|
+
end
|
156
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
require 'pry'
|
2
|
+
module CsvImportAnalyzer
|
3
|
+
module DatatypeValidator
|
4
|
+
|
5
|
+
def validate_field(content)
|
6
|
+
return get_datatype(content)
|
7
|
+
end
|
8
|
+
|
9
|
+
private
|
10
|
+
###
|
11
|
+
# Date.parse("12/31/20145234", "%m/%d/%Y") => true which is not supposed to be true (although technically its true)
|
12
|
+
# Validate year part has only 4 numbers in it
|
13
|
+
###
|
14
|
+
|
15
|
+
def validate_year_date(field)
|
16
|
+
date = nil
|
17
|
+
formats = ["%d/%m/%Y","%d-%m-%Y","%d %m %Y","%m/%d/%Y","%m-%d-%Y","%m %d %Y"]
|
18
|
+
formats.each do |format|
|
19
|
+
if(Date.strptime(field, format) rescue false)
|
20
|
+
date = Date.strptime(field, format)
|
21
|
+
break
|
22
|
+
end
|
23
|
+
end
|
24
|
+
unless date.nil?
|
25
|
+
field = date.to_s.scan(/\d*/) # Return an array with patterns matching with only numbers in it
|
26
|
+
if field[0].length == 4
|
27
|
+
return true
|
28
|
+
else
|
29
|
+
return false
|
30
|
+
end
|
31
|
+
else
|
32
|
+
return false
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
###
|
37
|
+
# To check for pattern of Date format after Date.parse is successfull
|
38
|
+
# Date.parse(3000) => true which is not supposed to be true
|
39
|
+
###
|
40
|
+
def datetime_pattern(field)
|
41
|
+
pattern1 = field.scan(/[0-9]\//)
|
42
|
+
pattern2 = field.scan(/[0-9]\-/)
|
43
|
+
pattern3 = field.scan(/[0-9] /)
|
44
|
+
pattern4 = field.scan(/[0-9] [A-Z][a-z][a-z] [0-9]|[0-9]-[A-Z][a-z][a-z]-[0-9]|[0-9] [a-z][a-z][a-z] [0-9]|[0-9]-[a-z][a-z][a-z]-[0-9]/)
|
45
|
+
if(pattern1.size == 2||pattern2.size == 2||pattern3.size == 2||pattern4.size != 0)
|
46
|
+
if(validate_year_date(field))
|
47
|
+
return true
|
48
|
+
else
|
49
|
+
return false
|
50
|
+
end
|
51
|
+
else
|
52
|
+
return false
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
###
|
57
|
+
#To determine the data-type of an input field
|
58
|
+
###
|
59
|
+
def get_datatype(field)
|
60
|
+
#Remove if field has any comma's for int and float rep
|
61
|
+
if field != nil && field.class == String
|
62
|
+
num = field.gsub(/,/,'')
|
63
|
+
else
|
64
|
+
num = field
|
65
|
+
end
|
66
|
+
if(Integer(num) rescue false)
|
67
|
+
if num.class == Float
|
68
|
+
return "float"
|
69
|
+
end
|
70
|
+
return "int"
|
71
|
+
elsif(Float(num) rescue false)
|
72
|
+
return "float"
|
73
|
+
elsif(Date.parse(field) rescue false || Date.strptime(field, '%m/%d/%Y') rescue false || Date.strptime(field, '%m-%d-%Y') rescue false || Date.strptime(field, '%m %d %Y') rescue false)
|
74
|
+
if datetime_pattern(field)
|
75
|
+
if field =~ /:/ # To check if the field contains any pattern for Hours:minutes
|
76
|
+
return "datetime"
|
77
|
+
else
|
78
|
+
return "date"
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
return "string"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require_relative "../helpers/errors"
|
2
|
+
require 'pry'
|
3
|
+
module CsvImportAnalyzer
|
4
|
+
module MysqlQueryHelper
|
5
|
+
|
6
|
+
def form_query_for_datatype(args)
|
7
|
+
unless args[:datatype].nil? || args[:header].nil?
|
8
|
+
if args[:datatype] == :string
|
9
|
+
return args[:header].to_s + " varchar(255)"
|
10
|
+
else
|
11
|
+
return args[:header].to_s + " " + args[:datatype].to_s
|
12
|
+
end
|
13
|
+
else
|
14
|
+
MissingRequiredArguments.new("Required arguments missing for form_query_for_datatype")
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def import_csv(args)
|
19
|
+
unless args[:tablename].nil? || args[:filename].nil? || args[:delimiter].nil?
|
20
|
+
import_statement = "LOAD DATA INFILE #{args[:filename]} INTO TABLE #{args[:tablename]} "+
|
21
|
+
"FIELDS TERMINATED BY '#{args[:delimiter]}' "+
|
22
|
+
"ENCLOSED BY '\"' "+
|
23
|
+
"LINES TERMINATED BY '\\n' "+
|
24
|
+
"IGNORE 1 LINES;"
|
25
|
+
else
|
26
|
+
MissingRequiredArguments.new("Required arguments missing for import_csv")
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require_relative "../helpers/errors"
|
2
|
+
require "pry"
|
3
|
+
module CsvImportAnalyzer
|
4
|
+
module PgQueryHelper
|
5
|
+
|
6
|
+
def form_query_for_datatype(args)
|
7
|
+
unless args[:datatype].nil? || args[:header].nil?
|
8
|
+
if args[:datatype] == :string
|
9
|
+
return args[:header].to_s + " varchar(255)"
|
10
|
+
else
|
11
|
+
return args[:header].to_s + " " + args[:datatype].to_s
|
12
|
+
end
|
13
|
+
else
|
14
|
+
MissingRequiredArguments.new("Required arguments missing for form_query_for_datatype")
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def import_csv(args)
|
19
|
+
unless args[:tablename].nil? || args[:filename].nil? || args[:delimiter].nil?
|
20
|
+
pg_import_statement = "COPY #{args[:tablename]} FROM '#{args[:filename]}' HEADER DELIMITER '#{args[:delimiter]}' CSV NULL AS 'NULL';"
|
21
|
+
else
|
22
|
+
MissingRequiredArguments.new("Required arguments missing for import_csv")
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require_relative "../helpers/errors"
|
2
|
+
require "pry"
|
3
|
+
module CsvImportAnalyzer
|
4
|
+
module QueryHelper
|
5
|
+
|
6
|
+
def form_query_for_datatype(args)
|
7
|
+
unless args[:datatype].nil? || args[:header].nil?
|
8
|
+
if args[:datatype] == :string
|
9
|
+
return args[:header].to_s + " varchar(255)"
|
10
|
+
else
|
11
|
+
return args[:header].to_s + " " + args[:datatype].to_s
|
12
|
+
end
|
13
|
+
else
|
14
|
+
MissingRequiredArgumentsError.new("Required arguments missing for form_query_for_datatype")
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def import_csv(args)
|
19
|
+
unless args[:tablename].nil? || args[:filename].nil? || args[:delimiter].nil?
|
20
|
+
pg_import_statement = "COPY #{args[:tablename]} FROM '#{args[:filename]}' HEADER DELIMITER '#{args[:delimiter]}' CSV NULL AS 'NULL';"
|
21
|
+
else
|
22
|
+
MissingRequiredArgumentsError.new("Required arguments missing for import_csv")
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|