csv-import-analyzer 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +35 -0
- data/Rakefile +2 -0
- data/csv-import-analyzer.gemspec +29 -0
- data/lib/csv-import-analyzer.rb +18 -0
- data/lib/csv-import-analyzer/analyzer/csv_check_bounds.rb +104 -0
- data/lib/csv-import-analyzer/analyzer/delimiter_identifier.rb +66 -0
- data/lib/csv-import-analyzer/analyzer/file_type_assertion.rb +29 -0
- data/lib/csv-import-analyzer/csv_datatype_analysis.rb +110 -0
- data/lib/csv-import-analyzer/csv_sanitizer.rb +86 -0
- data/lib/csv-import-analyzer/export/metadata_analysis.rb +156 -0
- data/lib/csv-import-analyzer/helpers/common_functions.rb +11 -0
- data/lib/csv-import-analyzer/helpers/datatype_validation.rb +85 -0
- data/lib/csv-import-analyzer/helpers/errors.rb +3 -0
- data/lib/csv-import-analyzer/helpers/string_class_extensions.rb +8 -0
- data/lib/csv-import-analyzer/query_builder/mysql_query_helper.rb +31 -0
- data/lib/csv-import-analyzer/query_builder/pg_query_helper.rb +27 -0
- data/lib/csv-import-analyzer/query_builder/query_helper.rb +27 -0
- data/lib/csv-import-analyzer/sampleTab.csv +5 -0
- data/lib/csv-import-analyzer/sql_query_builder.rb +125 -0
- data/lib/csv-import-analyzer/version.rb +5 -0
- data/lib/metadata_output.json +70 -0
- data/lib/sampleTab.csv +5 -0
- data/spec/csv-import-analyzer/analyzer/csv_check_bounds_spec.rb +43 -0
- data/spec/csv-import-analyzer/analyzer/delimiter_identifier_spec.rb +61 -0
- data/spec/csv-import-analyzer/analyzer/file_type_assertion_spec.rb +0 -0
- data/spec/csv-import-analyzer/csv_datatype_analysis_spec.rb +1 -0
- data/spec/csv-import-analyzer/csv_sanitizer_spec.rb +24 -0
- data/spec/csv-import-analyzer/export/metadata_analysis_spec.rb +0 -0
- data/spec/csv-import-analyzer/helpers/common_functions_spec.rb +31 -0
- data/spec/csv-import-analyzer/helpers/csv_check_bounds_spec.rb +3 -0
- data/spec/csv-import-analyzer/helpers/datatype_validation_spec.rb +75 -0
- data/spec/csv-import-analyzer/helpers/mysql_query_helper_spec.rb +0 -0
- data/spec/csv-import-analyzer/helpers/pq_query_helper_spec.rb +0 -0
- data/spec/csv-import-analyzer/helpers/string_class_extension_spec.rb +18 -0
- data/spec/csv-import-analyzer/query_builder/mysql_query_helper_spec.rb +54 -0
- data/spec/csv-import-analyzer/query_builder/pg_query_helper_spec.rb +55 -0
- data/spec/csv-import-analyzer_spec.rb +14 -0
- data/spec/fixtures/sample.csv +5 -0
- data/spec/fixtures/sample_options.yml +11 -0
- data/spec/fixtures/semicolon-sample.csv +5 -0
- data/spec/spec_helper.rb +84 -0
- metadata +208 -0
@@ -0,0 +1,86 @@
|
|
1
|
+
require "smarter_csv"
|
2
|
+
require "tempfile"
|
3
|
+
require_relative "analyzer/delimiter_identifier"
|
4
|
+
require_relative "helpers/string_class_extensions"
|
5
|
+
require_relative "helpers/common_functions"
|
6
|
+
require_relative "helpers/errors"
|
7
|
+
require_relative "csv_datatype_analysis"
|
8
|
+
|
9
|
+
module CsvImportAnalyzer
|
10
|
+
class CsvSanitizer
|
11
|
+
include CsvImportAnalyzer::Helper
|
12
|
+
include CsvImportAnalyzer::DelimiterIdentifier
|
13
|
+
|
14
|
+
def process(filename, options)
|
15
|
+
|
16
|
+
options = defaults.merge(options)
|
17
|
+
if File.exist?(filename)
|
18
|
+
options[:filename] = filename
|
19
|
+
#first thing to do - find the delimiter of the file.
|
20
|
+
delimiter = identify_delimiter(filename)
|
21
|
+
options[:delimiter] = delimiter
|
22
|
+
File.foreach(filename) do |line|
|
23
|
+
#Check if the line is empty - no point in processing empty lines
|
24
|
+
if line.length > 1
|
25
|
+
line = replace_line_single_quotes(line,delimiter)
|
26
|
+
begin
|
27
|
+
line = CSV.parse_line(line, {:col_sep => delimiter})
|
28
|
+
rescue CSV::MalformedCSVError => error
|
29
|
+
line = "#{line}\""
|
30
|
+
line = CSV.parse_line(line, {:col_sep => delimiter})
|
31
|
+
end
|
32
|
+
line = replace_null_values(line)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
# Cleaned the file - Now analyze for datatypes
|
36
|
+
CsvImportAnalyzer::CsvDatatypeAnalysis.new(options).datatype_analysis
|
37
|
+
else
|
38
|
+
FileNotFound.new
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def defaults
|
45
|
+
{
|
46
|
+
:metadata_output => nil,
|
47
|
+
:processed_input => nil,
|
48
|
+
:unique => 10,
|
49
|
+
:check_bounds => true,
|
50
|
+
:datatype_analysis => 200,
|
51
|
+
:chunk => 20,
|
52
|
+
:database => [:pg, :mysql],
|
53
|
+
:quote_convert => true,
|
54
|
+
:replace_nulls => true,
|
55
|
+
:out_format => :json
|
56
|
+
}
|
57
|
+
end
|
58
|
+
|
59
|
+
def replace_line_single_quotes(line, delimiter)
|
60
|
+
delimiter = "\\|" if delimiter == "|"
|
61
|
+
pattern = "#{delimiter}'.*?'#{delimiter}" # set the pattern to opening and closing single quote found between delimiters
|
62
|
+
res = line.gsub(/#{pattern}/)
|
63
|
+
result = res.each { |match|
|
64
|
+
replace = "#{delimiter}\""
|
65
|
+
replace = "\|\"" if delimiter == "\\|"
|
66
|
+
match = match.gsub(/^#{delimiter}'/,replace)
|
67
|
+
replace = "\"#{delimiter}"
|
68
|
+
replace = "\"\|" if delimiter == "\\|"
|
69
|
+
match = match.gsub(/'#{delimiter}$/,replace)
|
70
|
+
}
|
71
|
+
result = result.gsub(/''/,'\'') #replace any single quote that might have been used twice to escape single quote before
|
72
|
+
return result
|
73
|
+
end
|
74
|
+
|
75
|
+
# Replace all nil, "NAN", empty values with NULL for maintaining consistency during database import
|
76
|
+
def replace_null_values(line)
|
77
|
+
line.each do |value|
|
78
|
+
if null_like?(value)
|
79
|
+
replace_index = line.index(value)
|
80
|
+
line[replace_index] = "NULL"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
return line
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,156 @@
|
|
1
|
+
require 'pry'
|
2
|
+
require 'json'
|
3
|
+
module CsvImportAnalyzer
|
4
|
+
class MetadataAnalysis
|
5
|
+
attr_accessor :metadata, :max_distinct_values
|
6
|
+
def initialize(options)
|
7
|
+
@options = options
|
8
|
+
@metadata = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def options
|
12
|
+
@options
|
13
|
+
end
|
14
|
+
|
15
|
+
def metadata
|
16
|
+
@metadata
|
17
|
+
end
|
18
|
+
|
19
|
+
def header_datatypes
|
20
|
+
@options[:csv_column_datatypes]
|
21
|
+
end
|
22
|
+
|
23
|
+
def header_datatype_analysis
|
24
|
+
@options[:csv_datatype_analysis]
|
25
|
+
end
|
26
|
+
|
27
|
+
def nullable_columns
|
28
|
+
@options[:nullable]
|
29
|
+
end
|
30
|
+
|
31
|
+
def databases
|
32
|
+
@options[:database]
|
33
|
+
end
|
34
|
+
|
35
|
+
def create_queries
|
36
|
+
@options[:create_query]
|
37
|
+
end
|
38
|
+
|
39
|
+
def import_queries
|
40
|
+
@options[:import_query]
|
41
|
+
end
|
42
|
+
|
43
|
+
def unique_values
|
44
|
+
@options[:uniques]
|
45
|
+
end
|
46
|
+
|
47
|
+
def max_distinct_values
|
48
|
+
@max_distinct_values ||= Integer(options[:unique]) + 1
|
49
|
+
end
|
50
|
+
|
51
|
+
def metadata_print
|
52
|
+
build_metadata_output
|
53
|
+
if options[:metadata_output]
|
54
|
+
if options[:out_format] == :json
|
55
|
+
json_print_to_file
|
56
|
+
end
|
57
|
+
if options[:out_format] == :csv
|
58
|
+
csv_print_to_file
|
59
|
+
end
|
60
|
+
end
|
61
|
+
return JSON.pretty_generate(metadata)
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def json_print_to_file
|
68
|
+
outfile = File.open("metadata_output.json", "w")
|
69
|
+
outfile << JSON.pretty_generate(metadata)
|
70
|
+
outfile.close
|
71
|
+
end
|
72
|
+
|
73
|
+
# Priniting that csv from json is a mess - How to make pretty print ?
|
74
|
+
def csv_print_to_file
|
75
|
+
CSV.open("metadata_output.csv", "w") do |csv|
|
76
|
+
binding.pry
|
77
|
+
metadata.each do |key, value|
|
78
|
+
if value.class == Hash
|
79
|
+
csv << [key]
|
80
|
+
print_hash_to_csv(value, csv)
|
81
|
+
else
|
82
|
+
csv << [key, value]
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def print_hash_to_csv(hash, csv_handler)
|
89
|
+
if hash.class == Hash
|
90
|
+
hash.each do |key, value|
|
91
|
+
csv_handler << [key]
|
92
|
+
print_hash_to_csv(value, csv_handler)
|
93
|
+
end
|
94
|
+
else
|
95
|
+
csv_handler << [hash]
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def build_metadata_output
|
100
|
+
metadata[:csv_file] = add_file_metadata
|
101
|
+
metadata[:data_manipulations] = add_data_manipulations
|
102
|
+
metadata[:csv_headers] = add_header_metadata
|
103
|
+
metadata[:sql] = add_sql_data
|
104
|
+
end
|
105
|
+
|
106
|
+
def add_file_metadata
|
107
|
+
file_data = {}
|
108
|
+
file_data[:filename] = File.basename(options[:filename])
|
109
|
+
file_data[:file_size] = File.size(options[:filename])
|
110
|
+
# file_data[:rows] = options[:rows]
|
111
|
+
# file_data[:columns] = options[:columns]
|
112
|
+
file_data[:record_delimiter] = options[:delimiter]
|
113
|
+
return file_data
|
114
|
+
end
|
115
|
+
|
116
|
+
def add_data_manipulations
|
117
|
+
data_manipulations = {}
|
118
|
+
data_manipulations[:replace_nulls] = options[:replace_nulls]
|
119
|
+
data_manipulations[:replace_quotes] = options[:quote_convert]
|
120
|
+
return data_manipulations
|
121
|
+
end
|
122
|
+
|
123
|
+
def add_header_metadata
|
124
|
+
columns = {}
|
125
|
+
header_datatypes.keys.each do |column_name|
|
126
|
+
begin
|
127
|
+
columns[column_name] = {}
|
128
|
+
columns[column_name][:datatype] = header_datatypes[column_name]
|
129
|
+
columns[column_name][:datatype_analysis] = header_datatype_analysis[column_name]
|
130
|
+
if unique_values[column_name].size > max_distinct_values
|
131
|
+
columns[column_name][:distinct_values] = "#{max_distinct_values}+"
|
132
|
+
else
|
133
|
+
columns[column_name][:distinct_values] = unique_values[column_name]
|
134
|
+
end
|
135
|
+
if nullable_columns.include?(column_name)
|
136
|
+
columns[column_name][:nullable] = true
|
137
|
+
end
|
138
|
+
rescue Exception => e
|
139
|
+
puts e
|
140
|
+
end
|
141
|
+
end
|
142
|
+
return columns
|
143
|
+
end
|
144
|
+
|
145
|
+
def add_sql_data
|
146
|
+
sql = {}
|
147
|
+
databases.each do |db|
|
148
|
+
sql[db] = {}
|
149
|
+
sql[db][:create_query] = create_queries[db]
|
150
|
+
sql[db][:import_query] = import_queries[db]
|
151
|
+
end
|
152
|
+
return sql
|
153
|
+
end
|
154
|
+
|
155
|
+
end
|
156
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
require 'pry'
|
2
|
+
module CsvImportAnalyzer
|
3
|
+
module DatatypeValidator
|
4
|
+
|
5
|
+
def validate_field(content)
|
6
|
+
return get_datatype(content)
|
7
|
+
end
|
8
|
+
|
9
|
+
private
|
10
|
+
###
|
11
|
+
# Date.parse("12/31/20145234", "%m/%d/%Y") => true which is not supposed to be true (although technically its true)
|
12
|
+
# Validate year part has only 4 numbers in it
|
13
|
+
###
|
14
|
+
|
15
|
+
def validate_year_date(field)
|
16
|
+
date = nil
|
17
|
+
formats = ["%d/%m/%Y","%d-%m-%Y","%d %m %Y","%m/%d/%Y","%m-%d-%Y","%m %d %Y"]
|
18
|
+
formats.each do |format|
|
19
|
+
if(Date.strptime(field, format) rescue false)
|
20
|
+
date = Date.strptime(field, format)
|
21
|
+
break
|
22
|
+
end
|
23
|
+
end
|
24
|
+
unless date.nil?
|
25
|
+
field = date.to_s.scan(/\d*/) # Return an array with patterns matching with only numbers in it
|
26
|
+
if field[0].length == 4
|
27
|
+
return true
|
28
|
+
else
|
29
|
+
return false
|
30
|
+
end
|
31
|
+
else
|
32
|
+
return false
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
###
|
37
|
+
# To check for pattern of Date format after Date.parse is successfull
|
38
|
+
# Date.parse(3000) => true which is not supposed to be true
|
39
|
+
###
|
40
|
+
def datetime_pattern(field)
|
41
|
+
pattern1 = field.scan(/[0-9]\//)
|
42
|
+
pattern2 = field.scan(/[0-9]\-/)
|
43
|
+
pattern3 = field.scan(/[0-9] /)
|
44
|
+
pattern4 = field.scan(/[0-9] [A-Z][a-z][a-z] [0-9]|[0-9]-[A-Z][a-z][a-z]-[0-9]|[0-9] [a-z][a-z][a-z] [0-9]|[0-9]-[a-z][a-z][a-z]-[0-9]/)
|
45
|
+
if(pattern1.size == 2||pattern2.size == 2||pattern3.size == 2||pattern4.size != 0)
|
46
|
+
if(validate_year_date(field))
|
47
|
+
return true
|
48
|
+
else
|
49
|
+
return false
|
50
|
+
end
|
51
|
+
else
|
52
|
+
return false
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
###
|
57
|
+
#To determine the data-type of an input field
|
58
|
+
###
|
59
|
+
def get_datatype(field)
|
60
|
+
#Remove if field has any comma's for int and float rep
|
61
|
+
if field != nil && field.class == String
|
62
|
+
num = field.gsub(/,/,'')
|
63
|
+
else
|
64
|
+
num = field
|
65
|
+
end
|
66
|
+
if(Integer(num) rescue false)
|
67
|
+
if num.class == Float
|
68
|
+
return "float"
|
69
|
+
end
|
70
|
+
return "int"
|
71
|
+
elsif(Float(num) rescue false)
|
72
|
+
return "float"
|
73
|
+
elsif(Date.parse(field) rescue false || Date.strptime(field, '%m/%d/%Y') rescue false || Date.strptime(field, '%m-%d-%Y') rescue false || Date.strptime(field, '%m %d %Y') rescue false)
|
74
|
+
if datetime_pattern(field)
|
75
|
+
if field =~ /:/ # To check if the field contains any pattern for Hours:minutes
|
76
|
+
return "datetime"
|
77
|
+
else
|
78
|
+
return "date"
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
return "string"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require_relative "../helpers/errors"
|
2
|
+
require 'pry'
|
3
|
+
module CsvImportAnalyzer
|
4
|
+
module MysqlQueryHelper
|
5
|
+
|
6
|
+
def form_query_for_datatype(args)
|
7
|
+
unless args[:datatype].nil? || args[:header].nil?
|
8
|
+
if args[:datatype] == :string
|
9
|
+
return args[:header].to_s + " varchar(255)"
|
10
|
+
else
|
11
|
+
return args[:header].to_s + " " + args[:datatype].to_s
|
12
|
+
end
|
13
|
+
else
|
14
|
+
MissingRequiredArguments.new("Required arguments missing for form_query_for_datatype")
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def import_csv(args)
|
19
|
+
unless args[:tablename].nil? || args[:filename].nil? || args[:delimiter].nil?
|
20
|
+
import_statement = "LOAD DATA INFILE #{args[:filename]} INTO TABLE #{args[:tablename]} "+
|
21
|
+
"FIELDS TERMINATED BY '#{args[:delimiter]}' "+
|
22
|
+
"ENCLOSED BY '\"' "+
|
23
|
+
"LINES TERMINATED BY '\\n' "+
|
24
|
+
"IGNORE 1 LINES;"
|
25
|
+
else
|
26
|
+
MissingRequiredArguments.new("Required arguments missing for import_csv")
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require_relative "../helpers/errors"
|
2
|
+
require "pry"
|
3
|
+
module CsvImportAnalyzer
|
4
|
+
module PgQueryHelper
|
5
|
+
|
6
|
+
def form_query_for_datatype(args)
|
7
|
+
unless args[:datatype].nil? || args[:header].nil?
|
8
|
+
if args[:datatype] == :string
|
9
|
+
return args[:header].to_s + " varchar(255)"
|
10
|
+
else
|
11
|
+
return args[:header].to_s + " " + args[:datatype].to_s
|
12
|
+
end
|
13
|
+
else
|
14
|
+
MissingRequiredArguments.new("Required arguments missing for form_query_for_datatype")
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def import_csv(args)
|
19
|
+
unless args[:tablename].nil? || args[:filename].nil? || args[:delimiter].nil?
|
20
|
+
pg_import_statement = "COPY #{args[:tablename]} FROM '#{args[:filename]}' HEADER DELIMITER '#{args[:delimiter]}' CSV NULL AS 'NULL';"
|
21
|
+
else
|
22
|
+
MissingRequiredArguments.new("Required arguments missing for import_csv")
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require_relative "../helpers/errors"
|
2
|
+
require "pry"
|
3
|
+
module CsvImportAnalyzer
|
4
|
+
module QueryHelper
|
5
|
+
|
6
|
+
def form_query_for_datatype(args)
|
7
|
+
unless args[:datatype].nil? || args[:header].nil?
|
8
|
+
if args[:datatype] == :string
|
9
|
+
return args[:header].to_s + " varchar(255)"
|
10
|
+
else
|
11
|
+
return args[:header].to_s + " " + args[:datatype].to_s
|
12
|
+
end
|
13
|
+
else
|
14
|
+
MissingRequiredArgumentsError.new("Required arguments missing for form_query_for_datatype")
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def import_csv(args)
|
19
|
+
unless args[:tablename].nil? || args[:filename].nil? || args[:delimiter].nil?
|
20
|
+
pg_import_statement = "COPY #{args[:tablename]} FROM '#{args[:filename]}' HEADER DELIMITER '#{args[:delimiter]}' CSV NULL AS 'NULL';"
|
21
|
+
else
|
22
|
+
MissingRequiredArgumentsError.new("Required arguments missing for import_csv")
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|