csv-import-analyzer 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +8 -1
- data/csv-import-analyzer.gemspec +1 -1
- data/lib/csv-import-analyzer.rb +6 -4
- data/lib/csv-import-analyzer/analyzer/csv_check_bounds.rb +30 -19
- data/lib/csv-import-analyzer/analyzer/delimiter_identifier.rb +44 -24
- data/lib/csv-import-analyzer/analyzer/file_type_assertion.rb +1 -5
- data/lib/csv-import-analyzer/csv_datatype_analysis.rb +25 -9
- data/lib/csv-import-analyzer/csv_sanitizer.rb +67 -17
- data/lib/csv-import-analyzer/export/metadata_analysis.rb +63 -7
- data/lib/csv-import-analyzer/helpers/common_functions.rb +4 -0
- data/lib/csv-import-analyzer/helpers/datatype_validation.rb +6 -6
- data/lib/csv-import-analyzer/helpers/string_class_extensions.rb +9 -3
- data/lib/csv-import-analyzer/query_builder/mysql_query_helper.rb +2 -2
- data/lib/csv-import-analyzer/query_builder/pg_query_helper.rb +1 -2
- data/lib/csv-import-analyzer/query_builder/query_helper.rb +2 -2
- data/lib/csv-import-analyzer/sql_query_builder.rb +27 -12
- data/lib/csv-import-analyzer/version.rb +1 -1
- data/spec/csv-import-analyzer/analyzer/csv_check_bounds_spec.rb +8 -8
- data/spec/csv-import-analyzer/analyzer/delimiter_identifier_spec.rb +13 -13
- data/spec/csv-import-analyzer/csv_sanitizer_spec.rb +10 -7
- data/spec/csv-import-analyzer/helpers/common_functions_spec.rb +20 -19
- data/spec/csv-import-analyzer/helpers/datatype_validation_spec.rb +28 -28
- data/spec/csv-import-analyzer/helpers/string_class_extension_spec.rb +6 -6
- data/spec/csv-import-analyzer/query_builder/mysql_query_helper_spec.rb +13 -13
- data/spec/csv-import-analyzer/query_builder/pg_query_helper_spec.rb +16 -16
- data/spec/csv-import-analyzer_spec.rb +3 -6
- data/spec/fixtures/sample.csv +2 -2
- data/spec/spec_helper.rb +3 -0
- metadata +17 -6
- data/lib/csv-import-analyzer/sampleTab.csv +0 -5
- data/samples/metadata_output.json +0 -70
- data/spec/csv-import-analyzer/csv_datatype_analysis_spec.rb +0 -1
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'pry'
|
2
1
|
require 'json'
|
3
2
|
module CsvImportAnalyzer
|
4
3
|
class MetadataAnalysis
|
@@ -48,6 +47,11 @@ module CsvImportAnalyzer
|
|
48
47
|
@max_distinct_values ||= Integer(options[:unique]) + 1
|
49
48
|
end
|
50
49
|
|
50
|
+
###
|
51
|
+
# Builds the metadata from the analysis done so far
|
52
|
+
# Creates a new json file with the analysis added to it if options[:metadata_output] is set
|
53
|
+
# returns JSON object of the metadata
|
54
|
+
###
|
51
55
|
def metadata_print
|
52
56
|
build_metadata_output
|
53
57
|
if options[:metadata_output]
|
@@ -64,16 +68,22 @@ module CsvImportAnalyzer
|
|
64
68
|
|
65
69
|
private
|
66
70
|
|
71
|
+
###
|
72
|
+
# Create or overwrite the metadata_output.json file if it already exists
|
73
|
+
# Write the metadata to the file and close it
|
74
|
+
###
|
67
75
|
def json_print_to_file
|
68
76
|
outfile = File.open("metadata_output.json", "w")
|
69
77
|
outfile << JSON.pretty_generate(metadata)
|
70
78
|
outfile.close
|
71
79
|
end
|
72
80
|
|
73
|
-
|
81
|
+
###
|
82
|
+
# Priniting the metadat to csv - How to make sense of the csv print??
|
83
|
+
# TODO: ADD support for returning data analysis as csv file
|
84
|
+
###
|
74
85
|
def csv_print_to_file
|
75
86
|
CSV.open("metadata_output.csv", "w") do |csv|
|
76
|
-
binding.pry
|
77
87
|
metadata.each do |key, value|
|
78
88
|
if value.class == Hash
|
79
89
|
csv << [key]
|
@@ -85,6 +95,10 @@ module CsvImportAnalyzer
|
|
85
95
|
end
|
86
96
|
end
|
87
97
|
|
98
|
+
###
|
99
|
+
# Handle the key => value pairs to be printed as CSV files
|
100
|
+
# Recursively prints the key and value
|
101
|
+
###
|
88
102
|
def print_hash_to_csv(hash, csv_handler)
|
89
103
|
if hash.class == Hash
|
90
104
|
hash.each do |key, value|
|
@@ -96,6 +110,11 @@ module CsvImportAnalyzer
|
|
96
110
|
end
|
97
111
|
end
|
98
112
|
|
113
|
+
###
|
114
|
+
# Build the metadata hash with need key value pairs
|
115
|
+
# Add the analysis data to @metadata instance variable
|
116
|
+
# E.g. metadata[:csv_file] means the metadata for csv file
|
117
|
+
###
|
99
118
|
def build_metadata_output
|
100
119
|
metadata[:csv_file] = add_file_metadata
|
101
120
|
metadata[:data_manipulations] = add_data_manipulations
|
@@ -103,16 +122,33 @@ module CsvImportAnalyzer
|
|
103
122
|
metadata[:sql] = add_sql_data
|
104
123
|
end
|
105
124
|
|
125
|
+
###
|
126
|
+
# Metadata of the file
|
127
|
+
# adds the filename, file_path, record delimiter of the file along with processed file metadata
|
128
|
+
# Returns a hash of file data
|
129
|
+
###
|
106
130
|
def add_file_metadata
|
107
131
|
file_data = {}
|
108
|
-
file_data[:filename] = File.basename(options[:
|
109
|
-
file_data[:file_size] = File.size(options[:
|
132
|
+
file_data[:filename] = File.basename(options[:original_filename])
|
133
|
+
file_data[:file_size] = File.size(options[:original_filename])
|
134
|
+
file_data[:record_delimiter] = options[:delimiter]
|
135
|
+
|
136
|
+
file_data[:processed_filename] = File.basename(options[:filename])
|
137
|
+
file_data[:processed_file_path] = options[:filename]
|
138
|
+
file_data[:processed_file_size] = File.size(options[:filename])
|
139
|
+
file_data[:error_report] = options[:temp_file]
|
110
140
|
# file_data[:rows] = options[:rows]
|
111
141
|
# file_data[:columns] = options[:columns]
|
112
|
-
file_data[:record_delimiter] = options[:delimiter]
|
113
142
|
return file_data
|
114
143
|
end
|
115
144
|
|
145
|
+
###
|
146
|
+
# Add the data manipulations done to the processed file
|
147
|
+
# Currently only two types of manipulations
|
148
|
+
# replace all the nulls and empty values with NULL
|
149
|
+
# replace single quotes with double quotes
|
150
|
+
# returns hash of data_manipulations
|
151
|
+
###
|
116
152
|
def add_data_manipulations
|
117
153
|
data_manipulations = {}
|
118
154
|
data_manipulations[:replace_nulls] = options[:replace_nulls]
|
@@ -120,6 +156,22 @@ module CsvImportAnalyzer
|
|
120
156
|
return data_manipulations
|
121
157
|
end
|
122
158
|
|
159
|
+
###
|
160
|
+
# builds a columns hash with metadata of each column
|
161
|
+
# E.g
|
162
|
+
# "photo_id": {
|
163
|
+
# "datatype": "int", => Tells the datatype is int
|
164
|
+
# "datatype_analysis": { => gives the results of datatypes analyis done
|
165
|
+
# eventhough the column is determined to be int
|
166
|
+
# in reality it could have "int": 20, "float": "5"
|
167
|
+
# This would help the analyst to get a sense of data late on
|
168
|
+
# "int": 20
|
169
|
+
# },
|
170
|
+
# "distinct_values": "11+" => Cotains an array of distinct values,
|
171
|
+
# if they are less than the threshold set
|
172
|
+
# or
|
173
|
+
# [1, 2, 3]
|
174
|
+
# },
|
123
175
|
def add_header_metadata
|
124
176
|
columns = {}
|
125
177
|
header_datatypes.keys.each do |column_name|
|
@@ -142,6 +194,10 @@ module CsvImportAnalyzer
|
|
142
194
|
return columns
|
143
195
|
end
|
144
196
|
|
197
|
+
###
|
198
|
+
# Add the queries for each database type specified
|
199
|
+
# build an sql hash with both create and import statements
|
200
|
+
###
|
145
201
|
def add_sql_data
|
146
202
|
sql = {}
|
147
203
|
databases.each do |db|
|
@@ -153,4 +209,4 @@ module CsvImportAnalyzer
|
|
153
209
|
end
|
154
210
|
|
155
211
|
end
|
156
|
-
end
|
212
|
+
end
|
@@ -1,5 +1,9 @@
|
|
1
1
|
module CsvImportAnalyzer
|
2
2
|
module Helper
|
3
|
+
###
|
4
|
+
# To determine if a certain field in the dataset of null type
|
5
|
+
# returns a boolean of it's either null or not
|
6
|
+
###
|
3
7
|
def null_like?(value)
|
4
8
|
if ["NULL", "Null", "NUll", "NULl", "null", nil, "", "NAN", "\\N"].include?(value)
|
5
9
|
true
|
@@ -1,17 +1,16 @@
|
|
1
|
-
require 'pry'
|
2
1
|
module CsvImportAnalyzer
|
3
2
|
module DatatypeValidator
|
4
3
|
|
5
4
|
def validate_field(content)
|
6
|
-
|
5
|
+
get_datatype(content)
|
7
6
|
end
|
8
7
|
|
9
8
|
private
|
9
|
+
|
10
10
|
###
|
11
|
-
# Date.parse("12/31/20145234", "%m/%d/%Y") => true which is not supposed to be true (although technically its true)
|
11
|
+
# Date.parse("12/31/20145234", "%m/%d/%Y") => true, which is not supposed to be true (although technically its true)
|
12
12
|
# Validate year part has only 4 numbers in it
|
13
13
|
###
|
14
|
-
|
15
14
|
def validate_year_date(field)
|
16
15
|
date = nil
|
17
16
|
formats = ["%d/%m/%Y","%d-%m-%Y","%d %m %Y","%m/%d/%Y","%m-%d-%Y","%m %d %Y"]
|
@@ -54,7 +53,8 @@ module CsvImportAnalyzer
|
|
54
53
|
end
|
55
54
|
|
56
55
|
###
|
57
|
-
#To determine the data-type of an input field
|
56
|
+
# To determine the data-type of an input field
|
57
|
+
# Returns a field is either int, float, string, date, datetime type
|
58
58
|
###
|
59
59
|
def get_datatype(field)
|
60
60
|
#Remove if field has any comma's for int and float rep
|
@@ -82,4 +82,4 @@ module CsvImportAnalyzer
|
|
82
82
|
return "string"
|
83
83
|
end
|
84
84
|
end
|
85
|
-
end
|
85
|
+
end
|
@@ -1,8 +1,14 @@
|
|
1
|
-
require 'pry'
|
2
1
|
class String
|
3
|
-
|
2
|
+
|
3
|
+
###
|
4
|
+
# Monkey patch string class to find the count of needle in haystack
|
5
|
+
# haystack is self => string in itself
|
6
|
+
# needle could be anything
|
7
|
+
# E.g.
|
8
|
+
# "hello, how, are, you".substr_count(",") => 3
|
9
|
+
###
|
4
10
|
def substr_count(needle)
|
5
11
|
needle = "\\#{needle}" if(needle == '|') # To escape inside regex
|
6
12
|
self.scan(/(#{needle})/).size
|
7
13
|
end
|
8
|
-
end
|
14
|
+
end
|
@@ -1,14 +1,15 @@
|
|
1
|
-
require 'pry'
|
2
1
|
require_relative "query_builder/mysql_query_helper"
|
3
2
|
require_relative "query_builder/pg_query_helper"
|
4
3
|
require_relative "export/metadata_analysis"
|
5
4
|
module CsvImportAnalyzer
|
6
5
|
class SqlQueryBuilder
|
7
|
-
|
6
|
+
|
8
7
|
attr_accessor :create_query, :import_query, :csv_column_datatypes, :min_max_bounds, :nullable, :sql_helper_options
|
9
8
|
|
9
|
+
###
|
10
10
|
# Since Building SQL is dependent on multiple things,
|
11
11
|
# decided to go with an arguments hash that gets passed when creating an object for the class
|
12
|
+
###
|
12
13
|
def initialize(args)
|
13
14
|
@options = args
|
14
15
|
@create_query = {}
|
@@ -33,8 +34,8 @@ module CsvImportAnalyzer
|
|
33
34
|
end
|
34
35
|
|
35
36
|
def tablename
|
36
|
-
# May be optimize this, not run all three operations everytime filename method is called
|
37
|
-
# May be creating filename as instance variable and using a double pipe will relive it from running everytime doesn't it
|
37
|
+
# May be optimize this, not run all three operations everytime filename method is called ??
|
38
|
+
# May be creating filename as instance variable and using a double pipe will relive it from running everytime doesn't it??
|
38
39
|
tablename = File.basename(options[:filename])
|
39
40
|
tablename.gsub!(" ", "_")
|
40
41
|
tablename.downcase!
|
@@ -53,6 +54,12 @@ module CsvImportAnalyzer
|
|
53
54
|
@pg_helper
|
54
55
|
end
|
55
56
|
|
57
|
+
###
|
58
|
+
# Goes through each of the columns datatypes and prepares SQL statements for
|
59
|
+
# 1. Importing CSV files to database
|
60
|
+
# 2. Create table schema for the files
|
61
|
+
# Makes a function call to return the metadata analysis of the file
|
62
|
+
###
|
56
63
|
def generate_query
|
57
64
|
databases.each do |db|
|
58
65
|
create_query[db] = ["create table #{tablename} ("]
|
@@ -70,6 +77,10 @@ module CsvImportAnalyzer
|
|
70
77
|
|
71
78
|
private
|
72
79
|
|
80
|
+
###
|
81
|
+
# Based on the database type set in options
|
82
|
+
# returns query part for the header (column name)
|
83
|
+
###
|
73
84
|
def build_query_for_datatype(header, datatype)
|
74
85
|
query = {}
|
75
86
|
databases.each do |db|
|
@@ -87,6 +98,10 @@ module CsvImportAnalyzer
|
|
87
98
|
return query
|
88
99
|
end
|
89
100
|
|
101
|
+
###
|
102
|
+
# based on database type set in options
|
103
|
+
# returns import query for the database
|
104
|
+
###
|
90
105
|
def prepare_import_csv
|
91
106
|
databases.each do |db|
|
92
107
|
if db == :mysql
|
@@ -97,6 +112,9 @@ module CsvImportAnalyzer
|
|
97
112
|
end
|
98
113
|
end
|
99
114
|
|
115
|
+
###
|
116
|
+
# prepares sql statements based on the query for each header formed earlier
|
117
|
+
###
|
100
118
|
def prepare_sql_statements
|
101
119
|
databases.each do |db|
|
102
120
|
create_query[db][0] = create_query[db].first + " " + create_query[db][1]
|
@@ -106,6 +124,11 @@ module CsvImportAnalyzer
|
|
106
124
|
end
|
107
125
|
end
|
108
126
|
|
127
|
+
###
|
128
|
+
# set's the create query and import query's in options
|
129
|
+
# these fields will be added to the metadata later
|
130
|
+
# instantiates MetadataAnalysis and passes options hash
|
131
|
+
###
|
109
132
|
def print_metadata_analysis
|
110
133
|
options[:create_query] = create_query
|
111
134
|
options[:import_query] = import_query
|
@@ -115,11 +138,3 @@ module CsvImportAnalyzer
|
|
115
138
|
|
116
139
|
end
|
117
140
|
end
|
118
|
-
|
119
|
-
#Testing
|
120
|
-
# args = {}
|
121
|
-
# args[:options] = {:delimiter => ",", :chunk => 20, :filename => "/home/avinash/Desktop/csv-import-analyzer/lib/csv-import-analyzer/sampleTab"}
|
122
|
-
# args[:column_datatypes] = {:year_id=>:int, :make_id=>:string, :model_id=>:string, :description_id=>:string, :price_id=>:float}
|
123
|
-
# args[:nullable] = [:description_id]
|
124
|
-
# query = CsvImportAnalyzer::SqlQueryBuilder.new(args)
|
125
|
-
# puts query.generate_query
|
@@ -2,38 +2,38 @@
|
|
2
2
|
|
3
3
|
describe CsvImportAnalyzer::CsvCheckBounds do
|
4
4
|
|
5
|
-
describe
|
6
|
-
context
|
5
|
+
describe "#get_min_max_values" do
|
6
|
+
context "when not initialized right" do
|
7
7
|
let(:options) {Hash[filename: "sample", chunk_size: 200, delimiter: ",", unique: 2]}
|
8
8
|
|
9
|
-
it
|
9
|
+
it "will fail gracefully if filename is nil" do
|
10
10
|
@csv_check_bounds = CsvImportAnalyzer::CsvCheckBounds.new
|
11
11
|
expect(@csv_check_bounds.get_min_max_values).to be_instance_of(MissingRequiredArguments)
|
12
12
|
end
|
13
13
|
|
14
|
-
it
|
14
|
+
it "returns FileNotFound error if file is not found" do
|
15
15
|
@csv_check_bounds = CsvImportAnalyzer::CsvCheckBounds.new(options)
|
16
16
|
expect(@csv_check_bounds.get_min_max_values).to be_instance_of(FileNotFound)
|
17
17
|
end
|
18
18
|
end
|
19
19
|
|
20
|
-
context
|
20
|
+
context "when initialized right" do
|
21
21
|
let(:options) {Hash[filename: $sample_csv_path, chunk_size: 200, delimiter: ",", unique: 2, csv_column_datatypes: {:year_id => :int, :make_id => :string, :model_id => :string, :description_id => :string, :price_id => :float}]}
|
22
22
|
before(:each) do
|
23
23
|
@csv_check_bounds = CsvImportAnalyzer::CsvCheckBounds.new(options)
|
24
24
|
end
|
25
25
|
|
26
|
-
it
|
26
|
+
it "returns a Hash" do
|
27
27
|
expect(@csv_check_bounds.get_min_max_values).to be_an_instance_of(Hash)
|
28
28
|
end
|
29
29
|
|
30
|
-
it
|
30
|
+
it "returns correct min & max values for integer type" do
|
31
31
|
result = @csv_check_bounds.get_min_max_values
|
32
32
|
expect(result[:min_max][:year_id][:min]).to eq(1996)
|
33
33
|
expect(result[:min_max][:year_id][:max]).to eq(1999)
|
34
34
|
end
|
35
35
|
|
36
|
-
it
|
36
|
+
it "returns correct min & max lengths for string type" do
|
37
37
|
result = @csv_check_bounds.get_min_max_values
|
38
38
|
expect(result[:min_max][:make_id][:min]).to eq(4)
|
39
39
|
expect(result[:min_max][:make_id][:max]).to eq(7)
|
@@ -3,57 +3,57 @@
|
|
3
3
|
class DummyClass
|
4
4
|
end
|
5
5
|
|
6
|
-
describe
|
6
|
+
describe "#identify_delimiter" do
|
7
7
|
|
8
8
|
before(:each) do
|
9
9
|
@dummy_class = DummyClass.new
|
10
10
|
@dummy_class.extend(CsvImportAnalyzer::DelimiterIdentifier)
|
11
11
|
end
|
12
12
|
|
13
|
-
context
|
13
|
+
context "unable to determine the delimiter" do
|
14
14
|
|
15
|
-
it
|
15
|
+
it "return invalid input when the input is neither string nor array" do
|
16
16
|
expect(@dummy_class.identify_delimiter(3)).to be_instance_of(InvalidInput)
|
17
17
|
end
|
18
18
|
|
19
|
-
it
|
19
|
+
it "returns file not found when the input string is not a valid file" do
|
20
20
|
expect(@dummy_class.identify_delimiter("test")).to be_instance_of(FileNotFound)
|
21
21
|
end
|
22
22
|
|
23
23
|
end
|
24
24
|
|
25
|
-
context
|
25
|
+
context "finds the delimiter when the input is a file" do
|
26
26
|
|
27
|
-
it
|
27
|
+
it "returns a comma as the delimiter for sample_csv file" do
|
28
28
|
expect(@dummy_class.identify_delimiter($sample_csv_path)).to eq(",")
|
29
29
|
end
|
30
30
|
|
31
|
-
it
|
31
|
+
it "returns a semicolon as the the delimiter for sample_ssv file" do
|
32
32
|
expect(@dummy_class.identify_delimiter($sample_ssv_path)).to eq(";")
|
33
33
|
end
|
34
34
|
|
35
35
|
end
|
36
36
|
|
37
|
-
context
|
37
|
+
context "finds the delimiter when the input is an array" do
|
38
38
|
let(:sample) {['1999;Chevy;"Venture ""Extended Edition""";"";4900.00','1999;\'Chevy\';"Venture ""Extended Edition; Very Large""";;5000.00']}
|
39
|
-
it
|
39
|
+
it "returns a semicolon as the delimiter for sample array input" do
|
40
40
|
expect(@dummy_class.identify_delimiter(sample)).to eq(";")
|
41
41
|
end
|
42
42
|
end
|
43
43
|
end
|
44
44
|
|
45
|
-
describe
|
45
|
+
describe "#return_plausible_delimiter" do
|
46
46
|
before(:each) do
|
47
47
|
@dummy_class = DummyClass.new
|
48
48
|
@dummy_class.extend(CsvImportAnalyzer::DelimiterIdentifier)
|
49
49
|
end
|
50
50
|
|
51
|
-
context
|
52
|
-
it
|
51
|
+
context "identifies delimiter" do
|
52
|
+
it "returns comma as the delimiter by default" do
|
53
53
|
expect(@dummy_class.return_plausible_delimiter).to eq(",")
|
54
54
|
end
|
55
55
|
|
56
|
-
it
|
56
|
+
it "returns semicolon as the delimiter for sample delimiter_count" do
|
57
57
|
@dummy_class.stub(:delimiter_count).and_return(Hash[","=>15, ";"=>16, "\t"=>0, "|"=>0])
|
58
58
|
expect(@dummy_class.return_plausible_delimiter).to eq(";")
|
59
59
|
end
|