csv-import-analyzer 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +8 -1
- data/csv-import-analyzer.gemspec +1 -1
- data/lib/csv-import-analyzer.rb +6 -4
- data/lib/csv-import-analyzer/analyzer/csv_check_bounds.rb +30 -19
- data/lib/csv-import-analyzer/analyzer/delimiter_identifier.rb +44 -24
- data/lib/csv-import-analyzer/analyzer/file_type_assertion.rb +1 -5
- data/lib/csv-import-analyzer/csv_datatype_analysis.rb +25 -9
- data/lib/csv-import-analyzer/csv_sanitizer.rb +67 -17
- data/lib/csv-import-analyzer/export/metadata_analysis.rb +63 -7
- data/lib/csv-import-analyzer/helpers/common_functions.rb +4 -0
- data/lib/csv-import-analyzer/helpers/datatype_validation.rb +6 -6
- data/lib/csv-import-analyzer/helpers/string_class_extensions.rb +9 -3
- data/lib/csv-import-analyzer/query_builder/mysql_query_helper.rb +2 -2
- data/lib/csv-import-analyzer/query_builder/pg_query_helper.rb +1 -2
- data/lib/csv-import-analyzer/query_builder/query_helper.rb +2 -2
- data/lib/csv-import-analyzer/sql_query_builder.rb +27 -12
- data/lib/csv-import-analyzer/version.rb +1 -1
- data/spec/csv-import-analyzer/analyzer/csv_check_bounds_spec.rb +8 -8
- data/spec/csv-import-analyzer/analyzer/delimiter_identifier_spec.rb +13 -13
- data/spec/csv-import-analyzer/csv_sanitizer_spec.rb +10 -7
- data/spec/csv-import-analyzer/helpers/common_functions_spec.rb +20 -19
- data/spec/csv-import-analyzer/helpers/datatype_validation_spec.rb +28 -28
- data/spec/csv-import-analyzer/helpers/string_class_extension_spec.rb +6 -6
- data/spec/csv-import-analyzer/query_builder/mysql_query_helper_spec.rb +13 -13
- data/spec/csv-import-analyzer/query_builder/pg_query_helper_spec.rb +16 -16
- data/spec/csv-import-analyzer_spec.rb +3 -6
- data/spec/fixtures/sample.csv +2 -2
- data/spec/spec_helper.rb +3 -0
- metadata +17 -6
- data/lib/csv-import-analyzer/sampleTab.csv +0 -5
- data/samples/metadata_output.json +0 -70
- data/spec/csv-import-analyzer/csv_datatype_analysis_spec.rb +0 -1
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'pry'
|
2
1
|
require 'json'
|
3
2
|
module CsvImportAnalyzer
|
4
3
|
class MetadataAnalysis
|
@@ -48,6 +47,11 @@ module CsvImportAnalyzer
|
|
48
47
|
@max_distinct_values ||= Integer(options[:unique]) + 1
|
49
48
|
end
|
50
49
|
|
50
|
+
###
|
51
|
+
# Builds the metadata from the analysis done so far
|
52
|
+
# Creates a new json file with the analysis added to it if options[:metadata_output] is set
|
53
|
+
# returns JSON object of the metadata
|
54
|
+
###
|
51
55
|
def metadata_print
|
52
56
|
build_metadata_output
|
53
57
|
if options[:metadata_output]
|
@@ -64,16 +68,22 @@ module CsvImportAnalyzer
|
|
64
68
|
|
65
69
|
private
|
66
70
|
|
71
|
+
###
|
72
|
+
# Create or overwrite the metadata_output.json file if it already exists
|
73
|
+
# Write the metadata to the file and close it
|
74
|
+
###
|
67
75
|
def json_print_to_file
|
68
76
|
outfile = File.open("metadata_output.json", "w")
|
69
77
|
outfile << JSON.pretty_generate(metadata)
|
70
78
|
outfile.close
|
71
79
|
end
|
72
80
|
|
73
|
-
|
81
|
+
###
|
82
|
+
# Priniting the metadat to csv - How to make sense of the csv print??
|
83
|
+
# TODO: ADD support for returning data analysis as csv file
|
84
|
+
###
|
74
85
|
def csv_print_to_file
|
75
86
|
CSV.open("metadata_output.csv", "w") do |csv|
|
76
|
-
binding.pry
|
77
87
|
metadata.each do |key, value|
|
78
88
|
if value.class == Hash
|
79
89
|
csv << [key]
|
@@ -85,6 +95,10 @@ module CsvImportAnalyzer
|
|
85
95
|
end
|
86
96
|
end
|
87
97
|
|
98
|
+
###
|
99
|
+
# Handle the key => value pairs to be printed as CSV files
|
100
|
+
# Recursively prints the key and value
|
101
|
+
###
|
88
102
|
def print_hash_to_csv(hash, csv_handler)
|
89
103
|
if hash.class == Hash
|
90
104
|
hash.each do |key, value|
|
@@ -96,6 +110,11 @@ module CsvImportAnalyzer
|
|
96
110
|
end
|
97
111
|
end
|
98
112
|
|
113
|
+
###
|
114
|
+
# Build the metadata hash with need key value pairs
|
115
|
+
# Add the analysis data to @metadata instance variable
|
116
|
+
# E.g. metadata[:csv_file] means the metadata for csv file
|
117
|
+
###
|
99
118
|
def build_metadata_output
|
100
119
|
metadata[:csv_file] = add_file_metadata
|
101
120
|
metadata[:data_manipulations] = add_data_manipulations
|
@@ -103,16 +122,33 @@ module CsvImportAnalyzer
|
|
103
122
|
metadata[:sql] = add_sql_data
|
104
123
|
end
|
105
124
|
|
125
|
+
###
|
126
|
+
# Metadata of the file
|
127
|
+
# adds the filename, file_path, record delimiter of the file along with processed file metadata
|
128
|
+
# Returns a hash of file data
|
129
|
+
###
|
106
130
|
def add_file_metadata
|
107
131
|
file_data = {}
|
108
|
-
file_data[:filename] = File.basename(options[:
|
109
|
-
file_data[:file_size] = File.size(options[:
|
132
|
+
file_data[:filename] = File.basename(options[:original_filename])
|
133
|
+
file_data[:file_size] = File.size(options[:original_filename])
|
134
|
+
file_data[:record_delimiter] = options[:delimiter]
|
135
|
+
|
136
|
+
file_data[:processed_filename] = File.basename(options[:filename])
|
137
|
+
file_data[:processed_file_path] = options[:filename]
|
138
|
+
file_data[:processed_file_size] = File.size(options[:filename])
|
139
|
+
file_data[:error_report] = options[:temp_file]
|
110
140
|
# file_data[:rows] = options[:rows]
|
111
141
|
# file_data[:columns] = options[:columns]
|
112
|
-
file_data[:record_delimiter] = options[:delimiter]
|
113
142
|
return file_data
|
114
143
|
end
|
115
144
|
|
145
|
+
###
|
146
|
+
# Add the data manipulations done to the processed file
|
147
|
+
# Currently only two types of manipulations
|
148
|
+
# replace all the nulls and empty values with NULL
|
149
|
+
# replace single quotes with double quotes
|
150
|
+
# returns hash of data_manipulations
|
151
|
+
###
|
116
152
|
def add_data_manipulations
|
117
153
|
data_manipulations = {}
|
118
154
|
data_manipulations[:replace_nulls] = options[:replace_nulls]
|
@@ -120,6 +156,22 @@ module CsvImportAnalyzer
|
|
120
156
|
return data_manipulations
|
121
157
|
end
|
122
158
|
|
159
|
+
###
|
160
|
+
# builds a columns hash with metadata of each column
|
161
|
+
# E.g
|
162
|
+
# "photo_id": {
|
163
|
+
# "datatype": "int", => Tells the datatype is int
|
164
|
+
# "datatype_analysis": { => gives the results of datatypes analyis done
|
165
|
+
# eventhough the column is determined to be int
|
166
|
+
# in reality it could have "int": 20, "float": "5"
|
167
|
+
# This would help the analyst to get a sense of data late on
|
168
|
+
# "int": 20
|
169
|
+
# },
|
170
|
+
# "distinct_values": "11+" => Cotains an array of distinct values,
|
171
|
+
# if they are less than the threshold set
|
172
|
+
# or
|
173
|
+
# [1, 2, 3]
|
174
|
+
# },
|
123
175
|
def add_header_metadata
|
124
176
|
columns = {}
|
125
177
|
header_datatypes.keys.each do |column_name|
|
@@ -142,6 +194,10 @@ module CsvImportAnalyzer
|
|
142
194
|
return columns
|
143
195
|
end
|
144
196
|
|
197
|
+
###
|
198
|
+
# Add the queries for each database type specified
|
199
|
+
# build an sql hash with both create and import statements
|
200
|
+
###
|
145
201
|
def add_sql_data
|
146
202
|
sql = {}
|
147
203
|
databases.each do |db|
|
@@ -153,4 +209,4 @@ module CsvImportAnalyzer
|
|
153
209
|
end
|
154
210
|
|
155
211
|
end
|
156
|
-
end
|
212
|
+
end
|
@@ -1,5 +1,9 @@
|
|
1
1
|
module CsvImportAnalyzer
|
2
2
|
module Helper
|
3
|
+
###
|
4
|
+
# To determine if a certain field in the dataset of null type
|
5
|
+
# returns a boolean of it's either null or not
|
6
|
+
###
|
3
7
|
def null_like?(value)
|
4
8
|
if ["NULL", "Null", "NUll", "NULl", "null", nil, "", "NAN", "\\N"].include?(value)
|
5
9
|
true
|
@@ -1,17 +1,16 @@
|
|
1
|
-
require 'pry'
|
2
1
|
module CsvImportAnalyzer
|
3
2
|
module DatatypeValidator
|
4
3
|
|
5
4
|
def validate_field(content)
|
6
|
-
|
5
|
+
get_datatype(content)
|
7
6
|
end
|
8
7
|
|
9
8
|
private
|
9
|
+
|
10
10
|
###
|
11
|
-
# Date.parse("12/31/20145234", "%m/%d/%Y") => true which is not supposed to be true (although technically its true)
|
11
|
+
# Date.parse("12/31/20145234", "%m/%d/%Y") => true, which is not supposed to be true (although technically its true)
|
12
12
|
# Validate year part has only 4 numbers in it
|
13
13
|
###
|
14
|
-
|
15
14
|
def validate_year_date(field)
|
16
15
|
date = nil
|
17
16
|
formats = ["%d/%m/%Y","%d-%m-%Y","%d %m %Y","%m/%d/%Y","%m-%d-%Y","%m %d %Y"]
|
@@ -54,7 +53,8 @@ module CsvImportAnalyzer
|
|
54
53
|
end
|
55
54
|
|
56
55
|
###
|
57
|
-
#To determine the data-type of an input field
|
56
|
+
# To determine the data-type of an input field
|
57
|
+
# Returns a field is either int, float, string, date, datetime type
|
58
58
|
###
|
59
59
|
def get_datatype(field)
|
60
60
|
#Remove if field has any comma's for int and float rep
|
@@ -82,4 +82,4 @@ module CsvImportAnalyzer
|
|
82
82
|
return "string"
|
83
83
|
end
|
84
84
|
end
|
85
|
-
end
|
85
|
+
end
|
@@ -1,8 +1,14 @@
|
|
1
|
-
require 'pry'
|
2
1
|
class String
|
3
|
-
|
2
|
+
|
3
|
+
###
|
4
|
+
# Monkey patch string class to find the count of needle in haystack
|
5
|
+
# haystack is self => string in itself
|
6
|
+
# needle could be anything
|
7
|
+
# E.g.
|
8
|
+
# "hello, how, are, you".substr_count(",") => 3
|
9
|
+
###
|
4
10
|
def substr_count(needle)
|
5
11
|
needle = "\\#{needle}" if(needle == '|') # To escape inside regex
|
6
12
|
self.scan(/(#{needle})/).size
|
7
13
|
end
|
8
|
-
end
|
14
|
+
end
|
@@ -1,14 +1,15 @@
|
|
1
|
-
require 'pry'
|
2
1
|
require_relative "query_builder/mysql_query_helper"
|
3
2
|
require_relative "query_builder/pg_query_helper"
|
4
3
|
require_relative "export/metadata_analysis"
|
5
4
|
module CsvImportAnalyzer
|
6
5
|
class SqlQueryBuilder
|
7
|
-
|
6
|
+
|
8
7
|
attr_accessor :create_query, :import_query, :csv_column_datatypes, :min_max_bounds, :nullable, :sql_helper_options
|
9
8
|
|
9
|
+
###
|
10
10
|
# Since Building SQL is dependent on multiple things,
|
11
11
|
# decided to go with an arguments hash that gets passed when creating an object for the class
|
12
|
+
###
|
12
13
|
def initialize(args)
|
13
14
|
@options = args
|
14
15
|
@create_query = {}
|
@@ -33,8 +34,8 @@ module CsvImportAnalyzer
|
|
33
34
|
end
|
34
35
|
|
35
36
|
def tablename
|
36
|
-
# May be optimize this, not run all three operations everytime filename method is called
|
37
|
-
# May be creating filename as instance variable and using a double pipe will relive it from running everytime doesn't it
|
37
|
+
# May be optimize this, not run all three operations everytime filename method is called ??
|
38
|
+
# May be creating filename as instance variable and using a double pipe will relive it from running everytime doesn't it??
|
38
39
|
tablename = File.basename(options[:filename])
|
39
40
|
tablename.gsub!(" ", "_")
|
40
41
|
tablename.downcase!
|
@@ -53,6 +54,12 @@ module CsvImportAnalyzer
|
|
53
54
|
@pg_helper
|
54
55
|
end
|
55
56
|
|
57
|
+
###
|
58
|
+
# Goes through each of the columns datatypes and prepares SQL statements for
|
59
|
+
# 1. Importing CSV files to database
|
60
|
+
# 2. Create table schema for the files
|
61
|
+
# Makes a function call to return the metadata analysis of the file
|
62
|
+
###
|
56
63
|
def generate_query
|
57
64
|
databases.each do |db|
|
58
65
|
create_query[db] = ["create table #{tablename} ("]
|
@@ -70,6 +77,10 @@ module CsvImportAnalyzer
|
|
70
77
|
|
71
78
|
private
|
72
79
|
|
80
|
+
###
|
81
|
+
# Based on the database type set in options
|
82
|
+
# returns query part for the header (column name)
|
83
|
+
###
|
73
84
|
def build_query_for_datatype(header, datatype)
|
74
85
|
query = {}
|
75
86
|
databases.each do |db|
|
@@ -87,6 +98,10 @@ module CsvImportAnalyzer
|
|
87
98
|
return query
|
88
99
|
end
|
89
100
|
|
101
|
+
###
|
102
|
+
# based on database type set in options
|
103
|
+
# returns import query for the database
|
104
|
+
###
|
90
105
|
def prepare_import_csv
|
91
106
|
databases.each do |db|
|
92
107
|
if db == :mysql
|
@@ -97,6 +112,9 @@ module CsvImportAnalyzer
|
|
97
112
|
end
|
98
113
|
end
|
99
114
|
|
115
|
+
###
|
116
|
+
# prepares sql statements based on the query for each header formed earlier
|
117
|
+
###
|
100
118
|
def prepare_sql_statements
|
101
119
|
databases.each do |db|
|
102
120
|
create_query[db][0] = create_query[db].first + " " + create_query[db][1]
|
@@ -106,6 +124,11 @@ module CsvImportAnalyzer
|
|
106
124
|
end
|
107
125
|
end
|
108
126
|
|
127
|
+
###
|
128
|
+
# set's the create query and import query's in options
|
129
|
+
# these fields will be added to the metadata later
|
130
|
+
# instantiates MetadataAnalysis and passes options hash
|
131
|
+
###
|
109
132
|
def print_metadata_analysis
|
110
133
|
options[:create_query] = create_query
|
111
134
|
options[:import_query] = import_query
|
@@ -115,11 +138,3 @@ module CsvImportAnalyzer
|
|
115
138
|
|
116
139
|
end
|
117
140
|
end
|
118
|
-
|
119
|
-
#Testing
|
120
|
-
# args = {}
|
121
|
-
# args[:options] = {:delimiter => ",", :chunk => 20, :filename => "/home/avinash/Desktop/csv-import-analyzer/lib/csv-import-analyzer/sampleTab"}
|
122
|
-
# args[:column_datatypes] = {:year_id=>:int, :make_id=>:string, :model_id=>:string, :description_id=>:string, :price_id=>:float}
|
123
|
-
# args[:nullable] = [:description_id]
|
124
|
-
# query = CsvImportAnalyzer::SqlQueryBuilder.new(args)
|
125
|
-
# puts query.generate_query
|
@@ -2,38 +2,38 @@
|
|
2
2
|
|
3
3
|
describe CsvImportAnalyzer::CsvCheckBounds do
|
4
4
|
|
5
|
-
describe
|
6
|
-
context
|
5
|
+
describe "#get_min_max_values" do
|
6
|
+
context "when not initialized right" do
|
7
7
|
let(:options) {Hash[filename: "sample", chunk_size: 200, delimiter: ",", unique: 2]}
|
8
8
|
|
9
|
-
it
|
9
|
+
it "will fail gracefully if filename is nil" do
|
10
10
|
@csv_check_bounds = CsvImportAnalyzer::CsvCheckBounds.new
|
11
11
|
expect(@csv_check_bounds.get_min_max_values).to be_instance_of(MissingRequiredArguments)
|
12
12
|
end
|
13
13
|
|
14
|
-
it
|
14
|
+
it "returns FileNotFound error if file is not found" do
|
15
15
|
@csv_check_bounds = CsvImportAnalyzer::CsvCheckBounds.new(options)
|
16
16
|
expect(@csv_check_bounds.get_min_max_values).to be_instance_of(FileNotFound)
|
17
17
|
end
|
18
18
|
end
|
19
19
|
|
20
|
-
context
|
20
|
+
context "when initialized right" do
|
21
21
|
let(:options) {Hash[filename: $sample_csv_path, chunk_size: 200, delimiter: ",", unique: 2, csv_column_datatypes: {:year_id => :int, :make_id => :string, :model_id => :string, :description_id => :string, :price_id => :float}]}
|
22
22
|
before(:each) do
|
23
23
|
@csv_check_bounds = CsvImportAnalyzer::CsvCheckBounds.new(options)
|
24
24
|
end
|
25
25
|
|
26
|
-
it
|
26
|
+
it "returns a Hash" do
|
27
27
|
expect(@csv_check_bounds.get_min_max_values).to be_an_instance_of(Hash)
|
28
28
|
end
|
29
29
|
|
30
|
-
it
|
30
|
+
it "returns correct min & max values for integer type" do
|
31
31
|
result = @csv_check_bounds.get_min_max_values
|
32
32
|
expect(result[:min_max][:year_id][:min]).to eq(1996)
|
33
33
|
expect(result[:min_max][:year_id][:max]).to eq(1999)
|
34
34
|
end
|
35
35
|
|
36
|
-
it
|
36
|
+
it "returns correct min & max lengths for string type" do
|
37
37
|
result = @csv_check_bounds.get_min_max_values
|
38
38
|
expect(result[:min_max][:make_id][:min]).to eq(4)
|
39
39
|
expect(result[:min_max][:make_id][:max]).to eq(7)
|
@@ -3,57 +3,57 @@
|
|
3
3
|
class DummyClass
|
4
4
|
end
|
5
5
|
|
6
|
-
describe
|
6
|
+
describe "#identify_delimiter" do
|
7
7
|
|
8
8
|
before(:each) do
|
9
9
|
@dummy_class = DummyClass.new
|
10
10
|
@dummy_class.extend(CsvImportAnalyzer::DelimiterIdentifier)
|
11
11
|
end
|
12
12
|
|
13
|
-
context
|
13
|
+
context "unable to determine the delimiter" do
|
14
14
|
|
15
|
-
it
|
15
|
+
it "return invalid input when the input is neither string nor array" do
|
16
16
|
expect(@dummy_class.identify_delimiter(3)).to be_instance_of(InvalidInput)
|
17
17
|
end
|
18
18
|
|
19
|
-
it
|
19
|
+
it "returns file not found when the input string is not a valid file" do
|
20
20
|
expect(@dummy_class.identify_delimiter("test")).to be_instance_of(FileNotFound)
|
21
21
|
end
|
22
22
|
|
23
23
|
end
|
24
24
|
|
25
|
-
context
|
25
|
+
context "finds the delimiter when the input is a file" do
|
26
26
|
|
27
|
-
it
|
27
|
+
it "returns a comma as the delimiter for sample_csv file" do
|
28
28
|
expect(@dummy_class.identify_delimiter($sample_csv_path)).to eq(",")
|
29
29
|
end
|
30
30
|
|
31
|
-
it
|
31
|
+
it "returns a semicolon as the the delimiter for sample_ssv file" do
|
32
32
|
expect(@dummy_class.identify_delimiter($sample_ssv_path)).to eq(";")
|
33
33
|
end
|
34
34
|
|
35
35
|
end
|
36
36
|
|
37
|
-
context
|
37
|
+
context "finds the delimiter when the input is an array" do
|
38
38
|
let(:sample) {['1999;Chevy;"Venture ""Extended Edition""";"";4900.00','1999;\'Chevy\';"Venture ""Extended Edition; Very Large""";;5000.00']}
|
39
|
-
it
|
39
|
+
it "returns a semicolon as the delimiter for sample array input" do
|
40
40
|
expect(@dummy_class.identify_delimiter(sample)).to eq(";")
|
41
41
|
end
|
42
42
|
end
|
43
43
|
end
|
44
44
|
|
45
|
-
describe
|
45
|
+
describe "#return_plausible_delimiter" do
|
46
46
|
before(:each) do
|
47
47
|
@dummy_class = DummyClass.new
|
48
48
|
@dummy_class.extend(CsvImportAnalyzer::DelimiterIdentifier)
|
49
49
|
end
|
50
50
|
|
51
|
-
context
|
52
|
-
it
|
51
|
+
context "identifies delimiter" do
|
52
|
+
it "returns comma as the delimiter by default" do
|
53
53
|
expect(@dummy_class.return_plausible_delimiter).to eq(",")
|
54
54
|
end
|
55
55
|
|
56
|
-
it
|
56
|
+
it "returns semicolon as the delimiter for sample delimiter_count" do
|
57
57
|
@dummy_class.stub(:delimiter_count).and_return(Hash[","=>15, ";"=>16, "\t"=>0, "|"=>0])
|
58
58
|
expect(@dummy_class.return_plausible_delimiter).to eq(";")
|
59
59
|
end
|