csv-import-analyzer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +35 -0
  7. data/Rakefile +2 -0
  8. data/csv-import-analyzer.gemspec +29 -0
  9. data/lib/csv-import-analyzer.rb +18 -0
  10. data/lib/csv-import-analyzer/analyzer/csv_check_bounds.rb +104 -0
  11. data/lib/csv-import-analyzer/analyzer/delimiter_identifier.rb +66 -0
  12. data/lib/csv-import-analyzer/analyzer/file_type_assertion.rb +29 -0
  13. data/lib/csv-import-analyzer/csv_datatype_analysis.rb +110 -0
  14. data/lib/csv-import-analyzer/csv_sanitizer.rb +86 -0
  15. data/lib/csv-import-analyzer/export/metadata_analysis.rb +156 -0
  16. data/lib/csv-import-analyzer/helpers/common_functions.rb +11 -0
  17. data/lib/csv-import-analyzer/helpers/datatype_validation.rb +85 -0
  18. data/lib/csv-import-analyzer/helpers/errors.rb +3 -0
  19. data/lib/csv-import-analyzer/helpers/string_class_extensions.rb +8 -0
  20. data/lib/csv-import-analyzer/query_builder/mysql_query_helper.rb +31 -0
  21. data/lib/csv-import-analyzer/query_builder/pg_query_helper.rb +27 -0
  22. data/lib/csv-import-analyzer/query_builder/query_helper.rb +27 -0
  23. data/lib/csv-import-analyzer/sampleTab.csv +5 -0
  24. data/lib/csv-import-analyzer/sql_query_builder.rb +125 -0
  25. data/lib/csv-import-analyzer/version.rb +5 -0
  26. data/lib/metadata_output.json +70 -0
  27. data/lib/sampleTab.csv +5 -0
  28. data/spec/csv-import-analyzer/analyzer/csv_check_bounds_spec.rb +43 -0
  29. data/spec/csv-import-analyzer/analyzer/delimiter_identifier_spec.rb +61 -0
  30. data/spec/csv-import-analyzer/analyzer/file_type_assertion_spec.rb +0 -0
  31. data/spec/csv-import-analyzer/csv_datatype_analysis_spec.rb +1 -0
  32. data/spec/csv-import-analyzer/csv_sanitizer_spec.rb +24 -0
  33. data/spec/csv-import-analyzer/export/metadata_analysis_spec.rb +0 -0
  34. data/spec/csv-import-analyzer/helpers/common_functions_spec.rb +31 -0
  35. data/spec/csv-import-analyzer/helpers/csv_check_bounds_spec.rb +3 -0
  36. data/spec/csv-import-analyzer/helpers/datatype_validation_spec.rb +75 -0
  37. data/spec/csv-import-analyzer/helpers/mysql_query_helper_spec.rb +0 -0
  38. data/spec/csv-import-analyzer/helpers/pq_query_helper_spec.rb +0 -0
  39. data/spec/csv-import-analyzer/helpers/string_class_extension_spec.rb +18 -0
  40. data/spec/csv-import-analyzer/query_builder/mysql_query_helper_spec.rb +54 -0
  41. data/spec/csv-import-analyzer/query_builder/pg_query_helper_spec.rb +55 -0
  42. data/spec/csv-import-analyzer_spec.rb +14 -0
  43. data/spec/fixtures/sample.csv +5 -0
  44. data/spec/fixtures/sample_options.yml +11 -0
  45. data/spec/fixtures/semicolon-sample.csv +5 -0
  46. data/spec/spec_helper.rb +84 -0
  47. metadata +208 -0
@@ -0,0 +1,5 @@
1
+ Year ID,Make ID,Model ID,Description ID,Price ID
2
+ 1997,Ford,,"ac, abs, moon","3000"
3
+ 1999,Chevy,"Venture ""Extended Edition""","",4900.00
4
+ 1999,"Chevy","Venture ""Extended Edition, Very Large""",,5000.00
5
+ 1996,Jeep,Grand Che'rokee,"MUST SELL!air, moon roof, loaded",4799.00
@@ -0,0 +1,125 @@
1
+ require 'pry'
2
+ require_relative "query_builder/mysql_query_helper"
3
+ require_relative "query_builder/pg_query_helper"
4
+ require_relative "export/metadata_analysis"
5
+ module CsvImportAnalyzer
6
+ class SqlQueryBuilder
7
+ # include CsvImportAnalyzer::mysql_query_helper
8
+ attr_accessor :create_query, :import_query, :csv_column_datatypes, :min_max_bounds, :nullable, :sql_helper_options
9
+
10
+ # Since Building SQL is dependent on multiple things,
11
+ # decided to go with an arguments hash that gets passed when creating an object for the class
12
+ def initialize(args)
13
+ @options = args
14
+ @create_query = {}
15
+ @import_query = {}
16
+ @csv_column_datatypes = args[:csv_column_datatypes]
17
+ @nullable = args[:nullable]
18
+ @sql_helper_options = {:tablename => tablename, :filename => @options[:filename], :delimiter => @options[:delimiter]}
19
+ @mysql_helper.extend(CsvImportAnalyzer::MysqlQueryHelper)
20
+ @pg_helper.extend(CsvImportAnalyzer::PgQueryHelper)
21
+ end
22
+
23
+ def options
24
+ @options
25
+ end
26
+
27
+ def databases
28
+ options[:database]
29
+ end
30
+
31
+ def filename
32
+ return options[:filename]
33
+ end
34
+
35
+ def tablename
36
+ # May be optimize this, not run all three operations everytime filename method is called
37
+ # May be creating filename as instance variable and using a double pipe will relive it from running everytime doesn't it?
38
+ tablename = File.basename(options[:filename])
39
+ tablename.gsub!(" ", "_")
40
+ tablename.downcase!
41
+ return tablename
42
+ end
43
+
44
+ def delimiter
45
+ options[:delimiter]
46
+ end
47
+
48
+ def mysql_helper
49
+ @mysql_helper
50
+ end
51
+
52
+ def pg_helper
53
+ @pg_helper
54
+ end
55
+
56
+ def generate_query
57
+ databases.each do |db|
58
+ create_query[db] = ["create table #{tablename} ("]
59
+ end
60
+ csv_column_datatypes.each do |header, datatype|
61
+ append_to_query = build_query_for_datatype(header, datatype)
62
+ append_to_query.each do |key, value|
63
+ create_query[key].push(value)
64
+ end
65
+ end
66
+ prepare_sql_statements
67
+ prepare_import_csv
68
+ print_metadata_analysis
69
+ end
70
+
71
+ private
72
+
73
+ def build_query_for_datatype(header, datatype)
74
+ query = {}
75
+ databases.each do |db|
76
+ if db == :mysql
77
+ query[db] = mysql_helper.form_query_for_datatype(header: header, datatype: datatype)
78
+ else
79
+ query[db] = pg_helper.form_query_for_datatype(header: header, datatype: datatype)
80
+ end
81
+ end
82
+ unless nullable.include?(header)
83
+ query.keys.each do |db|
84
+ query[db] << " not null"
85
+ end
86
+ end
87
+ return query
88
+ end
89
+
90
+ def prepare_import_csv
91
+ databases.each do |db|
92
+ if db == :mysql
93
+ import_query[db] = mysql_helper.import_csv(tablename: tablename, filename: filename, delimiter: delimiter)
94
+ elsif db == :pg
95
+ import_query[db] = pg_helper.import_csv(tablename: tablename, filename: filename, delimiter: delimiter)
96
+ end
97
+ end
98
+ end
99
+
100
+ def prepare_sql_statements
101
+ databases.each do |db|
102
+ create_query[db][0] = create_query[db].first + " " + create_query[db][1]
103
+ create_query[db].delete_at(1)
104
+ create_query[db] = create_query[db].join(", ")
105
+ create_query[db] << ");"
106
+ end
107
+ end
108
+
109
+ def print_metadata_analysis
110
+ options[:create_query] = create_query
111
+ options[:import_query] = import_query
112
+ export = CsvImportAnalyzer::MetadataAnalysis.new(options)
113
+ export.metadata_print
114
+ end
115
+
116
+ end
117
+ end
118
+
119
+ #Testing
120
+ # args = {}
121
+ # args[:options] = {:delimiter => ",", :chunk => 20, :filename => "/home/avinash/Desktop/csv-import-analyzer/lib/csv-import-analyzer/sampleTab"}
122
+ # args[:column_datatypes] = {:year_id=>:int, :make_id=>:string, :model_id=>:string, :description_id=>:string, :price_id=>:float}
123
+ # args[:nullable] = [:description_id]
124
+ # query = CsvImportAnalyzer::SqlQueryBuilder.new(args)
125
+ # puts query.generate_query
@@ -0,0 +1,5 @@
1
+ module CsvImportAnalyzer
2
+ module Version
3
+ VERSION = "0.0.1"
4
+ end
5
+ end
@@ -0,0 +1,70 @@
1
+ {
2
+ "csv_file": {
3
+ "filename": "sampleTab.csv",
4
+ "file_size": 276,
5
+ "record_delimiter": ","
6
+ },
7
+ "data_manipulations": {
8
+ "replace_nulls": true,
9
+ "replace_quotes": true
10
+ },
11
+ "csv_headers": {
12
+ "year_id": {
13
+ "datatype": "int",
14
+ "datatype_analysis": {
15
+ "int": 4
16
+ },
17
+ "distinct_values": [
18
+ 1997,
19
+ 1999,
20
+ 1996
21
+ ]
22
+ },
23
+ "make_id": {
24
+ "datatype": "string",
25
+ "datatype_analysis": {
26
+ "string": 4
27
+ },
28
+ "distinct_values": [
29
+ "Ford",
30
+ "Chevy",
31
+ "Jeep"
32
+ ]
33
+ },
34
+ "model_id": {
35
+ "datatype": "string",
36
+ "datatype_analysis": {
37
+ "string": 4
38
+ },
39
+ "distinct_values": "3+"
40
+ },
41
+ "description_id": {
42
+ "datatype": "string",
43
+ "datatype_analysis": {
44
+ "string": 2
45
+ },
46
+ "distinct_values": [
47
+ "ac, abs, moon",
48
+ "MUST SELL!air, moon roof, loaded"
49
+ ],
50
+ "nullable": true
51
+ },
52
+ "price_id": {
53
+ "datatype": "float",
54
+ "datatype_analysis": {
55
+ "float": 4
56
+ },
57
+ "distinct_values": "3+"
58
+ }
59
+ },
60
+ "sql": {
61
+ "pg": {
62
+ "create_query": "create table sampletab.csv ( year_id int not null, make_id varchar(255) not null, model_id varchar(255) not null, description_id varchar(255), price_id float not null);",
63
+ "import_query": "COPY sampletab.csv FROM '/home/avinash/Desktop/csv-import-analyzer/lib/sampleTab.csv' HEADER DELIMITER ',' CSV NULL AS 'NULL';"
64
+ },
65
+ "mysql": {
66
+ "create_query": "create table sampletab.csv ( year_id int not null, make_id varchar(255) not null, model_id varchar(255) not null, description_id varchar(255), price_id float not null);",
67
+ "import_query": "COPY sampletab.csv FROM '/home/avinash/Desktop/csv-import-analyzer/lib/sampleTab.csv' HEADER DELIMITER ',' CSV NULL AS 'NULL';"
68
+ }
69
+ }
70
+ }
data/lib/sampleTab.csv ADDED
@@ -0,0 +1,5 @@
1
+ Year ID,Make ID,Model ID,Description ID,Price ID
2
+ 1997,Ford,E350,"ac, abs, moon","3000.00"
3
+ 1999,Chevy,"Venture ""Extended Edition""",,4900.00
4
+ 1999,"Chevy","Venture ""Extended Edition, Very Large""","",5000.00
5
+ 1996,Jeep,Grand Che'rokee,"MUST SELL!air, moon roof, loaded",4799.00
@@ -0,0 +1,43 @@
1
+ #require spec_helper.rb
2
+
3
+ describe CsvImportAnalyzer::CsvCheckBounds do
4
+
5
+ describe '#get_min_max_values' do
6
+ context 'when not initialized right' do
7
+ let(:options) {Hash[filename: "sample", chunk_size: 200, delimiter: ",", unique: 2]}
8
+
9
+ it 'will fail gracefully if filename is nil' do
10
+ @csv_check_bounds = CsvImportAnalyzer::CsvCheckBounds.new
11
+ expect(@csv_check_bounds.get_min_max_values).to be_instance_of(MissingRequiredArguments)
12
+ end
13
+
14
+ it 'returns FileNotFound error if file is not found' do
15
+ @csv_check_bounds = CsvImportAnalyzer::CsvCheckBounds.new(options)
16
+ expect(@csv_check_bounds.get_min_max_values).to be_instance_of(FileNotFound)
17
+ end
18
+ end
19
+
20
+ context 'when initialized right' do
21
+ let(:options) {Hash[filename: $sample_csv_path, chunk_size: 200, delimiter: ",", unique: 2, csv_column_datatypes: {:year_id => :int, :make_id => :string, :model_id => :string, :description_id => :string, :price_id => :float}]}
22
+ before(:each) do
23
+ @csv_check_bounds = CsvImportAnalyzer::CsvCheckBounds.new(options)
24
+ end
25
+
26
+ it 'returns a Hash' do
27
+ expect(@csv_check_bounds.get_min_max_values).to be_an_instance_of(Hash)
28
+ end
29
+
30
+ it 'returns correct min & max values for integer type' do
31
+ result = @csv_check_bounds.get_min_max_values
32
+ expect(result[:min_max][:year_id][:min]).to eq(1996)
33
+ expect(result[:min_max][:year_id][:max]).to eq(1999)
34
+ end
35
+
36
+ it 'returns correct min & max lengths for string type' do
37
+ result = @csv_check_bounds.get_min_max_values
38
+ expect(result[:min_max][:make_id][:min]).to eq(4)
39
+ expect(result[:min_max][:make_id][:max]).to eq(7)
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,61 @@
1
+ # require 'spec_helper'
2
+
3
+ class DummyClass
4
+ end
5
+
6
+ describe '#identify_delimiter' do
7
+
8
+ before(:each) do
9
+ @dummy_class = DummyClass.new
10
+ @dummy_class.extend(CsvImportAnalyzer::DelimiterIdentifier)
11
+ end
12
+
13
+ context 'unable to determine the delimiter' do
14
+
15
+ it 'return invalid input when the input is neither string nor array' do
16
+ expect(@dummy_class.identify_delimiter(3)).to be_instance_of(InvalidInput)
17
+ end
18
+
19
+ it 'returns file not found when the input string is not a valid file' do
20
+ expect(@dummy_class.identify_delimiter("test")).to be_instance_of(FileNotFound)
21
+ end
22
+
23
+ end
24
+
25
+ context 'finds the delimiter when the input is a file' do
26
+
27
+ it 'returns a comma as the delimiter for sample_csv file' do
28
+ expect(@dummy_class.identify_delimiter($sample_csv_path)).to eq(",")
29
+ end
30
+
31
+ it 'returns a semicolon as the the delimiter for sample_ssv file' do
32
+ expect(@dummy_class.identify_delimiter($sample_ssv_path)).to eq(";")
33
+ end
34
+
35
+ end
36
+
37
+ context 'finds the delimiter when the input is an array' do
38
+ let(:sample) {['1999;Chevy;"Venture ""Extended Edition""";"";4900.00','1999;\'Chevy\';"Venture ""Extended Edition; Very Large""";;5000.00']}
39
+ it 'returns a semicolon as the delimiter for sample array input' do
40
+ expect(@dummy_class.identify_delimiter(sample)).to eq(";")
41
+ end
42
+ end
43
+ end
44
+
45
+ describe '#return_plausible_delimiter' do
46
+ before(:each) do
47
+ @dummy_class = DummyClass.new
48
+ @dummy_class.extend(CsvImportAnalyzer::DelimiterIdentifier)
49
+ end
50
+
51
+ context 'identifies delimiter' do
52
+ it 'returns comma as the delimiter by default' do
53
+ expect(@dummy_class.return_plausible_delimiter).to eq(",")
54
+ end
55
+
56
+ it 'returns semicolon as the delimiter for sample delimiter_count' do
57
+ @dummy_class.stub(:delimiter_count).and_return(Hash[","=>15, ";"=>16, "\t"=>0, "|"=>0])
58
+ expect(@dummy_class.return_plausible_delimiter).to eq(";")
59
+ end
60
+ end
61
+ end
@@ -0,0 +1 @@
1
+ # CsvImportAnalyzer::CsvDatatypeAnalysis.new({:delimiter => ",", :chunk => 20, :filename => "sampleTab.csv"}).datatype_analysis
@@ -0,0 +1,24 @@
1
+ # require 'spec_helper'
2
+
3
+ require 'pry'
4
+ describe CsvImportAnalyzer::CsvSanitizer do
5
+ let (:csv_sanitizer) { CsvImportAnalyzer::CsvSanitizer.new }
6
+ it 'should handle file not found issue' do
7
+ expect(csv_sanitizer.process("sample.csv", options = {})).to be_instance_of(FileNotFound)
8
+ end
9
+ #Testing private methods - Although one should really have to test private methods, it's
10
+ context 'testing private methods' do
11
+ let (:test) {"\"t1\", 't2', \"t3\""}
12
+ let (:res) {"\"t1\", \"t2\", \"t3\""}
13
+ xit 'should replace single quotes to double' do
14
+ binding.pry
15
+ expect(csv_sanitizer.send(:replace_line_single_quotes, test, ",")).to eq(res)
16
+ end
17
+ let (:test) {["t1","t2","",nil,"t3"]}
18
+ let (:res) {["t1","t2","NULL","NULL","t3"]}
19
+ it 'should replace null values' do
20
+ expect(csv_sanitizer.send(:replace_null_values, test)).to eq(res)
21
+ end
22
+ end
23
+
24
+ end
@@ -0,0 +1,31 @@
1
+ # require 'spec_helper'
2
+
3
+ class DummyClass
4
+ end
5
+
6
+ describe 'null_like?' do
7
+
8
+ before(:each) do
9
+ @dummy_class = DummyClass.new
10
+ @dummy_class.extend(CsvImportAnalyzer::Helper)
11
+ end
12
+
13
+ context 'when called on null like objects' do
14
+ it 'returns NULL as null type' do
15
+ expect(@dummy_class.null_like?('NULL')).to eq(true)
16
+ end
17
+
18
+ it 'returns \\N as null type' do
19
+ expect(@dummy_class.null_like?('\N')).to eq(true)
20
+ end
21
+ end
22
+
23
+ context 'when called on non-null objects' do
24
+ it 'returns hello as not null' do
25
+ expect(@dummy_class.null_like?('Hello')).to eq(false)
26
+ end
27
+ it 'returns Fixnum(3) as not null' do
28
+ expect(@dummy_class.null_like?(3)).to eq(false)
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,3 @@
1
+ # check_bounds = CsvImportAnalyzer::Helper::CsvCheckBounds.new({:year_id=>:int, :make_id=>:string, :model_id=>:string, :description_id=>:string, :price_id=>:float},
2
+ # {:delimiter => ",", :chunk => 20, :filename => "/home/avinash/Desktop/csv-import-analyzer/lib/csv-import-analyzer/sampleTab.csv"})
3
+ # check_bounds.get_min_max_values
@@ -0,0 +1,75 @@
1
+ # require 'spec_helper'
2
+ require 'date'
3
+ class DummyClass
4
+ end
5
+
6
+ describe '#validate_field' do
7
+
8
+ before(:each) do
9
+ @dummy_class = DummyClass.new
10
+ @dummy_class.extend(CsvImportAnalyzer::DatatypeValidator)
11
+ end
12
+
13
+ context 'knows what an integer looks like' do
14
+
15
+ it 'returns Fixnum type as integer' do
16
+ expect(@dummy_class.validate_field(10)).to eq("int")
17
+ end
18
+ it 'returns Fixnum type with spaces as integer' do
19
+ expect(@dummy_class.validate_field(' 10 ')).to eq("int")
20
+ end
21
+ it 'returns Fixnum type with comma as integer' do
22
+ expect(@dummy_class.validate_field('1,000')).to eq("int")
23
+ end
24
+ it 'returns Fixnum type negative number as integer' do
25
+ expect(@dummy_class.validate_field(-3)).to eq("int")
26
+ end
27
+
28
+ end
29
+
30
+ context 'knows what an Float looks like' do
31
+
32
+ it 'returns Float type as float' do
33
+ expect(@dummy_class.validate_field(10.0)).to eq("float")
34
+ end
35
+ it 'returns Float type with spaces as float' do
36
+ expect(@dummy_class.validate_field(' 10.01 ')).to eq("float")
37
+ end
38
+ it 'returns Float type with comma as float' do
39
+ expect(@dummy_class.validate_field('1,000.01')).to eq("float")
40
+ end
41
+ it 'returns Float type negative number as float' do
42
+ expect(@dummy_class.validate_field(-3.3)).to eq("float")
43
+ end
44
+
45
+ end
46
+
47
+ context 'it knows what a date looks like' do
48
+ it 'return true for a valid date type - dd/mm/yyyy' do
49
+ expect(@dummy_class.validate_field('31/12/2014')).to eq("date")
50
+ end
51
+ it 'return true for a valid date type - mm/dd/yyyy' do
52
+ expect(@dummy_class.validate_field('12/31/2014')).to eq("date")
53
+ end
54
+ it 'return true for a valid date type - mm-dd-yyyy' do
55
+ expect(@dummy_class.validate_field('12-31-2014')).to eq("date")
56
+ end
57
+ it 'return true for a valid date type - mm dd yyyy' do
58
+ expect(@dummy_class.validate_field('12 31 2014')).to eq("date")
59
+ end
60
+ end
61
+
62
+ context 'it knows what a String looks like' do
63
+ it 'default to String type' do
64
+ expect(@dummy_class.validate_field("100 testingNow:)")).to eq("string")
65
+ end
66
+ it 'returns String type as string' do
67
+ expect(@dummy_class.validate_field("Hello")).to eq("string")
68
+ end
69
+ it 'returns String type of dates as string' do
70
+ expect(@dummy_class.validate_field("12 31 2014312")).to eq("string")
71
+ expect(@dummy_class.validate_field("12-31-2014312")).to eq("string")
72
+ expect(@dummy_class.validate_field("12/31/2014312")).to eq("string")
73
+ end
74
+ end
75
+ end