csv-import-analyzer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +35 -0
  7. data/Rakefile +2 -0
  8. data/csv-import-analyzer.gemspec +29 -0
  9. data/lib/csv-import-analyzer.rb +18 -0
  10. data/lib/csv-import-analyzer/analyzer/csv_check_bounds.rb +104 -0
  11. data/lib/csv-import-analyzer/analyzer/delimiter_identifier.rb +66 -0
  12. data/lib/csv-import-analyzer/analyzer/file_type_assertion.rb +29 -0
  13. data/lib/csv-import-analyzer/csv_datatype_analysis.rb +110 -0
  14. data/lib/csv-import-analyzer/csv_sanitizer.rb +86 -0
  15. data/lib/csv-import-analyzer/export/metadata_analysis.rb +156 -0
  16. data/lib/csv-import-analyzer/helpers/common_functions.rb +11 -0
  17. data/lib/csv-import-analyzer/helpers/datatype_validation.rb +85 -0
  18. data/lib/csv-import-analyzer/helpers/errors.rb +3 -0
  19. data/lib/csv-import-analyzer/helpers/string_class_extensions.rb +8 -0
  20. data/lib/csv-import-analyzer/query_builder/mysql_query_helper.rb +31 -0
  21. data/lib/csv-import-analyzer/query_builder/pg_query_helper.rb +27 -0
  22. data/lib/csv-import-analyzer/query_builder/query_helper.rb +27 -0
  23. data/lib/csv-import-analyzer/sampleTab.csv +5 -0
  24. data/lib/csv-import-analyzer/sql_query_builder.rb +125 -0
  25. data/lib/csv-import-analyzer/version.rb +5 -0
  26. data/lib/metadata_output.json +70 -0
  27. data/lib/sampleTab.csv +5 -0
  28. data/spec/csv-import-analyzer/analyzer/csv_check_bounds_spec.rb +43 -0
  29. data/spec/csv-import-analyzer/analyzer/delimiter_identifier_spec.rb +61 -0
  30. data/spec/csv-import-analyzer/analyzer/file_type_assertion_spec.rb +0 -0
  31. data/spec/csv-import-analyzer/csv_datatype_analysis_spec.rb +1 -0
  32. data/spec/csv-import-analyzer/csv_sanitizer_spec.rb +24 -0
  33. data/spec/csv-import-analyzer/export/metadata_analysis_spec.rb +0 -0
  34. data/spec/csv-import-analyzer/helpers/common_functions_spec.rb +31 -0
  35. data/spec/csv-import-analyzer/helpers/csv_check_bounds_spec.rb +3 -0
  36. data/spec/csv-import-analyzer/helpers/datatype_validation_spec.rb +75 -0
  37. data/spec/csv-import-analyzer/helpers/mysql_query_helper_spec.rb +0 -0
  38. data/spec/csv-import-analyzer/helpers/pq_query_helper_spec.rb +0 -0
  39. data/spec/csv-import-analyzer/helpers/string_class_extension_spec.rb +18 -0
  40. data/spec/csv-import-analyzer/query_builder/mysql_query_helper_spec.rb +54 -0
  41. data/spec/csv-import-analyzer/query_builder/pg_query_helper_spec.rb +55 -0
  42. data/spec/csv-import-analyzer_spec.rb +14 -0
  43. data/spec/fixtures/sample.csv +5 -0
  44. data/spec/fixtures/sample_options.yml +11 -0
  45. data/spec/fixtures/semicolon-sample.csv +5 -0
  46. data/spec/spec_helper.rb +84 -0
  47. metadata +208 -0
@@ -0,0 +1,5 @@
1
+ Year ID,Make ID,Model ID,Description ID,Price ID
2
+ 1997,Ford,,"ac, abs, moon","3000"
3
+ 1999,Chevy,"Venture ""Extended Edition""","",4900.00
4
+ 1999,"Chevy","Venture ""Extended Edition, Very Large""",,5000.00
5
+ 1996,Jeep,Grand Che'rokee,"MUST SELL!air, moon roof, loaded",4799.00
@@ -0,0 +1,125 @@
1
+ require 'pry'
2
+ require_relative "query_builder/mysql_query_helper"
3
+ require_relative "query_builder/pg_query_helper"
4
+ require_relative "export/metadata_analysis"
5
+ module CsvImportAnalyzer
6
+ class SqlQueryBuilder
7
+ # include CsvImportAnalyzer::mysql_query_helper
8
+ attr_accessor :create_query, :import_query, :csv_column_datatypes, :min_max_bounds, :nullable, :sql_helper_options
9
+
10
+ # Since Building SQL is dependent on multiple things,
11
+ # decided to go with an arguments hash that gets passed when creating an object for the class
12
+ def initialize(args)
13
+ @options = args
14
+ @create_query = {}
15
+ @import_query = {}
16
+ @csv_column_datatypes = args[:csv_column_datatypes]
17
+ @nullable = args[:nullable]
18
+ @sql_helper_options = {:tablename => tablename, :filename => @options[:filename], :delimiter => @options[:delimiter]}
19
+ @mysql_helper.extend(CsvImportAnalyzer::MysqlQueryHelper)
20
+ @pg_helper.extend(CsvImportAnalyzer::PgQueryHelper)
21
+ end
22
+
23
+ def options
24
+ @options
25
+ end
26
+
27
+ def databases
28
+ options[:database]
29
+ end
30
+
31
+ def filename
32
+ return options[:filename]
33
+ end
34
+
35
+ def tablename
36
+ # May be optimize this, not run all three operations everytime filename method is called
37
+ # May be creating filename as instance variable and using a double pipe will relive it from running everytime doesn't it?
38
+ tablename = File.basename(options[:filename])
39
+ tablename.gsub!(" ", "_")
40
+ tablename.downcase!
41
+ return tablename
42
+ end
43
+
44
+ def delimiter
45
+ options[:delimiter]
46
+ end
47
+
48
+ def mysql_helper
49
+ @mysql_helper
50
+ end
51
+
52
+ def pg_helper
53
+ @pg_helper
54
+ end
55
+
56
+ def generate_query
57
+ databases.each do |db|
58
+ create_query[db] = ["create table #{tablename} ("]
59
+ end
60
+ csv_column_datatypes.each do |header, datatype|
61
+ append_to_query = build_query_for_datatype(header, datatype)
62
+ append_to_query.each do |key, value|
63
+ create_query[key].push(value)
64
+ end
65
+ end
66
+ prepare_sql_statements
67
+ prepare_import_csv
68
+ print_metadata_analysis
69
+ end
70
+
71
+ private
72
+
73
+ def build_query_for_datatype(header, datatype)
74
+ query = {}
75
+ databases.each do |db|
76
+ if db == :mysql
77
+ query[db] = mysql_helper.form_query_for_datatype(header: header, datatype: datatype)
78
+ else
79
+ query[db] = pg_helper.form_query_for_datatype(header: header, datatype: datatype)
80
+ end
81
+ end
82
+ unless nullable.include?(header)
83
+ query.keys.each do |db|
84
+ query[db] << " not null"
85
+ end
86
+ end
87
+ return query
88
+ end
89
+
90
+ def prepare_import_csv
91
+ databases.each do |db|
92
+ if db == :mysql
93
+ import_query[db] = mysql_helper.import_csv(tablename: tablename, filename: filename, delimiter: delimiter)
94
+ elsif db == :pg
95
+ import_query[db] = pg_helper.import_csv(tablename: tablename, filename: filename, delimiter: delimiter)
96
+ end
97
+ end
98
+ end
99
+
100
+ def prepare_sql_statements
101
+ databases.each do |db|
102
+ create_query[db][0] = create_query[db].first + " " + create_query[db][1]
103
+ create_query[db].delete_at(1)
104
+ create_query[db] = create_query[db].join(", ")
105
+ create_query[db] << ");"
106
+ end
107
+ end
108
+
109
+ def print_metadata_analysis
110
+ options[:create_query] = create_query
111
+ options[:import_query] = import_query
112
+ export = CsvImportAnalyzer::MetadataAnalysis.new(options)
113
+ export.metadata_print
114
+ end
115
+
116
+ end
117
+ end
118
+
119
+ #Testing
120
+ # args = {}
121
+ # args[:options] = {:delimiter => ",", :chunk => 20, :filename => "/home/avinash/Desktop/csv-import-analyzer/lib/csv-import-analyzer/sampleTab"}
122
+ # args[:column_datatypes] = {:year_id=>:int, :make_id=>:string, :model_id=>:string, :description_id=>:string, :price_id=>:float}
123
+ # args[:nullable] = [:description_id]
124
+ # query = CsvImportAnalyzer::SqlQueryBuilder.new(args)
125
+ # puts query.generate_query
@@ -0,0 +1,5 @@
1
+ module CsvImportAnalyzer
2
+ module Version
3
+ VERSION = "0.0.1"
4
+ end
5
+ end
@@ -0,0 +1,70 @@
1
+ {
2
+ "csv_file": {
3
+ "filename": "sampleTab.csv",
4
+ "file_size": 276,
5
+ "record_delimiter": ","
6
+ },
7
+ "data_manipulations": {
8
+ "replace_nulls": true,
9
+ "replace_quotes": true
10
+ },
11
+ "csv_headers": {
12
+ "year_id": {
13
+ "datatype": "int",
14
+ "datatype_analysis": {
15
+ "int": 4
16
+ },
17
+ "distinct_values": [
18
+ 1997,
19
+ 1999,
20
+ 1996
21
+ ]
22
+ },
23
+ "make_id": {
24
+ "datatype": "string",
25
+ "datatype_analysis": {
26
+ "string": 4
27
+ },
28
+ "distinct_values": [
29
+ "Ford",
30
+ "Chevy",
31
+ "Jeep"
32
+ ]
33
+ },
34
+ "model_id": {
35
+ "datatype": "string",
36
+ "datatype_analysis": {
37
+ "string": 4
38
+ },
39
+ "distinct_values": "3+"
40
+ },
41
+ "description_id": {
42
+ "datatype": "string",
43
+ "datatype_analysis": {
44
+ "string": 2
45
+ },
46
+ "distinct_values": [
47
+ "ac, abs, moon",
48
+ "MUST SELL!air, moon roof, loaded"
49
+ ],
50
+ "nullable": true
51
+ },
52
+ "price_id": {
53
+ "datatype": "float",
54
+ "datatype_analysis": {
55
+ "float": 4
56
+ },
57
+ "distinct_values": "3+"
58
+ }
59
+ },
60
+ "sql": {
61
+ "pg": {
62
+ "create_query": "create table sampletab.csv ( year_id int not null, make_id varchar(255) not null, model_id varchar(255) not null, description_id varchar(255), price_id float not null);",
63
+ "import_query": "COPY sampletab.csv FROM '/home/avinash/Desktop/csv-import-analyzer/lib/sampleTab.csv' HEADER DELIMITER ',' CSV NULL AS 'NULL';"
64
+ },
65
+ "mysql": {
66
+ "create_query": "create table sampletab.csv ( year_id int not null, make_id varchar(255) not null, model_id varchar(255) not null, description_id varchar(255), price_id float not null);",
67
+ "import_query": "COPY sampletab.csv FROM '/home/avinash/Desktop/csv-import-analyzer/lib/sampleTab.csv' HEADER DELIMITER ',' CSV NULL AS 'NULL';"
68
+ }
69
+ }
70
+ }
data/lib/sampleTab.csv ADDED
@@ -0,0 +1,5 @@
1
+ Year ID,Make ID,Model ID,Description ID,Price ID
2
+ 1997,Ford,E350,"ac, abs, moon","3000.00"
3
+ 1999,Chevy,"Venture ""Extended Edition""",,4900.00
4
+ 1999,"Chevy","Venture ""Extended Edition, Very Large""","",5000.00
5
+ 1996,Jeep,Grand Che'rokee,"MUST SELL!air, moon roof, loaded",4799.00
@@ -0,0 +1,43 @@
1
+ #require spec_helper.rb
2
+
3
+ describe CsvImportAnalyzer::CsvCheckBounds do
4
+
5
+ describe '#get_min_max_values' do
6
+ context 'when not initialized right' do
7
+ let(:options) {Hash[filename: "sample", chunk_size: 200, delimiter: ",", unique: 2]}
8
+
9
+ it 'will fail gracefully if filename is nil' do
10
+ @csv_check_bounds = CsvImportAnalyzer::CsvCheckBounds.new
11
+ expect(@csv_check_bounds.get_min_max_values).to be_instance_of(MissingRequiredArguments)
12
+ end
13
+
14
+ it 'returns FileNotFound error if file is not found' do
15
+ @csv_check_bounds = CsvImportAnalyzer::CsvCheckBounds.new(options)
16
+ expect(@csv_check_bounds.get_min_max_values).to be_instance_of(FileNotFound)
17
+ end
18
+ end
19
+
20
+ context 'when initialized right' do
21
+ let(:options) {Hash[filename: $sample_csv_path, chunk_size: 200, delimiter: ",", unique: 2, csv_column_datatypes: {:year_id => :int, :make_id => :string, :model_id => :string, :description_id => :string, :price_id => :float}]}
22
+ before(:each) do
23
+ @csv_check_bounds = CsvImportAnalyzer::CsvCheckBounds.new(options)
24
+ end
25
+
26
+ it 'returns a Hash' do
27
+ expect(@csv_check_bounds.get_min_max_values).to be_an_instance_of(Hash)
28
+ end
29
+
30
+ it 'returns correct min & max values for integer type' do
31
+ result = @csv_check_bounds.get_min_max_values
32
+ expect(result[:min_max][:year_id][:min]).to eq(1996)
33
+ expect(result[:min_max][:year_id][:max]).to eq(1999)
34
+ end
35
+
36
+ it 'returns correct min & max lengths for string type' do
37
+ result = @csv_check_bounds.get_min_max_values
38
+ expect(result[:min_max][:make_id][:min]).to eq(4)
39
+ expect(result[:min_max][:make_id][:max]).to eq(7)
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,61 @@
1
+ # require 'spec_helper'
2
+
3
+ class DummyClass
4
+ end
5
+
6
+ describe '#identify_delimiter' do
7
+
8
+ before(:each) do
9
+ @dummy_class = DummyClass.new
10
+ @dummy_class.extend(CsvImportAnalyzer::DelimiterIdentifier)
11
+ end
12
+
13
+ context 'unable to determine the delimiter' do
14
+
15
+ it 'return invalid input when the input is neither string nor array' do
16
+ expect(@dummy_class.identify_delimiter(3)).to be_instance_of(InvalidInput)
17
+ end
18
+
19
+ it 'returns file not found when the input string is not a valid file' do
20
+ expect(@dummy_class.identify_delimiter("test")).to be_instance_of(FileNotFound)
21
+ end
22
+
23
+ end
24
+
25
+ context 'finds the delimiter when the input is a file' do
26
+
27
+ it 'returns a comma as the delimiter for sample_csv file' do
28
+ expect(@dummy_class.identify_delimiter($sample_csv_path)).to eq(",")
29
+ end
30
+
31
+ it 'returns a semicolon as the the delimiter for sample_ssv file' do
32
+ expect(@dummy_class.identify_delimiter($sample_ssv_path)).to eq(";")
33
+ end
34
+
35
+ end
36
+
37
+ context 'finds the delimiter when the input is an array' do
38
+ let(:sample) {['1999;Chevy;"Venture ""Extended Edition""";"";4900.00','1999;\'Chevy\';"Venture ""Extended Edition; Very Large""";;5000.00']}
39
+ it 'returns a semicolon as the delimiter for sample array input' do
40
+ expect(@dummy_class.identify_delimiter(sample)).to eq(";")
41
+ end
42
+ end
43
+ end
44
+
45
+ describe '#return_plausible_delimiter' do
46
+ before(:each) do
47
+ @dummy_class = DummyClass.new
48
+ @dummy_class.extend(CsvImportAnalyzer::DelimiterIdentifier)
49
+ end
50
+
51
+ context 'identifies delimiter' do
52
+ it 'returns comma as the delimiter by default' do
53
+ expect(@dummy_class.return_plausible_delimiter).to eq(",")
54
+ end
55
+
56
+ it 'returns semicolon as the delimiter for sample delimiter_count' do
57
+ @dummy_class.stub(:delimiter_count).and_return(Hash[","=>15, ";"=>16, "\t"=>0, "|"=>0])
58
+ expect(@dummy_class.return_plausible_delimiter).to eq(";")
59
+ end
60
+ end
61
+ end
@@ -0,0 +1 @@
1
+ # CsvImportAnalyzer::CsvDatatypeAnalysis.new({:delimiter => ",", :chunk => 20, :filename => "sampleTab.csv"}).datatype_analysis
@@ -0,0 +1,24 @@
1
+ # require 'spec_helper'
2
+
3
+ require 'pry'
4
+ describe CsvImportAnalyzer::CsvSanitizer do
5
+ let (:csv_sanitizer) { CsvImportAnalyzer::CsvSanitizer.new }
6
+ it 'should handle file not found issue' do
7
+ expect(csv_sanitizer.process("sample.csv", options = {})).to be_instance_of(FileNotFound)
8
+ end
9
+ #Testing private methods - Although one should really have to test private methods, it's
10
+ context 'testing private methods' do
11
+ let (:test) {"\"t1\", 't2', \"t3\""}
12
+ let (:res) {"\"t1\", \"t2\", \"t3\""}
13
+ xit 'should replace single quotes to double' do
14
+ binding.pry
15
+ expect(csv_sanitizer.send(:replace_line_single_quotes, test, ",")).to eq(res)
16
+ end
17
+ let (:test) {["t1","t2","",nil,"t3"]}
18
+ let (:res) {["t1","t2","NULL","NULL","t3"]}
19
+ it 'should replace null values' do
20
+ expect(csv_sanitizer.send(:replace_null_values, test)).to eq(res)
21
+ end
22
+ end
23
+
24
+ end
@@ -0,0 +1,31 @@
1
+ # require 'spec_helper'
2
+
3
+ class DummyClass
4
+ end
5
+
6
+ describe 'null_like?' do
7
+
8
+ before(:each) do
9
+ @dummy_class = DummyClass.new
10
+ @dummy_class.extend(CsvImportAnalyzer::Helper)
11
+ end
12
+
13
+ context 'when called on null like objects' do
14
+ it 'returns NULL as null type' do
15
+ expect(@dummy_class.null_like?('NULL')).to eq(true)
16
+ end
17
+
18
+ it 'returns \\N as null type' do
19
+ expect(@dummy_class.null_like?('\N')).to eq(true)
20
+ end
21
+ end
22
+
23
+ context 'when called on non-null objects' do
24
+ it 'returns hello as not null' do
25
+ expect(@dummy_class.null_like?('Hello')).to eq(false)
26
+ end
27
+ it 'returns Fixnum(3) as not null' do
28
+ expect(@dummy_class.null_like?(3)).to eq(false)
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,3 @@
1
+ # check_bounds = CsvImportAnalyzer::Helper::CsvCheckBounds.new({:year_id=>:int, :make_id=>:string, :model_id=>:string, :description_id=>:string, :price_id=>:float},
2
+ # {:delimiter => ",", :chunk => 20, :filename => "/home/avinash/Desktop/csv-import-analyzer/lib/csv-import-analyzer/sampleTab.csv"})
3
+ # check_bounds.get_min_max_values
@@ -0,0 +1,75 @@
1
+ # require 'spec_helper'
2
+ require 'date'
3
+ class DummyClass
4
+ end
5
+
6
+ describe '#validate_field' do
7
+
8
+ before(:each) do
9
+ @dummy_class = DummyClass.new
10
+ @dummy_class.extend(CsvImportAnalyzer::DatatypeValidator)
11
+ end
12
+
13
+ context 'knows what an integer looks like' do
14
+
15
+ it 'returns Fixnum type as integer' do
16
+ expect(@dummy_class.validate_field(10)).to eq("int")
17
+ end
18
+ it 'returns Fixnum type with spaces as integer' do
19
+ expect(@dummy_class.validate_field(' 10 ')).to eq("int")
20
+ end
21
+ it 'returns Fixnum type with comma as integer' do
22
+ expect(@dummy_class.validate_field('1,000')).to eq("int")
23
+ end
24
+ it 'returns Fixnum type negative number as integer' do
25
+ expect(@dummy_class.validate_field(-3)).to eq("int")
26
+ end
27
+
28
+ end
29
+
30
+ context 'knows what an Float looks like' do
31
+
32
+ it 'returns Float type as float' do
33
+ expect(@dummy_class.validate_field(10.0)).to eq("float")
34
+ end
35
+ it 'returns Float type with spaces as float' do
36
+ expect(@dummy_class.validate_field(' 10.01 ')).to eq("float")
37
+ end
38
+ it 'returns Float type with comma as float' do
39
+ expect(@dummy_class.validate_field('1,000.01')).to eq("float")
40
+ end
41
+ it 'returns Float type negative number as float' do
42
+ expect(@dummy_class.validate_field(-3.3)).to eq("float")
43
+ end
44
+
45
+ end
46
+
47
+ context 'it knows what a date looks like' do
48
+ it 'return true for a valid date type - dd/mm/yyyy' do
49
+ expect(@dummy_class.validate_field('31/12/2014')).to eq("date")
50
+ end
51
+ it 'return true for a valid date type - mm/dd/yyyy' do
52
+ expect(@dummy_class.validate_field('12/31/2014')).to eq("date")
53
+ end
54
+ it 'return true for a valid date type - mm-dd-yyyy' do
55
+ expect(@dummy_class.validate_field('12-31-2014')).to eq("date")
56
+ end
57
+ it 'return true for a valid date type - mm dd yyyy' do
58
+ expect(@dummy_class.validate_field('12 31 2014')).to eq("date")
59
+ end
60
+ end
61
+
62
+ context 'it knows what a String looks like' do
63
+ it 'default to String type' do
64
+ expect(@dummy_class.validate_field("100 testingNow:)")).to eq("string")
65
+ end
66
+ it 'returns String type as string' do
67
+ expect(@dummy_class.validate_field("Hello")).to eq("string")
68
+ end
69
+ it 'returns String type of dates as string' do
70
+ expect(@dummy_class.validate_field("12 31 2014312")).to eq("string")
71
+ expect(@dummy_class.validate_field("12-31-2014312")).to eq("string")
72
+ expect(@dummy_class.validate_field("12/31/2014312")).to eq("string")
73
+ end
74
+ end
75
+ end