csv-import-analyzer 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/README.md +8 -1
  4. data/csv-import-analyzer.gemspec +1 -1
  5. data/lib/csv-import-analyzer.rb +6 -4
  6. data/lib/csv-import-analyzer/analyzer/csv_check_bounds.rb +30 -19
  7. data/lib/csv-import-analyzer/analyzer/delimiter_identifier.rb +44 -24
  8. data/lib/csv-import-analyzer/analyzer/file_type_assertion.rb +1 -5
  9. data/lib/csv-import-analyzer/csv_datatype_analysis.rb +25 -9
  10. data/lib/csv-import-analyzer/csv_sanitizer.rb +67 -17
  11. data/lib/csv-import-analyzer/export/metadata_analysis.rb +63 -7
  12. data/lib/csv-import-analyzer/helpers/common_functions.rb +4 -0
  13. data/lib/csv-import-analyzer/helpers/datatype_validation.rb +6 -6
  14. data/lib/csv-import-analyzer/helpers/string_class_extensions.rb +9 -3
  15. data/lib/csv-import-analyzer/query_builder/mysql_query_helper.rb +2 -2
  16. data/lib/csv-import-analyzer/query_builder/pg_query_helper.rb +1 -2
  17. data/lib/csv-import-analyzer/query_builder/query_helper.rb +2 -2
  18. data/lib/csv-import-analyzer/sql_query_builder.rb +27 -12
  19. data/lib/csv-import-analyzer/version.rb +1 -1
  20. data/spec/csv-import-analyzer/analyzer/csv_check_bounds_spec.rb +8 -8
  21. data/spec/csv-import-analyzer/analyzer/delimiter_identifier_spec.rb +13 -13
  22. data/spec/csv-import-analyzer/csv_sanitizer_spec.rb +10 -7
  23. data/spec/csv-import-analyzer/helpers/common_functions_spec.rb +20 -19
  24. data/spec/csv-import-analyzer/helpers/datatype_validation_spec.rb +28 -28
  25. data/spec/csv-import-analyzer/helpers/string_class_extension_spec.rb +6 -6
  26. data/spec/csv-import-analyzer/query_builder/mysql_query_helper_spec.rb +13 -13
  27. data/spec/csv-import-analyzer/query_builder/pg_query_helper_spec.rb +16 -16
  28. data/spec/csv-import-analyzer_spec.rb +3 -6
  29. data/spec/fixtures/sample.csv +2 -2
  30. data/spec/spec_helper.rb +3 -0
  31. metadata +17 -6
  32. data/lib/csv-import-analyzer/sampleTab.csv +0 -5
  33. data/samples/metadata_output.json +0 -70
  34. data/spec/csv-import-analyzer/csv_datatype_analysis_spec.rb +0 -1
@@ -1,4 +1,3 @@
1
- require 'pry'
2
1
  require 'json'
3
2
  module CsvImportAnalyzer
4
3
  class MetadataAnalysis
@@ -48,6 +47,11 @@ module CsvImportAnalyzer
48
47
  @max_distinct_values ||= Integer(options[:unique]) + 1
49
48
  end
50
49
 
50
+ ###
51
+ # Builds the metadata from the analysis done so far
52
+ # Creates a new json file with the analysis added to it if options[:metadata_output] is set
53
+ # returns JSON object of the metadata
54
+ ###
51
55
  def metadata_print
52
56
  build_metadata_output
53
57
  if options[:metadata_output]
@@ -64,16 +68,22 @@ module CsvImportAnalyzer
64
68
 
65
69
  private
66
70
 
71
+ ###
72
+ # Create or overwrite the metadata_output.json file if it already exists
73
+ # Write the metadata to the file and close it
74
+ ###
67
75
  def json_print_to_file
68
76
  outfile = File.open("metadata_output.json", "w")
69
77
  outfile << JSON.pretty_generate(metadata)
70
78
  outfile.close
71
79
  end
72
80
 
73
- # Priniting that csv from json is a mess - How to make pretty print ?
81
+ ###
82
+ # Priniting the metadat to csv - How to make sense of the csv print??
83
+ # TODO: ADD support for returning data analysis as csv file
84
+ ###
74
85
  def csv_print_to_file
75
86
  CSV.open("metadata_output.csv", "w") do |csv|
76
- binding.pry
77
87
  metadata.each do |key, value|
78
88
  if value.class == Hash
79
89
  csv << [key]
@@ -85,6 +95,10 @@ module CsvImportAnalyzer
85
95
  end
86
96
  end
87
97
 
98
+ ###
99
+ # Handle the key => value pairs to be printed as CSV files
100
+ # Recursively prints the key and value
101
+ ###
88
102
  def print_hash_to_csv(hash, csv_handler)
89
103
  if hash.class == Hash
90
104
  hash.each do |key, value|
@@ -96,6 +110,11 @@ module CsvImportAnalyzer
96
110
  end
97
111
  end
98
112
 
113
+ ###
114
+ # Build the metadata hash with need key value pairs
115
+ # Add the analysis data to @metadata instance variable
116
+ # E.g. metadata[:csv_file] means the metadata for csv file
117
+ ###
99
118
  def build_metadata_output
100
119
  metadata[:csv_file] = add_file_metadata
101
120
  metadata[:data_manipulations] = add_data_manipulations
@@ -103,16 +122,33 @@ module CsvImportAnalyzer
103
122
  metadata[:sql] = add_sql_data
104
123
  end
105
124
 
125
+ ###
126
+ # Metadata of the file
127
+ # adds the filename, file_path, record delimiter of the file along with processed file metadata
128
+ # Returns a hash of file data
129
+ ###
106
130
  def add_file_metadata
107
131
  file_data = {}
108
- file_data[:filename] = File.basename(options[:filename])
109
- file_data[:file_size] = File.size(options[:filename])
132
+ file_data[:filename] = File.basename(options[:original_filename])
133
+ file_data[:file_size] = File.size(options[:original_filename])
134
+ file_data[:record_delimiter] = options[:delimiter]
135
+
136
+ file_data[:processed_filename] = File.basename(options[:filename])
137
+ file_data[:processed_file_path] = options[:filename]
138
+ file_data[:processed_file_size] = File.size(options[:filename])
139
+ file_data[:error_report] = options[:temp_file]
110
140
  # file_data[:rows] = options[:rows]
111
141
  # file_data[:columns] = options[:columns]
112
- file_data[:record_delimiter] = options[:delimiter]
113
142
  return file_data
114
143
  end
115
144
 
145
+ ###
146
+ # Add the data manipulations done to the processed file
147
+ # Currently only two types of manipulations
148
+ # replace all the nulls and empty values with NULL
149
+ # replace single quotes with double quotes
150
+ # returns hash of data_manipulations
151
+ ###
116
152
  def add_data_manipulations
117
153
  data_manipulations = {}
118
154
  data_manipulations[:replace_nulls] = options[:replace_nulls]
@@ -120,6 +156,22 @@ module CsvImportAnalyzer
120
156
  return data_manipulations
121
157
  end
122
158
 
159
+ ###
160
+ # builds a columns hash with metadata of each column
161
+ # E.g
162
+ # "photo_id": {
163
+ # "datatype": "int", => Tells the datatype is int
164
+ # "datatype_analysis": { => gives the results of datatypes analyis done
165
+ # eventhough the column is determined to be int
166
+ # in reality it could have "int": 20, "float": "5"
167
+ # This would help the analyst to get a sense of data late on
168
+ # "int": 20
169
+ # },
170
+ # "distinct_values": "11+" => Cotains an array of distinct values,
171
+ # if they are less than the threshold set
172
+ # or
173
+ # [1, 2, 3]
174
+ # },
123
175
  def add_header_metadata
124
176
  columns = {}
125
177
  header_datatypes.keys.each do |column_name|
@@ -142,6 +194,10 @@ module CsvImportAnalyzer
142
194
  return columns
143
195
  end
144
196
 
197
+ ###
198
+ # Add the queries for each database type specified
199
+ # build an sql hash with both create and import statements
200
+ ###
145
201
  def add_sql_data
146
202
  sql = {}
147
203
  databases.each do |db|
@@ -153,4 +209,4 @@ module CsvImportAnalyzer
153
209
  end
154
210
 
155
211
  end
156
- end
212
+ end
@@ -1,5 +1,9 @@
1
1
  module CsvImportAnalyzer
2
2
  module Helper
3
+ ###
4
+ # To determine if a certain field in the dataset of null type
5
+ # returns a boolean of it's either null or not
6
+ ###
3
7
  def null_like?(value)
4
8
  if ["NULL", "Null", "NUll", "NULl", "null", nil, "", "NAN", "\\N"].include?(value)
5
9
  true
@@ -1,17 +1,16 @@
1
- require 'pry'
2
1
  module CsvImportAnalyzer
3
2
  module DatatypeValidator
4
3
 
5
4
  def validate_field(content)
6
- return get_datatype(content)
5
+ get_datatype(content)
7
6
  end
8
7
 
9
8
  private
9
+
10
10
  ###
11
- # Date.parse("12/31/20145234", "%m/%d/%Y") => true which is not supposed to be true (although technically its true)
11
+ # Date.parse("12/31/20145234", "%m/%d/%Y") => true, which is not supposed to be true (although technically its true)
12
12
  # Validate year part has only 4 numbers in it
13
13
  ###
14
-
15
14
  def validate_year_date(field)
16
15
  date = nil
17
16
  formats = ["%d/%m/%Y","%d-%m-%Y","%d %m %Y","%m/%d/%Y","%m-%d-%Y","%m %d %Y"]
@@ -54,7 +53,8 @@ module CsvImportAnalyzer
54
53
  end
55
54
 
56
55
  ###
57
- #To determine the data-type of an input field
56
+ # To determine the data-type of an input field
57
+ # Returns a field is either int, float, string, date, datetime type
58
58
  ###
59
59
  def get_datatype(field)
60
60
  #Remove if field has any comma's for int and float rep
@@ -82,4 +82,4 @@ module CsvImportAnalyzer
82
82
  return "string"
83
83
  end
84
84
  end
85
- end
85
+ end
@@ -1,8 +1,14 @@
1
- require 'pry'
2
1
  class String
3
- #Extending string class to return the count of substr inside a string
2
+
3
+ ###
4
+ # Monkey patch string class to find the count of needle in haystack
5
+ # haystack is self => string in itself
6
+ # needle could be anything
7
+ # E.g.
8
+ # "hello, how, are, you".substr_count(",") => 3
9
+ ###
4
10
  def substr_count(needle)
5
11
  needle = "\\#{needle}" if(needle == '|') # To escape inside regex
6
12
  self.scan(/(#{needle})/).size
7
13
  end
8
- end
14
+ end
@@ -1,5 +1,5 @@
1
1
  require_relative "../helpers/errors"
2
- require 'pry'
2
+
3
3
  module CsvImportAnalyzer
4
4
  module MysqlQueryHelper
5
5
 
@@ -28,4 +28,4 @@ module CsvImportAnalyzer
28
28
  end
29
29
 
30
30
  end
31
- end
31
+ end
@@ -1,5 +1,4 @@
1
1
  require_relative "../helpers/errors"
2
- require "pry"
3
2
  module CsvImportAnalyzer
4
3
  module PgQueryHelper
5
4
 
@@ -24,4 +23,4 @@ module CsvImportAnalyzer
24
23
  end
25
24
 
26
25
  end
27
- end
26
+ end
@@ -1,5 +1,5 @@
1
1
  require_relative "../helpers/errors"
2
- require "pry"
2
+
3
3
  module CsvImportAnalyzer
4
4
  module QueryHelper
5
5
 
@@ -24,4 +24,4 @@ module CsvImportAnalyzer
24
24
  end
25
25
 
26
26
  end
27
- end
27
+ end
@@ -1,14 +1,15 @@
1
- require 'pry'
2
1
  require_relative "query_builder/mysql_query_helper"
3
2
  require_relative "query_builder/pg_query_helper"
4
3
  require_relative "export/metadata_analysis"
5
4
  module CsvImportAnalyzer
6
5
  class SqlQueryBuilder
7
- # include CsvImportAnalyzer::mysql_query_helper
6
+
8
7
  attr_accessor :create_query, :import_query, :csv_column_datatypes, :min_max_bounds, :nullable, :sql_helper_options
9
8
 
9
+ ###
10
10
  # Since Building SQL is dependent on multiple things,
11
11
  # decided to go with an arguments hash that gets passed when creating an object for the class
12
+ ###
12
13
  def initialize(args)
13
14
  @options = args
14
15
  @create_query = {}
@@ -33,8 +34,8 @@ module CsvImportAnalyzer
33
34
  end
34
35
 
35
36
  def tablename
36
- # May be optimize this, not run all three operations everytime filename method is called
37
- # May be creating filename as instance variable and using a double pipe will relive it from running everytime doesn't it?
37
+ # May be optimize this, not run all three operations everytime filename method is called ??
38
+ # May be creating filename as instance variable and using a double pipe will relive it from running everytime doesn't it??
38
39
  tablename = File.basename(options[:filename])
39
40
  tablename.gsub!(" ", "_")
40
41
  tablename.downcase!
@@ -53,6 +54,12 @@ module CsvImportAnalyzer
53
54
  @pg_helper
54
55
  end
55
56
 
57
+ ###
58
+ # Goes through each of the columns datatypes and prepares SQL statements for
59
+ # 1. Importing CSV files to database
60
+ # 2. Create table schema for the files
61
+ # Makes a function call to return the metadata analysis of the file
62
+ ###
56
63
  def generate_query
57
64
  databases.each do |db|
58
65
  create_query[db] = ["create table #{tablename} ("]
@@ -70,6 +77,10 @@ module CsvImportAnalyzer
70
77
 
71
78
  private
72
79
 
80
+ ###
81
+ # Based on the database type set in options
82
+ # returns query part for the header (column name)
83
+ ###
73
84
  def build_query_for_datatype(header, datatype)
74
85
  query = {}
75
86
  databases.each do |db|
@@ -87,6 +98,10 @@ module CsvImportAnalyzer
87
98
  return query
88
99
  end
89
100
 
101
+ ###
102
+ # based on database type set in options
103
+ # returns import query for the database
104
+ ###
90
105
  def prepare_import_csv
91
106
  databases.each do |db|
92
107
  if db == :mysql
@@ -97,6 +112,9 @@ module CsvImportAnalyzer
97
112
  end
98
113
  end
99
114
 
115
+ ###
116
+ # prepares sql statements based on the query for each header formed earlier
117
+ ###
100
118
  def prepare_sql_statements
101
119
  databases.each do |db|
102
120
  create_query[db][0] = create_query[db].first + " " + create_query[db][1]
@@ -106,6 +124,11 @@ module CsvImportAnalyzer
106
124
  end
107
125
  end
108
126
 
127
+ ###
128
+ # set's the create query and import query's in options
129
+ # these fields will be added to the metadata later
130
+ # instantiates MetadataAnalysis and passes options hash
131
+ ###
109
132
  def print_metadata_analysis
110
133
  options[:create_query] = create_query
111
134
  options[:import_query] = import_query
@@ -115,11 +138,3 @@ module CsvImportAnalyzer
115
138
 
116
139
  end
117
140
  end
118
-
119
- #Testing
120
- # args = {}
121
- # args[:options] = {:delimiter => ",", :chunk => 20, :filename => "/home/avinash/Desktop/csv-import-analyzer/lib/csv-import-analyzer/sampleTab"}
122
- # args[:column_datatypes] = {:year_id=>:int, :make_id=>:string, :model_id=>:string, :description_id=>:string, :price_id=>:float}
123
- # args[:nullable] = [:description_id]
124
- # query = CsvImportAnalyzer::SqlQueryBuilder.new(args)
125
- # puts query.generate_query
@@ -1,5 +1,5 @@
1
1
  module CsvImportAnalyzer
2
2
  module Version
3
- VERSION = "0.0.3"
3
+ VERSION = "0.0.4"
4
4
  end
5
5
  end
@@ -2,38 +2,38 @@
2
2
 
3
3
  describe CsvImportAnalyzer::CsvCheckBounds do
4
4
 
5
- describe '#get_min_max_values' do
6
- context 'when not initialized right' do
5
+ describe "#get_min_max_values" do
6
+ context "when not initialized right" do
7
7
  let(:options) {Hash[filename: "sample", chunk_size: 200, delimiter: ",", unique: 2]}
8
8
 
9
- it 'will fail gracefully if filename is nil' do
9
+ it "will fail gracefully if filename is nil" do
10
10
  @csv_check_bounds = CsvImportAnalyzer::CsvCheckBounds.new
11
11
  expect(@csv_check_bounds.get_min_max_values).to be_instance_of(MissingRequiredArguments)
12
12
  end
13
13
 
14
- it 'returns FileNotFound error if file is not found' do
14
+ it "returns FileNotFound error if file is not found" do
15
15
  @csv_check_bounds = CsvImportAnalyzer::CsvCheckBounds.new(options)
16
16
  expect(@csv_check_bounds.get_min_max_values).to be_instance_of(FileNotFound)
17
17
  end
18
18
  end
19
19
 
20
- context 'when initialized right' do
20
+ context "when initialized right" do
21
21
  let(:options) {Hash[filename: $sample_csv_path, chunk_size: 200, delimiter: ",", unique: 2, csv_column_datatypes: {:year_id => :int, :make_id => :string, :model_id => :string, :description_id => :string, :price_id => :float}]}
22
22
  before(:each) do
23
23
  @csv_check_bounds = CsvImportAnalyzer::CsvCheckBounds.new(options)
24
24
  end
25
25
 
26
- it 'returns a Hash' do
26
+ it "returns a Hash" do
27
27
  expect(@csv_check_bounds.get_min_max_values).to be_an_instance_of(Hash)
28
28
  end
29
29
 
30
- it 'returns correct min & max values for integer type' do
30
+ it "returns correct min & max values for integer type" do
31
31
  result = @csv_check_bounds.get_min_max_values
32
32
  expect(result[:min_max][:year_id][:min]).to eq(1996)
33
33
  expect(result[:min_max][:year_id][:max]).to eq(1999)
34
34
  end
35
35
 
36
- it 'returns correct min & max lengths for string type' do
36
+ it "returns correct min & max lengths for string type" do
37
37
  result = @csv_check_bounds.get_min_max_values
38
38
  expect(result[:min_max][:make_id][:min]).to eq(4)
39
39
  expect(result[:min_max][:make_id][:max]).to eq(7)
@@ -3,57 +3,57 @@
3
3
  class DummyClass
4
4
  end
5
5
 
6
- describe '#identify_delimiter' do
6
+ describe "#identify_delimiter" do
7
7
 
8
8
  before(:each) do
9
9
  @dummy_class = DummyClass.new
10
10
  @dummy_class.extend(CsvImportAnalyzer::DelimiterIdentifier)
11
11
  end
12
12
 
13
- context 'unable to determine the delimiter' do
13
+ context "unable to determine the delimiter" do
14
14
 
15
- it 'return invalid input when the input is neither string nor array' do
15
+ it "return invalid input when the input is neither string nor array" do
16
16
  expect(@dummy_class.identify_delimiter(3)).to be_instance_of(InvalidInput)
17
17
  end
18
18
 
19
- it 'returns file not found when the input string is not a valid file' do
19
+ it "returns file not found when the input string is not a valid file" do
20
20
  expect(@dummy_class.identify_delimiter("test")).to be_instance_of(FileNotFound)
21
21
  end
22
22
 
23
23
  end
24
24
 
25
- context 'finds the delimiter when the input is a file' do
25
+ context "finds the delimiter when the input is a file" do
26
26
 
27
- it 'returns a comma as the delimiter for sample_csv file' do
27
+ it "returns a comma as the delimiter for sample_csv file" do
28
28
  expect(@dummy_class.identify_delimiter($sample_csv_path)).to eq(",")
29
29
  end
30
30
 
31
- it 'returns a semicolon as the the delimiter for sample_ssv file' do
31
+ it "returns a semicolon as the the delimiter for sample_ssv file" do
32
32
  expect(@dummy_class.identify_delimiter($sample_ssv_path)).to eq(";")
33
33
  end
34
34
 
35
35
  end
36
36
 
37
- context 'finds the delimiter when the input is an array' do
37
+ context "finds the delimiter when the input is an array" do
38
38
  let(:sample) {['1999;Chevy;"Venture ""Extended Edition""";"";4900.00','1999;\'Chevy\';"Venture ""Extended Edition; Very Large""";;5000.00']}
39
- it 'returns a semicolon as the delimiter for sample array input' do
39
+ it "returns a semicolon as the delimiter for sample array input" do
40
40
  expect(@dummy_class.identify_delimiter(sample)).to eq(";")
41
41
  end
42
42
  end
43
43
  end
44
44
 
45
- describe '#return_plausible_delimiter' do
45
+ describe "#return_plausible_delimiter" do
46
46
  before(:each) do
47
47
  @dummy_class = DummyClass.new
48
48
  @dummy_class.extend(CsvImportAnalyzer::DelimiterIdentifier)
49
49
  end
50
50
 
51
- context 'identifies delimiter' do
52
- it 'returns comma as the delimiter by default' do
51
+ context "identifies delimiter" do
52
+ it "returns comma as the delimiter by default" do
53
53
  expect(@dummy_class.return_plausible_delimiter).to eq(",")
54
54
  end
55
55
 
56
- it 'returns semicolon as the delimiter for sample delimiter_count' do
56
+ it "returns semicolon as the delimiter for sample delimiter_count" do
57
57
  @dummy_class.stub(:delimiter_count).and_return(Hash[","=>15, ";"=>16, "\t"=>0, "|"=>0])
58
58
  expect(@dummy_class.return_plausible_delimiter).to eq(";")
59
59
  end