csv-import-analyzer 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/README.md +8 -1
  4. data/csv-import-analyzer.gemspec +1 -1
  5. data/lib/csv-import-analyzer.rb +6 -4
  6. data/lib/csv-import-analyzer/analyzer/csv_check_bounds.rb +30 -19
  7. data/lib/csv-import-analyzer/analyzer/delimiter_identifier.rb +44 -24
  8. data/lib/csv-import-analyzer/analyzer/file_type_assertion.rb +1 -5
  9. data/lib/csv-import-analyzer/csv_datatype_analysis.rb +25 -9
  10. data/lib/csv-import-analyzer/csv_sanitizer.rb +67 -17
  11. data/lib/csv-import-analyzer/export/metadata_analysis.rb +63 -7
  12. data/lib/csv-import-analyzer/helpers/common_functions.rb +4 -0
  13. data/lib/csv-import-analyzer/helpers/datatype_validation.rb +6 -6
  14. data/lib/csv-import-analyzer/helpers/string_class_extensions.rb +9 -3
  15. data/lib/csv-import-analyzer/query_builder/mysql_query_helper.rb +2 -2
  16. data/lib/csv-import-analyzer/query_builder/pg_query_helper.rb +1 -2
  17. data/lib/csv-import-analyzer/query_builder/query_helper.rb +2 -2
  18. data/lib/csv-import-analyzer/sql_query_builder.rb +27 -12
  19. data/lib/csv-import-analyzer/version.rb +1 -1
  20. data/spec/csv-import-analyzer/analyzer/csv_check_bounds_spec.rb +8 -8
  21. data/spec/csv-import-analyzer/analyzer/delimiter_identifier_spec.rb +13 -13
  22. data/spec/csv-import-analyzer/csv_sanitizer_spec.rb +10 -7
  23. data/spec/csv-import-analyzer/helpers/common_functions_spec.rb +20 -19
  24. data/spec/csv-import-analyzer/helpers/datatype_validation_spec.rb +28 -28
  25. data/spec/csv-import-analyzer/helpers/string_class_extension_spec.rb +6 -6
  26. data/spec/csv-import-analyzer/query_builder/mysql_query_helper_spec.rb +13 -13
  27. data/spec/csv-import-analyzer/query_builder/pg_query_helper_spec.rb +16 -16
  28. data/spec/csv-import-analyzer_spec.rb +3 -6
  29. data/spec/fixtures/sample.csv +2 -2
  30. data/spec/spec_helper.rb +3 -0
  31. metadata +17 -6
  32. data/lib/csv-import-analyzer/sampleTab.csv +0 -5
  33. data/samples/metadata_output.json +0 -70
  34. data/spec/csv-import-analyzer/csv_datatype_analysis_spec.rb +0 -1
@@ -1,4 +1,3 @@
1
- require 'pry'
2
1
  require 'json'
3
2
  module CsvImportAnalyzer
4
3
  class MetadataAnalysis
@@ -48,6 +47,11 @@ module CsvImportAnalyzer
48
47
  @max_distinct_values ||= Integer(options[:unique]) + 1
49
48
  end
50
49
 
50
+ ###
51
+ # Builds the metadata from the analysis done so far
52
+ # Creates a new json file with the analysis added to it if options[:metadata_output] is set
53
+ # returns JSON object of the metadata
54
+ ###
51
55
  def metadata_print
52
56
  build_metadata_output
53
57
  if options[:metadata_output]
@@ -64,16 +68,22 @@ module CsvImportAnalyzer
64
68
 
65
69
  private
66
70
 
71
+ ###
72
+ # Create or overwrite the metadata_output.json file if it already exists
73
+ # Write the metadata to the file and close it
74
+ ###
67
75
  def json_print_to_file
68
76
  outfile = File.open("metadata_output.json", "w")
69
77
  outfile << JSON.pretty_generate(metadata)
70
78
  outfile.close
71
79
  end
72
80
 
73
- # Priniting that csv from json is a mess - How to make pretty print ?
81
+ ###
82
+ # Priniting the metadat to csv - How to make sense of the csv print??
83
+ # TODO: ADD support for returning data analysis as csv file
84
+ ###
74
85
  def csv_print_to_file
75
86
  CSV.open("metadata_output.csv", "w") do |csv|
76
- binding.pry
77
87
  metadata.each do |key, value|
78
88
  if value.class == Hash
79
89
  csv << [key]
@@ -85,6 +95,10 @@ module CsvImportAnalyzer
85
95
  end
86
96
  end
87
97
 
98
+ ###
99
+ # Handle the key => value pairs to be printed as CSV files
100
+ # Recursively prints the key and value
101
+ ###
88
102
  def print_hash_to_csv(hash, csv_handler)
89
103
  if hash.class == Hash
90
104
  hash.each do |key, value|
@@ -96,6 +110,11 @@ module CsvImportAnalyzer
96
110
  end
97
111
  end
98
112
 
113
+ ###
114
+ # Build the metadata hash with need key value pairs
115
+ # Add the analysis data to @metadata instance variable
116
+ # E.g. metadata[:csv_file] means the metadata for csv file
117
+ ###
99
118
  def build_metadata_output
100
119
  metadata[:csv_file] = add_file_metadata
101
120
  metadata[:data_manipulations] = add_data_manipulations
@@ -103,16 +122,33 @@ module CsvImportAnalyzer
103
122
  metadata[:sql] = add_sql_data
104
123
  end
105
124
 
125
+ ###
126
+ # Metadata of the file
127
+ # adds the filename, file_path, record delimiter of the file along with processed file metadata
128
+ # Returns a hash of file data
129
+ ###
106
130
  def add_file_metadata
107
131
  file_data = {}
108
- file_data[:filename] = File.basename(options[:filename])
109
- file_data[:file_size] = File.size(options[:filename])
132
+ file_data[:filename] = File.basename(options[:original_filename])
133
+ file_data[:file_size] = File.size(options[:original_filename])
134
+ file_data[:record_delimiter] = options[:delimiter]
135
+
136
+ file_data[:processed_filename] = File.basename(options[:filename])
137
+ file_data[:processed_file_path] = options[:filename]
138
+ file_data[:processed_file_size] = File.size(options[:filename])
139
+ file_data[:error_report] = options[:temp_file]
110
140
  # file_data[:rows] = options[:rows]
111
141
  # file_data[:columns] = options[:columns]
112
- file_data[:record_delimiter] = options[:delimiter]
113
142
  return file_data
114
143
  end
115
144
 
145
+ ###
146
+ # Add the data manipulations done to the processed file
147
+ # Currently only two types of manipulations
148
+ # replace all the nulls and empty values with NULL
149
+ # replace single quotes with double quotes
150
+ # returns hash of data_manipulations
151
+ ###
116
152
  def add_data_manipulations
117
153
  data_manipulations = {}
118
154
  data_manipulations[:replace_nulls] = options[:replace_nulls]
@@ -120,6 +156,22 @@ module CsvImportAnalyzer
120
156
  return data_manipulations
121
157
  end
122
158
 
159
+ ###
160
+ # builds a columns hash with metadata of each column
161
+ # E.g
162
+ # "photo_id": {
163
+ # "datatype": "int", => Tells the datatype is int
164
+ # "datatype_analysis": { => gives the results of datatypes analyis done
165
+ # eventhough the column is determined to be int
166
+ # in reality it could have "int": 20, "float": "5"
167
+ # This would help the analyst to get a sense of data late on
168
+ # "int": 20
169
+ # },
170
+ # "distinct_values": "11+" => Cotains an array of distinct values,
171
+ # if they are less than the threshold set
172
+ # or
173
+ # [1, 2, 3]
174
+ # },
123
175
  def add_header_metadata
124
176
  columns = {}
125
177
  header_datatypes.keys.each do |column_name|
@@ -142,6 +194,10 @@ module CsvImportAnalyzer
142
194
  return columns
143
195
  end
144
196
 
197
+ ###
198
+ # Add the queries for each database type specified
199
+ # build an sql hash with both create and import statements
200
+ ###
145
201
  def add_sql_data
146
202
  sql = {}
147
203
  databases.each do |db|
@@ -153,4 +209,4 @@ module CsvImportAnalyzer
153
209
  end
154
210
 
155
211
  end
156
- end
212
+ end
@@ -1,5 +1,9 @@
1
1
  module CsvImportAnalyzer
2
2
  module Helper
3
+ ###
4
+ # To determine if a certain field in the dataset of null type
5
+ # returns a boolean of it's either null or not
6
+ ###
3
7
  def null_like?(value)
4
8
  if ["NULL", "Null", "NUll", "NULl", "null", nil, "", "NAN", "\\N"].include?(value)
5
9
  true
@@ -1,17 +1,16 @@
1
- require 'pry'
2
1
  module CsvImportAnalyzer
3
2
  module DatatypeValidator
4
3
 
5
4
  def validate_field(content)
6
- return get_datatype(content)
5
+ get_datatype(content)
7
6
  end
8
7
 
9
8
  private
9
+
10
10
  ###
11
- # Date.parse("12/31/20145234", "%m/%d/%Y") => true which is not supposed to be true (although technically its true)
11
+ # Date.parse("12/31/20145234", "%m/%d/%Y") => true, which is not supposed to be true (although technically its true)
12
12
  # Validate year part has only 4 numbers in it
13
13
  ###
14
-
15
14
  def validate_year_date(field)
16
15
  date = nil
17
16
  formats = ["%d/%m/%Y","%d-%m-%Y","%d %m %Y","%m/%d/%Y","%m-%d-%Y","%m %d %Y"]
@@ -54,7 +53,8 @@ module CsvImportAnalyzer
54
53
  end
55
54
 
56
55
  ###
57
- #To determine the data-type of an input field
56
+ # To determine the data-type of an input field
57
+ # Returns a field is either int, float, string, date, datetime type
58
58
  ###
59
59
  def get_datatype(field)
60
60
  #Remove if field has any comma's for int and float rep
@@ -82,4 +82,4 @@ module CsvImportAnalyzer
82
82
  return "string"
83
83
  end
84
84
  end
85
- end
85
+ end
@@ -1,8 +1,14 @@
1
- require 'pry'
2
1
  class String
3
- #Extending string class to return the count of substr inside a string
2
+
3
+ ###
4
+ # Monkey patch string class to find the count of needle in haystack
5
+ # haystack is self => string in itself
6
+ # needle could be anything
7
+ # E.g.
8
+ # "hello, how, are, you".substr_count(",") => 3
9
+ ###
4
10
  def substr_count(needle)
5
11
  needle = "\\#{needle}" if(needle == '|') # To escape inside regex
6
12
  self.scan(/(#{needle})/).size
7
13
  end
8
- end
14
+ end
@@ -1,5 +1,5 @@
1
1
  require_relative "../helpers/errors"
2
- require 'pry'
2
+
3
3
  module CsvImportAnalyzer
4
4
  module MysqlQueryHelper
5
5
 
@@ -28,4 +28,4 @@ module CsvImportAnalyzer
28
28
  end
29
29
 
30
30
  end
31
- end
31
+ end
@@ -1,5 +1,4 @@
1
1
  require_relative "../helpers/errors"
2
- require "pry"
3
2
  module CsvImportAnalyzer
4
3
  module PgQueryHelper
5
4
 
@@ -24,4 +23,4 @@ module CsvImportAnalyzer
24
23
  end
25
24
 
26
25
  end
27
- end
26
+ end
@@ -1,5 +1,5 @@
1
1
  require_relative "../helpers/errors"
2
- require "pry"
2
+
3
3
  module CsvImportAnalyzer
4
4
  module QueryHelper
5
5
 
@@ -24,4 +24,4 @@ module CsvImportAnalyzer
24
24
  end
25
25
 
26
26
  end
27
- end
27
+ end
@@ -1,14 +1,15 @@
1
- require 'pry'
2
1
  require_relative "query_builder/mysql_query_helper"
3
2
  require_relative "query_builder/pg_query_helper"
4
3
  require_relative "export/metadata_analysis"
5
4
  module CsvImportAnalyzer
6
5
  class SqlQueryBuilder
7
- # include CsvImportAnalyzer::mysql_query_helper
6
+
8
7
  attr_accessor :create_query, :import_query, :csv_column_datatypes, :min_max_bounds, :nullable, :sql_helper_options
9
8
 
9
+ ###
10
10
  # Since Building SQL is dependent on multiple things,
11
11
  # decided to go with an arguments hash that gets passed when creating an object for the class
12
+ ###
12
13
  def initialize(args)
13
14
  @options = args
14
15
  @create_query = {}
@@ -33,8 +34,8 @@ module CsvImportAnalyzer
33
34
  end
34
35
 
35
36
  def tablename
36
- # May be optimize this, not run all three operations everytime filename method is called
37
- # May be creating filename as instance variable and using a double pipe will relive it from running everytime doesn't it?
37
+ # May be optimize this, not run all three operations everytime filename method is called ??
38
+ # May be creating filename as instance variable and using a double pipe will relive it from running everytime doesn't it??
38
39
  tablename = File.basename(options[:filename])
39
40
  tablename.gsub!(" ", "_")
40
41
  tablename.downcase!
@@ -53,6 +54,12 @@ module CsvImportAnalyzer
53
54
  @pg_helper
54
55
  end
55
56
 
57
+ ###
58
+ # Goes through each of the columns datatypes and prepares SQL statements for
59
+ # 1. Importing CSV files to database
60
+ # 2. Create table schema for the files
61
+ # Makes a function call to return the metadata analysis of the file
62
+ ###
56
63
  def generate_query
57
64
  databases.each do |db|
58
65
  create_query[db] = ["create table #{tablename} ("]
@@ -70,6 +77,10 @@ module CsvImportAnalyzer
70
77
 
71
78
  private
72
79
 
80
+ ###
81
+ # Based on the database type set in options
82
+ # returns query part for the header (column name)
83
+ ###
73
84
  def build_query_for_datatype(header, datatype)
74
85
  query = {}
75
86
  databases.each do |db|
@@ -87,6 +98,10 @@ module CsvImportAnalyzer
87
98
  return query
88
99
  end
89
100
 
101
+ ###
102
+ # based on database type set in options
103
+ # returns import query for the database
104
+ ###
90
105
  def prepare_import_csv
91
106
  databases.each do |db|
92
107
  if db == :mysql
@@ -97,6 +112,9 @@ module CsvImportAnalyzer
97
112
  end
98
113
  end
99
114
 
115
+ ###
116
+ # prepares sql statements based on the query for each header formed earlier
117
+ ###
100
118
  def prepare_sql_statements
101
119
  databases.each do |db|
102
120
  create_query[db][0] = create_query[db].first + " " + create_query[db][1]
@@ -106,6 +124,11 @@ module CsvImportAnalyzer
106
124
  end
107
125
  end
108
126
 
127
+ ###
128
+ # set's the create query and import query's in options
129
+ # these fields will be added to the metadata later
130
+ # instantiates MetadataAnalysis and passes options hash
131
+ ###
109
132
  def print_metadata_analysis
110
133
  options[:create_query] = create_query
111
134
  options[:import_query] = import_query
@@ -115,11 +138,3 @@ module CsvImportAnalyzer
115
138
 
116
139
  end
117
140
  end
118
-
119
- #Testing
120
- # args = {}
121
- # args[:options] = {:delimiter => ",", :chunk => 20, :filename => "/home/avinash/Desktop/csv-import-analyzer/lib/csv-import-analyzer/sampleTab"}
122
- # args[:column_datatypes] = {:year_id=>:int, :make_id=>:string, :model_id=>:string, :description_id=>:string, :price_id=>:float}
123
- # args[:nullable] = [:description_id]
124
- # query = CsvImportAnalyzer::SqlQueryBuilder.new(args)
125
- # puts query.generate_query
@@ -1,5 +1,5 @@
1
1
  module CsvImportAnalyzer
2
2
  module Version
3
- VERSION = "0.0.3"
3
+ VERSION = "0.0.4"
4
4
  end
5
5
  end
@@ -2,38 +2,38 @@
2
2
 
3
3
  describe CsvImportAnalyzer::CsvCheckBounds do
4
4
 
5
- describe '#get_min_max_values' do
6
- context 'when not initialized right' do
5
+ describe "#get_min_max_values" do
6
+ context "when not initialized right" do
7
7
  let(:options) {Hash[filename: "sample", chunk_size: 200, delimiter: ",", unique: 2]}
8
8
 
9
- it 'will fail gracefully if filename is nil' do
9
+ it "will fail gracefully if filename is nil" do
10
10
  @csv_check_bounds = CsvImportAnalyzer::CsvCheckBounds.new
11
11
  expect(@csv_check_bounds.get_min_max_values).to be_instance_of(MissingRequiredArguments)
12
12
  end
13
13
 
14
- it 'returns FileNotFound error if file is not found' do
14
+ it "returns FileNotFound error if file is not found" do
15
15
  @csv_check_bounds = CsvImportAnalyzer::CsvCheckBounds.new(options)
16
16
  expect(@csv_check_bounds.get_min_max_values).to be_instance_of(FileNotFound)
17
17
  end
18
18
  end
19
19
 
20
- context 'when initialized right' do
20
+ context "when initialized right" do
21
21
  let(:options) {Hash[filename: $sample_csv_path, chunk_size: 200, delimiter: ",", unique: 2, csv_column_datatypes: {:year_id => :int, :make_id => :string, :model_id => :string, :description_id => :string, :price_id => :float}]}
22
22
  before(:each) do
23
23
  @csv_check_bounds = CsvImportAnalyzer::CsvCheckBounds.new(options)
24
24
  end
25
25
 
26
- it 'returns a Hash' do
26
+ it "returns a Hash" do
27
27
  expect(@csv_check_bounds.get_min_max_values).to be_an_instance_of(Hash)
28
28
  end
29
29
 
30
- it 'returns correct min & max values for integer type' do
30
+ it "returns correct min & max values for integer type" do
31
31
  result = @csv_check_bounds.get_min_max_values
32
32
  expect(result[:min_max][:year_id][:min]).to eq(1996)
33
33
  expect(result[:min_max][:year_id][:max]).to eq(1999)
34
34
  end
35
35
 
36
- it 'returns correct min & max lengths for string type' do
36
+ it "returns correct min & max lengths for string type" do
37
37
  result = @csv_check_bounds.get_min_max_values
38
38
  expect(result[:min_max][:make_id][:min]).to eq(4)
39
39
  expect(result[:min_max][:make_id][:max]).to eq(7)
@@ -3,57 +3,57 @@
3
3
  class DummyClass
4
4
  end
5
5
 
6
- describe '#identify_delimiter' do
6
+ describe "#identify_delimiter" do
7
7
 
8
8
  before(:each) do
9
9
  @dummy_class = DummyClass.new
10
10
  @dummy_class.extend(CsvImportAnalyzer::DelimiterIdentifier)
11
11
  end
12
12
 
13
- context 'unable to determine the delimiter' do
13
+ context "unable to determine the delimiter" do
14
14
 
15
- it 'return invalid input when the input is neither string nor array' do
15
+ it "return invalid input when the input is neither string nor array" do
16
16
  expect(@dummy_class.identify_delimiter(3)).to be_instance_of(InvalidInput)
17
17
  end
18
18
 
19
- it 'returns file not found when the input string is not a valid file' do
19
+ it "returns file not found when the input string is not a valid file" do
20
20
  expect(@dummy_class.identify_delimiter("test")).to be_instance_of(FileNotFound)
21
21
  end
22
22
 
23
23
  end
24
24
 
25
- context 'finds the delimiter when the input is a file' do
25
+ context "finds the delimiter when the input is a file" do
26
26
 
27
- it 'returns a comma as the delimiter for sample_csv file' do
27
+ it "returns a comma as the delimiter for sample_csv file" do
28
28
  expect(@dummy_class.identify_delimiter($sample_csv_path)).to eq(",")
29
29
  end
30
30
 
31
- it 'returns a semicolon as the the delimiter for sample_ssv file' do
31
+ it "returns a semicolon as the the delimiter for sample_ssv file" do
32
32
  expect(@dummy_class.identify_delimiter($sample_ssv_path)).to eq(";")
33
33
  end
34
34
 
35
35
  end
36
36
 
37
- context 'finds the delimiter when the input is an array' do
37
+ context "finds the delimiter when the input is an array" do
38
38
  let(:sample) {['1999;Chevy;"Venture ""Extended Edition""";"";4900.00','1999;\'Chevy\';"Venture ""Extended Edition; Very Large""";;5000.00']}
39
- it 'returns a semicolon as the delimiter for sample array input' do
39
+ it "returns a semicolon as the delimiter for sample array input" do
40
40
  expect(@dummy_class.identify_delimiter(sample)).to eq(";")
41
41
  end
42
42
  end
43
43
  end
44
44
 
45
- describe '#return_plausible_delimiter' do
45
+ describe "#return_plausible_delimiter" do
46
46
  before(:each) do
47
47
  @dummy_class = DummyClass.new
48
48
  @dummy_class.extend(CsvImportAnalyzer::DelimiterIdentifier)
49
49
  end
50
50
 
51
- context 'identifies delimiter' do
52
- it 'returns comma as the delimiter by default' do
51
+ context "identifies delimiter" do
52
+ it "returns comma as the delimiter by default" do
53
53
  expect(@dummy_class.return_plausible_delimiter).to eq(",")
54
54
  end
55
55
 
56
- it 'returns semicolon as the delimiter for sample delimiter_count' do
56
+ it "returns semicolon as the delimiter for sample delimiter_count" do
57
57
  @dummy_class.stub(:delimiter_count).and_return(Hash[","=>15, ";"=>16, "\t"=>0, "|"=>0])
58
58
  expect(@dummy_class.return_plausible_delimiter).to eq(";")
59
59
  end