csv-import-analyzer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +35 -0
  7. data/Rakefile +2 -0
  8. data/csv-import-analyzer.gemspec +29 -0
  9. data/lib/csv-import-analyzer.rb +18 -0
  10. data/lib/csv-import-analyzer/analyzer/csv_check_bounds.rb +104 -0
  11. data/lib/csv-import-analyzer/analyzer/delimiter_identifier.rb +66 -0
  12. data/lib/csv-import-analyzer/analyzer/file_type_assertion.rb +29 -0
  13. data/lib/csv-import-analyzer/csv_datatype_analysis.rb +110 -0
  14. data/lib/csv-import-analyzer/csv_sanitizer.rb +86 -0
  15. data/lib/csv-import-analyzer/export/metadata_analysis.rb +156 -0
  16. data/lib/csv-import-analyzer/helpers/common_functions.rb +11 -0
  17. data/lib/csv-import-analyzer/helpers/datatype_validation.rb +85 -0
  18. data/lib/csv-import-analyzer/helpers/errors.rb +3 -0
  19. data/lib/csv-import-analyzer/helpers/string_class_extensions.rb +8 -0
  20. data/lib/csv-import-analyzer/query_builder/mysql_query_helper.rb +31 -0
  21. data/lib/csv-import-analyzer/query_builder/pg_query_helper.rb +27 -0
  22. data/lib/csv-import-analyzer/query_builder/query_helper.rb +27 -0
  23. data/lib/csv-import-analyzer/sampleTab.csv +5 -0
  24. data/lib/csv-import-analyzer/sql_query_builder.rb +125 -0
  25. data/lib/csv-import-analyzer/version.rb +5 -0
  26. data/lib/metadata_output.json +70 -0
  27. data/lib/sampleTab.csv +5 -0
  28. data/spec/csv-import-analyzer/analyzer/csv_check_bounds_spec.rb +43 -0
  29. data/spec/csv-import-analyzer/analyzer/delimiter_identifier_spec.rb +61 -0
  30. data/spec/csv-import-analyzer/analyzer/file_type_assertion_spec.rb +0 -0
  31. data/spec/csv-import-analyzer/csv_datatype_analysis_spec.rb +1 -0
  32. data/spec/csv-import-analyzer/csv_sanitizer_spec.rb +24 -0
  33. data/spec/csv-import-analyzer/export/metadata_analysis_spec.rb +0 -0
  34. data/spec/csv-import-analyzer/helpers/common_functions_spec.rb +31 -0
  35. data/spec/csv-import-analyzer/helpers/csv_check_bounds_spec.rb +3 -0
  36. data/spec/csv-import-analyzer/helpers/datatype_validation_spec.rb +75 -0
  37. data/spec/csv-import-analyzer/helpers/mysql_query_helper_spec.rb +0 -0
  38. data/spec/csv-import-analyzer/helpers/pq_query_helper_spec.rb +0 -0
  39. data/spec/csv-import-analyzer/helpers/string_class_extension_spec.rb +18 -0
  40. data/spec/csv-import-analyzer/query_builder/mysql_query_helper_spec.rb +54 -0
  41. data/spec/csv-import-analyzer/query_builder/pg_query_helper_spec.rb +55 -0
  42. data/spec/csv-import-analyzer_spec.rb +14 -0
  43. data/spec/fixtures/sample.csv +5 -0
  44. data/spec/fixtures/sample_options.yml +11 -0
  45. data/spec/fixtures/semicolon-sample.csv +5 -0
  46. data/spec/spec_helper.rb +84 -0
  47. metadata +208 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d57394db99e42dd7154c4c1506d94fa7310311db
4
+ data.tar.gz: 3b07ff8c14728eb61ceca8ecf107b1be6dea3344
5
+ SHA512:
6
+ metadata.gz: 4942b31ef92123e2fbedbe1b73eb1aa2814ed459d2b875f4435fb607eb4c072d7f10164bae285ee4ed1af0404ccebab73194526eacb786f9422a3c3540e8f66b
7
+ data.tar.gz: ad717f58dac4d563f6cf1edfb89ec14ca6299c18e6714b45fa1ccf6f78d93f758f7fd66952b76fe52cab07c414405070523c16446f98a45644f7209b3087915f
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --color
2
+ --require spec_helper
3
+ --format documentation
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in csv-import-analyzer.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 avinash vallabhaneni
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,35 @@
1
+ # Csv::Import::Analyzer
2
+
3
+ Perform datatype analysis on desired chunk
4
+ Calculate min-max bounds for each column
5
+ Determine which coulmns are nullable in the csv file
6
+
7
+ Note: This gem expects the first line to be definitve header, as in like column names if the csv file has to be imported to database.
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ ```ruby
14
+ gem 'csv-import-analyzer'
15
+ ```
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install csv-import-analyzer
24
+
25
+ ## Usage
26
+
27
+ TODO: Write usage instructions here
28
+
29
+ ## Contributing
30
+
31
+ 1. Fork it ( https://github.com/avinash-vllbh/csv-import-analyzer/fork )
32
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
33
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
34
+ 4. Push to the branch (`git push origin my-new-feature`)
35
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'csv-import-analyzer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "csv-import-analyzer"
8
+ spec.version = CsvImportAnalyzer::Version::VERSION
9
+ spec.authors = ["avinash vallabhaneni"]
10
+ spec.email = ["avinash.vallab@gmail.com"]
11
+ spec.description = %q{Santize large csv files and help in predicting datatypes including min max values for easy import to SQL}
12
+ spec.summary = %q{To process large csv files and predict valid datatypes of each column for easy import into SQL}
13
+ spec.homepage = "http://rubygems.org/gems/csv-import-analyzer"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.6"
22
+ spec.add_development_dependency "rake", "~> 10"
23
+ spec.add_development_dependency "pry", "~> 0.10"
24
+ spec.add_development_dependency "rspec", "~> 3.0"
25
+ spec.add_development_dependency "simplecov", "~> 0.9"
26
+
27
+ spec.add_runtime_dependency "smarter_csv", "~> 1.0.17"
28
+ spec.add_runtime_dependency "roo", "~> 1.13"
29
+ end
@@ -0,0 +1,18 @@
1
+ require 'pry'
2
+ require_relative "csv-import-analyzer/csv_sanitizer"
3
+ require_relative "csv-import-analyzer/helpers/errors"
4
+ module CsvImportAnalyzer
5
+ # To identify the methods in the module as class methods
6
+ extend self
7
+
8
+ def process(filename, options = {})
9
+ if File::exist?(filename)
10
+ CsvImportAnalyzer::CsvSanitizer.new().process(File.absolute_path(filename), options)
11
+ else
12
+ FileNotFound.new
13
+ end
14
+ end
15
+ end
16
+
17
+ CsvImportAnalyzer.process("sampleTab.csv", {:metadata_output => true, :unique => 2})
18
+ # CsvImportAnalyzer.process("sampleTab.csv", {:metadata_output => true, :out_format => :csv})
@@ -0,0 +1,104 @@
1
+ require 'smarter_csv'
2
+ require 'pry'
3
+ require_relative "../helpers/common_functions"
4
+ require_relative "../helpers/errors"
5
+
6
+ module CsvImportAnalyzer
7
+ class CsvCheckBounds
8
+ include CsvImportAnalyzer::Helper
9
+
10
+ attr_accessor :min_max_bounds, :distinct_values, :csv_column_datatypes, :options, :nullable, :max_distinct_values
11
+
12
+ def initialize(options = {})
13
+ @csv_column_datatypes = options[:csv_column_datatypes]
14
+ @options = options
15
+ @min_max_bounds = {}
16
+ @distinct_values = {}
17
+ @nullable = options[:nullable] || []
18
+
19
+ end
20
+
21
+ def filename
22
+ return options[:filename]
23
+ end
24
+ def chunk_size
25
+ return options[:chunk_size]
26
+ end
27
+ def delimiter
28
+ return options[:delimiter]
29
+ end
30
+ def max_distinct_values
31
+ @max_distinct_values ||= Integer(options[:unique]) + 1
32
+ end
33
+
34
+ # Public interface that is called - Processes the CSV file for min & max values for each column
35
+ def get_min_max_values
36
+ unless filename.nil?
37
+ if File.exist?(filename)
38
+ SmarterCSV.process(filename, {:col_sep => delimiter, :chunk_size => chunk_size,
39
+ :remove_empty_values => false, :remove_zero_values => false}) do |chunk|
40
+ chunk.each do |row|
41
+ row.each do |key, value|
42
+ unless null_like?(value)
43
+ process_min_max_for_column(key, value)
44
+ process_distinct_values(key, value)
45
+ else
46
+ nullable.push(key) unless nullable.include?(key)
47
+ end
48
+ end
49
+ end
50
+ end
51
+ return {:min_max => min_max_bounds, :uniques => distinct_values}
52
+ else
53
+ FileNotFound.new
54
+ end
55
+ else
56
+ MissingRequiredArguments.new("valid filename is required to check bounds")
57
+ end
58
+ end
59
+
60
+ private
61
+
62
+ ##
63
+ #If the key is of String type then we find the max length of it
64
+ ##
65
+ def process_min_max_for_column(key, value)
66
+ if min_max_bounds[key].nil?
67
+ unless csv_column_datatypes[key] == :string
68
+ min_max_bounds[key] = {:min => value, :max => value}
69
+ else
70
+ min_max_bounds[key] = {:min => value.length, :max => 0}
71
+ end
72
+ end
73
+ add_bounds(key, value)
74
+ end
75
+
76
+ ##
77
+ #Method which decides on the min max values for each key and according to the passsed in value
78
+ ##
79
+ def add_bounds(key, value)
80
+ if csv_column_datatypes[key] == :string
81
+ min_max_bounds[key][:min] = value.length if value.length < min_max_bounds[key][:min]
82
+ min_max_bounds[key][:max] = value.length if value.length > min_max_bounds[key][:max]
83
+ else
84
+ min_max_bounds[key][:min] = value if value < min_max_bounds[key][:min]
85
+ min_max_bounds[key][:max] = value if value > min_max_bounds[key][:max]
86
+ end
87
+ end
88
+
89
+ ##
90
+ #Processes the max number of distinct values set for each column
91
+ ##
92
+ def process_distinct_values(key, value)
93
+ if distinct_values[key].nil?
94
+ distinct_values[key] = [value]
95
+ else
96
+ if distinct_values[key].size > max_distinct_values
97
+ else
98
+ distinct_values[key].push(value) unless distinct_values[key].include?(value)
99
+ end
100
+ end
101
+ end
102
+
103
+ end
104
+ end
@@ -0,0 +1,66 @@
1
+ require_relative "../helpers/string_class_extensions"
2
+ require 'pry'
3
+
4
+ module CsvImportAnalyzer
5
+ module DelimiterIdentifier
6
+
7
+ # attr_accessor :delimiter, :delimiter_count
8
+
9
+ def delimiter
10
+ @delimiter ||= [",", ";", "\t", "|"]
11
+ end
12
+
13
+ def delimiter_count
14
+ @delimiter_count ||= Hash[delimiter.map {|v| [v,0]}]
15
+ @delimiter_count
16
+ end
17
+
18
+ def getting_contents_of_quoted_values(input)
19
+ #return a join of all the strings inside quotes inside a line
20
+ input.scan(/".*?"/).join
21
+ end
22
+
23
+ def count_occurances_delimiter(line)
24
+ delimiter_count.keys.each do |key|
25
+ #Count the occurances of delimiter in a line
26
+ total_count_delimiter = line.substr_count(key)
27
+ #count the occurances of delimiter between quotes inside a line to disregard them
28
+ quoted_delimiter_count = getting_contents_of_quoted_values(line).substr_count(key)
29
+ delimiter_count[key] += total_count_delimiter - quoted_delimiter_count
30
+ end
31
+ end
32
+
33
+ def return_plausible_delimiter
34
+ return delimiter_count.key(delimiter_count.values.max)
35
+ end
36
+
37
+ def identify_delimiter(filename_or_sample)
38
+ #filename_or_sample input can be either a File or an Array or a string - Return delimiter for File or an Array of strings (if found)
39
+ if filename_or_sample.class == String
40
+ if File::exists?(filename_or_sample)
41
+ current_line_number = 0
42
+ File.foreach(filename_or_sample) do |line|
43
+ count_occurances_delimiter(line)
44
+ current_line_number += 1
45
+ if current_line_number > 3
46
+ break
47
+ end
48
+ end
49
+ else
50
+ # count_occurances_delimiter(filename_or_sample)
51
+ return FileNotFound.new
52
+ end
53
+ return_plausible_delimiter
54
+ elsif filename_or_sample.class == Array
55
+ filename_or_sample.each do |line|
56
+ count_occurances_delimiter(line)
57
+ end
58
+ return_plausible_delimiter
59
+ else
60
+ InvalidInput.new
61
+ end
62
+ end
63
+ end
64
+ end
65
+
66
+ # puts CsvImportAnalyzer::DelimiterIdentifier.identify_delimiter("/home/avinash/Desktop/csv-import-analyzer/spec/fixtures/sample.csv")
@@ -0,0 +1,29 @@
1
+ # file_type_assertion.rb
2
+ require 'pry'
3
+ module CsvImportAnalyzer
4
+ module Analyzer
5
+ class FileTypeAssertion
6
+
7
+ def check_file_type(filename)
8
+ extension = File.absolute_path(filename).split(".").last
9
+ if extension == "csv"
10
+ Analyzer::FileTypeAssertion.new("sampleTab.csv")
11
+ #Try adding support for non csv files - xlsx, xls in future
12
+ elsif extension == "xlsx"
13
+ puts "xlsx"
14
+ else
15
+ # return UnsupportedFileFormat.new
16
+ end
17
+ end
18
+
19
+ def self.convert_excel_to_csv
20
+
21
+ end
22
+
23
+ def csv_clean
24
+
25
+ end
26
+
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,110 @@
1
+ require "smarter_csv"
2
+ require "tempfile"
3
+ require "pry"
4
+ require_relative "helpers/datatype_validation"
5
+ require_relative "analyzer/csv_check_bounds"
6
+ require_relative "helpers/common_functions"
7
+ require_relative "sql_query_builder"
8
+
9
+ module CsvImportAnalyzer
10
+ class CsvDatatypeAnalysis
11
+ include CsvImportAnalyzer::Helper
12
+ include CsvImportAnalyzer::DatatypeValidator
13
+
14
+ attr_accessor :csv_column_datatypes, :nullable
15
+
16
+ def initialize(options)
17
+ @options = options
18
+ @csv_column_datatypes = {}
19
+ @nullable = []
20
+ end
21
+
22
+ def options
23
+ @options
24
+ end
25
+
26
+ def filename
27
+ @options[:filename]
28
+ end
29
+
30
+
31
+ # Process a chunk of csv file for all possible datatypes towards each column in the row
32
+ def datatype_analysis
33
+ SmarterCSV.process(filename, {:col_sep => delimiter, :chunk_size => chunk_size,
34
+ :remove_empty_values => false, :remove_zero_values => false}) do |chunk|
35
+ chunk.each do |row|
36
+ row.each do |key, value|
37
+ unless null_like?(value)
38
+ datatype = determine_dataype(value)
39
+ add_to_datatype(key, datatype.to_sym)
40
+ else
41
+ nullable.push(key) unless nullable.include?(key)
42
+ end
43
+ end
44
+ end
45
+ break
46
+ end
47
+ options[:csv_datatype_analysis] = csv_column_datatypes.clone # To retain the current state of csv_column_datatypes since it's altered further
48
+ finalize_datatypes_for_csv
49
+ options[:csv_column_datatypes] = csv_column_datatypes
50
+ options[:nullable] = nullable
51
+ take_further_actions
52
+ end
53
+
54
+ private
55
+
56
+ def delimiter
57
+ return options[:delimiter]
58
+ end
59
+
60
+ def chunk_size
61
+ return options[:chunk]
62
+ end
63
+
64
+ #Call DatatypeValidator in helper module to process the possible datatype for the value
65
+ #Is this the right way to hide dependency on the external classes or objects
66
+ #May be a static would do ? Should I create an object and call method on the object each time rather than instantiate a new object each time ??
67
+ def determine_dataype(value)
68
+ return validate_field(value)
69
+ end
70
+
71
+ # Build the hash of hashes which hold the count of different possible datatypes for each row
72
+ def add_to_datatype(key, datatype)
73
+ if csv_column_datatypes[key].nil?
74
+ csv_column_datatypes[key] = {datatype => 1}
75
+ else
76
+ if csv_column_datatypes[key][datatype].nil?
77
+ csv_column_datatypes[key][datatype] = 1
78
+ else
79
+ csv_column_datatypes[key][datatype] += 1
80
+ end
81
+ end
82
+ end
83
+
84
+ #Finalize the datatype for each column, A column datatype would be set to varchar or string if atleast of it's values tend to be string
85
+ #If the column doesn't have any possible strings then assign the datatype to column with maximum count of identified possibilites
86
+ def finalize_datatypes_for_csv
87
+ csv_column_datatypes.map { |column_name, possible_datatypes|
88
+ #If there is string type even atleast 1 there is no other option but to set the datatype to string => varchar
89
+ if possible_datatypes.has_key?(:string)
90
+ csv_column_datatypes[column_name] = :string
91
+ else
92
+ #set the max occurance datatype as the datatype of column
93
+ csv_column_datatypes[column_name] = possible_datatypes.key(possible_datatypes.values.max)
94
+ end
95
+ }
96
+ end
97
+
98
+ #Decide if simple datatype analysis is enough or proced further
99
+ def take_further_actions
100
+ if options[:check_bounds]
101
+ min_max_bounds = CsvImportAnalyzer::CsvCheckBounds.new(options)
102
+ res = min_max_bounds.get_min_max_values
103
+ options[:min_max_bounds] = res[:min_max]
104
+ options[:uniques] = res[:uniques]
105
+ end
106
+ query = CsvImportAnalyzer::SqlQueryBuilder.new(options)
107
+ query.generate_query
108
+ end
109
+ end
110
+ end