csv-import-analyzer 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/CsvImportAnalyzer +59 -0
- data/csv-import-analyzer.gemspec +1 -1
- data/lib/csv-import-analyzer.rb +1 -0
- data/lib/csv-import-analyzer/csv_sanitizer.rb +1 -1
- data/lib/csv-import-analyzer/version.rb +1 -1
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7ff8951a5c2ea050bfbbebf39617d1d735a4f6ea
|
4
|
+
data.tar.gz: f00f91d6123a435c54a5585a7a3d014cae46c67b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 474adc7c9f7e0946c2c86a6c14395569c9574fd4865bc907f43614df39386336959f3cc1b3b71e51a7fa6d2844e8426bf8be22475345a5bceb153cf7473662e5
|
7
|
+
data.tar.gz: b87c638ac9a94fee9fcc357566c50517db1dd30e3bf3c3d158eab05d128d4e17f2965f1a1a37472e7846c807dd4030805e4ac3ea8f6bf8c543327ab02142d53a
|
@@ -0,0 +1,59 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'csv-import-analyzer'
|
4
|
+
|
5
|
+
# Default options for the executable
|
6
|
+
options = {:input => nil, :metadata_output => nil, :processed_input => nil, :unique => 5,
|
7
|
+
:chunk => 200, :database => [:mysql], :quote_convert => true, :replace_nulls => true, :check_bounds => true}
|
8
|
+
|
9
|
+
# Parse the options using optparse
|
10
|
+
# prases the input given through command line and set to the respective option
|
11
|
+
# E.g. CsvImportAnalyzer -i "test.csv"
|
12
|
+
# ==> options[:input] = "test.csv"
|
13
|
+
parser = OptionParser.new do |opts|
|
14
|
+
opts.banner = "Usage: CsvImportAnalyzer [options]"
|
15
|
+
|
16
|
+
opts.on('-i', '--input filename', 'Input file name') do |input|
|
17
|
+
options[:input] = input # todo: be able to handle files not in the current directory
|
18
|
+
end
|
19
|
+
opts.on('-m', '--output-structure filename', 'Output the metadata of file') do |metadata_output|
|
20
|
+
options[:metadata_output] = metadata_output
|
21
|
+
end
|
22
|
+
opts.on('-o', '--output-cleaned filename', 'Output the cleaned csv file name, defaults to current driectory proccessed_(filename).csv ') do |processed_input|
|
23
|
+
options[:processed_input] = processed_input
|
24
|
+
end
|
25
|
+
opts.on('-u', '--unique unique', 'No of Unique values you need, default: 10') do |unique|
|
26
|
+
options[:unique] = unique
|
27
|
+
end
|
28
|
+
opts.on('-c', '--chunk size', 'Chunk size for predecting datatypes, default: 200') do |chunk|
|
29
|
+
options[:chunk] = chunk
|
30
|
+
end
|
31
|
+
# opts.on('-s', '--skip lines', 'skip the number of lines at the top, default: 0') do |skip|
|
32
|
+
# options[:skip] = skip
|
33
|
+
# end
|
34
|
+
opts.on('-d', '--database type', 'MySQL or Postgres, Options: M or P, default: nil(print nothing)') do |database_type|
|
35
|
+
options[:database] = [database_type.upcase]
|
36
|
+
end
|
37
|
+
opts.on('-q', '--quotes conversion', 'Convert single quotes to double quotes, options: true or false, default: true') do |quote_convert|
|
38
|
+
options[:quote_convert] = quote_convert.upcase
|
39
|
+
end
|
40
|
+
opts.on('-r', '--replace nulls', 'replace empty, Null\'s, \N, NAN with NULL, options: true or false, default: true') do |replace_nulls|
|
41
|
+
options[:replace_nulls] = replace_nulls.upcase
|
42
|
+
end
|
43
|
+
opts.on('-h', '--help', 'Displays Help') do
|
44
|
+
puts opts
|
45
|
+
exit
|
46
|
+
end
|
47
|
+
end
|
48
|
+
parser.parse!
|
49
|
+
|
50
|
+
# Input validations
|
51
|
+
# Make sure a filename is given to the executable
|
52
|
+
filename = nil
|
53
|
+
if options[:input] == nil
|
54
|
+
print " Requires a valid input file name! \n"
|
55
|
+
puts parser
|
56
|
+
exit
|
57
|
+
end
|
58
|
+
|
59
|
+
puts CsvImportAnalyzer.process(options[:input], options)
|
data/csv-import-analyzer.gemspec
CHANGED
@@ -23,6 +23,6 @@ Gem::Specification.new do |spec|
|
|
23
23
|
spec.add_development_dependency "pry", "~> 0.10"
|
24
24
|
spec.add_development_dependency "rspec", "~> 3.0"
|
25
25
|
spec.add_development_dependency "simplecov", "~> 0.9"
|
26
|
-
|
26
|
+
|
27
27
|
spec.add_runtime_dependency "smarter_csv", "~> 1.0", ">= 1.0.17"
|
28
28
|
end
|
data/lib/csv-import-analyzer.rb
CHANGED
@@ -11,6 +11,7 @@ module CsvImportAnalyzer
|
|
11
11
|
# returns FileNotFound if given file is invalid
|
12
12
|
###
|
13
13
|
def process(filename, options = {})
|
14
|
+
return ArgumentError.new("A valid file needed to process") if filename.nil?
|
14
15
|
if File::exist?(filename)
|
15
16
|
CsvImportAnalyzer::CsvSanitizer.new().process(File.absolute_path(filename), options)
|
16
17
|
else
|
@@ -69,7 +69,7 @@ module CsvImportAnalyzer
|
|
69
69
|
{
|
70
70
|
:metadata_output => nil, # To be set if metadata needs to be printed to a file
|
71
71
|
:processed_input => nil, # To be set if processed input is needed
|
72
|
-
:unique =>
|
72
|
+
:unique => 5, # Threshold for number of defaults values that needs to identified
|
73
73
|
:check_bounds => true, # Option to check for min - max bounds for each column [true => find the bounds]
|
74
74
|
:datatype_analysis => 200, # Number of rows to be sampled for datatype analysis
|
75
75
|
:chunk => 200, # Chunk size (no of rows) that needs to processed in-memory [Important not to load entire file into memory]
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csv-import-analyzer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Avinash Vallabhaneni
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-11-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -104,7 +104,8 @@ description: Santize large csv files and help in predicting datatypes including
|
|
104
104
|
max values for easy import to SQL
|
105
105
|
email:
|
106
106
|
- avinash.vallab@gmail.com
|
107
|
-
executables:
|
107
|
+
executables:
|
108
|
+
- CsvImportAnalyzer
|
108
109
|
extensions: []
|
109
110
|
extra_rdoc_files: []
|
110
111
|
files:
|
@@ -114,6 +115,7 @@ files:
|
|
114
115
|
- LICENSE.txt
|
115
116
|
- README.md
|
116
117
|
- Rakefile
|
118
|
+
- bin/CsvImportAnalyzer
|
117
119
|
- csv-import-analyzer.gemspec
|
118
120
|
- lib/csv-import-analyzer.rb
|
119
121
|
- lib/csv-import-analyzer/analyzer/csv_check_bounds.rb
|