data_kit 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.rspec +2 -0
- data/.travis.yml +7 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +44 -0
- data/LICENSE +20 -0
- data/README.md +22 -0
- data/Rakefile +6 -0
- data/data_kit.gemspec +31 -0
- data/lib/data_kit/converters/boolean.rb +16 -0
- data/lib/data_kit/converters/date_time.rb +21 -0
- data/lib/data_kit/converters/integer.rb +24 -0
- data/lib/data_kit/converters/number.rb +28 -0
- data/lib/data_kit/csv/analysis.rb +69 -0
- data/lib/data_kit/csv/analyzer.rb +52 -0
- data/lib/data_kit/csv/converter.rb +62 -0
- data/lib/data_kit/csv/parser.rb +55 -0
- data/lib/data_kit/dataset/field.rb +58 -0
- data/lib/data_kit/dataset/schema.rb +21 -0
- data/lib/data_kit/patches/rcsv.rb +121 -0
- data/lib/data_kit/version.rb +3 -0
- data/lib/data_kit.rb +20 -0
- data/spec/converters/boolean_spec.rb +18 -0
- data/spec/converters/date_time_spec.rb +30 -0
- data/spec/converters/integer_spec.rb +20 -0
- data/spec/converters/number_spec.rb +20 -0
- data/spec/csv/analysis_spec.rb +55 -0
- data/spec/csv/analyzer_spec.rb +56 -0
- data/spec/csv/converter_spec.rb +35 -0
- data/spec/csv/parser_spec.rb +50 -0
- data/spec/dataset/field_spec.rb +95 -0
- data/spec/dataset/schema_spec.rb +22 -0
- data/spec/fixtures/carriage_returns.csv +1 -0
- data/spec/fixtures/standard.csv +11 -0
- data/spec/spec_helper.rb +18 -0
- metadata +193 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 6cb35e25f3fbf1a5444fbd581b8ae9225c038653
|
4
|
+
data.tar.gz: 8933a43e911a6e8c36c92d7fcb9ea004875c0cb6
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: abf4c32f7bc1c7d001d2acd90dfd23c652382f652bb1066205df76642855c77bb94abbeea6cd1c27cd39a7784f846f58f3cdd7be08e87c2edc9541cdacf3edbf
|
7
|
+
data.tar.gz: f248370d7f60840a9a82229409bf4d8316e3d5b40340bd6cf7e6cbe0dc40f226ec05813107f9a6bdee8607c81a4253fe7381525a11719904d43858c9cd54cf0c
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
data_kit (0.0.1)
|
5
|
+
rcsv
|
6
|
+
timeliness
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: https://rubygems.org/
|
10
|
+
specs:
|
11
|
+
codeclimate-test-reporter (0.2.0)
|
12
|
+
simplecov (>= 0.7.1, < 1.0.0)
|
13
|
+
diff-lcs (1.2.5)
|
14
|
+
docile (1.1.1)
|
15
|
+
multi_json (1.8.2)
|
16
|
+
profile (0.3.3)
|
17
|
+
rake (10.1.0)
|
18
|
+
rcsv (0.1.1)
|
19
|
+
rspec (2.14.1)
|
20
|
+
rspec-core (~> 2.14.0)
|
21
|
+
rspec-expectations (~> 2.14.0)
|
22
|
+
rspec-mocks (~> 2.14.0)
|
23
|
+
rspec-core (2.14.7)
|
24
|
+
rspec-expectations (2.14.4)
|
25
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
26
|
+
rspec-mocks (2.14.4)
|
27
|
+
simplecov (0.8.2)
|
28
|
+
docile (~> 1.1.0)
|
29
|
+
multi_json
|
30
|
+
simplecov-html (~> 0.8.0)
|
31
|
+
simplecov-html (0.8.0)
|
32
|
+
timeliness (0.3.7)
|
33
|
+
|
34
|
+
PLATFORMS
|
35
|
+
ruby
|
36
|
+
|
37
|
+
DEPENDENCIES
|
38
|
+
bundler (~> 1.3)
|
39
|
+
codeclimate-test-reporter
|
40
|
+
data_kit!
|
41
|
+
profile
|
42
|
+
rake
|
43
|
+
rspec
|
44
|
+
simplecov
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2013 Mode
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
6
|
+
this software and associated documentation files (the "Software"), to deal in
|
7
|
+
the Software without restriction, including without limitation the rights to
|
8
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
9
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
10
|
+
subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
17
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
18
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
19
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
20
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Data Kit
|
2
|
+
==========
|
3
|
+
|
4
|
+
Library for ingesting, analyzing and cleaning normalizing datasets
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
### Standalone
|
9
|
+
|
10
|
+
From your terminal run:
|
11
|
+
|
12
|
+
$ gem install data_kit
|
13
|
+
|
14
|
+
### Bundler
|
15
|
+
|
16
|
+
Add this line to your application's Gemfile:
|
17
|
+
|
18
|
+
gem 'data_kit'
|
19
|
+
|
20
|
+
And then execute:
|
21
|
+
|
22
|
+
$ bundle
|
data/Rakefile
ADDED
data/data_kit.gemspec
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'data_kit/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "data_kit"
|
8
|
+
spec.version = DataKit::VERSION
|
9
|
+
spec.authors = ["Mode Analytics"]
|
10
|
+
spec.email = ["support@modeanalytics.com"]
|
11
|
+
spec.description = %q{Library for ingesting, analyzing and normalizing datasets in various formats}
|
12
|
+
spec.summary = %q{Provides parsers, analyzers and converters for datasets stored in various formats}
|
13
|
+
spec.homepage = "http://www.modeanalytics.com/"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# Runtime Dependencies
|
22
|
+
spec.add_runtime_dependency "rcsv"
|
23
|
+
spec.add_runtime_dependency "timeliness"
|
24
|
+
|
25
|
+
# Development Dependencies
|
26
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
27
|
+
spec.add_development_dependency "rake"
|
28
|
+
spec.add_development_dependency "profile"
|
29
|
+
spec.add_development_dependency "rspec"
|
30
|
+
spec.add_development_dependency "simplecov"
|
31
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module DataKit
|
2
|
+
module Converters
|
3
|
+
class Boolean
|
4
|
+
class << self
|
5
|
+
def convert(value)
|
6
|
+
downcased = value.downcase
|
7
|
+
downcased == 'true' || downcased == 't'
|
8
|
+
end
|
9
|
+
|
10
|
+
def match?(value)
|
11
|
+
(value =~ /\A(true|t|false|f)\z/i) == 0
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'timeliness'
|
2
|
+
|
3
|
+
module DataKit
|
4
|
+
module Converters
|
5
|
+
class DateTime
|
6
|
+
class << self
|
7
|
+
def convert(value)
|
8
|
+
Timeliness.parse(value, :datetime, :zone => :utc)
|
9
|
+
end
|
10
|
+
|
11
|
+
def match?(value)
|
12
|
+
Timeliness::Definitions.format_sets(:datetime, value).any?{|set| value =~ set.regexp}
|
13
|
+
end
|
14
|
+
|
15
|
+
# Additional Date/Time Formats
|
16
|
+
Timeliness.add_formats(:datetime, "yyyy-m-dTh:nn:ss")
|
17
|
+
Timeliness.add_formats(:datetime, "m/d/yy h:nn:ss.u ampm")
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module DataKit
|
2
|
+
module Converters
|
3
|
+
class Integer
|
4
|
+
class << self
|
5
|
+
def convert(value)
|
6
|
+
Integer(value)
|
7
|
+
end
|
8
|
+
|
9
|
+
def match?(value)
|
10
|
+
begin
|
11
|
+
Integer(value)
|
12
|
+
true
|
13
|
+
rescue
|
14
|
+
false
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def reformat(value)
|
19
|
+
value.tr(',', '').tr('$', '')
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module DataKit
|
2
|
+
module Converters
|
3
|
+
class Number
|
4
|
+
class << self
|
5
|
+
def convert(value)
|
6
|
+
Float(value)
|
7
|
+
end
|
8
|
+
|
9
|
+
def match?(value)
|
10
|
+
begin
|
11
|
+
Float(value)
|
12
|
+
true
|
13
|
+
rescue
|
14
|
+
false
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def reformat(value)
|
19
|
+
if value.is_a?(String)
|
20
|
+
value.gsub(/(\p{Sc}|\,)/, '')
|
21
|
+
else
|
22
|
+
value
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module DataKit
|
2
|
+
module CSV
|
3
|
+
class Analysis
|
4
|
+
attr_reader :fields
|
5
|
+
attr_reader :types
|
6
|
+
attr_reader :row_count
|
7
|
+
attr_reader :sample_count
|
8
|
+
|
9
|
+
def initialize(fields)
|
10
|
+
@fields, @types = fields, {}
|
11
|
+
@row_count, @sample_count = 0, 0
|
12
|
+
|
13
|
+
fields.each do |field_name|
|
14
|
+
@types[field_name] = {}
|
15
|
+
Dataset::Field::Types.each do |type|
|
16
|
+
@types[field_name][type] = 0
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def increment_total
|
22
|
+
@row_count += 1
|
23
|
+
end
|
24
|
+
|
25
|
+
def increment_sample
|
26
|
+
@sample_count += 1
|
27
|
+
end
|
28
|
+
|
29
|
+
def insert(field_name, value)
|
30
|
+
@types[field_name][Dataset::Field.type?(value)] += 1
|
31
|
+
end
|
32
|
+
|
33
|
+
def field_types
|
34
|
+
fields.inject({}) do |result, field_name|
|
35
|
+
result[field_name] = type?(field_name)
|
36
|
+
result
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def type?(field)
|
41
|
+
if has_single_type?(field)
|
42
|
+
type_list(field).first
|
43
|
+
elsif has_only_numeric_types?(field)
|
44
|
+
:number
|
45
|
+
else
|
46
|
+
:string
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def type_count(field, type)
|
51
|
+
types[field][type] || 0
|
52
|
+
end
|
53
|
+
|
54
|
+
def type_list(field)
|
55
|
+
types[field].keys.select do |type|
|
56
|
+
type_count(field, type) > 0
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def has_single_type?(field)
|
61
|
+
type_list(field).length == 1
|
62
|
+
end
|
63
|
+
|
64
|
+
def has_only_numeric_types?(field)
|
65
|
+
(type_list(field) - [:integer, :number, :null]).length == 0
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module DataKit
|
2
|
+
module CSV
|
3
|
+
class Analyzer
|
4
|
+
attr_accessor :csv
|
5
|
+
attr_accessor :keys
|
6
|
+
attr_accessor :sample_rate
|
7
|
+
|
8
|
+
def initialize(csv, options = {})
|
9
|
+
@csv = csv
|
10
|
+
@keys = options[:keys] || []
|
11
|
+
@sample_rate = options[:sample_rate] || 0.1
|
12
|
+
end
|
13
|
+
|
14
|
+
def execute
|
15
|
+
random = Random.new
|
16
|
+
analysis = Analysis.new(csv.headers)
|
17
|
+
|
18
|
+
csv.each_row do |row|
|
19
|
+
analysis.increment_total
|
20
|
+
if random.rand <= sample_rate
|
21
|
+
analysis.increment_sample
|
22
|
+
row.keys.each do |field_name|
|
23
|
+
analysis.insert(field_name.to_s, row[field_name])
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
analysis
|
29
|
+
end
|
30
|
+
|
31
|
+
class << self
|
32
|
+
def analyze(csv, options = {})
|
33
|
+
analyzer = new(csv,
|
34
|
+
:keys => options[:keys],
|
35
|
+
:sample_rate => options[:sample_rate]
|
36
|
+
)
|
37
|
+
|
38
|
+
analyzer.execute
|
39
|
+
end
|
40
|
+
|
41
|
+
def sample_rate(file_size)
|
42
|
+
if file_size < (1024 * 1024)
|
43
|
+
sample_rate = 1.0
|
44
|
+
else
|
45
|
+
scale_factor = 500
|
46
|
+
sample_rate = (scale_factor / Math.sqrt(file_size)).round(4)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module DataKit
|
4
|
+
module CSV
|
5
|
+
class Converter
|
6
|
+
|
7
|
+
attr_accessor :csv
|
8
|
+
attr_accessor :analysis
|
9
|
+
attr_accessor :output_path
|
10
|
+
|
11
|
+
def initialize(csv, analysis, output_path)
|
12
|
+
@csv = csv
|
13
|
+
@analysis = analysis
|
14
|
+
@output_path = File.expand_path(output_path)
|
15
|
+
end
|
16
|
+
|
17
|
+
def execute
|
18
|
+
::CSV.open(output_path, 'wb') do |writer|
|
19
|
+
writer << csv.headers
|
20
|
+
csv.each_row do |row|
|
21
|
+
writer << csv.headers.collect do |field_name|
|
22
|
+
convert(row[field_name], field_types[field_name])
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def field_types
|
29
|
+
@field_types ||= analysis.field_types
|
30
|
+
end
|
31
|
+
|
32
|
+
class << self
|
33
|
+
def convert(csv, analysis, output_path)
|
34
|
+
converter = new(csv, analysis, output_path)
|
35
|
+
converter.execute
|
36
|
+
converter
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def convert(value, type)
|
43
|
+
if value.nil? || type == :string || type == :empty
|
44
|
+
return value.to_s
|
45
|
+
else
|
46
|
+
formatted = Converters::Number.reformat(value)
|
47
|
+
|
48
|
+
case type
|
49
|
+
when :integer
|
50
|
+
return Converters::Integer.convert(formatted)
|
51
|
+
when :number
|
52
|
+
return Converters::Number.convert(formatted)
|
53
|
+
when :boolean
|
54
|
+
return Converters::Boolean.convert(value)
|
55
|
+
when :datetime
|
56
|
+
return Converters::DateTime.convert(value).strftime("%Y-%m-%dT%H:%M:%SZ")
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'rcsv'
|
2
|
+
|
3
|
+
module DataKit
|
4
|
+
module CSV
|
5
|
+
class Parser
|
6
|
+
# Encode streams from BINARY into UTF-8
|
7
|
+
InternalEnc = Encoding.find("UTF-8")
|
8
|
+
ExternalEnc = Encoding.find("BINARY")
|
9
|
+
|
10
|
+
attr_reader :path
|
11
|
+
attr_reader :handle
|
12
|
+
attr_reader :headers
|
13
|
+
|
14
|
+
def initialize(path)
|
15
|
+
@path = path
|
16
|
+
|
17
|
+
set_handle
|
18
|
+
set_headers
|
19
|
+
end
|
20
|
+
|
21
|
+
def each_row(&block)
|
22
|
+
handle.rewind
|
23
|
+
Rcsv.parse(handle, :header => :skip, :columns => columns, :row_as_hash => true) do |row|
|
24
|
+
yield row
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def columns
|
31
|
+
index = -1
|
32
|
+
@columns ||= headers.inject({}) do |result, field_name|
|
33
|
+
index += 1
|
34
|
+
result[index] = { :alias => field_name }
|
35
|
+
result
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def set_handle
|
40
|
+
if path.is_a?(IO)
|
41
|
+
@handle = path
|
42
|
+
else
|
43
|
+
@handle = File.open(path)
|
44
|
+
end
|
45
|
+
|
46
|
+
@handle.set_encoding(ExternalEnc, InternalEnc)
|
47
|
+
end
|
48
|
+
|
49
|
+
def set_headers
|
50
|
+
handle.rewind
|
51
|
+
Rcsv.parse(handle, :header => :none) { |row| @headers = row; break }
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module DataKit
|
2
|
+
module Dataset
|
3
|
+
class Field
|
4
|
+
|
5
|
+
Types = [:string, :integer, :number, :datetime, :boolean, :null].freeze
|
6
|
+
|
7
|
+
attr_accessor :name
|
8
|
+
attr_accessor :key
|
9
|
+
attr_accessor :type
|
10
|
+
|
11
|
+
def initialize(name, options = {})
|
12
|
+
@name = name
|
13
|
+
@key = options[:key] || false
|
14
|
+
@type = (options[:type] || :string).to_sym
|
15
|
+
end
|
16
|
+
|
17
|
+
def key?
|
18
|
+
key == true
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_hash
|
22
|
+
{ 'name' => name, 'type' => type.to_s, 'key' => key?}
|
23
|
+
end
|
24
|
+
|
25
|
+
class << self
|
26
|
+
def type?(value)
|
27
|
+
return :null if value.nil?
|
28
|
+
reformatted = Converters::Number.reformat(value)
|
29
|
+
|
30
|
+
if Converters::Integer.match?(reformatted)
|
31
|
+
:integer
|
32
|
+
elsif Converters::Number.match?(reformatted)
|
33
|
+
:number
|
34
|
+
elsif Converters::Boolean.match?(value)
|
35
|
+
:boolean
|
36
|
+
elsif Converters::DateTime.match?(value)
|
37
|
+
:datetime
|
38
|
+
else
|
39
|
+
:string
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def convert(value, type)
|
44
|
+
return nil if type == :null || value.nil?
|
45
|
+
reformatted = Converters::Number.reformat(value)
|
46
|
+
|
47
|
+
case type
|
48
|
+
when :integer then Converters::Integer.convert(reformatted)
|
49
|
+
when :number then Converters::Number.convert(reformatted)
|
50
|
+
when :boolean then Converters::Boolean.convert(value)
|
51
|
+
when :datetime then Converters::DateTime.convert(value)
|
52
|
+
else value.to_s
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
|
3
|
+
module DataKit
|
4
|
+
module Dataset
|
5
|
+
class Schema
|
6
|
+
attr_accessor :fields
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@fields = []
|
10
|
+
end
|
11
|
+
|
12
|
+
def keys
|
13
|
+
fields.select{ |f| f.key? }
|
14
|
+
end
|
15
|
+
|
16
|
+
def to_yaml
|
17
|
+
fields.collect(&:to_hash).to_yaml
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|