data_kit 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6cb35e25f3fbf1a5444fbd581b8ae9225c038653
4
+ data.tar.gz: 8933a43e911a6e8c36c92d7fcb9ea004875c0cb6
5
+ SHA512:
6
+ metadata.gz: abf4c32f7bc1c7d001d2acd90dfd23c652382f652bb1066205df76642855c77bb94abbeea6cd1c27cd39a7784f846f58f3cdd7be08e87c2edc9541cdacf3edbf
7
+ data.tar.gz: f248370d7f60840a9a82229409bf4d8316e3d5b40340bd6cf7e6cbe0dc40f226ec05813107f9a6bdee8607c81a4253fe7381525a11719904d43858c9cd54cf0c
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ coverage
6
+ InstalledFiles
7
+ lib/bundler/man
8
+ pkg
9
+ rdoc
10
+ spec/reports
11
+ test/tmp
12
+ test/version_tmp
13
+ tmp
14
+
15
+ # YARD artifacts
16
+ .yardoc
17
+ _yardoc
18
+ doc/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
data/.travis.yml ADDED
@@ -0,0 +1,7 @@
1
+ language: ruby
2
+
3
+ rvm:
4
+ - 2.0.0
5
+
6
+ script:
7
+ - bundle exec rake
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in mode-ruby.gemspec
4
+ gemspec
5
+
6
+ gem "codeclimate-test-reporter", group: :test, require: nil
data/Gemfile.lock ADDED
@@ -0,0 +1,44 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ data_kit (0.0.1)
5
+ rcsv
6
+ timeliness
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ codeclimate-test-reporter (0.2.0)
12
+ simplecov (>= 0.7.1, < 1.0.0)
13
+ diff-lcs (1.2.5)
14
+ docile (1.1.1)
15
+ multi_json (1.8.2)
16
+ profile (0.3.3)
17
+ rake (10.1.0)
18
+ rcsv (0.1.1)
19
+ rspec (2.14.1)
20
+ rspec-core (~> 2.14.0)
21
+ rspec-expectations (~> 2.14.0)
22
+ rspec-mocks (~> 2.14.0)
23
+ rspec-core (2.14.7)
24
+ rspec-expectations (2.14.4)
25
+ diff-lcs (>= 1.1.3, < 2.0)
26
+ rspec-mocks (2.14.4)
27
+ simplecov (0.8.2)
28
+ docile (~> 1.1.0)
29
+ multi_json
30
+ simplecov-html (~> 0.8.0)
31
+ simplecov-html (0.8.0)
32
+ timeliness (0.3.7)
33
+
34
+ PLATFORMS
35
+ ruby
36
+
37
+ DEPENDENCIES
38
+ bundler (~> 1.3)
39
+ codeclimate-test-reporter
40
+ data_kit!
41
+ profile
42
+ rake
43
+ rspec
44
+ simplecov
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2013 Mode
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
6
+ this software and associated documentation files (the "Software"), to deal in
7
+ the Software without restriction, including without limitation the rights to
8
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9
+ the Software, and to permit persons to whom the Software is furnished to do so,
10
+ subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17
+ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,22 @@
1
+ Data Kit
2
+ ==========
3
+
4
+ Library for ingesting, analyzing and cleaning normalizing datasets
5
+
6
+ ## Installation
7
+
8
+ ### Standalone
9
+
10
+ From your terminal run:
11
+
12
+ $ gem install data_kit
13
+
14
+ ### Bundler
15
+
16
+ Add this line to your application's Gemfile:
17
+
18
+ gem 'data_kit'
19
+
20
+ And then execute:
21
+
22
+ $ bundle
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/data_kit.gemspec ADDED
@@ -0,0 +1,31 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'data_kit/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "data_kit"
8
+ spec.version = DataKit::VERSION
9
+ spec.authors = ["Mode Analytics"]
10
+ spec.email = ["support@modeanalytics.com"]
11
+ spec.description = %q{Library for ingesting, analyzing and normalizing datasets in various formats}
12
+ spec.summary = %q{Provides parsers, analyzers and converters for datasets stored in various formats}
13
+ spec.homepage = "http://www.modeanalytics.com/"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ # Runtime Dependencies
22
+ spec.add_runtime_dependency "rcsv"
23
+ spec.add_runtime_dependency "timeliness"
24
+
25
+ # Development Dependencies
26
+ spec.add_development_dependency "bundler", "~> 1.3"
27
+ spec.add_development_dependency "rake"
28
+ spec.add_development_dependency "profile"
29
+ spec.add_development_dependency "rspec"
30
+ spec.add_development_dependency "simplecov"
31
+ end
@@ -0,0 +1,16 @@
1
+ module DataKit
2
+ module Converters
3
+ class Boolean
4
+ class << self
5
+ def convert(value)
6
+ downcased = value.downcase
7
+ downcased == 'true' || downcased == 't'
8
+ end
9
+
10
+ def match?(value)
11
+ (value =~ /\A(true|t|false|f)\z/i) == 0
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,21 @@
1
+ require 'timeliness'
2
+
3
+ module DataKit
4
+ module Converters
5
+ class DateTime
6
+ class << self
7
+ def convert(value)
8
+ Timeliness.parse(value, :datetime, :zone => :utc)
9
+ end
10
+
11
+ def match?(value)
12
+ Timeliness::Definitions.format_sets(:datetime, value).any?{|set| value =~ set.regexp}
13
+ end
14
+
15
+ # Additional Date/Time Formats
16
+ Timeliness.add_formats(:datetime, "yyyy-m-dTh:nn:ss")
17
+ Timeliness.add_formats(:datetime, "m/d/yy h:nn:ss.u ampm")
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,24 @@
1
+ module DataKit
2
+ module Converters
3
+ class Integer
4
+ class << self
5
+ def convert(value)
6
+ Integer(value)
7
+ end
8
+
9
+ def match?(value)
10
+ begin
11
+ Integer(value)
12
+ true
13
+ rescue
14
+ false
15
+ end
16
+ end
17
+
18
+ def reformat(value)
19
+ value.tr(',', '').tr('$', '')
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,28 @@
1
+ module DataKit
2
+ module Converters
3
+ class Number
4
+ class << self
5
+ def convert(value)
6
+ Float(value)
7
+ end
8
+
9
+ def match?(value)
10
+ begin
11
+ Float(value)
12
+ true
13
+ rescue
14
+ false
15
+ end
16
+ end
17
+
18
+ def reformat(value)
19
+ if value.is_a?(String)
20
+ value.gsub(/(\p{Sc}|\,)/, '')
21
+ else
22
+ value
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,69 @@
1
+ module DataKit
2
+ module CSV
3
+ class Analysis
4
+ attr_reader :fields
5
+ attr_reader :types
6
+ attr_reader :row_count
7
+ attr_reader :sample_count
8
+
9
+ def initialize(fields)
10
+ @fields, @types = fields, {}
11
+ @row_count, @sample_count = 0, 0
12
+
13
+ fields.each do |field_name|
14
+ @types[field_name] = {}
15
+ Dataset::Field::Types.each do |type|
16
+ @types[field_name][type] = 0
17
+ end
18
+ end
19
+ end
20
+
21
+ def increment_total
22
+ @row_count += 1
23
+ end
24
+
25
+ def increment_sample
26
+ @sample_count += 1
27
+ end
28
+
29
+ def insert(field_name, value)
30
+ @types[field_name][Dataset::Field.type?(value)] += 1
31
+ end
32
+
33
+ def field_types
34
+ fields.inject({}) do |result, field_name|
35
+ result[field_name] = type?(field_name)
36
+ result
37
+ end
38
+ end
39
+
40
+ def type?(field)
41
+ if has_single_type?(field)
42
+ type_list(field).first
43
+ elsif has_only_numeric_types?(field)
44
+ :number
45
+ else
46
+ :string
47
+ end
48
+ end
49
+
50
+ def type_count(field, type)
51
+ types[field][type] || 0
52
+ end
53
+
54
+ def type_list(field)
55
+ types[field].keys.select do |type|
56
+ type_count(field, type) > 0
57
+ end
58
+ end
59
+
60
+ def has_single_type?(field)
61
+ type_list(field).length == 1
62
+ end
63
+
64
+ def has_only_numeric_types?(field)
65
+ (type_list(field) - [:integer, :number, :null]).length == 0
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,52 @@
1
+ module DataKit
2
+ module CSV
3
+ class Analyzer
4
+ attr_accessor :csv
5
+ attr_accessor :keys
6
+ attr_accessor :sample_rate
7
+
8
+ def initialize(csv, options = {})
9
+ @csv = csv
10
+ @keys = options[:keys] || []
11
+ @sample_rate = options[:sample_rate] || 0.1
12
+ end
13
+
14
+ def execute
15
+ random = Random.new
16
+ analysis = Analysis.new(csv.headers)
17
+
18
+ csv.each_row do |row|
19
+ analysis.increment_total
20
+ if random.rand <= sample_rate
21
+ analysis.increment_sample
22
+ row.keys.each do |field_name|
23
+ analysis.insert(field_name.to_s, row[field_name])
24
+ end
25
+ end
26
+ end
27
+
28
+ analysis
29
+ end
30
+
31
+ class << self
32
+ def analyze(csv, options = {})
33
+ analyzer = new(csv,
34
+ :keys => options[:keys],
35
+ :sample_rate => options[:sample_rate]
36
+ )
37
+
38
+ analyzer.execute
39
+ end
40
+
41
+ def sample_rate(file_size)
42
+ if file_size < (1024 * 1024)
43
+ sample_rate = 1.0
44
+ else
45
+ scale_factor = 500
46
+ sample_rate = (scale_factor / Math.sqrt(file_size)).round(4)
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,62 @@
1
+ require 'csv'
2
+
3
+ module DataKit
4
+ module CSV
5
+ class Converter
6
+
7
+ attr_accessor :csv
8
+ attr_accessor :analysis
9
+ attr_accessor :output_path
10
+
11
+ def initialize(csv, analysis, output_path)
12
+ @csv = csv
13
+ @analysis = analysis
14
+ @output_path = File.expand_path(output_path)
15
+ end
16
+
17
+ def execute
18
+ ::CSV.open(output_path, 'wb') do |writer|
19
+ writer << csv.headers
20
+ csv.each_row do |row|
21
+ writer << csv.headers.collect do |field_name|
22
+ convert(row[field_name], field_types[field_name])
23
+ end
24
+ end
25
+ end
26
+ end
27
+
28
+ def field_types
29
+ @field_types ||= analysis.field_types
30
+ end
31
+
32
+ class << self
33
+ def convert(csv, analysis, output_path)
34
+ converter = new(csv, analysis, output_path)
35
+ converter.execute
36
+ converter
37
+ end
38
+ end
39
+
40
+ private
41
+
42
+ def convert(value, type)
43
+ if value.nil? || type == :string || type == :empty
44
+ return value.to_s
45
+ else
46
+ formatted = Converters::Number.reformat(value)
47
+
48
+ case type
49
+ when :integer
50
+ return Converters::Integer.convert(formatted)
51
+ when :number
52
+ return Converters::Number.convert(formatted)
53
+ when :boolean
54
+ return Converters::Boolean.convert(value)
55
+ when :datetime
56
+ return Converters::DateTime.convert(value).strftime("%Y-%m-%dT%H:%M:%SZ")
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,55 @@
1
+ require 'rcsv'
2
+
3
+ module DataKit
4
+ module CSV
5
+ class Parser
6
+ # Encode streams from BINARY into UTF-8
7
+ InternalEnc = Encoding.find("UTF-8")
8
+ ExternalEnc = Encoding.find("BINARY")
9
+
10
+ attr_reader :path
11
+ attr_reader :handle
12
+ attr_reader :headers
13
+
14
+ def initialize(path)
15
+ @path = path
16
+
17
+ set_handle
18
+ set_headers
19
+ end
20
+
21
+ def each_row(&block)
22
+ handle.rewind
23
+ Rcsv.parse(handle, :header => :skip, :columns => columns, :row_as_hash => true) do |row|
24
+ yield row
25
+ end
26
+ end
27
+
28
+ private
29
+
30
+ def columns
31
+ index = -1
32
+ @columns ||= headers.inject({}) do |result, field_name|
33
+ index += 1
34
+ result[index] = { :alias => field_name }
35
+ result
36
+ end
37
+ end
38
+
39
+ def set_handle
40
+ if path.is_a?(IO)
41
+ @handle = path
42
+ else
43
+ @handle = File.open(path)
44
+ end
45
+
46
+ @handle.set_encoding(ExternalEnc, InternalEnc)
47
+ end
48
+
49
+ def set_headers
50
+ handle.rewind
51
+ Rcsv.parse(handle, :header => :none) { |row| @headers = row; break }
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,58 @@
1
+ module DataKit
2
+ module Dataset
3
+ class Field
4
+
5
+ Types = [:string, :integer, :number, :datetime, :boolean, :null].freeze
6
+
7
+ attr_accessor :name
8
+ attr_accessor :key
9
+ attr_accessor :type
10
+
11
+ def initialize(name, options = {})
12
+ @name = name
13
+ @key = options[:key] || false
14
+ @type = (options[:type] || :string).to_sym
15
+ end
16
+
17
+ def key?
18
+ key == true
19
+ end
20
+
21
+ def to_hash
22
+ { 'name' => name, 'type' => type.to_s, 'key' => key?}
23
+ end
24
+
25
+ class << self
26
+ def type?(value)
27
+ return :null if value.nil?
28
+ reformatted = Converters::Number.reformat(value)
29
+
30
+ if Converters::Integer.match?(reformatted)
31
+ :integer
32
+ elsif Converters::Number.match?(reformatted)
33
+ :number
34
+ elsif Converters::Boolean.match?(value)
35
+ :boolean
36
+ elsif Converters::DateTime.match?(value)
37
+ :datetime
38
+ else
39
+ :string
40
+ end
41
+ end
42
+
43
+ def convert(value, type)
44
+ return nil if type == :null || value.nil?
45
+ reformatted = Converters::Number.reformat(value)
46
+
47
+ case type
48
+ when :integer then Converters::Integer.convert(reformatted)
49
+ when :number then Converters::Number.convert(reformatted)
50
+ when :boolean then Converters::Boolean.convert(value)
51
+ when :datetime then Converters::DateTime.convert(value)
52
+ else value.to_s
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,21 @@
1
+ require 'yaml'
2
+
3
+ module DataKit
4
+ module Dataset
5
+ class Schema
6
+ attr_accessor :fields
7
+
8
+ def initialize
9
+ @fields = []
10
+ end
11
+
12
+ def keys
13
+ fields.select{ |f| f.key? }
14
+ end
15
+
16
+ def to_yaml
17
+ fields.collect(&:to_hash).to_yaml
18
+ end
19
+ end
20
+ end
21
+ end