data_kit 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,121 @@
1
+ require "rcsv/rcsv"
2
+ require "rcsv/version"
3
+
4
+ require "stringio"
5
+
6
+ #
7
+ # This is a temporary monkey patch to Rcsv.parse
8
+ # to silence warnings in Ruby 2 about #lines being deprecated
9
+ #
10
+
11
+ class Rcsv
12
+ def self.parse(csv_data, options = {}, &block)
13
+ options[:header] ||= :use
14
+ raw_options = {}
15
+
16
+ raw_options[:col_sep] = options[:column_separator] && options[:column_separator][0] || ','
17
+ raw_options[:offset_rows] = options[:offset_rows] || 0
18
+ raw_options[:nostrict] = options[:nostrict]
19
+ raw_options[:parse_empty_fields_as] = options[:parse_empty_fields_as]
20
+ raw_options[:buffer_size] = options[:buffer_size] || 1024 * 1024 # 1 MiB
21
+
22
+ if csv_data.is_a?(String)
23
+ csv_data = StringIO.new(csv_data)
24
+ elsif !(csv_data.respond_to?(:lines) && csv_data.respond_to?(:read))
25
+ inspected_csv_data = csv_data.inspect
26
+ raise ParseError.new("Supplied CSV object #{inspected_csv_data[0..127]}#{inspected_csv_data.size > 128 ? '...' : ''} is neither String nor looks like IO object.")
27
+ end
28
+
29
+ if csv_data.respond_to?(:external_encoding)
30
+ raw_options[:output_encoding] = csv_data.external_encoding.to_s
31
+ end
32
+
33
+ initial_position = csv_data.pos
34
+
35
+ case options[:header]
36
+ when :use
37
+ header = self.raw_parse(StringIO.new(csv_data.each_line.first), raw_options).first
38
+ raw_options[:offset_rows] += 1
39
+ when :skip
40
+ header = (0..(csv_data.each_line.first.split(raw_options[:col_sep]).count)).to_a
41
+ raw_options[:offset_rows] += 1
42
+ when :none
43
+ header = (0..(csv_data.each_line.first.split(raw_options[:col_sep]).count)).to_a
44
+ end
45
+
46
+ raw_options[:row_as_hash] = options[:row_as_hash] # Setting after header parsing
47
+
48
+ if options[:columns]
49
+ only_rows = []
50
+ except_rows = []
51
+ row_defaults = []
52
+ column_names = []
53
+ row_conversions = ''
54
+
55
+ header.each do |column_header|
56
+ column_options = options[:columns][column_header]
57
+ if column_options
58
+ if (options[:row_as_hash])
59
+ column_names << (column_options[:alias] || column_header)
60
+ end
61
+
62
+ row_defaults << column_options[:default] || nil
63
+
64
+ only_rows << case column_options[:match]
65
+ when Array
66
+ column_options[:match]
67
+ when nil
68
+ nil
69
+ else
70
+ [column_options[:match]]
71
+ end
72
+
73
+ except_rows << case column_options[:not_match]
74
+ when Array
75
+ column_options[:not_match]
76
+ when nil
77
+ nil
78
+ else
79
+ [column_options[:not_match]]
80
+ end
81
+
82
+ row_conversions << case column_options[:type]
83
+ when :int
84
+ 'i'
85
+ when :float
86
+ 'f'
87
+ when :string
88
+ 's'
89
+ when :bool
90
+ 'b'
91
+ when nil
92
+ 's' # strings by default
93
+ else
94
+ fail "Unknown column type #{column_options[:type].inspect}."
95
+ end
96
+ elsif options[:only_listed_columns]
97
+ column_names << nil
98
+ row_defaults << nil
99
+ only_rows << nil
100
+ except_rows << nil
101
+ row_conversions << ' '
102
+ else
103
+ column_names << column_header
104
+ row_defaults << nil
105
+ only_rows << nil
106
+ except_rows << nil
107
+ row_conversions << 's'
108
+ end
109
+ end
110
+
111
+ raw_options[:column_names] = column_names if options[:row_as_hash]
112
+ raw_options[:only_rows] = only_rows unless only_rows.compact.empty?
113
+ raw_options[:except_rows] = except_rows unless except_rows.compact.empty?
114
+ raw_options[:row_defaults] = row_defaults unless row_defaults.compact.empty?
115
+ raw_options[:row_conversions] = row_conversions
116
+ end
117
+
118
+ csv_data.pos = initial_position
119
+ return self.raw_parse(csv_data, raw_options, &block)
120
+ end
121
+ end
@@ -0,0 +1,3 @@
1
+ module DataKit
2
+ VERSION = "0.0.2"
3
+ end
data/lib/data_kit.rb ADDED
@@ -0,0 +1,20 @@
1
+ require "data_kit/version"
2
+
3
+ # Data I/O
4
+ require 'data_kit/csv/parser'
5
+ require 'data_kit/csv/analyzer'
6
+ require 'data_kit/csv/analysis'
7
+ require 'data_kit/csv/converter'
8
+
9
+ # Data Conversion
10
+ require 'data_kit/converters/number'
11
+ require 'data_kit/converters/integer'
12
+ require 'data_kit/converters/boolean'
13
+ require 'data_kit/converters/date_time'
14
+
15
+ # Datasets
16
+ require 'data_kit/dataset/field'
17
+ require 'data_kit/dataset/schema'
18
+
19
+ # Patches / Fixes
20
+ require 'data_kit/patches/rcsv'
@@ -0,0 +1,18 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataKit::Converters::Boolean do
4
+ it "should match values" do
5
+ ['true', 't', 'false', 'f'].each do |testcase|
6
+ DataKit::Converters::Boolean.match?(testcase).should == true
7
+ end
8
+ end
9
+
10
+ it "should convert value it can match" do
11
+ {
12
+ "true" => true, "t" => true,
13
+ "false" => false, "f" => false
14
+ }.each do |testcase, result|
15
+ DataKit::Converters::Boolean.convert(testcase).should == result
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataKit::Converters::DateTime do
4
+ it "should match a date" do
5
+ DataKit::Converters::DateTime.match?("1/1/00").should == true
6
+ DataKit::Converters::DateTime.match?("2010-01-01").should == true
7
+ DataKit::Converters::DateTime.match?("2010-01-01 12:00:00").should == true
8
+ DataKit::Converters::DateTime.match?("2000-01-01T00:00:00Z").should == true
9
+ DataKit::Converters::DateTime.match?("2000-01-01T00:00:00+00:00").should == true
10
+ DataKit::Converters::DateTime.match?("10/16/10 18:24").should == true
11
+ DataKit::Converters::DateTime.match?("10/16/10 1:24:15").should == true
12
+ end
13
+
14
+ it "should convert dates it can match" do
15
+ {
16
+ '1/1/00' => "2000-01-01 00:00:00",
17
+ '2010-01-01' => '2010-01-01 00:00:00',
18
+ '2010-01-01 12:00:00' => '2010-01-01 12:00:00',
19
+ '2000-01-01T00:00:00' => '2000-01-01 00:00:00',
20
+ '2000-01-01T00:00:00Z' => '2000-01-01 00:00:00',
21
+ '2000-02-01T00:00:00+00:00' => '2000-02-01 00:00:00',
22
+ '10/1/2012 10:27:45.000000 AM' => '2012-10-01 10:27:45',
23
+ '10/1/2012 1:27:45.000000 AM' => '2012-10-01 01:27:45',
24
+ '1/1/2012 1:27:45.000000 AM' => '2012-01-01 01:27:45',
25
+ "10/16/10 18:24" => '2010-10-16 18:24:00'
26
+ }.each do |testcase, result|
27
+ DataKit::Converters::DateTime.convert(testcase).strftime("%Y-%m-%d %H:%M:%S").should == result
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,20 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataKit::Converters::Integer do
4
+ it "should match values" do
5
+ ["100", "-100", "1,000", "$1,000"].each do |integer|
6
+ reformatted = DataKit::Converters::Integer.reformat(integer)
7
+ DataKit::Converters::Integer.match?(reformatted).should == true
8
+ end
9
+ end
10
+
11
+ it "should convert value it can match" do
12
+ {
13
+ "100" => 100, "-100" => -100,
14
+ "1,000" => 1000, "$1,000" => 1000
15
+ }.each do |testcase, result|
16
+ reformatted = DataKit::Converters::Integer.reformat(testcase)
17
+ DataKit::Converters::Integer.convert(reformatted).should == result
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,20 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataKit::Converters::Integer do
4
+ it "should match values" do
5
+ ["100.0", "-100.5", "-1,000.00", "5.6E11", "$1,000.21"].each do |number|
6
+ reformatted = DataKit::Converters::Number.reformat(number)
7
+ DataKit::Converters::Number.match?(reformatted).should == true
8
+ end
9
+ end
10
+
11
+ it "should convert value it can match" do
12
+ {
13
+ "100.0" => 100.0, "-100.5" => -100.5,
14
+ "-1,000.00" => -1000.00, "5.6E11" => 5.6E11, "$1,000.21" => 1000.21
15
+ }.each do |testcase, result|
16
+ reformatted = DataKit::Converters::Number.reformat(testcase)
17
+ DataKit::Converters::Number.convert(reformatted).should == result
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,55 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataKit::CSV::Analysis do
4
+ it "should insert a row for analysis" do
5
+ analysis = DataKit::CSV::Analysis.new(['field1'])
6
+
7
+ analysis.insert 'field1', '1.0'
8
+ analysis.insert 'field1', '2.0'
9
+
10
+ analysis.type_list('field1').should == [:number]
11
+ end
12
+
13
+ it "should know the number of rows with a particular type" do
14
+ analysis = DataKit::CSV::Analysis.new(['field1'])
15
+
16
+ analysis.insert 'field1', '1.0'
17
+ analysis.insert 'field1', '2.0'
18
+
19
+ analysis.type_count('field1', :number).should == 2
20
+ end
21
+
22
+ it "should determine the type of a field" do
23
+ analysis = DataKit::CSV::Analysis.new(['field1', 'field2'])
24
+
25
+ analysis.insert 'field1', '1.0'
26
+ analysis.insert 'field1', '2.0'
27
+ analysis.insert 'field2', 'str'
28
+ analysis.insert 'field2', 'str2'
29
+
30
+ analysis.type?('field1').should == :number
31
+ analysis.type?('field2').should == :string
32
+ end
33
+
34
+ it "should infer a string type if there non-numeric mixed types" do
35
+ analysis = DataKit::CSV::Analysis.new(['field1', 'field2'])
36
+
37
+ analysis.insert 'field1', '1.0'
38
+ analysis.insert 'field1', '2.0'
39
+ analysis.insert 'field2', '2.0'
40
+ analysis.insert 'field2', 'str2'
41
+
42
+ analysis.type?('field1').should == :number
43
+ analysis.type?('field2').should == :string
44
+ end
45
+
46
+ it "should infer a number type if there are mixed numeric types" do
47
+ analysis = DataKit::CSV::Analysis.new(['field1'])
48
+
49
+ analysis.insert 'field1', '1.0'
50
+ analysis.insert 'field1', '20'
51
+ analysis.insert 'field1', nil
52
+
53
+ analysis.type?('field1').should == :number
54
+ end
55
+ end
@@ -0,0 +1,56 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataKit::CSV::Analyzer do
4
+ let(:path) {
5
+ data_path('standard.csv')
6
+ }
7
+
8
+ let(:csv) {
9
+ DataKit::CSV::Parser.new(path)
10
+ }
11
+
12
+ let(:iocsv) {
13
+ DataKit::CSV::Parser.new(File.open(path))
14
+ }
15
+
16
+ it "should initialize" do
17
+ analyzer = DataKit::CSV::Analyzer.new(csv)
18
+
19
+ analyzer.csv.should == csv
20
+ analyzer.keys.should == []
21
+ analyzer.sample_rate.should == 0.1
22
+ end
23
+
24
+ it "should initialize schema with an IO" do
25
+ analyzer = DataKit::CSV::Analyzer.new(iocsv)
26
+
27
+ analyzer.csv.should == iocsv
28
+ analyzer.keys.should == []
29
+ analyzer.sample_rate.should == 0.1
30
+ end
31
+
32
+ it "should execute an analysis" do
33
+ analysis = DataKit::CSV::Analyzer.new(csv, :sample_rate => 0.5).execute
34
+
35
+ puts analysis.inspect
36
+
37
+ analysis.type?('id').should == :integer
38
+ analysis.type?('first_name').should == :string
39
+ analysis.type?('last_name').should == :string
40
+ analysis.type?('email').should == :string
41
+ analysis.type?('country').should == :string
42
+ analysis.type?('ip_address').should == :string
43
+ analysis.type?('amount').should == :number
44
+ analysis.type?('active').should == :boolean
45
+ analysis.type?('activated_at').should == :datetime
46
+ analysis.type?('address').should == :string
47
+
48
+ analysis.row_count.should == 10
49
+ analysis.sample_count.should be < 10
50
+ end
51
+
52
+ it "should calculate a sample_rate" do
53
+ DataKit::CSV::Analyzer.sample_rate(1024).should == 1
54
+ DataKit::CSV::Analyzer.sample_rate(2048 * 2048).should be < 1
55
+ end
56
+ end
@@ -0,0 +1,35 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataKit::CSV::Converter do
4
+ let(:tmpdir) {
5
+ Dir.mktmpdir
6
+ }
7
+
8
+ let(:target) {
9
+ File.join(tmpdir, 'data.csv')
10
+ }
11
+
12
+ let(:csv) {
13
+ DataKit::CSV::Parser.new(data_path('standard.csv'))
14
+ }
15
+
16
+ it "should initialize and execute" do
17
+ analysis = DataKit::CSV::Analyzer.analyze(csv, :sample_rate => 1)
18
+ converter = DataKit::CSV::Converter.new(csv, analysis, target)
19
+
20
+ converter.execute
21
+
22
+ row_count = 0
23
+ CSV.open(target).each { |row| row_count += 1 }
24
+ row_count.should == 11
25
+ end
26
+
27
+ it "should convert using the convience method" do
28
+ analysis = DataKit::CSV::Analyzer.analyze(csv, :sample_rate => 1)
29
+ converter = DataKit::CSV::Converter.convert(csv, analysis, target)
30
+
31
+ row_count = 0
32
+ CSV.open(target).each { |row| row_count += 1 }
33
+ row_count.should == 11
34
+ end
35
+ end
@@ -0,0 +1,50 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataKit::CSV::Parser do
4
+ let(:path) {
5
+ data_path('standard.csv')
6
+ }
7
+
8
+ let(:crlf_path) {
9
+ data_path('carriage_returns.csv')
10
+ }
11
+
12
+ it "should initialize" do
13
+ csv = DataKit::CSV::Parser.new(path)
14
+
15
+ csv.path.should == path
16
+ end
17
+
18
+ it "should enumerate rows with a string path" do
19
+ csv = DataKit::CSV::Parser.new(path)
20
+
21
+ count = 0
22
+ csv.each_row do |row|
23
+ count += 1
24
+ end
25
+
26
+ count.should == 10
27
+ end
28
+
29
+ it "should enumerate rows with an IO path" do
30
+ csv = DataKit::CSV::Parser.new(File.open(path))
31
+
32
+ count = 0
33
+ csv.each_row do |row|
34
+ count += 1
35
+ end
36
+
37
+ count.should == 10
38
+ end
39
+
40
+ it "should enumerate rows for lines separated by CRLF" do
41
+ csv = DataKit::CSV::Parser.new(File.open(crlf_path))
42
+
43
+ count = 0
44
+ csv.each_row do |row|
45
+ count += 1
46
+ end
47
+
48
+ count.should == 10
49
+ end
50
+ end
@@ -0,0 +1,95 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataKit::Dataset::Field do
4
+ it "should initialize" do
5
+ field = DataKit::Dataset::Field.new('field_name')
6
+
7
+ field.name.should == 'field_name'
8
+ field.key?.should == false
9
+ field.type.should == :string
10
+ end
11
+
12
+ it "should serialize" do
13
+ field = DataKit::Dataset::Field.new('field_name')
14
+
15
+ field.to_hash.should == {
16
+ 'name' => 'field_name', 'type' => 'string', 'key' => false
17
+ }
18
+ end
19
+
20
+ it "should infer nil types" do
21
+ DataKit::Dataset::Field.type?(nil).should == :null
22
+ end
23
+
24
+ it "should infer integer types" do
25
+ DataKit::Dataset::Field.type?("100").should == :integer
26
+ DataKit::Dataset::Field.type?("-100").should == :integer
27
+ DataKit::Dataset::Field.type?("1,000").should == :integer
28
+ DataKit::Dataset::Field.type?("$1,000").should == :integer
29
+ end
30
+
31
+ it "should infer numeric types" do
32
+ DataKit::Dataset::Field.type?("100.0").should == :number
33
+ DataKit::Dataset::Field.type?("-100.5").should == :number
34
+ DataKit::Dataset::Field.type?("5.6E11").should == :number
35
+ DataKit::Dataset::Field.type?("-1,000.0").should == :number
36
+ DataKit::Dataset::Field.type?("$1,000.0").should == :number
37
+ end
38
+
39
+ it "should infer date types" do
40
+ DataKit::Dataset::Field.type?("2010-01-01").should == :datetime
41
+
42
+ # Excel makes everyone sad
43
+ DataKit::Dataset::Field.type?("1/1/00").should == :datetime
44
+ end
45
+
46
+ it "should infer date/time types" do
47
+ DataKit::Dataset::Field.type?("2010-01-01 12:00:00").should == :datetime
48
+ end
49
+
50
+ it "should infer boolean types" do
51
+ DataKit::Dataset::Field.type?("true").should == :boolean
52
+ DataKit::Dataset::Field.type?("false").should == :boolean
53
+ end
54
+
55
+ it "should infer string types" do
56
+ DataKit::Dataset::Field.type?("true5").should == :string
57
+ DataKit::Dataset::Field.type?("my string").should == :string
58
+ end
59
+
60
+ it "should convert nil values" do
61
+ DataKit::Dataset::Field.convert(nil, :string).should == nil
62
+ end
63
+
64
+ it "should convert integer values" do
65
+ DataKit::Dataset::Field.convert("100", :integer).should == 100
66
+ DataKit::Dataset::Field.convert("-100", :integer).should == -100
67
+ DataKit::Dataset::Field.convert("1,000", :integer).should == 1_000
68
+ DataKit::Dataset::Field.convert("$1,000", :integer).should == 1_000
69
+ end
70
+
71
+ it "should convert numeric values" do
72
+ DataKit::Dataset::Field.convert("100.0", :number).should == 100.0
73
+ DataKit::Dataset::Field.convert("-100.0", :number).should == -100.0
74
+ DataKit::Dataset::Field.convert("-1,000.0", :number).should == -1_000.0
75
+ DataKit::Dataset::Field.convert("5E5", :number).should == 500000.0
76
+ DataKit::Dataset::Field.convert("$1,000.0", :number).should == 1000.0
77
+ end
78
+
79
+ it "should convert boolean values" do
80
+ DataKit::Dataset::Field.convert("true", :boolean).should == true
81
+ DataKit::Dataset::Field.convert("false", :boolean).should == false
82
+ end
83
+
84
+ it "should convert date values" do
85
+ DataKit::Dataset::Field.convert("2010-01-01", :datetime).strftime("%Y-%m-%d %H:%M:%S").should == '2010-01-01 00:00:00'
86
+ end
87
+
88
+ it "should convert date/time values" do
89
+ DataKit::Dataset::Field.convert("2010-01-01 12:00:00", :datetime).strftime("%Y-%m-%d %H:%M:%S").should == '2010-01-01 12:00:00'
90
+ end
91
+
92
+ it "should convert string values" do
93
+ DataKit::Dataset::Field.convert(500, :string).should == "500"
94
+ end
95
+ end
@@ -0,0 +1,22 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataKit::Dataset::Schema do
4
+ it "should initialize" do
5
+ schema = DataKit::Dataset::Schema.new
6
+ schema.fields.should == []
7
+ end
8
+
9
+ it "should return a list of keys" do
10
+ schema = DataKit::Dataset::Schema.new
11
+ schema.fields << DataKit::Dataset::Field.new('field')
12
+ schema.fields << DataKit::Dataset::Field.new('field2', key: true)
13
+
14
+ schema.keys.length.should == 1
15
+ end
16
+
17
+ it "should serialize to yaml" do
18
+ schema = DataKit::Dataset::Schema.new
19
+ schema.fields << DataKit::Dataset::Field.new('field')
20
+ schema.to_yaml.should == schema.fields.collect(&:to_hash).to_yaml
21
+ end
22
+ end
@@ -0,0 +1 @@
1
+ draft_order,player,position,avg_draft_position,avg_bid_value
@@ -0,0 +1,11 @@
1
+ id,first_name,last_name,email,country,ip_address,amount,active,activated_at,address
2
+ 1,Todd,Hamilton,thamilton@plambee.edu,Norfolk Island,66.157.128.241,$7.72,true,10/3/1955,0582 Dwight Street
3
+ 2,Melissa,Kelly,mkelly@twinte.name,Singapore,204.221.167.233,$4.33,false,7/9/2013,06 Cardinal Crossing
4
+ 3,Donald,Wheeler,dwheeler@edgeify.mil,Madagascar,34.201.104.193,$2.92,true,12/12/1993,4 Del Sol Hill
5
+ 4,Ruby,Hall,rhall@cogilith.com,USSR,237.243.109.67,$8.27,false,12/15/1975,7 Ramsey Avenue
6
+ 5,Jessica,Cole,jcole@shuffletag.info,Cyprus,25.40.138.137,$8.16,false,6/2/1939,8142 Novick Hill
7
+ 6,Doris,Nelson,dnelson@zoombox.biz,Svalbard and Jan Mayen Islands,233.43.155.229,$6.26,false,5/23/1948,596 Veith Road
8
+ 7,Robert,Hansen,rhansen@miboo.edu,Ghana,41.194.33.211,$4.90,true,9/28/1999,529 Oak Pass
9
+ 8,Matthew,Freeman,mfreeman@midel.name,Sudan,53.186.162.65,$6.63,false,5/24/1996,70682 Declaration Center
10
+ 9,Julia,Nelson,jnelson@skajo.net,Vatican City State (Holy See),249.49.124.178,$9.80,true,1/26/1940,96 Hermina Lane
11
+ 10,Wanda,Palmer,wpalmer@ntags.biz,Indonesia,0.200.163.200,$5.89,false,1/1/1959,26837 Donald Trail
@@ -0,0 +1,18 @@
1
+ require "codeclimate-test-reporter"
2
+ CodeClimate::TestReporter.start
3
+
4
+ require 'rubygems'
5
+ require 'bundler/setup'
6
+
7
+ SimpleCov.start do
8
+ add_filter "/spec"
9
+ end
10
+
11
+ require 'tmpdir'
12
+ require 'data_kit'
13
+
14
+ RSpec.configure do |config|
15
+ def data_path(file)
16
+ File.join(File.dirname(__FILE__), 'fixtures', file)
17
+ end
18
+ end