pest 0.0.0 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +49 -47
- data/VERSION +1 -1
- data/lib/pest.rb +0 -1
- data/lib/pest/data_set.rb +45 -5
- data/lib/pest/data_set/hash.rb +70 -36
- data/lib/pest/data_set/narray.rb +104 -53
- data/lib/pest/estimator.rb +8 -28
- data/lib/pest/estimator/frequency.rb +9 -7
- data/lib/pest/function.rb +0 -7
- data/lib/pest/function/entropy.rb +5 -3
- data/lib/pest/function/probability.rb +45 -10
- data/pest.gemspec +15 -5
- data/spec/pest/data_set/hash_spec.rb +78 -63
- data/spec/pest/data_set/narray_spec.rb +107 -47
- data/spec/pest/data_set_spec.rb +36 -1
- data/spec/pest/estimator/frequency_spec.rb +31 -32
- data/spec/pest/estimator_spec.rb +13 -11
- data/spec/pest/function/entropy_spec.rb +11 -14
- data/spec/pest/function/probability_spec.rb +97 -28
- metadata +68 -6
- data/lib/pest/variable.rb +0 -34
- data/spec/pest/variable_spec.rb +0 -73
data/README.md
CHANGED
@@ -2,18 +2,10 @@
|
|
2
2
|
|
3
3
|
[![Build Status](https://secure.travis-ci.org/kerinin/pest.png)](http://travis-ci.org/kerinin/pest)
|
4
4
|
|
5
|
+
**A concise API focused on painless investigation of data sets**
|
5
6
|
|
6
|
-
Pest provides a
|
7
|
-
estimation models.
|
8
|
-
|
9
|
-
* Pest tries to be agnostic about the underlying data data structures,
|
10
|
-
so changing libraries (GSL -> Hadoop) is as simple as using a different data source.
|
11
|
-
* Pest is designed to create estimators using subsets of larger data sources, and
|
12
|
-
transparently constructs estimators to facilitate dynamic querying
|
13
|
-
* Implementing custom estimation models is easy, and Pest implements some model
|
14
|
-
common ones for you.
|
15
|
-
|
16
|
-
Pest abstracts common statstical operations including:
|
7
|
+
Pest provides a framework for interacting with different probability
|
8
|
+
estimation models. Pest abstracts common statstical operations including:
|
17
9
|
|
18
10
|
* Marginal, Joint and Conditional point probability
|
19
11
|
* Interval and Cumulative probability
|
@@ -21,24 +13,34 @@ Pest abstracts common statstical operations including:
|
|
21
13
|
* Mean, Median, Mode, etc
|
22
14
|
|
23
15
|
|
24
|
-
|
16
|
+
**Scalability if you need it**
|
25
17
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
brew install gsl # Forcing gsl v1.4
|
18
|
+
Pest tries to be agnostic about the underlying data data structures,
|
19
|
+
so changing libraries (NArray -> Hadoop) is as simple as using a different data source.
|
20
|
+
Pest is designed to create estimators using subsets of larger data sources, and
|
21
|
+
transparently constructs estimators to facilitate dynamic querying
|
31
22
|
|
32
|
-
|
33
|
-
|
23
|
+
|
24
|
+
**Code structure designed to be extended**
|
25
|
+
|
26
|
+
Implementing custom estimation models is easy, and Pest implements some model
|
27
|
+
common ones for you.
|
28
|
+
|
29
|
+
|
30
|
+
## Install
|
31
|
+
|
32
|
+
Add it to your Gemfile and bundle
|
33
|
+
|
34
|
+
gem "pest"
|
35
|
+
|
36
|
+
bundle install
|
34
37
|
|
35
38
|
## API
|
36
39
|
|
37
40
|
``` ruby
|
38
41
|
# Creating Datasets
|
39
|
-
test = Pest::DataSet::Hash.
|
40
|
-
|
41
|
-
train = Pest::DataSet::GSL.new file # Creates a GSL dataset from and IO instance
|
42
|
+
test = Pest::DataSet::Hash.from_hash hash # Creates a Hash dataset of observations from a hash
|
43
|
+
train = Pest::DataSet::NArray.from_hash hash # Creates a NArray dataset
|
42
44
|
|
43
45
|
# DataSet Variables
|
44
46
|
test.variables # hash of Variable instances detected in observation set
|
@@ -47,13 +49,14 @@ test.v[:foo] # a specific variable
|
|
47
49
|
test.v[:foo] = another_variable # explicit declaration
|
48
50
|
|
49
51
|
# Creating Estimators
|
50
|
-
e = Pest::Estimator::
|
51
|
-
e = Pest::Estimator::
|
52
|
+
e = Pest::Estimator::Frequency.new(data) # Frequentist estimator - values treated as unordered set
|
53
|
+
e = Pest::Estimator::Multinomial.new(data) # Multinomial estimator
|
54
|
+
e = Pest::Estimator::Gaussian.new(data) # Gaussian mean/varaince ML estimator
|
52
55
|
|
53
56
|
# Descriptive Statistical Properties
|
54
|
-
e.mode(:foo) # Mode
|
55
|
-
e.mean(:foo) # Mean (discrete & continuous only)
|
56
|
-
e.median(:foo) # Median (discrete & continuous only)
|
57
|
+
#e.mode(:foo) # Mode
|
58
|
+
#e.mean(:foo) # Mean (discrete & continuous only)
|
59
|
+
#e.median(:foo) # Median (discrete & continuous only)
|
57
60
|
# quantile?
|
58
61
|
# variance?
|
59
62
|
# deviation?
|
@@ -65,26 +68,25 @@ e.h(:foo).given(:bar) # Cross entropy of 'foo' :
|
|
65
68
|
e.mutual_information(:foo, :bar) # Mutual information of 'foo' and 'bar'
|
66
69
|
e.i(:foo, :bar) # Alias
|
67
70
|
|
68
|
-
# Estimating Point Probability
|
69
|
-
e.probability(
|
70
|
-
e.p(:foo)
|
71
|
-
e.p(:foo)
|
72
|
-
e.p(:foo).given(:bar)
|
73
|
-
e.p(:foo, :bar).
|
74
|
-
|
75
|
-
|
76
|
-
e.
|
77
|
-
e.
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
e.
|
71
|
+
# Estimating Point Probability
|
72
|
+
e.probability(e.variables[:foo] => 1) # Estimate the probability that foo=1
|
73
|
+
e.p(:foo => 1) # Same as above, tries to find a variable named 'foo'
|
74
|
+
e.p(:foo => 1, :bar => 2) # Estimate the probability that foo=1 AND bar=2
|
75
|
+
e.p(:foo => 1).given(:bar => 2) # Estimate the probability that foo=1 given bar=2
|
76
|
+
e.p(:foo => 1, :bar => 2).given(:baz => 3, :qux => 4) # Moar
|
77
|
+
|
78
|
+
# Batch Point Probability Estimation
|
79
|
+
e.batch_probability(:foo).in(test) # Estimate the probability of each value in test
|
80
|
+
e.batch_p(:foo, :bar).in(test) # Joint probability
|
81
|
+
e.batch_p(:foo).given(:bar).in(test) # Conditional probability
|
82
|
+
e.batch_p(:foo, :bar).given(:baz, :qux).in(test) # Moar
|
83
|
+
|
84
|
+
# Estimating Cumulative & Interval Probability
|
85
|
+
#e.probability(:foo).greater_than(:bar).in(test)
|
86
|
+
#e.p(:foo).greater_than(:bar).less_than(:baz).in(test)
|
87
|
+
#e.p(:foo).gt(:bar).lt(:baz).given(:qux).in(test)
|
83
88
|
```
|
84
89
|
|
85
|
-
##
|
90
|
+
## TODO
|
86
91
|
|
87
|
-
|
88
|
-
variables named differently in different data sets to be equivalent. And how the
|
89
|
-
fuck do we handle variable type? I'm almost thinking we don't, and let the actual
|
90
|
-
estimators take care of type casting
|
92
|
+
the builders should validate the variables they're given and throw errors if they're not part of the estimators data
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.1.0
|
data/lib/pest.rb
CHANGED
data/lib/pest/data_set.rb
CHANGED
@@ -3,13 +3,15 @@ module Pest::DataSet
|
|
3
3
|
base.extend(ClassMethods)
|
4
4
|
end
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
attr_accessor :variables, :data
|
9
9
|
|
10
|
-
def
|
11
|
-
@variables
|
10
|
+
def initialize(variables = Set.new, data = nil)
|
11
|
+
@variables = variables
|
12
|
+
@data = data
|
12
13
|
end
|
14
|
+
alias :v :variables
|
13
15
|
|
14
16
|
def to_hash(*args)
|
15
17
|
raise NotImplementedError
|
@@ -27,6 +29,44 @@ module Pest::DataSet
|
|
27
29
|
raise NotImplementedError
|
28
30
|
end
|
29
31
|
|
32
|
+
def ==(other)
|
33
|
+
variables == other.variables and data == other.data
|
34
|
+
end
|
35
|
+
alias :eql? :==
|
36
|
+
|
37
|
+
def [](*args)
|
38
|
+
raise NotImplementedError
|
39
|
+
end
|
40
|
+
|
41
|
+
def except(start, finish)
|
42
|
+
left = start > 0 ? self[0..start] : nil
|
43
|
+
right = finish < length - 1 ? self[finish..-1] : nil
|
44
|
+
case [left.nil?, right.nil?]
|
45
|
+
when [true, false]
|
46
|
+
right
|
47
|
+
when [false, true]
|
48
|
+
left
|
49
|
+
when [false, false]
|
50
|
+
right + left
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def +(other)
|
55
|
+
raise NotImplementedError
|
56
|
+
end
|
57
|
+
|
58
|
+
def pick(*args)
|
59
|
+
raise NotImplementedError
|
60
|
+
end
|
61
|
+
|
62
|
+
def each(&block)
|
63
|
+
raise NotImplementedError
|
64
|
+
end
|
65
|
+
|
66
|
+
def merge(other)
|
67
|
+
raise NotImplementedError
|
68
|
+
end
|
69
|
+
|
30
70
|
module ClassMethods
|
31
71
|
def from(data_source)
|
32
72
|
# Try to translate the data source directly
|
data/lib/pest/data_set/hash.rb
CHANGED
@@ -3,68 +3,102 @@ class Pest::DataSet::Hash
|
|
3
3
|
|
4
4
|
def self.translators
|
5
5
|
{
|
6
|
-
File => :from_file,
|
7
6
|
String => :from_file,
|
8
7
|
Symbol => :from_file
|
9
8
|
}
|
10
9
|
end
|
11
10
|
|
12
|
-
def self.
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
if object.kind_of?(::Hash)
|
18
|
-
self.new(object)
|
19
|
-
else
|
20
|
-
raise "File does not seem to contain valid data"
|
21
|
-
end
|
11
|
+
def self.from_hash(hash)
|
12
|
+
data_set = new
|
13
|
+
data_set.variables += hash.keys
|
14
|
+
data_set.instance_variable_set(:@hash, hash)
|
15
|
+
data_set
|
22
16
|
end
|
23
17
|
|
24
18
|
attr_reader :variables, :hash
|
25
19
|
|
26
|
-
def initialize(
|
27
|
-
|
28
|
-
@
|
29
|
-
hash.keys().each do |name|
|
30
|
-
@variables[name] = Pest::Variable.new(:name => name)
|
31
|
-
end
|
20
|
+
def initialize(*args)
|
21
|
+
super *args
|
22
|
+
@hash = {}
|
32
23
|
end
|
33
24
|
|
34
|
-
def
|
35
|
-
|
25
|
+
def data
|
26
|
+
hash.values
|
36
27
|
end
|
37
28
|
|
38
|
-
def
|
39
|
-
|
29
|
+
def to_hash
|
30
|
+
hash
|
40
31
|
end
|
41
32
|
|
42
33
|
def length
|
43
34
|
@hash.values.first.length
|
44
35
|
end
|
45
36
|
|
46
|
-
def
|
47
|
-
|
48
|
-
|
49
|
-
|
37
|
+
def [](*args)
|
38
|
+
unless args.any?
|
39
|
+
raise ArgumentError, "Indices not specified"
|
40
|
+
end
|
41
|
+
|
42
|
+
args.map do |arg|
|
43
|
+
subset = self.class.new
|
44
|
+
subset.variables = self.variables
|
45
|
+
variables.each do |var|
|
46
|
+
subset.hash[var.name] = hash[var.name][arg]
|
47
|
+
end
|
48
|
+
subset
|
49
|
+
end.inject(:+)
|
50
|
+
|
50
51
|
end
|
51
52
|
|
52
|
-
|
53
|
-
|
53
|
+
def +(other)
|
54
|
+
unless other.variables == variables
|
55
|
+
raise ArgumentError, "DataSets have different variables"
|
56
|
+
end
|
54
57
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
+
union = self.class.new
|
59
|
+
union.variables = variables
|
60
|
+
variables.each do |var|
|
61
|
+
union.hash[var.name] = hash[var.name] + other.hash[var.name]
|
58
62
|
end
|
63
|
+
union
|
64
|
+
end
|
59
65
|
|
60
|
-
|
61
|
-
|
66
|
+
def pick(*args)
|
67
|
+
unless args.any?
|
68
|
+
raise ArgumentError, "You didn't specify any variables to pick"
|
62
69
|
end
|
63
70
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
71
|
+
subset = self.class.new
|
72
|
+
subset.variables += args
|
73
|
+
args.each do |var|
|
74
|
+
raise ArgumentError, "Dataset doesn't include '#{var}'" unless hash.has_key?(var)
|
75
|
+
subset.hash[var] = hash[var]
|
68
76
|
end
|
77
|
+
subset
|
78
|
+
end
|
79
|
+
|
80
|
+
def each(&block)
|
81
|
+
(0..length-1).to_a.each do |i| yield variables.map {|var| hash[var][i]} end
|
82
|
+
end
|
83
|
+
|
84
|
+
def dup
|
85
|
+
instance = self.class.new
|
86
|
+
instance.variables = variables.dup
|
87
|
+
instance.instance_variable_set(:@hash, hash.dup)
|
88
|
+
instance
|
89
|
+
end
|
90
|
+
|
91
|
+
def merge(other)
|
92
|
+
dup.merge!(other)
|
93
|
+
end
|
94
|
+
|
95
|
+
def merge!(other)
|
96
|
+
other = self.class.from_hash(other) if other.kind_of?(::Hash)
|
97
|
+
raise ArgumentError, "Lengths must be the same" if other.length != length
|
98
|
+
|
99
|
+
@variables += other.variables
|
100
|
+
hash.merge! other.hash
|
101
|
+
|
102
|
+
self
|
69
103
|
end
|
70
104
|
end
|
data/lib/pest/data_set/narray.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'narray'
|
2
2
|
|
3
|
-
class Pest::DataSet::NArray
|
3
|
+
class Pest::DataSet::NArray
|
4
4
|
include Pest::DataSet
|
5
5
|
|
6
6
|
def self.translators
|
@@ -13,86 +13,137 @@ class Pest::DataSet::NArray < NMatrix
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def self.from_hash(hash)
|
16
|
-
data_set =
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
end
|
16
|
+
data_set = new(
|
17
|
+
hash.keys.to_set,
|
18
|
+
NMatrix.to_na(hash.values)
|
19
|
+
)
|
20
|
+
data_set.instance_variable_set(:@variable_array, hash.keys)
|
22
21
|
data_set
|
23
22
|
end
|
24
23
|
|
25
24
|
def self.from_file(file)
|
26
|
-
|
27
|
-
|
28
|
-
begin
|
29
|
-
variables, matrix = Marshal.restore(file)
|
30
|
-
data_set = to_na(matrix)
|
31
|
-
data_set.variables = variables
|
32
|
-
data_set
|
33
|
-
rescue
|
34
|
-
raise "File does not seem to contain valid data"
|
35
|
-
end
|
25
|
+
from_csv(file)
|
36
26
|
end
|
37
27
|
|
38
28
|
def self.from_csv(file, args={})
|
39
|
-
args =
|
40
|
-
|
41
|
-
|
42
|
-
data_set
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
29
|
+
args = {:col_sep => "\t", :headers => true, :converters => :all}.merge args
|
30
|
+
csv_data = CSV.read(file, args).map(&:to_hash)
|
31
|
+
|
32
|
+
data_set = new(
|
33
|
+
csv_data.first.keys.to_set,
|
34
|
+
NMatrix.to_na(csv_data.map(&:values)).transpose
|
35
|
+
)
|
36
|
+
# Ensure the ordering matches what's in the CSV
|
37
|
+
data_set.instance_variable_set(:@variable_array, csv_data.first.keys)
|
47
38
|
data_set
|
48
39
|
end
|
49
40
|
|
50
|
-
|
41
|
+
attr_reader :variable_array
|
42
|
+
|
43
|
+
def initialize(*args)
|
44
|
+
super *args
|
45
|
+
@variable_array = variables.to_a.sort
|
46
|
+
end
|
51
47
|
|
52
48
|
def to_hash
|
53
49
|
hash = {}
|
54
|
-
|
55
|
-
hash[
|
50
|
+
variable_array.each_index do |i|
|
51
|
+
hash[variable_array[i]] = data[true,i].to_a[0]
|
56
52
|
end
|
57
53
|
hash
|
58
54
|
end
|
59
55
|
|
60
|
-
|
61
|
-
|
62
|
-
def data_vectors(variables=nil)
|
63
|
-
VectorEnumerable.new(self, variables)
|
56
|
+
def length
|
57
|
+
data.shape[0]
|
64
58
|
end
|
65
59
|
|
66
|
-
|
67
|
-
|
60
|
+
# Return a subset of the data with the same variables,
|
61
|
+
# but only the vectors specified by i
|
62
|
+
#
|
63
|
+
def [](*args)
|
64
|
+
unless args.any?
|
65
|
+
raise ArgumentError, "Indices not specified"
|
66
|
+
end
|
67
|
+
|
68
|
+
args.map do |arg|
|
69
|
+
subset = self.class.new
|
70
|
+
subset.variables = self.variables
|
71
|
+
subset.data = self.data[arg,true]
|
72
|
+
subset
|
73
|
+
end.inject(:+)
|
68
74
|
end
|
69
75
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
76
|
+
|
77
|
+
# Return the union of self and other
|
78
|
+
#
|
79
|
+
def +(other)
|
80
|
+
unless other.variables == variables
|
81
|
+
raise ArgumentError, "DataSets have different variables"
|
82
|
+
end
|
83
|
+
|
84
|
+
union = self.class.new
|
85
|
+
union.variables = variables
|
86
|
+
union.data = NMatrix[*(data.transpose.to_a + other.data.transpose.to_a)].transpose
|
87
|
+
union
|
75
88
|
end
|
76
89
|
|
77
|
-
|
78
|
-
|
90
|
+
# Return a subset of the data with the same vectors, but only
|
91
|
+
# the variables specified in args
|
92
|
+
#
|
93
|
+
def pick(*args)
|
94
|
+
unless args.any?
|
95
|
+
raise ArgumentError, "You didn't specify any variables to pick"
|
96
|
+
end
|
79
97
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
if @variables.kind_of?(Enumerable)
|
84
|
-
@variables = variables.map {|v| @data_set.variable_array.index(v)}
|
85
|
-
end
|
98
|
+
picked_indices = args.map do |variable|
|
99
|
+
raise ArgumentError, "Dataset doesn't include #{variable}" unless variables.include?(variable)
|
100
|
+
self.variable_array.index(variable)
|
86
101
|
end
|
87
102
|
|
88
|
-
|
89
|
-
|
103
|
+
self.class.new(args, self.data[true, picked_indices] )
|
104
|
+
end
|
105
|
+
|
106
|
+
def each(&block)
|
107
|
+
(0..length-1).to_a.each do |i|
|
108
|
+
yield data[i,true].transpose.to_a.first
|
90
109
|
end
|
110
|
+
end
|
91
111
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
112
|
+
def dup
|
113
|
+
instance = self.class.new( variables.dup, data.dup)
|
114
|
+
instance.instance_variable_set(:@variable_array, variable_array)
|
115
|
+
instance
|
116
|
+
end
|
117
|
+
|
118
|
+
def merge(other)
|
119
|
+
dup.merge!(other)
|
120
|
+
end
|
121
|
+
|
122
|
+
def merge!(other)
|
123
|
+
other = self.class.from_hash(other) if other.kind_of?(::Hash)
|
124
|
+
raise ArgumentError, "Lengths must be the same" if other.length != length
|
125
|
+
|
126
|
+
# Merge the variables. Existing variables should be updated,
|
127
|
+
# new variables should be appended to the hash in the same order
|
128
|
+
# as they appear in other
|
129
|
+
@variable_array += (other.variables - variables).to_a
|
130
|
+
@variables += other.variables
|
131
|
+
|
132
|
+
# Create the new data array, should be the size of the merged variables
|
133
|
+
# by the number of vectors
|
134
|
+
new_data = ::NArray.object(length, variables.length)
|
135
|
+
|
136
|
+
# Copy over the data from self (as if we had extended self.data to the
|
137
|
+
# right to allow for the new data)
|
138
|
+
new_data[true, 0..data.shape[1]-1] = data
|
139
|
+
|
140
|
+
# Merge in other's data, using the indices of other's variables as the
|
141
|
+
# slice keys
|
142
|
+
other.variables.each do |variable|
|
143
|
+
new_data[true, variable_array.index(variable)] = other.pick(variable).to_a.flatten
|
96
144
|
end
|
145
|
+
|
146
|
+
self.data = new_data
|
147
|
+
self
|
97
148
|
end
|
98
149
|
end
|