pest 0.0.0 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +49 -47
- data/VERSION +1 -1
- data/lib/pest.rb +0 -1
- data/lib/pest/data_set.rb +45 -5
- data/lib/pest/data_set/hash.rb +70 -36
- data/lib/pest/data_set/narray.rb +104 -53
- data/lib/pest/estimator.rb +8 -28
- data/lib/pest/estimator/frequency.rb +9 -7
- data/lib/pest/function.rb +0 -7
- data/lib/pest/function/entropy.rb +5 -3
- data/lib/pest/function/probability.rb +45 -10
- data/pest.gemspec +15 -5
- data/spec/pest/data_set/hash_spec.rb +78 -63
- data/spec/pest/data_set/narray_spec.rb +107 -47
- data/spec/pest/data_set_spec.rb +36 -1
- data/spec/pest/estimator/frequency_spec.rb +31 -32
- data/spec/pest/estimator_spec.rb +13 -11
- data/spec/pest/function/entropy_spec.rb +11 -14
- data/spec/pest/function/probability_spec.rb +97 -28
- metadata +68 -6
- data/lib/pest/variable.rb +0 -34
- data/spec/pest/variable_spec.rb +0 -73
data/README.md
CHANGED
@@ -2,18 +2,10 @@
|
|
2
2
|
|
3
3
|
[](http://travis-ci.org/kerinin/pest)
|
4
4
|
|
5
|
+
**A concise API focused on painless investigation of data sets**
|
5
6
|
|
6
|
-
Pest provides a
|
7
|
-
estimation models.
|
8
|
-
|
9
|
-
* Pest tries to be agnostic about the underlying data data structures,
|
10
|
-
so changing libraries (GSL -> Hadoop) is as simple as using a different data source.
|
11
|
-
* Pest is designed to create estimators using subsets of larger data sources, and
|
12
|
-
transparently constructs estimators to facilitate dynamic querying
|
13
|
-
* Implementing custom estimation models is easy, and Pest implements some model
|
14
|
-
common ones for you.
|
15
|
-
|
16
|
-
Pest abstracts common statstical operations including:
|
7
|
+
Pest provides a framework for interacting with different probability
|
8
|
+
estimation models. Pest abstracts common statstical operations including:
|
17
9
|
|
18
10
|
* Marginal, Joint and Conditional point probability
|
19
11
|
* Interval and Cumulative probability
|
@@ -21,24 +13,34 @@ Pest abstracts common statstical operations including:
|
|
21
13
|
* Mean, Median, Mode, etc
|
22
14
|
|
23
15
|
|
24
|
-
|
16
|
+
**Scalability if you need it**
|
25
17
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
brew install gsl # Forcing gsl v1.4
|
18
|
+
Pest tries to be agnostic about the underlying data data structures,
|
19
|
+
so changing libraries (NArray -> Hadoop) is as simple as using a different data source.
|
20
|
+
Pest is designed to create estimators using subsets of larger data sources, and
|
21
|
+
transparently constructs estimators to facilitate dynamic querying
|
31
22
|
|
32
|
-
|
33
|
-
|
23
|
+
|
24
|
+
**Code structure designed to be extended**
|
25
|
+
|
26
|
+
Implementing custom estimation models is easy, and Pest implements some model
|
27
|
+
common ones for you.
|
28
|
+
|
29
|
+
|
30
|
+
## Install
|
31
|
+
|
32
|
+
Add it to your Gemfile and bundle
|
33
|
+
|
34
|
+
gem "pest"
|
35
|
+
|
36
|
+
bundle install
|
34
37
|
|
35
38
|
## API
|
36
39
|
|
37
40
|
``` ruby
|
38
41
|
# Creating Datasets
|
39
|
-
test = Pest::DataSet::Hash.
|
40
|
-
|
41
|
-
train = Pest::DataSet::GSL.new file # Creates a GSL dataset from and IO instance
|
42
|
+
test = Pest::DataSet::Hash.from_hash hash # Creates a Hash dataset of observations from a hash
|
43
|
+
train = Pest::DataSet::NArray.from_hash hash # Creates a NArray dataset
|
42
44
|
|
43
45
|
# DataSet Variables
|
44
46
|
test.variables # hash of Variable instances detected in observation set
|
@@ -47,13 +49,14 @@ test.v[:foo] # a specific variable
|
|
47
49
|
test.v[:foo] = another_variable # explicit declaration
|
48
50
|
|
49
51
|
# Creating Estimators
|
50
|
-
e = Pest::Estimator::
|
51
|
-
e = Pest::Estimator::
|
52
|
+
e = Pest::Estimator::Frequency.new(data) # Frequentist estimator - values treated as unordered set
|
53
|
+
e = Pest::Estimator::Multinomial.new(data) # Multinomial estimator
|
54
|
+
e = Pest::Estimator::Gaussian.new(data) # Gaussian mean/varaince ML estimator
|
52
55
|
|
53
56
|
# Descriptive Statistical Properties
|
54
|
-
e.mode(:foo) # Mode
|
55
|
-
e.mean(:foo) # Mean (discrete & continuous only)
|
56
|
-
e.median(:foo) # Median (discrete & continuous only)
|
57
|
+
#e.mode(:foo) # Mode
|
58
|
+
#e.mean(:foo) # Mean (discrete & continuous only)
|
59
|
+
#e.median(:foo) # Median (discrete & continuous only)
|
57
60
|
# quantile?
|
58
61
|
# variance?
|
59
62
|
# deviation?
|
@@ -65,26 +68,25 @@ e.h(:foo).given(:bar) # Cross entropy of 'foo' :
|
|
65
68
|
e.mutual_information(:foo, :bar) # Mutual information of 'foo' and 'bar'
|
66
69
|
e.i(:foo, :bar) # Alias
|
67
70
|
|
68
|
-
# Estimating Point Probability
|
69
|
-
e.probability(
|
70
|
-
e.p(:foo)
|
71
|
-
e.p(:foo)
|
72
|
-
e.p(:foo).given(:bar)
|
73
|
-
e.p(:foo, :bar).
|
74
|
-
|
75
|
-
|
76
|
-
e.
|
77
|
-
e.
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
e.
|
71
|
+
# Estimating Point Probability
|
72
|
+
e.probability(e.variables[:foo] => 1) # Estimate the probability that foo=1
|
73
|
+
e.p(:foo => 1) # Same as above, tries to find a variable named 'foo'
|
74
|
+
e.p(:foo => 1, :bar => 2) # Estimate the probability that foo=1 AND bar=2
|
75
|
+
e.p(:foo => 1).given(:bar => 2) # Estimate the probability that foo=1 given bar=2
|
76
|
+
e.p(:foo => 1, :bar => 2).given(:baz => 3, :qux => 4) # Moar
|
77
|
+
|
78
|
+
# Batch Point Probability Estimation
|
79
|
+
e.batch_probability(:foo).in(test) # Estimate the probability of each value in test
|
80
|
+
e.batch_p(:foo, :bar).in(test) # Joint probability
|
81
|
+
e.batch_p(:foo).given(:bar).in(test) # Conditional probability
|
82
|
+
e.batch_p(:foo, :bar).given(:baz, :qux).in(test) # Moar
|
83
|
+
|
84
|
+
# Estimating Cumulative & Interval Probability
|
85
|
+
#e.probability(:foo).greater_than(:bar).in(test)
|
86
|
+
#e.p(:foo).greater_than(:bar).less_than(:baz).in(test)
|
87
|
+
#e.p(:foo).gt(:bar).lt(:baz).given(:qux).in(test)
|
83
88
|
```
|
84
89
|
|
85
|
-
##
|
90
|
+
## TODO
|
86
91
|
|
87
|
-
|
88
|
-
variables named differently in different data sets to be equivalent. And how the
|
89
|
-
fuck do we handle variable type? I'm almost thinking we don't, and let the actual
|
90
|
-
estimators take care of type casting
|
92
|
+
the builders should validate the variables they're given and throw errors if they're not part of the estimators data
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.1.0
|
data/lib/pest.rb
CHANGED
data/lib/pest/data_set.rb
CHANGED
@@ -3,13 +3,15 @@ module Pest::DataSet
|
|
3
3
|
base.extend(ClassMethods)
|
4
4
|
end
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
attr_accessor :variables, :data
|
9
9
|
|
10
|
-
def
|
11
|
-
@variables
|
10
|
+
def initialize(variables = Set.new, data = nil)
|
11
|
+
@variables = variables
|
12
|
+
@data = data
|
12
13
|
end
|
14
|
+
alias :v :variables
|
13
15
|
|
14
16
|
def to_hash(*args)
|
15
17
|
raise NotImplementedError
|
@@ -27,6 +29,44 @@ module Pest::DataSet
|
|
27
29
|
raise NotImplementedError
|
28
30
|
end
|
29
31
|
|
32
|
+
def ==(other)
|
33
|
+
variables == other.variables and data == other.data
|
34
|
+
end
|
35
|
+
alias :eql? :==
|
36
|
+
|
37
|
+
def [](*args)
|
38
|
+
raise NotImplementedError
|
39
|
+
end
|
40
|
+
|
41
|
+
def except(start, finish)
|
42
|
+
left = start > 0 ? self[0..start] : nil
|
43
|
+
right = finish < length - 1 ? self[finish..-1] : nil
|
44
|
+
case [left.nil?, right.nil?]
|
45
|
+
when [true, false]
|
46
|
+
right
|
47
|
+
when [false, true]
|
48
|
+
left
|
49
|
+
when [false, false]
|
50
|
+
right + left
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def +(other)
|
55
|
+
raise NotImplementedError
|
56
|
+
end
|
57
|
+
|
58
|
+
def pick(*args)
|
59
|
+
raise NotImplementedError
|
60
|
+
end
|
61
|
+
|
62
|
+
def each(&block)
|
63
|
+
raise NotImplementedError
|
64
|
+
end
|
65
|
+
|
66
|
+
def merge(other)
|
67
|
+
raise NotImplementedError
|
68
|
+
end
|
69
|
+
|
30
70
|
module ClassMethods
|
31
71
|
def from(data_source)
|
32
72
|
# Try to translate the data source directly
|
data/lib/pest/data_set/hash.rb
CHANGED
@@ -3,68 +3,102 @@ class Pest::DataSet::Hash
|
|
3
3
|
|
4
4
|
def self.translators
|
5
5
|
{
|
6
|
-
File => :from_file,
|
7
6
|
String => :from_file,
|
8
7
|
Symbol => :from_file
|
9
8
|
}
|
10
9
|
end
|
11
10
|
|
12
|
-
def self.
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
if object.kind_of?(::Hash)
|
18
|
-
self.new(object)
|
19
|
-
else
|
20
|
-
raise "File does not seem to contain valid data"
|
21
|
-
end
|
11
|
+
def self.from_hash(hash)
|
12
|
+
data_set = new
|
13
|
+
data_set.variables += hash.keys
|
14
|
+
data_set.instance_variable_set(:@hash, hash)
|
15
|
+
data_set
|
22
16
|
end
|
23
17
|
|
24
18
|
attr_reader :variables, :hash
|
25
19
|
|
26
|
-
def initialize(
|
27
|
-
|
28
|
-
@
|
29
|
-
hash.keys().each do |name|
|
30
|
-
@variables[name] = Pest::Variable.new(:name => name)
|
31
|
-
end
|
20
|
+
def initialize(*args)
|
21
|
+
super *args
|
22
|
+
@hash = {}
|
32
23
|
end
|
33
24
|
|
34
|
-
def
|
35
|
-
|
25
|
+
def data
|
26
|
+
hash.values
|
36
27
|
end
|
37
28
|
|
38
|
-
def
|
39
|
-
|
29
|
+
def to_hash
|
30
|
+
hash
|
40
31
|
end
|
41
32
|
|
42
33
|
def length
|
43
34
|
@hash.values.first.length
|
44
35
|
end
|
45
36
|
|
46
|
-
def
|
47
|
-
|
48
|
-
|
49
|
-
|
37
|
+
def [](*args)
|
38
|
+
unless args.any?
|
39
|
+
raise ArgumentError, "Indices not specified"
|
40
|
+
end
|
41
|
+
|
42
|
+
args.map do |arg|
|
43
|
+
subset = self.class.new
|
44
|
+
subset.variables = self.variables
|
45
|
+
variables.each do |var|
|
46
|
+
subset.hash[var.name] = hash[var.name][arg]
|
47
|
+
end
|
48
|
+
subset
|
49
|
+
end.inject(:+)
|
50
|
+
|
50
51
|
end
|
51
52
|
|
52
|
-
|
53
|
-
|
53
|
+
def +(other)
|
54
|
+
unless other.variables == variables
|
55
|
+
raise ArgumentError, "DataSets have different variables"
|
56
|
+
end
|
54
57
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
+
union = self.class.new
|
59
|
+
union.variables = variables
|
60
|
+
variables.each do |var|
|
61
|
+
union.hash[var.name] = hash[var.name] + other.hash[var.name]
|
58
62
|
end
|
63
|
+
union
|
64
|
+
end
|
59
65
|
|
60
|
-
|
61
|
-
|
66
|
+
def pick(*args)
|
67
|
+
unless args.any?
|
68
|
+
raise ArgumentError, "You didn't specify any variables to pick"
|
62
69
|
end
|
63
70
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
71
|
+
subset = self.class.new
|
72
|
+
subset.variables += args
|
73
|
+
args.each do |var|
|
74
|
+
raise ArgumentError, "Dataset doesn't include '#{var}'" unless hash.has_key?(var)
|
75
|
+
subset.hash[var] = hash[var]
|
68
76
|
end
|
77
|
+
subset
|
78
|
+
end
|
79
|
+
|
80
|
+
def each(&block)
|
81
|
+
(0..length-1).to_a.each do |i| yield variables.map {|var| hash[var][i]} end
|
82
|
+
end
|
83
|
+
|
84
|
+
def dup
|
85
|
+
instance = self.class.new
|
86
|
+
instance.variables = variables.dup
|
87
|
+
instance.instance_variable_set(:@hash, hash.dup)
|
88
|
+
instance
|
89
|
+
end
|
90
|
+
|
91
|
+
def merge(other)
|
92
|
+
dup.merge!(other)
|
93
|
+
end
|
94
|
+
|
95
|
+
def merge!(other)
|
96
|
+
other = self.class.from_hash(other) if other.kind_of?(::Hash)
|
97
|
+
raise ArgumentError, "Lengths must be the same" if other.length != length
|
98
|
+
|
99
|
+
@variables += other.variables
|
100
|
+
hash.merge! other.hash
|
101
|
+
|
102
|
+
self
|
69
103
|
end
|
70
104
|
end
|
data/lib/pest/data_set/narray.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'narray'
|
2
2
|
|
3
|
-
class Pest::DataSet::NArray
|
3
|
+
class Pest::DataSet::NArray
|
4
4
|
include Pest::DataSet
|
5
5
|
|
6
6
|
def self.translators
|
@@ -13,86 +13,137 @@ class Pest::DataSet::NArray < NMatrix
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def self.from_hash(hash)
|
16
|
-
data_set =
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
end
|
16
|
+
data_set = new(
|
17
|
+
hash.keys.to_set,
|
18
|
+
NMatrix.to_na(hash.values)
|
19
|
+
)
|
20
|
+
data_set.instance_variable_set(:@variable_array, hash.keys)
|
22
21
|
data_set
|
23
22
|
end
|
24
23
|
|
25
24
|
def self.from_file(file)
|
26
|
-
|
27
|
-
|
28
|
-
begin
|
29
|
-
variables, matrix = Marshal.restore(file)
|
30
|
-
data_set = to_na(matrix)
|
31
|
-
data_set.variables = variables
|
32
|
-
data_set
|
33
|
-
rescue
|
34
|
-
raise "File does not seem to contain valid data"
|
35
|
-
end
|
25
|
+
from_csv(file)
|
36
26
|
end
|
37
27
|
|
38
28
|
def self.from_csv(file, args={})
|
39
|
-
args =
|
40
|
-
|
41
|
-
|
42
|
-
data_set
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
29
|
+
args = {:col_sep => "\t", :headers => true, :converters => :all}.merge args
|
30
|
+
csv_data = CSV.read(file, args).map(&:to_hash)
|
31
|
+
|
32
|
+
data_set = new(
|
33
|
+
csv_data.first.keys.to_set,
|
34
|
+
NMatrix.to_na(csv_data.map(&:values)).transpose
|
35
|
+
)
|
36
|
+
# Ensure the ordering matches what's in the CSV
|
37
|
+
data_set.instance_variable_set(:@variable_array, csv_data.first.keys)
|
47
38
|
data_set
|
48
39
|
end
|
49
40
|
|
50
|
-
|
41
|
+
attr_reader :variable_array
|
42
|
+
|
43
|
+
def initialize(*args)
|
44
|
+
super *args
|
45
|
+
@variable_array = variables.to_a.sort
|
46
|
+
end
|
51
47
|
|
52
48
|
def to_hash
|
53
49
|
hash = {}
|
54
|
-
|
55
|
-
hash[
|
50
|
+
variable_array.each_index do |i|
|
51
|
+
hash[variable_array[i]] = data[true,i].to_a[0]
|
56
52
|
end
|
57
53
|
hash
|
58
54
|
end
|
59
55
|
|
60
|
-
|
61
|
-
|
62
|
-
def data_vectors(variables=nil)
|
63
|
-
VectorEnumerable.new(self, variables)
|
56
|
+
def length
|
57
|
+
data.shape[0]
|
64
58
|
end
|
65
59
|
|
66
|
-
|
67
|
-
|
60
|
+
# Return a subset of the data with the same variables,
|
61
|
+
# but only the vectors specified by i
|
62
|
+
#
|
63
|
+
def [](*args)
|
64
|
+
unless args.any?
|
65
|
+
raise ArgumentError, "Indices not specified"
|
66
|
+
end
|
67
|
+
|
68
|
+
args.map do |arg|
|
69
|
+
subset = self.class.new
|
70
|
+
subset.variables = self.variables
|
71
|
+
subset.data = self.data[arg,true]
|
72
|
+
subset
|
73
|
+
end.inject(:+)
|
68
74
|
end
|
69
75
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
76
|
+
|
77
|
+
# Return the union of self and other
|
78
|
+
#
|
79
|
+
def +(other)
|
80
|
+
unless other.variables == variables
|
81
|
+
raise ArgumentError, "DataSets have different variables"
|
82
|
+
end
|
83
|
+
|
84
|
+
union = self.class.new
|
85
|
+
union.variables = variables
|
86
|
+
union.data = NMatrix[*(data.transpose.to_a + other.data.transpose.to_a)].transpose
|
87
|
+
union
|
75
88
|
end
|
76
89
|
|
77
|
-
|
78
|
-
|
90
|
+
# Return a subset of the data with the same vectors, but only
|
91
|
+
# the variables specified in args
|
92
|
+
#
|
93
|
+
def pick(*args)
|
94
|
+
unless args.any?
|
95
|
+
raise ArgumentError, "You didn't specify any variables to pick"
|
96
|
+
end
|
79
97
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
if @variables.kind_of?(Enumerable)
|
84
|
-
@variables = variables.map {|v| @data_set.variable_array.index(v)}
|
85
|
-
end
|
98
|
+
picked_indices = args.map do |variable|
|
99
|
+
raise ArgumentError, "Dataset doesn't include #{variable}" unless variables.include?(variable)
|
100
|
+
self.variable_array.index(variable)
|
86
101
|
end
|
87
102
|
|
88
|
-
|
89
|
-
|
103
|
+
self.class.new(args, self.data[true, picked_indices] )
|
104
|
+
end
|
105
|
+
|
106
|
+
def each(&block)
|
107
|
+
(0..length-1).to_a.each do |i|
|
108
|
+
yield data[i,true].transpose.to_a.first
|
90
109
|
end
|
110
|
+
end
|
91
111
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
112
|
+
def dup
|
113
|
+
instance = self.class.new( variables.dup, data.dup)
|
114
|
+
instance.instance_variable_set(:@variable_array, variable_array)
|
115
|
+
instance
|
116
|
+
end
|
117
|
+
|
118
|
+
def merge(other)
|
119
|
+
dup.merge!(other)
|
120
|
+
end
|
121
|
+
|
122
|
+
def merge!(other)
|
123
|
+
other = self.class.from_hash(other) if other.kind_of?(::Hash)
|
124
|
+
raise ArgumentError, "Lengths must be the same" if other.length != length
|
125
|
+
|
126
|
+
# Merge the variables. Existing variables should be updated,
|
127
|
+
# new variables should be appended to the hash in the same order
|
128
|
+
# as they appear in other
|
129
|
+
@variable_array += (other.variables - variables).to_a
|
130
|
+
@variables += other.variables
|
131
|
+
|
132
|
+
# Create the new data array, should be the size of the merged variables
|
133
|
+
# by the number of vectors
|
134
|
+
new_data = ::NArray.object(length, variables.length)
|
135
|
+
|
136
|
+
# Copy over the data from self (as if we had extended self.data to the
|
137
|
+
# right to allow for the new data)
|
138
|
+
new_data[true, 0..data.shape[1]-1] = data
|
139
|
+
|
140
|
+
# Merge in other's data, using the indices of other's variables as the
|
141
|
+
# slice keys
|
142
|
+
other.variables.each do |variable|
|
143
|
+
new_data[true, variable_array.index(variable)] = other.pick(variable).to_a.flatten
|
96
144
|
end
|
145
|
+
|
146
|
+
self.data = new_data
|
147
|
+
self
|
97
148
|
end
|
98
149
|
end
|