pest 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +6 -0
- data/Gemfile +13 -0
- data/README.md +90 -0
- data/Rakefile +53 -0
- data/VERSION +1 -0
- data/lib/pest.rb +23 -0
- data/lib/pest/data_set.rb +62 -0
- data/lib/pest/data_set/hash.rb +70 -0
- data/lib/pest/data_set/narray.rb +98 -0
- data/lib/pest/estimator.rb +72 -0
- data/lib/pest/estimator/frequency.rb +61 -0
- data/lib/pest/function.rb +28 -0
- data/lib/pest/function/entropy.rb +35 -0
- data/lib/pest/function/probability.rb +41 -0
- data/lib/pest/variable.rb +34 -0
- data/lib/pest/version.rb +7 -0
- data/pest.gemspec +112 -0
- data/spec/pest/data_set/hash_spec.rb +108 -0
- data/spec/pest/data_set/narray_spec.rb +141 -0
- data/spec/pest/data_set_spec.rb +95 -0
- data/spec/pest/estimator/bernoulli_spec.rb +21 -0
- data/spec/pest/estimator/frequency_spec.rb +85 -0
- data/spec/pest/estimator/gaussian_spec.rb +21 -0
- data/spec/pest/estimator/multinomial_spec.rb +21 -0
- data/spec/pest/estimator/parzen_spec.rb +21 -0
- data/spec/pest/estimator/svd_spec.rb +21 -0
- data/spec/pest/estimator_spec.rb +74 -0
- data/spec/pest/function/entropy_spec.rb +105 -0
- data/spec/pest/function/probability_spec.rb +118 -0
- data/spec/pest/variable_spec.rb +73 -0
- data/spec/pest_spec.rb +4 -0
- data/spec/spec_helper.rb +14 -0
- metadata +321 -0
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
# Pest, a framework for Probability Estimation
|
2
|
+
|
3
|
+
[![Build Status](https://secure.travis-ci.org/kerinin/pest.png)](http://travis-ci.org/kerinin/pest)
|
4
|
+
|
5
|
+
|
6
|
+
Pest provides a unified framework for interacting with different probability
|
7
|
+
estimation models.
|
8
|
+
|
9
|
+
* Pest tries to be agnostic about the underlying data data structures,
|
10
|
+
so changing libraries (GSL -> Hadoop) is as simple as using a different data source.
|
11
|
+
* Pest is designed to create estimators using subsets of larger data sources, and
|
12
|
+
transparently constructs estimators to facilitate dynamic querying
|
13
|
+
* Implementing custom estimation models is easy, and Pest implements some model
|
14
|
+
common ones for you.
|
15
|
+
|
16
|
+
Pest abstracts common statstical operations including:
|
17
|
+
|
18
|
+
* Marginal, Joint and Conditional point probability
|
19
|
+
* Interval and Cumulative probability
|
20
|
+
* Entropy, Cross Entropy, and Mutual Information
|
21
|
+
* Mean, Median, Mode, etc
|
22
|
+
|
23
|
+
|
24
|
+
## Ruby Install
|
25
|
+
|
26
|
+
``` sh
|
27
|
+
brew install gnuplot # This may take awhile...
|
28
|
+
cd /usr/local
|
29
|
+
git checkout 83ed494 /usr/local/Library/Formula/gsl.rb
|
30
|
+
brew install gsl # Forcing gsl v1.4
|
31
|
+
|
32
|
+
bundle install
|
33
|
+
```
|
34
|
+
|
35
|
+
## API
|
36
|
+
|
37
|
+
``` ruby
|
38
|
+
# Creating Datasets
|
39
|
+
test = Pest::DataSet::Hash.new hash # Creates a Hash dataset of observations from a hash
|
40
|
+
test = Pest::DataSet::Hash.new file # Creates a Hash dataset of observations from an IO (Marshalled)
|
41
|
+
train = Pest::DataSet::GSL.new file # Creates a GSL dataset from and IO instance
|
42
|
+
|
43
|
+
# DataSet Variables
|
44
|
+
test.variables # hash of Variable instances detected in observation set
|
45
|
+
test.v # alias of 'variables'
|
46
|
+
test.v[:foo] # a specific variable
|
47
|
+
test.v[:foo] = another_variable # explicit declaration
|
48
|
+
|
49
|
+
# Creating Estimators
|
50
|
+
e = Pest::Estimator::Set::Multinomial.new(test) # Creates a multinomial estimator for set o
|
51
|
+
e = Pest::Estimator::Discrete::Gaussian.new(file) # Creating an estimator with the DataSet API
|
52
|
+
|
53
|
+
# Descriptive Statistical Properties
|
54
|
+
e.mode(:foo) # Mode
|
55
|
+
e.mean(:foo) # Mean (discrete & continuous only)
|
56
|
+
e.median(:foo) # Median (discrete & continuous only)
|
57
|
+
# quantile?
|
58
|
+
# variance?
|
59
|
+
# deviation?
|
60
|
+
|
61
|
+
# Estimating Entropy (Set & Discrete only)
|
62
|
+
e.entropy(:foo) # Entropy of 'foo'
|
63
|
+
e.h(:foo, :bar) # Joint entropy of 'foo' AND 'bar'
|
64
|
+
e.h(:foo).given(:bar) # Cross entropy of 'foo' : 'bar'
|
65
|
+
e.mutual_information(:foo, :bar) # Mutual information of 'foo' and 'bar'
|
66
|
+
e.i(:foo, :bar) # Alias
|
67
|
+
|
68
|
+
# Estimating Point Probability (Set & Discrete only)
|
69
|
+
e.probability(o.variables[:foo]) # (Set/Discrete only) Estimate the probability of all values of 'foo'
|
70
|
+
e.p(:foo) # Same as above, tries to find a variable named 'foo'
|
71
|
+
e.p(:foo).in(test) # Estimate the probability of values in dataset 'test'
|
72
|
+
e.p(:foo).given(:bar).in(test) # Estimate the conditional foo | bar for the values in 'test'
|
73
|
+
e.p(:foo, :bar).in(test) # Estimate the joint probablity foo AND bar
|
74
|
+
e.p(:foo, :bar).given(:baz, :qux).in(test) # More complex joint & conditional probabilities
|
75
|
+
e.p(:foo => 4, :bar => 2).given(:baz => 0) # Single prediction (implicitly creates dataset)
|
76
|
+
e.p(:foo).given(:bar).cache # Builds and persists the model for 'foo|bar'
|
77
|
+
e.p(:foo).given(:bar).cache('path.csv') # Persist to a specific path (defaults to tmp)
|
78
|
+
|
79
|
+
# Estimating Cumulative & Interval Probability (Discrete & Continuous only)
|
80
|
+
e.probability(:foo).greater_than(:bar).in(test)
|
81
|
+
e.p(:foo).greater_than(:bar).less_than(:baz).in(test)
|
82
|
+
e.p(:foo).gt(:bar).lt(:baz).given(:qux).in(test)
|
83
|
+
```
|
84
|
+
|
85
|
+
## Working Notes
|
86
|
+
|
87
|
+
Do we want variable equality to be name-based? It may make more sense to allow
|
88
|
+
variables named differently in different data sets to be equivalent. And how the
|
89
|
+
fuck do we handle variable type? I'm almost thinking we don't, and let the actual
|
90
|
+
estimators take care of type casting
|
data/Rakefile
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "pest"
|
18
|
+
gem.homepage = "http://github.com/kerinin/pest"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.description = %q{Wrappers to facilitate different classes of probability estimators}
|
21
|
+
gem.summary = %q{Probability Estimation}
|
22
|
+
gem.email = "kerinin@gmail.com"
|
23
|
+
gem.authors = ["Ryan Michael"]
|
24
|
+
# dependencies defined in Gemfile
|
25
|
+
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
27
|
+
|
28
|
+
require 'rake/testtask'
|
29
|
+
Rake::TestTask.new(:test) do |test|
|
30
|
+
test.libs << 'lib' << 'test'
|
31
|
+
test.pattern = 'test/**/test_*.rb'
|
32
|
+
test.verbose = true
|
33
|
+
end
|
34
|
+
|
35
|
+
# require 'rcov/rcovtask'
|
36
|
+
# Rcov::RcovTask.new do |test|
|
37
|
+
# test.libs << 'test'
|
38
|
+
# test.pattern = 'test/**/test_*.rb'
|
39
|
+
# test.verbose = true
|
40
|
+
# test.rcov_opts << '--exclude "gems/*"'
|
41
|
+
# end
|
42
|
+
|
43
|
+
task :default => :test
|
44
|
+
|
45
|
+
require 'rdoc/task'
|
46
|
+
Rake::RDocTask.new do |rdoc|
|
47
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
48
|
+
|
49
|
+
rdoc.rdoc_dir = 'rdoc'
|
50
|
+
rdoc.title = "pest #{version}"
|
51
|
+
rdoc.rdoc_files.include('README*')
|
52
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
53
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.0
|
data/lib/pest.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
2
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), 'lib'))
|
3
|
+
|
4
|
+
require 'uuidtools'
|
5
|
+
require 'csv'
|
6
|
+
|
7
|
+
require "pest/version"
|
8
|
+
require "pest/variable"
|
9
|
+
|
10
|
+
require "pest/function"
|
11
|
+
require "pest/function/probability"
|
12
|
+
require "pest/function/entropy"
|
13
|
+
|
14
|
+
require "pest/data_set"
|
15
|
+
require "pest/data_set/hash"
|
16
|
+
require "pest/data_set/narray"
|
17
|
+
|
18
|
+
require "pest/estimator"
|
19
|
+
require "pest/estimator/frequency"
|
20
|
+
|
21
|
+
module Pest
|
22
|
+
CACHE_TO_FILE = false
|
23
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module Pest::DataSet
|
2
|
+
def self.included(base)
|
3
|
+
base.extend(ClassMethods)
|
4
|
+
end
|
5
|
+
|
6
|
+
def variables
|
7
|
+
@variables ||= {}
|
8
|
+
end
|
9
|
+
|
10
|
+
def variable_array
|
11
|
+
@variables.values.sort
|
12
|
+
end
|
13
|
+
|
14
|
+
def to_hash(*args)
|
15
|
+
raise NotImplementedError
|
16
|
+
end
|
17
|
+
|
18
|
+
def save(*args)
|
19
|
+
raise NotImplementedError
|
20
|
+
end
|
21
|
+
|
22
|
+
def destroy
|
23
|
+
raise NotImplementedError
|
24
|
+
end
|
25
|
+
|
26
|
+
def length
|
27
|
+
raise NotImplementedError
|
28
|
+
end
|
29
|
+
|
30
|
+
module ClassMethods
|
31
|
+
def from(data_source)
|
32
|
+
# Try to translate the data source directly
|
33
|
+
if translator_method = translators[data_source.class]
|
34
|
+
send(translator_method, data_source)
|
35
|
+
|
36
|
+
# Try to translate via hash
|
37
|
+
else
|
38
|
+
begin
|
39
|
+
hash_data = data_source.to_hash
|
40
|
+
rescue NoMethodError
|
41
|
+
raise "Unrecognized data source type"
|
42
|
+
end
|
43
|
+
|
44
|
+
if hash_data and translators.has_key?(hash_data.class)
|
45
|
+
from(data_source.to_hash)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def translators(*args)
|
51
|
+
raise NotImplementedError
|
52
|
+
end
|
53
|
+
|
54
|
+
def from_file(*args)
|
55
|
+
raise NotImplementedError
|
56
|
+
end
|
57
|
+
|
58
|
+
def from_hash(*args)
|
59
|
+
raise NotImplementedError
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
class Pest::DataSet::Hash
|
2
|
+
include Pest::DataSet
|
3
|
+
|
4
|
+
def self.translators
|
5
|
+
{
|
6
|
+
File => :from_file,
|
7
|
+
String => :from_file,
|
8
|
+
Symbol => :from_file
|
9
|
+
}
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.from_file(file)
|
13
|
+
file = File.open(file.to_s, 'r') if file.kind_of?(String)
|
14
|
+
|
15
|
+
object = Marshal.restore(file)
|
16
|
+
|
17
|
+
if object.kind_of?(::Hash)
|
18
|
+
self.new(object)
|
19
|
+
else
|
20
|
+
raise "File does not seem to contain valid data"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
attr_reader :variables, :hash
|
25
|
+
|
26
|
+
def initialize(hash)
|
27
|
+
@hash = hash
|
28
|
+
@variables = {}
|
29
|
+
hash.keys().each do |name|
|
30
|
+
@variables[name] = Pest::Variable.new(:name => name)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def to_hash
|
35
|
+
@hash
|
36
|
+
end
|
37
|
+
|
38
|
+
def data_vectors(variables=nil)
|
39
|
+
VectorEnumerable.new(self,variables)
|
40
|
+
end
|
41
|
+
|
42
|
+
def length
|
43
|
+
@hash.values.first.length
|
44
|
+
end
|
45
|
+
|
46
|
+
def save(file=nil)
|
47
|
+
file ||= Tempfile.new('pest_hash_dataset')
|
48
|
+
file = File.open(file, 'w') if file.kind_of?(String)
|
49
|
+
Marshal.dump(@hash, file)
|
50
|
+
end
|
51
|
+
|
52
|
+
class VectorEnumerable
|
53
|
+
include Enumerable
|
54
|
+
|
55
|
+
def initialize(data_set,variables=nil)
|
56
|
+
@data_set = data_set
|
57
|
+
@variables = variables || @data_set.variables
|
58
|
+
end
|
59
|
+
|
60
|
+
def [](i)
|
61
|
+
@variables.map {|var| @data_set.hash[var][i]}
|
62
|
+
end
|
63
|
+
|
64
|
+
def each
|
65
|
+
@data_set.hash.values.first.each_index do |i|
|
66
|
+
yield @variables.keys.map {|var| @data_set.hash[var][i]}
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
require 'narray'
|
2
|
+
|
3
|
+
class Pest::DataSet::NArray < NMatrix
|
4
|
+
include Pest::DataSet
|
5
|
+
|
6
|
+
def self.translators
|
7
|
+
{
|
8
|
+
Hash => :from_hash,
|
9
|
+
File => :from_file,
|
10
|
+
String => :from_file,
|
11
|
+
Symbol => :from_file
|
12
|
+
}
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.from_hash(hash)
|
16
|
+
data_set = to_na(hash.keys.sort.map {|key| hash[key]}) # Ensure the matrix is sorted the same as the variables
|
17
|
+
data_set.variables = {}
|
18
|
+
hash.keys.each do |key|
|
19
|
+
variable = key.kind_of?(Pest::Variable) ? key : Pest::Variable.new(:name => key)
|
20
|
+
data_set.variables[variable.name] = variable
|
21
|
+
end
|
22
|
+
data_set
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.from_file(file)
|
26
|
+
file = File.open(file.to_s, 'r') if file.kind_of?(String)
|
27
|
+
|
28
|
+
begin
|
29
|
+
variables, matrix = Marshal.restore(file)
|
30
|
+
data_set = to_na(matrix)
|
31
|
+
data_set.variables = variables
|
32
|
+
data_set
|
33
|
+
rescue
|
34
|
+
raise "File does not seem to contain valid data"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.from_csv(file, args={})
|
39
|
+
args = args.merge({:converters => :all})
|
40
|
+
data = CSV.read(file, args)
|
41
|
+
data_set = to_na(data[1..-1]).transpose
|
42
|
+
data_set.variables = {}
|
43
|
+
data[0].each do |key|
|
44
|
+
variable = key.kind_of?(Pest::Variable) ? key : Pest::Variable.new(:name => key)
|
45
|
+
data_set.variables[variable.name] = variable
|
46
|
+
end
|
47
|
+
data_set
|
48
|
+
end
|
49
|
+
|
50
|
+
attr_accessor :variables
|
51
|
+
|
52
|
+
def to_hash
|
53
|
+
hash = {}
|
54
|
+
variables.values.each_index do |i|
|
55
|
+
hash[variables.values[i]] = self[true,i].to_a[0]
|
56
|
+
end
|
57
|
+
hash
|
58
|
+
end
|
59
|
+
|
60
|
+
# variables: an array of variables for which each vector should contain values
|
61
|
+
# Order is retained in the returned value
|
62
|
+
def data_vectors(variables=nil)
|
63
|
+
VectorEnumerable.new(self, variables)
|
64
|
+
end
|
65
|
+
|
66
|
+
def length
|
67
|
+
shape[0]
|
68
|
+
end
|
69
|
+
|
70
|
+
def save(file=nil)
|
71
|
+
file ||= Tempfile.new('pest_hash_dataset')
|
72
|
+
file = File.open(file, 'w') if file.kind_of?(String)
|
73
|
+
Marshal.dump([variables,to_a], file)
|
74
|
+
file.close
|
75
|
+
end
|
76
|
+
|
77
|
+
class VectorEnumerable
|
78
|
+
include Enumerable
|
79
|
+
|
80
|
+
def initialize(data_set, variables = true)
|
81
|
+
@data_set = data_set
|
82
|
+
@variables = variables
|
83
|
+
if @variables.kind_of?(Enumerable)
|
84
|
+
@variables = variables.map {|v| @data_set.variable_array.index(v)}
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def [](i)
|
89
|
+
@data_set[i,@variables].transpose
|
90
|
+
end
|
91
|
+
|
92
|
+
def each
|
93
|
+
(0..@data_set.shape[0]-1).each do |i|
|
94
|
+
yield Array(self[i]).first
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
module Pest::Estimator
|
2
|
+
attr_accessor :data
|
3
|
+
|
4
|
+
def initialize(data=nil)
|
5
|
+
@data = data
|
6
|
+
end
|
7
|
+
|
8
|
+
def variables
|
9
|
+
@data.nil? ? {} : @data.variables
|
10
|
+
end
|
11
|
+
|
12
|
+
def distributions
|
13
|
+
@distributions ||= DistributionList.new(self)
|
14
|
+
end
|
15
|
+
|
16
|
+
def to_variable(arg)
|
17
|
+
variable = case arg.class.name
|
18
|
+
when 'Pest::Variable'
|
19
|
+
arg
|
20
|
+
when 'String', 'Symbol'
|
21
|
+
variables[arg] || Pest::Variable.new(:name => arg)
|
22
|
+
end
|
23
|
+
raise ArgumentError unless variables.values.include?(variable)
|
24
|
+
variable
|
25
|
+
end
|
26
|
+
|
27
|
+
module Distribution
|
28
|
+
attr_reader :variables
|
29
|
+
|
30
|
+
def initialize(estimator, variables)
|
31
|
+
@estimator = estimator
|
32
|
+
@variables = variables
|
33
|
+
end
|
34
|
+
|
35
|
+
def variable_array
|
36
|
+
variables.to_a.sort
|
37
|
+
end
|
38
|
+
|
39
|
+
def probability
|
40
|
+
raise NotImplementedError
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
class DistributionList < Hash
|
45
|
+
def initialize(estimator)
|
46
|
+
@estimator = estimator
|
47
|
+
end
|
48
|
+
|
49
|
+
def parse_args(args)
|
50
|
+
set = if args.kind_of? Array
|
51
|
+
if args.any? {|arg| arg.kind_of?(::Set)}
|
52
|
+
args.inject(::Set.new) {|set, el| set + el.to_set}
|
53
|
+
else
|
54
|
+
args.flatten.to_set
|
55
|
+
end
|
56
|
+
elsif args.kind_of? ::Set
|
57
|
+
args
|
58
|
+
else
|
59
|
+
Array(args).to_set
|
60
|
+
end
|
61
|
+
set.map! {|arg| @estimator.to_variable(arg) }
|
62
|
+
end
|
63
|
+
|
64
|
+
def [](*args)
|
65
|
+
set = parse_args(args)
|
66
|
+
unless has_key? set
|
67
|
+
self[set] = @estimator.distribution_class.new(@estimator, set)
|
68
|
+
end
|
69
|
+
super(set)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|