pest 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +6 -0
- data/Gemfile +13 -0
- data/README.md +90 -0
- data/Rakefile +53 -0
- data/VERSION +1 -0
- data/lib/pest.rb +23 -0
- data/lib/pest/data_set.rb +62 -0
- data/lib/pest/data_set/hash.rb +70 -0
- data/lib/pest/data_set/narray.rb +98 -0
- data/lib/pest/estimator.rb +72 -0
- data/lib/pest/estimator/frequency.rb +61 -0
- data/lib/pest/function.rb +28 -0
- data/lib/pest/function/entropy.rb +35 -0
- data/lib/pest/function/probability.rb +41 -0
- data/lib/pest/variable.rb +34 -0
- data/lib/pest/version.rb +7 -0
- data/pest.gemspec +112 -0
- data/spec/pest/data_set/hash_spec.rb +108 -0
- data/spec/pest/data_set/narray_spec.rb +141 -0
- data/spec/pest/data_set_spec.rb +95 -0
- data/spec/pest/estimator/bernoulli_spec.rb +21 -0
- data/spec/pest/estimator/frequency_spec.rb +85 -0
- data/spec/pest/estimator/gaussian_spec.rb +21 -0
- data/spec/pest/estimator/multinomial_spec.rb +21 -0
- data/spec/pest/estimator/parzen_spec.rb +21 -0
- data/spec/pest/estimator/svd_spec.rb +21 -0
- data/spec/pest/estimator_spec.rb +74 -0
- data/spec/pest/function/entropy_spec.rb +105 -0
- data/spec/pest/function/probability_spec.rb +118 -0
- data/spec/pest/variable_spec.rb +73 -0
- data/spec/pest_spec.rb +4 -0
- data/spec/spec_helper.rb +14 -0
- metadata +321 -0
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
# Pest, a framework for Probability Estimation
|
2
|
+
|
3
|
+
[](http://travis-ci.org/kerinin/pest)
|
4
|
+
|
5
|
+
|
6
|
+
Pest provides a unified framework for interacting with different probability
|
7
|
+
estimation models.
|
8
|
+
|
9
|
+
* Pest tries to be agnostic about the underlying data data structures,
|
10
|
+
so changing libraries (GSL -> Hadoop) is as simple as using a different data source.
|
11
|
+
* Pest is designed to create estimators using subsets of larger data sources, and
|
12
|
+
transparently constructs estimators to facilitate dynamic querying
|
13
|
+
* Implementing custom estimation models is easy, and Pest implements some model
|
14
|
+
common ones for you.
|
15
|
+
|
16
|
+
Pest abstracts common statstical operations including:
|
17
|
+
|
18
|
+
* Marginal, Joint and Conditional point probability
|
19
|
+
* Interval and Cumulative probability
|
20
|
+
* Entropy, Cross Entropy, and Mutual Information
|
21
|
+
* Mean, Median, Mode, etc
|
22
|
+
|
23
|
+
|
24
|
+
## Ruby Install
|
25
|
+
|
26
|
+
``` sh
|
27
|
+
brew install gnuplot # This may take awhile...
|
28
|
+
cd /usr/local
|
29
|
+
git checkout 83ed494 /usr/local/Library/Formula/gsl.rb
|
30
|
+
brew install gsl # Forcing gsl v1.4
|
31
|
+
|
32
|
+
bundle install
|
33
|
+
```
|
34
|
+
|
35
|
+
## API
|
36
|
+
|
37
|
+
``` ruby
|
38
|
+
# Creating Datasets
|
39
|
+
test = Pest::DataSet::Hash.new hash # Creates a Hash dataset of observations from a hash
|
40
|
+
test = Pest::DataSet::Hash.new file # Creates a Hash dataset of observations from an IO (Marshalled)
|
41
|
+
train = Pest::DataSet::GSL.new file # Creates a GSL dataset from and IO instance
|
42
|
+
|
43
|
+
# DataSet Variables
|
44
|
+
test.variables # hash of Variable instances detected in observation set
|
45
|
+
test.v # alias of 'variables'
|
46
|
+
test.v[:foo] # a specific variable
|
47
|
+
test.v[:foo] = another_variable # explicit declaration
|
48
|
+
|
49
|
+
# Creating Estimators
|
50
|
+
e = Pest::Estimator::Set::Multinomial.new(test) # Creates a multinomial estimator for set o
|
51
|
+
e = Pest::Estimator::Discrete::Gaussian.new(file) # Creating an estimator with the DataSet API
|
52
|
+
|
53
|
+
# Descriptive Statistical Properties
|
54
|
+
e.mode(:foo) # Mode
|
55
|
+
e.mean(:foo) # Mean (discrete & continuous only)
|
56
|
+
e.median(:foo) # Median (discrete & continuous only)
|
57
|
+
# quantile?
|
58
|
+
# variance?
|
59
|
+
# deviation?
|
60
|
+
|
61
|
+
# Estimating Entropy (Set & Discrete only)
|
62
|
+
e.entropy(:foo) # Entropy of 'foo'
|
63
|
+
e.h(:foo, :bar) # Joint entropy of 'foo' AND 'bar'
|
64
|
+
e.h(:foo).given(:bar) # Cross entropy of 'foo' : 'bar'
|
65
|
+
e.mutual_information(:foo, :bar) # Mutual information of 'foo' and 'bar'
|
66
|
+
e.i(:foo, :bar) # Alias
|
67
|
+
|
68
|
+
# Estimating Point Probability (Set & Discrete only)
|
69
|
+
e.probability(o.variables[:foo]) # (Set/Discrete only) Estimate the probability of all values of 'foo'
|
70
|
+
e.p(:foo) # Same as above, tries to find a variable named 'foo'
|
71
|
+
e.p(:foo).in(test) # Estimate the probability of values in dataset 'test'
|
72
|
+
e.p(:foo).given(:bar).in(test) # Estimate the conditional foo | bar for the values in 'test'
|
73
|
+
e.p(:foo, :bar).in(test) # Estimate the joint probablity foo AND bar
|
74
|
+
e.p(:foo, :bar).given(:baz, :qux).in(test) # More complex joint & conditional probabilities
|
75
|
+
e.p(:foo => 4, :bar => 2).given(:baz => 0) # Single prediction (implicitly creates dataset)
|
76
|
+
e.p(:foo).given(:bar).cache # Builds and persists the model for 'foo|bar'
|
77
|
+
e.p(:foo).given(:bar).cache('path.csv') # Persist to a specific path (defaults to tmp)
|
78
|
+
|
79
|
+
# Estimating Cumulative & Interval Probability (Discrete & Continuous only)
|
80
|
+
e.probability(:foo).greater_than(:bar).in(test)
|
81
|
+
e.p(:foo).greater_than(:bar).less_than(:baz).in(test)
|
82
|
+
e.p(:foo).gt(:bar).lt(:baz).given(:qux).in(test)
|
83
|
+
```
|
84
|
+
|
85
|
+
## Working Notes
|
86
|
+
|
87
|
+
Do we want variable equality to be name-based? It may make more sense to allow
|
88
|
+
variables named differently in different data sets to be equivalent. And how the
|
89
|
+
fuck do we handle variable type? I'm almost thinking we don't, and let the actual
|
90
|
+
estimators take care of type casting
|
data/Rakefile
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "pest"
|
18
|
+
gem.homepage = "http://github.com/kerinin/pest"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.description = %q{Wrappers to facilitate different classes of probability estimators}
|
21
|
+
gem.summary = %q{Probability Estimation}
|
22
|
+
gem.email = "kerinin@gmail.com"
|
23
|
+
gem.authors = ["Ryan Michael"]
|
24
|
+
# dependencies defined in Gemfile
|
25
|
+
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
27
|
+
|
28
|
+
require 'rake/testtask'
|
29
|
+
Rake::TestTask.new(:test) do |test|
|
30
|
+
test.libs << 'lib' << 'test'
|
31
|
+
test.pattern = 'test/**/test_*.rb'
|
32
|
+
test.verbose = true
|
33
|
+
end
|
34
|
+
|
35
|
+
# require 'rcov/rcovtask'
|
36
|
+
# Rcov::RcovTask.new do |test|
|
37
|
+
# test.libs << 'test'
|
38
|
+
# test.pattern = 'test/**/test_*.rb'
|
39
|
+
# test.verbose = true
|
40
|
+
# test.rcov_opts << '--exclude "gems/*"'
|
41
|
+
# end
|
42
|
+
|
43
|
+
task :default => :test
|
44
|
+
|
45
|
+
require 'rdoc/task'
|
46
|
+
Rake::RDocTask.new do |rdoc|
|
47
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
48
|
+
|
49
|
+
rdoc.rdoc_dir = 'rdoc'
|
50
|
+
rdoc.title = "pest #{version}"
|
51
|
+
rdoc.rdoc_files.include('README*')
|
52
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
53
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.0
|
data/lib/pest.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
2
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), 'lib'))
|
3
|
+
|
4
|
+
require 'uuidtools'
|
5
|
+
require 'csv'
|
6
|
+
|
7
|
+
require "pest/version"
|
8
|
+
require "pest/variable"
|
9
|
+
|
10
|
+
require "pest/function"
|
11
|
+
require "pest/function/probability"
|
12
|
+
require "pest/function/entropy"
|
13
|
+
|
14
|
+
require "pest/data_set"
|
15
|
+
require "pest/data_set/hash"
|
16
|
+
require "pest/data_set/narray"
|
17
|
+
|
18
|
+
require "pest/estimator"
|
19
|
+
require "pest/estimator/frequency"
|
20
|
+
|
21
|
+
module Pest
|
22
|
+
CACHE_TO_FILE = false
|
23
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module Pest::DataSet
|
2
|
+
def self.included(base)
|
3
|
+
base.extend(ClassMethods)
|
4
|
+
end
|
5
|
+
|
6
|
+
def variables
|
7
|
+
@variables ||= {}
|
8
|
+
end
|
9
|
+
|
10
|
+
def variable_array
|
11
|
+
@variables.values.sort
|
12
|
+
end
|
13
|
+
|
14
|
+
def to_hash(*args)
|
15
|
+
raise NotImplementedError
|
16
|
+
end
|
17
|
+
|
18
|
+
def save(*args)
|
19
|
+
raise NotImplementedError
|
20
|
+
end
|
21
|
+
|
22
|
+
def destroy
|
23
|
+
raise NotImplementedError
|
24
|
+
end
|
25
|
+
|
26
|
+
def length
|
27
|
+
raise NotImplementedError
|
28
|
+
end
|
29
|
+
|
30
|
+
module ClassMethods
|
31
|
+
def from(data_source)
|
32
|
+
# Try to translate the data source directly
|
33
|
+
if translator_method = translators[data_source.class]
|
34
|
+
send(translator_method, data_source)
|
35
|
+
|
36
|
+
# Try to translate via hash
|
37
|
+
else
|
38
|
+
begin
|
39
|
+
hash_data = data_source.to_hash
|
40
|
+
rescue NoMethodError
|
41
|
+
raise "Unrecognized data source type"
|
42
|
+
end
|
43
|
+
|
44
|
+
if hash_data and translators.has_key?(hash_data.class)
|
45
|
+
from(data_source.to_hash)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def translators(*args)
|
51
|
+
raise NotImplementedError
|
52
|
+
end
|
53
|
+
|
54
|
+
def from_file(*args)
|
55
|
+
raise NotImplementedError
|
56
|
+
end
|
57
|
+
|
58
|
+
def from_hash(*args)
|
59
|
+
raise NotImplementedError
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
class Pest::DataSet::Hash
|
2
|
+
include Pest::DataSet
|
3
|
+
|
4
|
+
def self.translators
|
5
|
+
{
|
6
|
+
File => :from_file,
|
7
|
+
String => :from_file,
|
8
|
+
Symbol => :from_file
|
9
|
+
}
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.from_file(file)
|
13
|
+
file = File.open(file.to_s, 'r') if file.kind_of?(String)
|
14
|
+
|
15
|
+
object = Marshal.restore(file)
|
16
|
+
|
17
|
+
if object.kind_of?(::Hash)
|
18
|
+
self.new(object)
|
19
|
+
else
|
20
|
+
raise "File does not seem to contain valid data"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
attr_reader :variables, :hash
|
25
|
+
|
26
|
+
def initialize(hash)
|
27
|
+
@hash = hash
|
28
|
+
@variables = {}
|
29
|
+
hash.keys().each do |name|
|
30
|
+
@variables[name] = Pest::Variable.new(:name => name)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def to_hash
|
35
|
+
@hash
|
36
|
+
end
|
37
|
+
|
38
|
+
def data_vectors(variables=nil)
|
39
|
+
VectorEnumerable.new(self,variables)
|
40
|
+
end
|
41
|
+
|
42
|
+
def length
|
43
|
+
@hash.values.first.length
|
44
|
+
end
|
45
|
+
|
46
|
+
def save(file=nil)
|
47
|
+
file ||= Tempfile.new('pest_hash_dataset')
|
48
|
+
file = File.open(file, 'w') if file.kind_of?(String)
|
49
|
+
Marshal.dump(@hash, file)
|
50
|
+
end
|
51
|
+
|
52
|
+
class VectorEnumerable
|
53
|
+
include Enumerable
|
54
|
+
|
55
|
+
def initialize(data_set,variables=nil)
|
56
|
+
@data_set = data_set
|
57
|
+
@variables = variables || @data_set.variables
|
58
|
+
end
|
59
|
+
|
60
|
+
def [](i)
|
61
|
+
@variables.map {|var| @data_set.hash[var][i]}
|
62
|
+
end
|
63
|
+
|
64
|
+
def each
|
65
|
+
@data_set.hash.values.first.each_index do |i|
|
66
|
+
yield @variables.keys.map {|var| @data_set.hash[var][i]}
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
require 'narray'
|
2
|
+
|
3
|
+
class Pest::DataSet::NArray < NMatrix
|
4
|
+
include Pest::DataSet
|
5
|
+
|
6
|
+
def self.translators
|
7
|
+
{
|
8
|
+
Hash => :from_hash,
|
9
|
+
File => :from_file,
|
10
|
+
String => :from_file,
|
11
|
+
Symbol => :from_file
|
12
|
+
}
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.from_hash(hash)
|
16
|
+
data_set = to_na(hash.keys.sort.map {|key| hash[key]}) # Ensure the matrix is sorted the same as the variables
|
17
|
+
data_set.variables = {}
|
18
|
+
hash.keys.each do |key|
|
19
|
+
variable = key.kind_of?(Pest::Variable) ? key : Pest::Variable.new(:name => key)
|
20
|
+
data_set.variables[variable.name] = variable
|
21
|
+
end
|
22
|
+
data_set
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.from_file(file)
|
26
|
+
file = File.open(file.to_s, 'r') if file.kind_of?(String)
|
27
|
+
|
28
|
+
begin
|
29
|
+
variables, matrix = Marshal.restore(file)
|
30
|
+
data_set = to_na(matrix)
|
31
|
+
data_set.variables = variables
|
32
|
+
data_set
|
33
|
+
rescue
|
34
|
+
raise "File does not seem to contain valid data"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.from_csv(file, args={})
|
39
|
+
args = args.merge({:converters => :all})
|
40
|
+
data = CSV.read(file, args)
|
41
|
+
data_set = to_na(data[1..-1]).transpose
|
42
|
+
data_set.variables = {}
|
43
|
+
data[0].each do |key|
|
44
|
+
variable = key.kind_of?(Pest::Variable) ? key : Pest::Variable.new(:name => key)
|
45
|
+
data_set.variables[variable.name] = variable
|
46
|
+
end
|
47
|
+
data_set
|
48
|
+
end
|
49
|
+
|
50
|
+
attr_accessor :variables
|
51
|
+
|
52
|
+
def to_hash
|
53
|
+
hash = {}
|
54
|
+
variables.values.each_index do |i|
|
55
|
+
hash[variables.values[i]] = self[true,i].to_a[0]
|
56
|
+
end
|
57
|
+
hash
|
58
|
+
end
|
59
|
+
|
60
|
+
# variables: an array of variables for which each vector should contain values
|
61
|
+
# Order is retained in the returned value
|
62
|
+
def data_vectors(variables=nil)
|
63
|
+
VectorEnumerable.new(self, variables)
|
64
|
+
end
|
65
|
+
|
66
|
+
def length
|
67
|
+
shape[0]
|
68
|
+
end
|
69
|
+
|
70
|
+
def save(file=nil)
|
71
|
+
file ||= Tempfile.new('pest_hash_dataset')
|
72
|
+
file = File.open(file, 'w') if file.kind_of?(String)
|
73
|
+
Marshal.dump([variables,to_a], file)
|
74
|
+
file.close
|
75
|
+
end
|
76
|
+
|
77
|
+
class VectorEnumerable
|
78
|
+
include Enumerable
|
79
|
+
|
80
|
+
def initialize(data_set, variables = true)
|
81
|
+
@data_set = data_set
|
82
|
+
@variables = variables
|
83
|
+
if @variables.kind_of?(Enumerable)
|
84
|
+
@variables = variables.map {|v| @data_set.variable_array.index(v)}
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def [](i)
|
89
|
+
@data_set[i,@variables].transpose
|
90
|
+
end
|
91
|
+
|
92
|
+
def each
|
93
|
+
(0..@data_set.shape[0]-1).each do |i|
|
94
|
+
yield Array(self[i]).first
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
module Pest::Estimator
|
2
|
+
attr_accessor :data
|
3
|
+
|
4
|
+
def initialize(data=nil)
|
5
|
+
@data = data
|
6
|
+
end
|
7
|
+
|
8
|
+
def variables
|
9
|
+
@data.nil? ? {} : @data.variables
|
10
|
+
end
|
11
|
+
|
12
|
+
def distributions
|
13
|
+
@distributions ||= DistributionList.new(self)
|
14
|
+
end
|
15
|
+
|
16
|
+
def to_variable(arg)
|
17
|
+
variable = case arg.class.name
|
18
|
+
when 'Pest::Variable'
|
19
|
+
arg
|
20
|
+
when 'String', 'Symbol'
|
21
|
+
variables[arg] || Pest::Variable.new(:name => arg)
|
22
|
+
end
|
23
|
+
raise ArgumentError unless variables.values.include?(variable)
|
24
|
+
variable
|
25
|
+
end
|
26
|
+
|
27
|
+
module Distribution
|
28
|
+
attr_reader :variables
|
29
|
+
|
30
|
+
def initialize(estimator, variables)
|
31
|
+
@estimator = estimator
|
32
|
+
@variables = variables
|
33
|
+
end
|
34
|
+
|
35
|
+
def variable_array
|
36
|
+
variables.to_a.sort
|
37
|
+
end
|
38
|
+
|
39
|
+
def probability
|
40
|
+
raise NotImplementedError
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
class DistributionList < Hash
|
45
|
+
def initialize(estimator)
|
46
|
+
@estimator = estimator
|
47
|
+
end
|
48
|
+
|
49
|
+
def parse_args(args)
|
50
|
+
set = if args.kind_of? Array
|
51
|
+
if args.any? {|arg| arg.kind_of?(::Set)}
|
52
|
+
args.inject(::Set.new) {|set, el| set + el.to_set}
|
53
|
+
else
|
54
|
+
args.flatten.to_set
|
55
|
+
end
|
56
|
+
elsif args.kind_of? ::Set
|
57
|
+
args
|
58
|
+
else
|
59
|
+
Array(args).to_set
|
60
|
+
end
|
61
|
+
set.map! {|arg| @estimator.to_variable(arg) }
|
62
|
+
end
|
63
|
+
|
64
|
+
def [](*args)
|
65
|
+
set = parse_args(args)
|
66
|
+
unless has_key? set
|
67
|
+
self[set] = @estimator.distribution_class.new(@estimator, set)
|
68
|
+
end
|
69
|
+
super(set)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|