pest 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/.travis.yml ADDED
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.2
4
+ - 1.9.3
5
+ # uncomment this line if your project needs to run something other than `rake`:
6
+ script: bundle exec rspec spec
data/Gemfile ADDED
@@ -0,0 +1,13 @@
1
+ source 'http://rubygems.org'
2
+
3
+ gemspec
4
+
5
+ gem "narray"
6
+ gem "uuidtools"
7
+
8
+ group :development do
9
+ gem "jeweler"
10
+ gem "rake"
11
+ gem "pry"
12
+ gem "rspec"
13
+ end
data/README.md ADDED
@@ -0,0 +1,90 @@
1
+ # Pest, a framework for Probability Estimation
2
+
3
+ [![Build Status](https://secure.travis-ci.org/kerinin/pest.png)](http://travis-ci.org/kerinin/pest)
4
+
5
+
6
+ Pest provides a unified framework for interacting with different probability
7
+ estimation models.
8
+
9
+ * Pest tries to be agnostic about the underlying data data structures,
10
+ so changing libraries (GSL -> Hadoop) is as simple as using a different data source.
11
+ * Pest is designed to create estimators using subsets of larger data sources, and
12
+ transparently constructs estimators to facilitate dynamic querying
13
+ * Implementing custom estimation models is easy, and Pest implements some model
14
+ common ones for you.
15
+
16
+ Pest abstracts common statstical operations including:
17
+
18
+ * Marginal, Joint and Conditional point probability
19
+ * Interval and Cumulative probability
20
+ * Entropy, Cross Entropy, and Mutual Information
21
+ * Mean, Median, Mode, etc
22
+
23
+
24
+ ## Ruby Install
25
+
26
+ ``` sh
27
+ brew install gnuplot # This may take awhile...
28
+ cd /usr/local
29
+ git checkout 83ed494 /usr/local/Library/Formula/gsl.rb
30
+ brew install gsl # Forcing gsl v1.4
31
+
32
+ bundle install
33
+ ```
34
+
35
+ ## API
36
+
37
+ ``` ruby
38
+ # Creating Datasets
39
+ test = Pest::DataSet::Hash.new hash # Creates a Hash dataset of observations from a hash
40
+ test = Pest::DataSet::Hash.new file # Creates a Hash dataset of observations from an IO (Marshalled)
41
+ train = Pest::DataSet::GSL.new file # Creates a GSL dataset from and IO instance
42
+
43
+ # DataSet Variables
44
+ test.variables # hash of Variable instances detected in observation set
45
+ test.v # alias of 'variables'
46
+ test.v[:foo] # a specific variable
47
+ test.v[:foo] = another_variable # explicit declaration
48
+
49
+ # Creating Estimators
50
+ e = Pest::Estimator::Set::Multinomial.new(test) # Creates a multinomial estimator for set o
51
+ e = Pest::Estimator::Discrete::Gaussian.new(file) # Creating an estimator with the DataSet API
52
+
53
+ # Descriptive Statistical Properties
54
+ e.mode(:foo) # Mode
55
+ e.mean(:foo) # Mean (discrete & continuous only)
56
+ e.median(:foo) # Median (discrete & continuous only)
57
+ # quantile?
58
+ # variance?
59
+ # deviation?
60
+
61
+ # Estimating Entropy (Set & Discrete only)
62
+ e.entropy(:foo) # Entropy of 'foo'
63
+ e.h(:foo, :bar) # Joint entropy of 'foo' AND 'bar'
64
+ e.h(:foo).given(:bar) # Cross entropy of 'foo' : 'bar'
65
+ e.mutual_information(:foo, :bar) # Mutual information of 'foo' and 'bar'
66
+ e.i(:foo, :bar) # Alias
67
+
68
+ # Estimating Point Probability (Set & Discrete only)
69
+ e.probability(o.variables[:foo]) # (Set/Discrete only) Estimate the probability of all values of 'foo'
70
+ e.p(:foo) # Same as above, tries to find a variable named 'foo'
71
+ e.p(:foo).in(test) # Estimate the probability of values in dataset 'test'
72
+ e.p(:foo).given(:bar).in(test) # Estimate the conditional foo | bar for the values in 'test'
73
+ e.p(:foo, :bar).in(test) # Estimate the joint probablity foo AND bar
74
+ e.p(:foo, :bar).given(:baz, :qux).in(test) # More complex joint & conditional probabilities
75
+ e.p(:foo => 4, :bar => 2).given(:baz => 0) # Single prediction (implicitly creates dataset)
76
+ e.p(:foo).given(:bar).cache # Builds and persists the model for 'foo|bar'
77
+ e.p(:foo).given(:bar).cache('path.csv') # Persist to a specific path (defaults to tmp)
78
+
79
+ # Estimating Cumulative & Interval Probability (Discrete & Continuous only)
80
+ e.probability(:foo).greater_than(:bar).in(test)
81
+ e.p(:foo).greater_than(:bar).less_than(:baz).in(test)
82
+ e.p(:foo).gt(:bar).lt(:baz).given(:qux).in(test)
83
+ ```
84
+
85
+ ## Working Notes
86
+
87
+ Do we want variable equality to be name-based? It may make more sense to allow
88
+ variables named differently in different data sets to be equivalent. And how the
89
+ fuck do we handle variable type? I'm almost thinking we don't, and let the actual
90
+ estimators take care of type casting
data/Rakefile ADDED
@@ -0,0 +1,53 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "pest"
18
+ gem.homepage = "http://github.com/kerinin/pest"
19
+ gem.license = "MIT"
20
+ gem.description = %q{Wrappers to facilitate different classes of probability estimators}
21
+ gem.summary = %q{Probability Estimation}
22
+ gem.email = "kerinin@gmail.com"
23
+ gem.authors = ["Ryan Michael"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ require 'rake/testtask'
29
+ Rake::TestTask.new(:test) do |test|
30
+ test.libs << 'lib' << 'test'
31
+ test.pattern = 'test/**/test_*.rb'
32
+ test.verbose = true
33
+ end
34
+
35
+ # require 'rcov/rcovtask'
36
+ # Rcov::RcovTask.new do |test|
37
+ # test.libs << 'test'
38
+ # test.pattern = 'test/**/test_*.rb'
39
+ # test.verbose = true
40
+ # test.rcov_opts << '--exclude "gems/*"'
41
+ # end
42
+
43
+ task :default => :test
44
+
45
+ require 'rdoc/task'
46
+ Rake::RDocTask.new do |rdoc|
47
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
48
+
49
+ rdoc.rdoc_dir = 'rdoc'
50
+ rdoc.title = "pest #{version}"
51
+ rdoc.rdoc_files.include('README*')
52
+ rdoc.rdoc_files.include('lib/**/*.rb')
53
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.0
data/lib/pest.rb ADDED
@@ -0,0 +1,23 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), 'lib'))
3
+
4
+ require 'uuidtools'
5
+ require 'csv'
6
+
7
+ require "pest/version"
8
+ require "pest/variable"
9
+
10
+ require "pest/function"
11
+ require "pest/function/probability"
12
+ require "pest/function/entropy"
13
+
14
+ require "pest/data_set"
15
+ require "pest/data_set/hash"
16
+ require "pest/data_set/narray"
17
+
18
+ require "pest/estimator"
19
+ require "pest/estimator/frequency"
20
+
21
+ module Pest
22
+ CACHE_TO_FILE = false
23
+ end
@@ -0,0 +1,62 @@
1
+ module Pest::DataSet
2
+ def self.included(base)
3
+ base.extend(ClassMethods)
4
+ end
5
+
6
+ def variables
7
+ @variables ||= {}
8
+ end
9
+
10
+ def variable_array
11
+ @variables.values.sort
12
+ end
13
+
14
+ def to_hash(*args)
15
+ raise NotImplementedError
16
+ end
17
+
18
+ def save(*args)
19
+ raise NotImplementedError
20
+ end
21
+
22
+ def destroy
23
+ raise NotImplementedError
24
+ end
25
+
26
+ def length
27
+ raise NotImplementedError
28
+ end
29
+
30
+ module ClassMethods
31
+ def from(data_source)
32
+ # Try to translate the data source directly
33
+ if translator_method = translators[data_source.class]
34
+ send(translator_method, data_source)
35
+
36
+ # Try to translate via hash
37
+ else
38
+ begin
39
+ hash_data = data_source.to_hash
40
+ rescue NoMethodError
41
+ raise "Unrecognized data source type"
42
+ end
43
+
44
+ if hash_data and translators.has_key?(hash_data.class)
45
+ from(data_source.to_hash)
46
+ end
47
+ end
48
+ end
49
+
50
+ def translators(*args)
51
+ raise NotImplementedError
52
+ end
53
+
54
+ def from_file(*args)
55
+ raise NotImplementedError
56
+ end
57
+
58
+ def from_hash(*args)
59
+ raise NotImplementedError
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,70 @@
1
+ class Pest::DataSet::Hash
2
+ include Pest::DataSet
3
+
4
+ def self.translators
5
+ {
6
+ File => :from_file,
7
+ String => :from_file,
8
+ Symbol => :from_file
9
+ }
10
+ end
11
+
12
+ def self.from_file(file)
13
+ file = File.open(file.to_s, 'r') if file.kind_of?(String)
14
+
15
+ object = Marshal.restore(file)
16
+
17
+ if object.kind_of?(::Hash)
18
+ self.new(object)
19
+ else
20
+ raise "File does not seem to contain valid data"
21
+ end
22
+ end
23
+
24
+ attr_reader :variables, :hash
25
+
26
+ def initialize(hash)
27
+ @hash = hash
28
+ @variables = {}
29
+ hash.keys().each do |name|
30
+ @variables[name] = Pest::Variable.new(:name => name)
31
+ end
32
+ end
33
+
34
+ def to_hash
35
+ @hash
36
+ end
37
+
38
+ def data_vectors(variables=nil)
39
+ VectorEnumerable.new(self,variables)
40
+ end
41
+
42
+ def length
43
+ @hash.values.first.length
44
+ end
45
+
46
+ def save(file=nil)
47
+ file ||= Tempfile.new('pest_hash_dataset')
48
+ file = File.open(file, 'w') if file.kind_of?(String)
49
+ Marshal.dump(@hash, file)
50
+ end
51
+
52
+ class VectorEnumerable
53
+ include Enumerable
54
+
55
+ def initialize(data_set,variables=nil)
56
+ @data_set = data_set
57
+ @variables = variables || @data_set.variables
58
+ end
59
+
60
+ def [](i)
61
+ @variables.map {|var| @data_set.hash[var][i]}
62
+ end
63
+
64
+ def each
65
+ @data_set.hash.values.first.each_index do |i|
66
+ yield @variables.keys.map {|var| @data_set.hash[var][i]}
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,98 @@
1
+ require 'narray'
2
+
3
+ class Pest::DataSet::NArray < NMatrix
4
+ include Pest::DataSet
5
+
6
+ def self.translators
7
+ {
8
+ Hash => :from_hash,
9
+ File => :from_file,
10
+ String => :from_file,
11
+ Symbol => :from_file
12
+ }
13
+ end
14
+
15
+ def self.from_hash(hash)
16
+ data_set = to_na(hash.keys.sort.map {|key| hash[key]}) # Ensure the matrix is sorted the same as the variables
17
+ data_set.variables = {}
18
+ hash.keys.each do |key|
19
+ variable = key.kind_of?(Pest::Variable) ? key : Pest::Variable.new(:name => key)
20
+ data_set.variables[variable.name] = variable
21
+ end
22
+ data_set
23
+ end
24
+
25
+ def self.from_file(file)
26
+ file = File.open(file.to_s, 'r') if file.kind_of?(String)
27
+
28
+ begin
29
+ variables, matrix = Marshal.restore(file)
30
+ data_set = to_na(matrix)
31
+ data_set.variables = variables
32
+ data_set
33
+ rescue
34
+ raise "File does not seem to contain valid data"
35
+ end
36
+ end
37
+
38
+ def self.from_csv(file, args={})
39
+ args = args.merge({:converters => :all})
40
+ data = CSV.read(file, args)
41
+ data_set = to_na(data[1..-1]).transpose
42
+ data_set.variables = {}
43
+ data[0].each do |key|
44
+ variable = key.kind_of?(Pest::Variable) ? key : Pest::Variable.new(:name => key)
45
+ data_set.variables[variable.name] = variable
46
+ end
47
+ data_set
48
+ end
49
+
50
+ attr_accessor :variables
51
+
52
+ def to_hash
53
+ hash = {}
54
+ variables.values.each_index do |i|
55
+ hash[variables.values[i]] = self[true,i].to_a[0]
56
+ end
57
+ hash
58
+ end
59
+
60
+ # variables: an array of variables for which each vector should contain values
61
+ # Order is retained in the returned value
62
+ def data_vectors(variables=nil)
63
+ VectorEnumerable.new(self, variables)
64
+ end
65
+
66
+ def length
67
+ shape[0]
68
+ end
69
+
70
+ def save(file=nil)
71
+ file ||= Tempfile.new('pest_hash_dataset')
72
+ file = File.open(file, 'w') if file.kind_of?(String)
73
+ Marshal.dump([variables,to_a], file)
74
+ file.close
75
+ end
76
+
77
+ class VectorEnumerable
78
+ include Enumerable
79
+
80
+ def initialize(data_set, variables = true)
81
+ @data_set = data_set
82
+ @variables = variables
83
+ if @variables.kind_of?(Enumerable)
84
+ @variables = variables.map {|v| @data_set.variable_array.index(v)}
85
+ end
86
+ end
87
+
88
+ def [](i)
89
+ @data_set[i,@variables].transpose
90
+ end
91
+
92
+ def each
93
+ (0..@data_set.shape[0]-1).each do |i|
94
+ yield Array(self[i]).first
95
+ end
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,72 @@
1
+ module Pest::Estimator
2
+ attr_accessor :data
3
+
4
+ def initialize(data=nil)
5
+ @data = data
6
+ end
7
+
8
+ def variables
9
+ @data.nil? ? {} : @data.variables
10
+ end
11
+
12
+ def distributions
13
+ @distributions ||= DistributionList.new(self)
14
+ end
15
+
16
+ def to_variable(arg)
17
+ variable = case arg.class.name
18
+ when 'Pest::Variable'
19
+ arg
20
+ when 'String', 'Symbol'
21
+ variables[arg] || Pest::Variable.new(:name => arg)
22
+ end
23
+ raise ArgumentError unless variables.values.include?(variable)
24
+ variable
25
+ end
26
+
27
+ module Distribution
28
+ attr_reader :variables
29
+
30
+ def initialize(estimator, variables)
31
+ @estimator = estimator
32
+ @variables = variables
33
+ end
34
+
35
+ def variable_array
36
+ variables.to_a.sort
37
+ end
38
+
39
+ def probability
40
+ raise NotImplementedError
41
+ end
42
+ end
43
+
44
+ class DistributionList < Hash
45
+ def initialize(estimator)
46
+ @estimator = estimator
47
+ end
48
+
49
+ def parse_args(args)
50
+ set = if args.kind_of? Array
51
+ if args.any? {|arg| arg.kind_of?(::Set)}
52
+ args.inject(::Set.new) {|set, el| set + el.to_set}
53
+ else
54
+ args.flatten.to_set
55
+ end
56
+ elsif args.kind_of? ::Set
57
+ args
58
+ else
59
+ Array(args).to_set
60
+ end
61
+ set.map! {|arg| @estimator.to_variable(arg) }
62
+ end
63
+
64
+ def [](*args)
65
+ set = parse_args(args)
66
+ unless has_key? set
67
+ self[set] = @estimator.distribution_class.new(@estimator, set)
68
+ end
69
+ super(set)
70
+ end
71
+ end
72
+ end