pest 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.travis.yml ADDED
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.2
4
+ - 1.9.3
5
+ # uncomment this line if your project needs to run something other than `rake`:
6
+ script: bundle exec rspec spec
data/Gemfile ADDED
@@ -0,0 +1,13 @@
1
+ source 'http://rubygems.org'
2
+
3
+ gemspec
4
+
5
+ gem "narray"
6
+ gem "uuidtools"
7
+
8
+ group :development do
9
+ gem "jeweler"
10
+ gem "rake"
11
+ gem "pry"
12
+ gem "rspec"
13
+ end
data/README.md ADDED
@@ -0,0 +1,90 @@
1
+ # Pest, a framework for Probability Estimation
2
+
3
+ [![Build Status](https://secure.travis-ci.org/kerinin/pest.png)](http://travis-ci.org/kerinin/pest)
4
+
5
+
6
+ Pest provides a unified framework for interacting with different probability
7
+ estimation models.
8
+
9
+ * Pest tries to be agnostic about the underlying data data structures,
10
+ so changing libraries (GSL -> Hadoop) is as simple as using a different data source.
11
+ * Pest is designed to create estimators using subsets of larger data sources, and
12
+ transparently constructs estimators to facilitate dynamic querying
13
+ * Implementing custom estimation models is easy, and Pest implements some model
14
+ common ones for you.
15
+
16
+ Pest abstracts common statstical operations including:
17
+
18
+ * Marginal, Joint and Conditional point probability
19
+ * Interval and Cumulative probability
20
+ * Entropy, Cross Entropy, and Mutual Information
21
+ * Mean, Median, Mode, etc
22
+
23
+
24
+ ## Ruby Install
25
+
26
+ ``` sh
27
+ brew install gnuplot # This may take awhile...
28
+ cd /usr/local
29
+ git checkout 83ed494 /usr/local/Library/Formula/gsl.rb
30
+ brew install gsl # Forcing gsl v1.4
31
+
32
+ bundle install
33
+ ```
34
+
35
+ ## API
36
+
37
+ ``` ruby
38
+ # Creating Datasets
39
+ test = Pest::DataSet::Hash.new hash # Creates a Hash dataset of observations from a hash
40
+ test = Pest::DataSet::Hash.new file # Creates a Hash dataset of observations from an IO (Marshalled)
41
+ train = Pest::DataSet::GSL.new file # Creates a GSL dataset from and IO instance
42
+
43
+ # DataSet Variables
44
+ test.variables # hash of Variable instances detected in observation set
45
+ test.v # alias of 'variables'
46
+ test.v[:foo] # a specific variable
47
+ test.v[:foo] = another_variable # explicit declaration
48
+
49
+ # Creating Estimators
50
+ e = Pest::Estimator::Set::Multinomial.new(test) # Creates a multinomial estimator for set o
51
+ e = Pest::Estimator::Discrete::Gaussian.new(file) # Creating an estimator with the DataSet API
52
+
53
+ # Descriptive Statistical Properties
54
+ e.mode(:foo) # Mode
55
+ e.mean(:foo) # Mean (discrete & continuous only)
56
+ e.median(:foo) # Median (discrete & continuous only)
57
+ # quantile?
58
+ # variance?
59
+ # deviation?
60
+
61
+ # Estimating Entropy (Set & Discrete only)
62
+ e.entropy(:foo) # Entropy of 'foo'
63
+ e.h(:foo, :bar) # Joint entropy of 'foo' AND 'bar'
64
+ e.h(:foo).given(:bar) # Cross entropy of 'foo' : 'bar'
65
+ e.mutual_information(:foo, :bar) # Mutual information of 'foo' and 'bar'
66
+ e.i(:foo, :bar) # Alias
67
+
68
+ # Estimating Point Probability (Set & Discrete only)
69
+ e.probability(o.variables[:foo]) # (Set/Discrete only) Estimate the probability of all values of 'foo'
70
+ e.p(:foo) # Same as above, tries to find a variable named 'foo'
71
+ e.p(:foo).in(test) # Estimate the probability of values in dataset 'test'
72
+ e.p(:foo).given(:bar).in(test) # Estimate the conditional foo | bar for the values in 'test'
73
+ e.p(:foo, :bar).in(test) # Estimate the joint probablity foo AND bar
74
+ e.p(:foo, :bar).given(:baz, :qux).in(test) # More complex joint & conditional probabilities
75
+ e.p(:foo => 4, :bar => 2).given(:baz => 0) # Single prediction (implicitly creates dataset)
76
+ e.p(:foo).given(:bar).cache # Builds and persists the model for 'foo|bar'
77
+ e.p(:foo).given(:bar).cache('path.csv') # Persist to a specific path (defaults to tmp)
78
+
79
+ # Estimating Cumulative & Interval Probability (Discrete & Continuous only)
80
+ e.probability(:foo).greater_than(:bar).in(test)
81
+ e.p(:foo).greater_than(:bar).less_than(:baz).in(test)
82
+ e.p(:foo).gt(:bar).lt(:baz).given(:qux).in(test)
83
+ ```
84
+
85
+ ## Working Notes
86
+
87
+ Do we want variable equality to be name-based? It may make more sense to allow
88
+ variables named differently in different data sets to be equivalent. And how the
89
+ fuck do we handle variable type? I'm almost thinking we don't, and let the actual
90
+ estimators take care of type casting
data/Rakefile ADDED
@@ -0,0 +1,53 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "pest"
18
+ gem.homepage = "http://github.com/kerinin/pest"
19
+ gem.license = "MIT"
20
+ gem.description = %q{Wrappers to facilitate different classes of probability estimators}
21
+ gem.summary = %q{Probability Estimation}
22
+ gem.email = "kerinin@gmail.com"
23
+ gem.authors = ["Ryan Michael"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ require 'rake/testtask'
29
+ Rake::TestTask.new(:test) do |test|
30
+ test.libs << 'lib' << 'test'
31
+ test.pattern = 'test/**/test_*.rb'
32
+ test.verbose = true
33
+ end
34
+
35
+ # require 'rcov/rcovtask'
36
+ # Rcov::RcovTask.new do |test|
37
+ # test.libs << 'test'
38
+ # test.pattern = 'test/**/test_*.rb'
39
+ # test.verbose = true
40
+ # test.rcov_opts << '--exclude "gems/*"'
41
+ # end
42
+
43
+ task :default => :test
44
+
45
+ require 'rdoc/task'
46
+ Rake::RDocTask.new do |rdoc|
47
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
48
+
49
+ rdoc.rdoc_dir = 'rdoc'
50
+ rdoc.title = "pest #{version}"
51
+ rdoc.rdoc_files.include('README*')
52
+ rdoc.rdoc_files.include('lib/**/*.rb')
53
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.0
data/lib/pest.rb ADDED
@@ -0,0 +1,23 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), 'lib'))
3
+
4
+ require 'uuidtools'
5
+ require 'csv'
6
+
7
+ require "pest/version"
8
+ require "pest/variable"
9
+
10
+ require "pest/function"
11
+ require "pest/function/probability"
12
+ require "pest/function/entropy"
13
+
14
+ require "pest/data_set"
15
+ require "pest/data_set/hash"
16
+ require "pest/data_set/narray"
17
+
18
+ require "pest/estimator"
19
+ require "pest/estimator/frequency"
20
+
21
+ module Pest
22
+ CACHE_TO_FILE = false
23
+ end
@@ -0,0 +1,62 @@
1
+ module Pest::DataSet
2
+ def self.included(base)
3
+ base.extend(ClassMethods)
4
+ end
5
+
6
+ def variables
7
+ @variables ||= {}
8
+ end
9
+
10
+ def variable_array
11
+ @variables.values.sort
12
+ end
13
+
14
+ def to_hash(*args)
15
+ raise NotImplementedError
16
+ end
17
+
18
+ def save(*args)
19
+ raise NotImplementedError
20
+ end
21
+
22
+ def destroy
23
+ raise NotImplementedError
24
+ end
25
+
26
+ def length
27
+ raise NotImplementedError
28
+ end
29
+
30
+ module ClassMethods
31
+ def from(data_source)
32
+ # Try to translate the data source directly
33
+ if translator_method = translators[data_source.class]
34
+ send(translator_method, data_source)
35
+
36
+ # Try to translate via hash
37
+ else
38
+ begin
39
+ hash_data = data_source.to_hash
40
+ rescue NoMethodError
41
+ raise "Unrecognized data source type"
42
+ end
43
+
44
+ if hash_data and translators.has_key?(hash_data.class)
45
+ from(data_source.to_hash)
46
+ end
47
+ end
48
+ end
49
+
50
+ def translators(*args)
51
+ raise NotImplementedError
52
+ end
53
+
54
+ def from_file(*args)
55
+ raise NotImplementedError
56
+ end
57
+
58
+ def from_hash(*args)
59
+ raise NotImplementedError
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,70 @@
1
+ class Pest::DataSet::Hash
2
+ include Pest::DataSet
3
+
4
+ def self.translators
5
+ {
6
+ File => :from_file,
7
+ String => :from_file,
8
+ Symbol => :from_file
9
+ }
10
+ end
11
+
12
+ def self.from_file(file)
13
+ file = File.open(file.to_s, 'r') if file.kind_of?(String)
14
+
15
+ object = Marshal.restore(file)
16
+
17
+ if object.kind_of?(::Hash)
18
+ self.new(object)
19
+ else
20
+ raise "File does not seem to contain valid data"
21
+ end
22
+ end
23
+
24
+ attr_reader :variables, :hash
25
+
26
+ def initialize(hash)
27
+ @hash = hash
28
+ @variables = {}
29
+ hash.keys().each do |name|
30
+ @variables[name] = Pest::Variable.new(:name => name)
31
+ end
32
+ end
33
+
34
+ def to_hash
35
+ @hash
36
+ end
37
+
38
+ def data_vectors(variables=nil)
39
+ VectorEnumerable.new(self,variables)
40
+ end
41
+
42
+ def length
43
+ @hash.values.first.length
44
+ end
45
+
46
+ def save(file=nil)
47
+ file ||= Tempfile.new('pest_hash_dataset')
48
+ file = File.open(file, 'w') if file.kind_of?(String)
49
+ Marshal.dump(@hash, file)
50
+ end
51
+
52
+ class VectorEnumerable
53
+ include Enumerable
54
+
55
+ def initialize(data_set,variables=nil)
56
+ @data_set = data_set
57
+ @variables = variables || @data_set.variables
58
+ end
59
+
60
+ def [](i)
61
+ @variables.map {|var| @data_set.hash[var][i]}
62
+ end
63
+
64
+ def each
65
+ @data_set.hash.values.first.each_index do |i|
66
+ yield @variables.keys.map {|var| @data_set.hash[var][i]}
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,98 @@
1
+ require 'narray'
2
+
3
+ class Pest::DataSet::NArray < NMatrix
4
+ include Pest::DataSet
5
+
6
+ def self.translators
7
+ {
8
+ Hash => :from_hash,
9
+ File => :from_file,
10
+ String => :from_file,
11
+ Symbol => :from_file
12
+ }
13
+ end
14
+
15
+ def self.from_hash(hash)
16
+ data_set = to_na(hash.keys.sort.map {|key| hash[key]}) # Ensure the matrix is sorted the same as the variables
17
+ data_set.variables = {}
18
+ hash.keys.each do |key|
19
+ variable = key.kind_of?(Pest::Variable) ? key : Pest::Variable.new(:name => key)
20
+ data_set.variables[variable.name] = variable
21
+ end
22
+ data_set
23
+ end
24
+
25
+ def self.from_file(file)
26
+ file = File.open(file.to_s, 'r') if file.kind_of?(String)
27
+
28
+ begin
29
+ variables, matrix = Marshal.restore(file)
30
+ data_set = to_na(matrix)
31
+ data_set.variables = variables
32
+ data_set
33
+ rescue
34
+ raise "File does not seem to contain valid data"
35
+ end
36
+ end
37
+
38
+ def self.from_csv(file, args={})
39
+ args = args.merge({:converters => :all})
40
+ data = CSV.read(file, args)
41
+ data_set = to_na(data[1..-1]).transpose
42
+ data_set.variables = {}
43
+ data[0].each do |key|
44
+ variable = key.kind_of?(Pest::Variable) ? key : Pest::Variable.new(:name => key)
45
+ data_set.variables[variable.name] = variable
46
+ end
47
+ data_set
48
+ end
49
+
50
+ attr_accessor :variables
51
+
52
+ def to_hash
53
+ hash = {}
54
+ variables.values.each_index do |i|
55
+ hash[variables.values[i]] = self[true,i].to_a[0]
56
+ end
57
+ hash
58
+ end
59
+
60
+ # variables: an array of variables for which each vector should contain values
61
+ # Order is retained in the returned value
62
+ def data_vectors(variables=nil)
63
+ VectorEnumerable.new(self, variables)
64
+ end
65
+
66
+ def length
67
+ shape[0]
68
+ end
69
+
70
+ def save(file=nil)
71
+ file ||= Tempfile.new('pest_hash_dataset')
72
+ file = File.open(file, 'w') if file.kind_of?(String)
73
+ Marshal.dump([variables,to_a], file)
74
+ file.close
75
+ end
76
+
77
+ class VectorEnumerable
78
+ include Enumerable
79
+
80
+ def initialize(data_set, variables = true)
81
+ @data_set = data_set
82
+ @variables = variables
83
+ if @variables.kind_of?(Enumerable)
84
+ @variables = variables.map {|v| @data_set.variable_array.index(v)}
85
+ end
86
+ end
87
+
88
+ def [](i)
89
+ @data_set[i,@variables].transpose
90
+ end
91
+
92
+ def each
93
+ (0..@data_set.shape[0]-1).each do |i|
94
+ yield Array(self[i]).first
95
+ end
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,72 @@
1
+ module Pest::Estimator
2
+ attr_accessor :data
3
+
4
+ def initialize(data=nil)
5
+ @data = data
6
+ end
7
+
8
+ def variables
9
+ @data.nil? ? {} : @data.variables
10
+ end
11
+
12
+ def distributions
13
+ @distributions ||= DistributionList.new(self)
14
+ end
15
+
16
+ def to_variable(arg)
17
+ variable = case arg.class.name
18
+ when 'Pest::Variable'
19
+ arg
20
+ when 'String', 'Symbol'
21
+ variables[arg] || Pest::Variable.new(:name => arg)
22
+ end
23
+ raise ArgumentError unless variables.values.include?(variable)
24
+ variable
25
+ end
26
+
27
+ module Distribution
28
+ attr_reader :variables
29
+
30
+ def initialize(estimator, variables)
31
+ @estimator = estimator
32
+ @variables = variables
33
+ end
34
+
35
+ def variable_array
36
+ variables.to_a.sort
37
+ end
38
+
39
+ def probability
40
+ raise NotImplementedError
41
+ end
42
+ end
43
+
44
+ class DistributionList < Hash
45
+ def initialize(estimator)
46
+ @estimator = estimator
47
+ end
48
+
49
+ def parse_args(args)
50
+ set = if args.kind_of? Array
51
+ if args.any? {|arg| arg.kind_of?(::Set)}
52
+ args.inject(::Set.new) {|set, el| set + el.to_set}
53
+ else
54
+ args.flatten.to_set
55
+ end
56
+ elsif args.kind_of? ::Set
57
+ args
58
+ else
59
+ Array(args).to_set
60
+ end
61
+ set.map! {|arg| @estimator.to_variable(arg) }
62
+ end
63
+
64
+ def [](*args)
65
+ set = parse_args(args)
66
+ unless has_key? set
67
+ self[set] = @estimator.distribution_class.new(@estimator, set)
68
+ end
69
+ super(set)
70
+ end
71
+ end
72
+ end