danielsdeleo-basset 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,93 @@
1
+ # This file contains extensions to built in Ruby classes.
2
+
3
+ require 'rubygems'
4
+ require 'stemmer'
5
+
6
+ # Extensions to the array class.
7
+ class Array
8
+ # Returns a new array that contains everything except the first element of this one. (just like in lisp)
9
+ def rest
10
+ return self if empty?
11
+ self.slice(1, size)
12
+ end
13
+
14
+ # Returns the second item in the array
15
+ def second
16
+ self[1]
17
+ end
18
+
19
+ # Returns a random item from the array
20
+ def pick_random
21
+ self[rand(self.size)]
22
+ end
23
+
24
+ # Returns a randomized array
25
+ def randomize
26
+ self.sort_by { rand }
27
+ end
28
+
29
+ def sum
30
+ inject(0) { |sum, val| sum + val }
31
+ end
32
+
33
+ # Randomizes array in place
34
+ def randomize!
35
+ self.replace(self.randomize)
36
+ end
37
+ end
38
+
39
+ class Float
40
+ def to_s_decimal_places(decimal_places)
41
+ pattern = "[0-9]*\."
42
+ decimal_places.times { pattern << "[0-9]"}
43
+ return self.to_s.match(pattern)[0]
44
+ end
45
+ end
46
+
47
+ class Symbol
48
+ unless public_method_defined? :to_proc
49
+ def to_proc
50
+ Proc.new { |*args| args.shift.__send__(self, *args) }
51
+ end
52
+ end
53
+ end
54
+
55
+ # Extensions to the string class.
56
+ # We're just including the stemmable module into string. This adds the .stem method.
57
+ class String
58
+ include Stemmable
59
+ end
60
+
61
+ module Math
62
+
63
+ def variance(population)
64
+ n = 0
65
+ mean = 0.0
66
+ s = 0.0
67
+ population.each { |x|
68
+ n = n + 1
69
+ delta = x - mean
70
+ mean = mean + (delta / n)
71
+ s = s + delta * (x - mean)
72
+ }
73
+ # if you want to calculate std deviation
74
+ # of a sample change this to "s / (n-1)"
75
+ return s / n
76
+ end
77
+
78
+ # calculate the standard deviation of a population
79
+ # accepts: an array, the population
80
+ # returns: the standard deviation
81
+ def stddev(population)
82
+ sqrt(variance(population))
83
+ end
84
+
85
+
86
+ def avg(pop)
87
+ total = pop.inject(0) { |sum, n| sum + n }
88
+ total.to_f / pop.count.to_f
89
+ end
90
+
91
+ module_function :variance, :avg, :stddev
92
+
93
+ end
@@ -0,0 +1,84 @@
1
+ require 'uri'
2
+
3
+ module Basset
4
+
5
+ # A class for representing a document as a vector of features. It takes the text
6
+ # of the document and the classification. The vector of features representation is
7
+ # just a basic bag of words approach.
8
+ class Document
9
+ attr_reader :text, :classification
10
+
11
+ #
12
+ # initialize the object with document text. Set an explicit classification
13
+ # to use the document as training data
14
+ def initialize(text, classification = nil)
15
+ @text, @classification = text, classification
16
+ @tokens = stemmed_words
17
+ end
18
+
19
+ #
20
+ # returns an array of feature (token) vectors, which are instances Feature
21
+ def vector_of_features
22
+ @feature_vector ||= vector_of_features_from_terms_hash( terms_hash_from_words_array( @tokens ) )
23
+ end
24
+
25
+ #
26
+ # Alias for #vector_of_features
27
+ def feature_vectors
28
+ vector_of_features
29
+ end
30
+
31
+ private
32
+
33
+ # returns a hash with each word as a key and the value is the number of times
34
+ # the word appears in the passed in words array
35
+ def terms_hash_from_words_array(words)
36
+ terms = Hash.new(0)
37
+ words.each do |term|
38
+ terms[term] += 1
39
+ end
40
+ return terms
41
+ end
42
+
43
+ def vector_of_features_from_terms_hash(terms)
44
+ terms.collect do |term, frequency|
45
+ Feature.new(term, frequency)
46
+ end
47
+ end
48
+
49
+ def stemmed_words
50
+ words.map { |w| w.stem.downcase }
51
+ end
52
+
53
+ def words
54
+ clean_text.split(" ")
55
+ end
56
+
57
+ # Remove punctuation, numbers and symbols
58
+ def clean_text
59
+ text.tr("'@_", '').gsub(/\W/, ' ').gsub(/[0-9]/, '')
60
+ # text.tr( ',?.!;:"#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "@'\-", "")
61
+ end
62
+
63
+ end
64
+
65
+ #
66
+ # Subclass of Document intended to be used to classify URIs
67
+ class UriDocument < Document
68
+
69
+ def initialize(uri, classification=nil)
70
+ @text, @classification = uri, classification
71
+ @tokens = uri_tokens
72
+ end
73
+
74
+ def vector_of_features
75
+ @feature_vector ||= vector_of_features_from_terms_hash(terms_hash_from_words_array(@tokens))
76
+ end
77
+
78
+ def uri_tokens
79
+ URI.decode(@text).gsub(/(\&|\?|\\\\|\\|\/\/|\/|\=|\[|\]|\.\.|\.)/) { |char| " " + char + " " }.split
80
+ end
81
+
82
+ end
83
+
84
+ end
@@ -0,0 +1,11 @@
1
+ module Basset
2
+ # This class is an example for how to do custom document representations. In this
3
+ # example, I change the way text is cleaned and don't stem the words. It would also
4
+ # be easy to put in additional hard coded features.
5
+ # The important thing to note is that the new document class only needs one function: vector_of_features
6
+ class DocumentOverrideExample < Document
7
+ def vector_of_features
8
+ @vector_of_features ||= vector_of_features_from_terms_hash( terms_hash_from_words_array( text.gsub(/\W/, ' ').split(' ') ) )
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,26 @@
1
+ module Basset
2
+
3
+ # A class to hold a feature which consists of a name and a value. In the basic sense
4
+ # of document classification the name would be the word and the value would be the
5
+ # number of times that word appeared in the document.
6
+ class Feature
7
+ attr_accessor :name, :value
8
+
9
+ def initialize(name, value = 0)
10
+ @name = name
11
+ @value = value
12
+ end
13
+
14
+ def <=>(other)
15
+ ret = self.name <=> other.name
16
+ ret = self.value <=> other.value if ret.zero?
17
+ ret
18
+ end
19
+
20
+ def ==(other)
21
+ ret = self.name == other.name
22
+ ret = self.value == other.value if ret
23
+ ret
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,52 @@
1
+ require File.join(File.dirname(__FILE__), "yaml_serialization")
2
+
3
+ module Basset
4
+
5
+ # Extracts features from a document. On initialization it expects the set of features that
6
+ # are to be extracted from documents. The extracted features will just be numbered in
7
+ # ascending order. This makes it easy to output feature sets for libraries like svmlight.
8
+ class FeatureExtractor
9
+ include YamlSerialization
10
+
11
+ # the constructor takes an array of feature names. These are the features that will be
12
+ # extracted from documents. All others will be ignored.
13
+ def initialize(feature_names)
14
+ @feature_names = {}
15
+ feature_names.each_with_index {|feature_name, index| @feature_names[feature_name] = index + 1}
16
+ end
17
+
18
+ def number_of_features
19
+ @feature_names.size
20
+ end
21
+
22
+ # returns an array of features, but with their names replaced with an integer identifier.
23
+ # They should be sorted in ascending identifier order. This is a generic representation that works
24
+ # well with other machine learning packages like svm_light.
25
+ def extract_numbered(document)
26
+ numbered_features = extract(document).collect do |feature|
27
+ Feature.new(@feature_names[feature.name], feature.value)
28
+ end
29
+ numbered_features.sort
30
+ end
31
+
32
+ # just returns the features from the document that the extractor is interested in
33
+ def extract(document)
34
+ document.vector_of_features.find_all do |feature|
35
+ @feature_names[feature.name]
36
+ end
37
+ end
38
+
39
+ # def extract_with_duplicate_removal(document)
40
+ # features = extract(document)
41
+ # # # now remove the unigrams that dupe bigram features
42
+ # # # first grab an array of the bigram ones
43
+ # # bigram_features = []
44
+ # # sorted_features.each {|feature| bigram_features << feature if feature.name =~ /.*_AND_.*/}
45
+ # # # now remove all the ones that have a match in the bigram features
46
+ # # sorted_features.each_with_index do |feature, index|
47
+ # # sorted_features.delete_at(index) if (feature.name !~ /_AND_/ and bigram_features.detect {|bf| bf.name =~ /^#{feature.name}_|_#{feature.name}$/})
48
+ # # end
49
+ # end
50
+
51
+ end
52
+ end
@@ -0,0 +1,126 @@
1
+ module Basset
2
+
3
+ # This class is the feature selector. All documents in the training set should be added
4
+ # to the selector. Once they are in, a number of features may be selected based on the
5
+ # chi square value. When in doubt just call feature_with_chi_value_greater_than with an
6
+ # empty hash. It will return all features that have at least some statistical significance
7
+ # and occur in more than one document.
8
+ class FeatureSelector
9
+ attr_reader :docs
10
+
11
+ def initialize
12
+ @docs = 0
13
+ @docs_in_class = Hash.new(0)
14
+ @features = Hash.new { |h, k| h[k] = FeatureValues.new }
15
+ end
16
+
17
+ # Adds a document to the feature selector. The document should respond_to a
18
+ # method vector_of_features which returns a vector of unique features.
19
+ def add_document(document)
20
+ @docs += 1
21
+ @docs_in_class[document.classification] += 1
22
+
23
+ document.vector_of_features.each do |feature|
24
+ @features[feature.name].add_document_with_class(document.classification)
25
+ end
26
+ end
27
+
28
+ # returns all features, regardless of chi_square or frequency
29
+ def all_feature_names
30
+ @features.keys
31
+ end
32
+
33
+ def number_of_features
34
+ @features.size
35
+ end
36
+
37
+ # returns an array of the best features for a given classification
38
+ def best_features(count = 10, classification = nil)
39
+ select_features(1.0, classification).first(count)
40
+ end
41
+
42
+ def features_with_chi(classification)
43
+ @features.keys.map do |feature_name|
44
+ Feature.new(feature_name, chi_squared(feature_name, classification))
45
+ end
46
+ end
47
+
48
+ # returns an array of features that have a minimum or better chi_square value.
49
+ def select_features(chi_value = 1.0, classification = nil)
50
+ classification ||= @docs_in_class.keys.first
51
+
52
+ selected_features = features_with_chi(classification).select do |feature|
53
+ (docs_with_feature(feature.name) > 1) && (feature.value >= chi_value)
54
+ end
55
+
56
+ selected_features.sort_by(&:value).reverse.collect(&:name)
57
+ end
58
+
59
+ private
60
+
61
+ def docs_with_feature_and_class(feature_name, classification)
62
+ @features[feature_name].docs_with_class(classification)
63
+ end
64
+
65
+ def docs_with_feature_and_not_class(feature_name, classification)
66
+ @features[feature_name].docs_with_feature - @features[feature_name].docs_with_class(classification)
67
+ end
68
+
69
+ def docs_with_class_and_not_feature(classification, feature_name)
70
+ @docs_in_class[classification] - @features[feature_name].docs_with_class(classification)
71
+ end
72
+
73
+ def docs_without_feature_or_class(feature_name, classification)
74
+ @docs - @docs_in_class[classification] - docs_with_feature_and_not_class(feature_name, classification)
75
+ end
76
+
77
+ def docs_with_feature(feature_name)
78
+ @features[feature_name].docs_with_feature
79
+ end
80
+
81
+ def docs_with_class(classification)
82
+ @docs_in_class[classification]
83
+ end
84
+
85
+ # Returns the chi_squared value for this feature with the passed classification
86
+ # This is formula 13.14 on page 215 of An Introduction to Information Retrieval by
87
+ # Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze.
88
+ def chi_squared(feature_name, classification)
89
+ chi_squared_algo(
90
+ docs_with_feature_and_class(feature_name, classification),
91
+ docs_with_class_and_not_feature(classification, feature_name),
92
+ docs_with_feature_and_not_class(feature_name, classification),
93
+ docs_without_feature_or_class(feature_name, classification)
94
+ )
95
+ end
96
+
97
+ def chi_squared_algo(o11, o10, o01, o00)
98
+ denominator = ((o11 + o01) * (o11 + o10) * (o10 + o00) * (o01 + o00))
99
+ numerator = ((o11 + o10 + o01 + o00) * ((o11 * o00 - o10 * o01)**2))
100
+ # Checking zero to avoid producing Infinity
101
+ denominator.zero? ? 0.0 : numerator.to_f / denominator.to_f
102
+ end
103
+
104
+ # A class to hold the values associated with a feature. These values are
105
+ # important for feature selection.
106
+ class FeatureValues
107
+ attr_accessor :docs_with_feature
108
+
109
+ def initialize()
110
+ @classes = Hash.new(0)
111
+ @docs_with_feature = 0
112
+ end
113
+
114
+ def add_document_with_class(classification)
115
+ @classes[classification] += 1
116
+ @docs_with_feature += 1
117
+ end
118
+
119
+ def docs_with_class(classification)
120
+ @classes[classification]
121
+ end
122
+
123
+ end
124
+
125
+ end
126
+ end
@@ -0,0 +1,151 @@
1
+ require File.join(File.dirname(__FILE__), "yaml_serialization")
2
+
3
+ module Basset
4
+
5
+ # A class for running Naive Bayes classification.
6
+ # Documents are added to the classifier. Once they are added
7
+ # it can be used to classify new documents.
8
+ class NaiveBayes
9
+ include YamlSerialization
10
+
11
+ attr_reader :total_docs, :total_docs_in_class, :feature_counts
12
+
13
+ def initialize
14
+ @total_docs = 0
15
+ @total_docs_in_class = Hash.new(0)
16
+ @feature_counts = {}
17
+ @occurrences_of_all_features_in_class = {}
18
+ end
19
+
20
+ # takes a classification which can be a string and
21
+ # a vector of features.
22
+ def add_document(classification, feature_vector)
23
+ reset_cached_probabilities
24
+
25
+ @total_docs_in_class[classification] += 1
26
+ @total_docs += 1
27
+
28
+ feature_vector.each do |feature|
29
+ @feature_counts[feature.name] ||= FeatureCount.new(feature.name)
30
+ @feature_counts[feature.name].add_count_for_class(feature.value, classification)
31
+ end
32
+ end
33
+
34
+ def classes
35
+ @total_docs_in_class.keys
36
+ end
37
+
38
+ # returns the most likely class given a vector of features
39
+ def classify(feature_vectors, opts={:normalize_classes=>true})
40
+ class_probabilities = []
41
+
42
+ classes.each do |classification|
43
+ class_probability = 0
44
+ class_probability += Math.log10(probability_of_class(classification)) if opts[:normalize_classes]
45
+ class_probability += probability_of_vectors_for_class(feature_vectors, classification)
46
+ class_probabilities << [class_probability, classification]
47
+ end
48
+
49
+ # this next bit picks a random item first
50
+ # this covers the case that all the class probabilities are equal and we need to randomly select a class
51
+ max = class_probabilities.pick_random
52
+ class_probabilities.each do |cp|
53
+ max = cp if cp.first > max.first
54
+ end
55
+ max
56
+ end
57
+
58
+ #
59
+ # Gives a score for probability of _feature_vector_ being in
60
+ # class _classification_.
61
+ #
62
+ # This score can be normalized to the number of feature vectors by passing
63
+ # :normalize => true for the third argument.
64
+ #
65
+ # Score is not normalized for the relatives probabilities of each class.
66
+ def probability_of_vectors_for_class(feature_vectors, classification, opts={:normalize=>false})
67
+ probability = 0
68
+ feature_vectors.each do |feature_vector|
69
+ probability += probability_of_vector_for_class(feature_vector, classification)
70
+ end
71
+ if opts[:normalize]
72
+ probability / feature_vectors.count.to_f
73
+ else
74
+ probability
75
+ end
76
+ end
77
+
78
+ # returns the probability of a feature given the class
79
+ def probability_of_vector_for_class(feature_vector, classification)
80
+ # the reason the rescue 0 is in there is tricky
81
+ # because of the removal of redundant unigrams, it's possible that one of the features is never used/initialized
82
+ decimal_probability = (((@feature_counts[feature_vector.name].count_for_class(classification) rescue 0) + 0.1)/ occurrences_of_all_features_in_class(classification).to_f) * feature_vector.value
83
+ Math.log10(decimal_probability)
84
+ end
85
+
86
+ # The sum total of times all features occurs for a given class.
87
+ def occurrences_of_all_features_in_class(classification)
88
+ # return the cached value, if there is one
89
+ return @occurrences_of_all_features_in_class[classification] if @occurrences_of_all_features_in_class[classification]
90
+
91
+ @feature_counts.each_value do |feature_count|
92
+ @occurrences_of_all_features_in_class[classification] ||= 0
93
+ @occurrences_of_all_features_in_class[classification] += feature_count.count_for_class(classification)
94
+ end
95
+ @occurrences_of_all_features_in_class[classification]
96
+ end
97
+
98
+ def ==(other)
99
+ other.is_a?(self.class) && other.total_docs == total_docs &&
100
+ other.total_docs_in_class == total_docs_in_class && other.feature_counts == feature_counts
101
+ end
102
+
103
+ private
104
+
105
+ # probabilities are cached when the classification is run. This method resets
106
+ # the cached probabities.
107
+ def reset_cached_probabilities
108
+ @occurrences_of_all_features_in_class.clear
109
+ end
110
+
111
+ # returns the probability of a given class
112
+ def probability_of_class(classification)
113
+ @total_docs_in_class[classification] / @total_docs.to_f
114
+ end
115
+
116
+ # A class to store feature counts
117
+ class FeatureCount
118
+ attr_reader :classes, :name
119
+
120
+ def initialize(feature_name=nil, classification=nil, count=0)
121
+ @name, @classes = feature_name, {}
122
+ add_count_for_class(count, classification) if classification
123
+ end
124
+
125
+ def add_count_for_class(count, classification)
126
+ @classes[classification] ||= 0
127
+ @classes[classification] += count
128
+ end
129
+
130
+ def count_for_class(classification)
131
+ #@classes[classification] || 1 um, what?
132
+ @classes[classification] || 0
133
+ end
134
+
135
+ def count
136
+ @classes.values.sum
137
+ end
138
+
139
+ def ==(other)
140
+ other.kind_of?(FeatureCount) && other.classes == @classes && other.name == @name
141
+ end
142
+
143
+ def inspect(opts={:verbose=>false})
144
+ return super if opts[:verbose]
145
+ "#<FeatureCount for ``" + @name.to_s + "''" + " --> " + @classes.inspect + " > "
146
+ end
147
+
148
+ end
149
+
150
+ end
151
+ end