rjspotter-basset 1.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,93 @@
1
+ # This file contains extensions to built in Ruby classes.
2
+
3
+ require 'rubygems'
4
+ require 'stemmer'
5
+
6
+ # Extensions to the array class.
7
+ class Array
8
+ # Returns a new array that contains everything except the first element of this one. (just like in lisp)
9
+ def rest
10
+ return self if empty?
11
+ self.slice(1, size)
12
+ end
13
+
14
+ # Returns the second item in the array
15
+ def second
16
+ self[1]
17
+ end
18
+
19
+ # Returns a random item from the array
20
+ def pick_random
21
+ self[rand(self.size)]
22
+ end
23
+
24
+ # Returns a randomized array
25
+ def randomize
26
+ self.sort_by { rand }
27
+ end
28
+
29
+ def sum
30
+ inject(0) { |sum, val| sum + val }
31
+ end
32
+
33
+ # Randomizes array in place
34
+ def randomize!
35
+ self.replace(self.randomize)
36
+ end
37
+ end
38
+
39
+ class Float
40
+ def to_s_decimal_places(decimal_places)
41
+ pattern = "[0-9]*\."
42
+ decimal_places.times { pattern << "[0-9]"}
43
+ return self.to_s.match(pattern)[0]
44
+ end
45
+ end
46
+
47
+ class Symbol
48
+ unless public_method_defined? :to_proc
49
+ def to_proc
50
+ Proc.new { |*args| args.shift.__send__(self, *args) }
51
+ end
52
+ end
53
+ end
54
+
55
+ # Extensions to the string class.
56
+ # We're just including the stemmable module into string. This adds the .stem method.
57
+ class String
58
+ include Stemmable
59
+ end
60
+
61
+ module Math
62
+
63
+ def variance(population)
64
+ n = 0
65
+ mean = 0.0
66
+ s = 0.0
67
+ population.each { |x|
68
+ n = n + 1
69
+ delta = x - mean
70
+ mean = mean + (delta / n)
71
+ s = s + delta * (x - mean)
72
+ }
73
+ # if you want to calculate std deviation
74
+ # of a sample change this to "s / (n-1)"
75
+ return s / n
76
+ end
77
+
78
+ # calculate the standard deviation of a population
79
+ # accepts: an array, the population
80
+ # returns: the standard deviation
81
+ def stddev(population)
82
+ sqrt(variance(population))
83
+ end
84
+
85
+
86
+ def avg(pop)
87
+ total = pop.inject(0) { |sum, n| sum + n }
88
+ total.to_f / pop.count.to_f
89
+ end
90
+
91
+ module_function :variance, :avg, :stddev
92
+
93
+ end
@@ -0,0 +1,84 @@
1
+ require 'uri'
2
+
3
+ module Basset
4
+
5
+ # A class for representing a document as a vector of features. It takes the text
6
+ # of the document and the classification. The vector of features representation is
7
+ # just a basic bag of words approach.
8
+ class Document
9
+ attr_reader :text, :classification
10
+
11
+ #
12
+ # initialize the object with document text. Set an explicit classification
13
+ # to use the document as training data
14
+ def initialize(text, classification = nil)
15
+ @text, @classification = text, classification
16
+ @tokens = stemmed_words
17
+ end
18
+
19
+ #
20
+ # returns an array of feature (token) vectors, which are instances Feature
21
+ def vector_of_features
22
+ @feature_vector ||= vector_of_features_from_terms_hash( terms_hash_from_words_array( @tokens ) )
23
+ end
24
+
25
+ #
26
+ # Alias for #vector_of_features
27
+ def feature_vectors
28
+ vector_of_features
29
+ end
30
+
31
+ private
32
+
33
+ # returns a hash with each word as a key and the value is the number of times
34
+ # the word appears in the passed in words array
35
+ def terms_hash_from_words_array(words)
36
+ terms = Hash.new(0)
37
+ words.each do |term|
38
+ terms[term] += 1
39
+ end
40
+ return terms
41
+ end
42
+
43
+ def vector_of_features_from_terms_hash(terms)
44
+ terms.collect do |term, frequency|
45
+ Feature.new(term, frequency)
46
+ end
47
+ end
48
+
49
+ def stemmed_words
50
+ words.map { |w| w.stem.downcase }
51
+ end
52
+
53
+ def words
54
+ clean_text.split(" ")
55
+ end
56
+
57
+ # Remove punctuation, numbers and symbols
58
+ def clean_text
59
+ text.tr("'@_", '').gsub(/\W/, ' ').gsub(/[0-9]/, '')
60
+ # text.tr( ',?.!;:"#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "@'\-", "")
61
+ end
62
+
63
+ end
64
+
65
+ #
66
+ # Subclass of Document intended to be used to classify URIs
67
+ class UriDocument < Document
68
+
69
+ def initialize(uri, classification=nil)
70
+ @text, @classification = uri, classification
71
+ @tokens = uri_tokens
72
+ end
73
+
74
+ def vector_of_features
75
+ @feature_vector ||= vector_of_features_from_terms_hash(terms_hash_from_words_array(@tokens))
76
+ end
77
+
78
+ def uri_tokens
79
+ URI.decode(@text).gsub(/(\&|\?|\\\\|\\|\/\/|\/|\=|\[|\]|\.\.|\.)/) { |char| " " + char + " " }.split
80
+ end
81
+
82
+ end
83
+
84
+ end
@@ -0,0 +1,11 @@
1
+ module Basset
2
+ # This class is an example for how to do custom document representations. In this
3
+ # example, I change the way text is cleaned and don't stem the words. It would also
4
+ # be easy to put in additional hard coded features.
5
+ # The important thing to note is that the new document class only needs one function: vector_of_features
6
+ class DocumentOverrideExample < Document
7
+ def vector_of_features
8
+ @vector_of_features ||= vector_of_features_from_terms_hash( terms_hash_from_words_array( text.gsub(/\W/, ' ').split(' ') ) )
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,26 @@
1
+ module Basset
2
+
3
+ # A class to hold a feature which consists of a name and a value. In the basic sense
4
+ # of document classification the name would be the word and the value would be the
5
+ # number of times that word appeared in the document.
6
+ class Feature
7
+ attr_accessor :name, :value
8
+
9
+ def initialize(name, value = 0)
10
+ @name = name
11
+ @value = value
12
+ end
13
+
14
+ def <=>(other)
15
+ ret = self.name <=> other.name
16
+ ret = self.value <=> other.value if ret.zero?
17
+ ret
18
+ end
19
+
20
+ def ==(other)
21
+ ret = self.name == other.name
22
+ ret = self.value == other.value if ret
23
+ ret
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,52 @@
1
+ require File.join(File.dirname(__FILE__), "yaml_serialization")
2
+
3
+ module Basset
4
+
5
+ # Extracts features from a document. On initialization it expects the set of features that
6
+ # are to be extracted from documents. The extracted features will just be numbered in
7
+ # ascending order. This makes it easy to output feature sets for libraries like svmlight.
8
+ class FeatureExtractor
9
+ include YamlSerialization
10
+
11
+ # the constructor takes an array of feature names. These are the features that will be
12
+ # extracted from documents. All others will be ignored.
13
+ def initialize(feature_names)
14
+ @feature_names = {}
15
+ feature_names.each_with_index {|feature_name, index| @feature_names[feature_name] = index + 1}
16
+ end
17
+
18
+ def number_of_features
19
+ @feature_names.size
20
+ end
21
+
22
+ # returns an array of features, but with their names replaced with an integer identifier.
23
+ # They should be sorted in ascending identifier order. This is a generic representation that works
24
+ # well with other machine learning packages like svm_light.
25
+ def extract_numbered(document)
26
+ numbered_features = extract(document).collect do |feature|
27
+ Feature.new(@feature_names[feature.name], feature.value)
28
+ end
29
+ numbered_features.sort
30
+ end
31
+
32
+ # just returns the features from the document that the extractor is interested in
33
+ def extract(document)
34
+ document.vector_of_features.find_all do |feature|
35
+ @feature_names[feature.name]
36
+ end
37
+ end
38
+
39
+ # def extract_with_duplicate_removal(document)
40
+ # features = extract(document)
41
+ # # # now remove the unigrams that dupe bigram features
42
+ # # # first grab an array of the bigram ones
43
+ # # bigram_features = []
44
+ # # sorted_features.each {|feature| bigram_features << feature if feature.name =~ /.*_AND_.*/}
45
+ # # # now remove all the ones that have a match in the bigram features
46
+ # # sorted_features.each_with_index do |feature, index|
47
+ # # sorted_features.delete_at(index) if (feature.name !~ /_AND_/ and bigram_features.detect {|bf| bf.name =~ /^#{feature.name}_|_#{feature.name}$/})
48
+ # # end
49
+ # end
50
+
51
+ end
52
+ end
@@ -0,0 +1,126 @@
1
+ module Basset
2
+
3
+ # This class is the feature selector. All documents in the training set should be added
4
+ # to the selector. Once they are in, a number of features may be selected based on the
5
+ # chi square value. When in doubt just call feature_with_chi_value_greater_than with an
6
+ # empty hash. It will return all features that have at least some statistical significance
7
+ # and occur in more than one document.
8
+ class FeatureSelector
9
+ attr_reader :docs
10
+
11
+ def initialize
12
+ @docs = 0
13
+ @docs_in_class = Hash.new(0)
14
+ @features = Hash.new { |h, k| h[k] = FeatureValues.new }
15
+ end
16
+
17
+ # Adds a document to the feature selector. The document should respond_to a
18
+ # method vector_of_features which returns a vector of unique features.
19
+ def add_document(document)
20
+ @docs += 1
21
+ @docs_in_class[document.classification] += 1
22
+
23
+ document.vector_of_features.each do |feature|
24
+ @features[feature.name].add_document_with_class(document.classification)
25
+ end
26
+ end
27
+
28
+ # returns all features, regardless of chi_square or frequency
29
+ def all_feature_names
30
+ @features.keys
31
+ end
32
+
33
+ def number_of_features
34
+ @features.size
35
+ end
36
+
37
+ # returns an array of the best features for a given classification
38
+ def best_features(count = 10, classification = nil)
39
+ select_features(1.0, classification).first(count)
40
+ end
41
+
42
+ def features_with_chi(classification)
43
+ @features.keys.map do |feature_name|
44
+ Feature.new(feature_name, chi_squared(feature_name, classification))
45
+ end
46
+ end
47
+
48
+ # returns an array of features that have a minimum or better chi_square value.
49
+ def select_features(chi_value = 1.0, classification = nil)
50
+ classification ||= @docs_in_class.keys.first
51
+
52
+ selected_features = features_with_chi(classification).select do |feature|
53
+ (docs_with_feature(feature.name) > 1) && (feature.value >= chi_value)
54
+ end
55
+
56
+ selected_features.sort_by(&:value).reverse.collect(&:name)
57
+ end
58
+
59
+ private
60
+
61
+ def docs_with_feature_and_class(feature_name, classification)
62
+ @features[feature_name].docs_with_class(classification)
63
+ end
64
+
65
+ def docs_with_feature_and_not_class(feature_name, classification)
66
+ @features[feature_name].docs_with_feature - @features[feature_name].docs_with_class(classification)
67
+ end
68
+
69
+ def docs_with_class_and_not_feature(classification, feature_name)
70
+ @docs_in_class[classification] - @features[feature_name].docs_with_class(classification)
71
+ end
72
+
73
+ def docs_without_feature_or_class(feature_name, classification)
74
+ @docs - @docs_in_class[classification] - docs_with_feature_and_not_class(feature_name, classification)
75
+ end
76
+
77
+ def docs_with_feature(feature_name)
78
+ @features[feature_name].docs_with_feature
79
+ end
80
+
81
+ def docs_with_class(classification)
82
+ @docs_in_class[classification]
83
+ end
84
+
85
+ # Returns the chi_squared value for this feature with the passed classification
86
+ # This is formula 13.14 on page 215 of An Introduction to Information Retrieval by
87
+ # Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze.
88
+ def chi_squared(feature_name, classification)
89
+ chi_squared_algo(
90
+ docs_with_feature_and_class(feature_name, classification),
91
+ docs_with_class_and_not_feature(classification, feature_name),
92
+ docs_with_feature_and_not_class(feature_name, classification),
93
+ docs_without_feature_or_class(feature_name, classification)
94
+ )
95
+ end
96
+
97
+ def chi_squared_algo(o11, o10, o01, o00)
98
+ denominator = ((o11 + o01) * (o11 + o10) * (o10 + o00) * (o01 + o00))
99
+ numerator = ((o11 + o10 + o01 + o00) * ((o11 * o00 - o10 * o01)**2))
100
+ # Checking zero to avoid producing Infinity
101
+ denominator.zero? ? 0.0 : numerator.to_f / denominator.to_f
102
+ end
103
+
104
+ # A class to hold the values associated with a feature. These values are
105
+ # important for feature selection.
106
+ class FeatureValues
107
+ attr_accessor :docs_with_feature
108
+
109
+ def initialize()
110
+ @classes = Hash.new(0)
111
+ @docs_with_feature = 0
112
+ end
113
+
114
+ def add_document_with_class(classification)
115
+ @classes[classification] += 1
116
+ @docs_with_feature += 1
117
+ end
118
+
119
+ def docs_with_class(classification)
120
+ @classes[classification]
121
+ end
122
+
123
+ end
124
+
125
+ end
126
+ end
@@ -0,0 +1,151 @@
1
+ require File.join(File.dirname(__FILE__), "yaml_serialization")
2
+
3
+ module Basset
4
+
5
+ # A class for running Naive Bayes classification.
6
+ # Documents are added to the classifier. Once they are added
7
+ # it can be used to classify new documents.
8
+ class NaiveBayes
9
+ include YamlSerialization
10
+
11
+ attr_reader :total_docs, :total_docs_in_class, :feature_counts
12
+
13
+ def initialize
14
+ @total_docs = 0
15
+ @total_docs_in_class = Hash.new(0)
16
+ @feature_counts = {}
17
+ @occurrences_of_all_features_in_class = {}
18
+ end
19
+
20
+ # takes a classification which can be a string and
21
+ # a vector of features.
22
+ def add_document(classification, feature_vector)
23
+ reset_cached_probabilities
24
+
25
+ @total_docs_in_class[classification] += 1
26
+ @total_docs += 1
27
+
28
+ feature_vector.each do |feature|
29
+ @feature_counts[feature.name] ||= FeatureCount.new(feature.name)
30
+ @feature_counts[feature.name].add_count_for_class(feature.value, classification)
31
+ end
32
+ end
33
+
34
+ def classes
35
+ @total_docs_in_class.keys
36
+ end
37
+
38
+ # returns the most likely class given a vector of features
39
+ def classify(feature_vectors, opts={:normalize_classes=>true})
40
+ class_probabilities = []
41
+
42
+ classes.each do |classification|
43
+ class_probability = 0
44
+ class_probability += Math.log10(probability_of_class(classification)) if opts[:normalize_classes]
45
+ class_probability += probability_of_vectors_for_class(feature_vectors, classification)
46
+ class_probabilities << [class_probability, classification]
47
+ end
48
+
49
+ # this next bit picks a random item first
50
+ # this covers the case that all the class probabilities are equal and we need to randomly select a class
51
+ max = class_probabilities.pick_random
52
+ class_probabilities.each do |cp|
53
+ max = cp if cp.first > max.first
54
+ end
55
+ max
56
+ end
57
+
58
+ #
59
+ # Gives a score for probability of _feature_vector_ being in
60
+ # class _classification_.
61
+ #
62
+ # This score can be normalized to the number of feature vectors by passing
63
+ # :normalize => true for the third argument.
64
+ #
65
+ # Score is not normalized for the relatives probabilities of each class.
66
+ def probability_of_vectors_for_class(feature_vectors, classification, opts={:normalize=>false})
67
+ probability = 0
68
+ feature_vectors.each do |feature_vector|
69
+ probability += probability_of_vector_for_class(feature_vector, classification)
70
+ end
71
+ if opts[:normalize]
72
+ probability / feature_vectors.count.to_f
73
+ else
74
+ probability
75
+ end
76
+ end
77
+
78
+ # returns the probability of a feature given the class
79
+ def probability_of_vector_for_class(feature_vector, classification)
80
+ # the reason the rescue 0 is in there is tricky
81
+ # because of the removal of redundant unigrams, it's possible that one of the features is never used/initialized
82
+ decimal_probability = (((@feature_counts[feature_vector.name].count_for_class(classification) rescue 0) + 0.1)/ occurrences_of_all_features_in_class(classification).to_f) * feature_vector.value
83
+ Math.log10(decimal_probability)
84
+ end
85
+
86
+ # The sum total of times all features occurs for a given class.
87
+ def occurrences_of_all_features_in_class(classification)
88
+ # return the cached value, if there is one
89
+ return @occurrences_of_all_features_in_class[classification] if @occurrences_of_all_features_in_class[classification]
90
+
91
+ @feature_counts.each_value do |feature_count|
92
+ @occurrences_of_all_features_in_class[classification] ||= 0
93
+ @occurrences_of_all_features_in_class[classification] += feature_count.count_for_class(classification)
94
+ end
95
+ @occurrences_of_all_features_in_class[classification]
96
+ end
97
+
98
+ def ==(other)
99
+ other.is_a?(self.class) && other.total_docs == total_docs &&
100
+ other.total_docs_in_class == total_docs_in_class && other.feature_counts == feature_counts
101
+ end
102
+
103
+ private
104
+
105
+ # probabilities are cached when the classification is run. This method resets
106
+ # the cached probabities.
107
+ def reset_cached_probabilities
108
+ @occurrences_of_all_features_in_class.clear
109
+ end
110
+
111
+ # returns the probability of a given class
112
+ def probability_of_class(classification)
113
+ @total_docs_in_class[classification] / @total_docs.to_f
114
+ end
115
+
116
+ # A class to store feature counts
117
+ class FeatureCount
118
+ attr_reader :classes, :name
119
+
120
+ def initialize(feature_name=nil, classification=nil, count=0)
121
+ @name, @classes = feature_name, {}
122
+ add_count_for_class(count, classification) if classification
123
+ end
124
+
125
+ def add_count_for_class(count, classification)
126
+ @classes[classification] ||= 0
127
+ @classes[classification] += count
128
+ end
129
+
130
+ def count_for_class(classification)
131
+ #@classes[classification] || 1 um, what?
132
+ @classes[classification] || 0
133
+ end
134
+
135
+ def count
136
+ @classes.values.sum
137
+ end
138
+
139
+ def ==(other)
140
+ other.kind_of?(FeatureCount) && other.classes == @classes && other.name == @name
141
+ end
142
+
143
+ def inspect(opts={:verbose=>false})
144
+ return super if opts[:verbose]
145
+ "#<FeatureCount for ``" + @name.to_s + "''" + " --> " + @classes.inspect + " > "
146
+ end
147
+
148
+ end
149
+
150
+ end
151
+ end