rjspotter-basset 1.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +7 -0
- data/License.txt +20 -0
- data/Manifest.txt +21 -0
- data/README.rdoc +58 -0
- data/Rakefile +63 -0
- data/VERSION.yml +4 -0
- data/basset.gemspec +38 -0
- data/examples/example.rb +25 -0
- data/lib/basset.rb +9 -0
- data/lib/basset/classification_evaluator.rb +170 -0
- data/lib/basset/classifier.rb +188 -0
- data/lib/basset/core_extensions.rb +93 -0
- data/lib/basset/document.rb +84 -0
- data/lib/basset/document_override_example.rb +11 -0
- data/lib/basset/feature.rb +26 -0
- data/lib/basset/feature_extractor.rb +52 -0
- data/lib/basset/feature_selector.rb +126 -0
- data/lib/basset/naive_bayes.rb +151 -0
- data/lib/basset/svm.rb +180 -0
- data/lib/basset/yaml_serialization.rb +41 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +12 -0
- data/spec/unit/classifier_spec.rb +166 -0
- data/spec/unit/core_extension_spec.rb +33 -0
- data/spec/unit/document_spec.rb +59 -0
- data/spec/unit/feature_extractor_spec.rb +33 -0
- data/spec/unit/feature_selector_spec.rb +108 -0
- data/spec/unit/feature_spec.rb +40 -0
- data/spec/unit/naive_bayes_spec.rb +119 -0
- data/spec/unit/svm_spec.rb +83 -0
- metadata +115 -0
@@ -0,0 +1,93 @@
|
|
1
|
+
# This file contains extensions to built in Ruby classes.
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'stemmer'
|
5
|
+
|
6
|
+
# Extensions to the array class.
|
7
|
+
class Array
|
8
|
+
# Returns a new array that contains everything except the first element of this one. (just like in lisp)
|
9
|
+
def rest
|
10
|
+
return self if empty?
|
11
|
+
self.slice(1, size)
|
12
|
+
end
|
13
|
+
|
14
|
+
# Returns the second item in the array
|
15
|
+
def second
|
16
|
+
self[1]
|
17
|
+
end
|
18
|
+
|
19
|
+
# Returns a random item from the array
|
20
|
+
def pick_random
|
21
|
+
self[rand(self.size)]
|
22
|
+
end
|
23
|
+
|
24
|
+
# Returns a randomized array
|
25
|
+
def randomize
|
26
|
+
self.sort_by { rand }
|
27
|
+
end
|
28
|
+
|
29
|
+
def sum
|
30
|
+
inject(0) { |sum, val| sum + val }
|
31
|
+
end
|
32
|
+
|
33
|
+
# Randomizes array in place
|
34
|
+
def randomize!
|
35
|
+
self.replace(self.randomize)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
class Float
|
40
|
+
def to_s_decimal_places(decimal_places)
|
41
|
+
pattern = "[0-9]*\."
|
42
|
+
decimal_places.times { pattern << "[0-9]"}
|
43
|
+
return self.to_s.match(pattern)[0]
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
class Symbol
|
48
|
+
unless public_method_defined? :to_proc
|
49
|
+
def to_proc
|
50
|
+
Proc.new { |*args| args.shift.__send__(self, *args) }
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# Extensions to the string class.
|
56
|
+
# We're just including the stemmable module into string. This adds the .stem method.
|
57
|
+
class String
|
58
|
+
include Stemmable
|
59
|
+
end
|
60
|
+
|
61
|
+
module Math
|
62
|
+
|
63
|
+
def variance(population)
|
64
|
+
n = 0
|
65
|
+
mean = 0.0
|
66
|
+
s = 0.0
|
67
|
+
population.each { |x|
|
68
|
+
n = n + 1
|
69
|
+
delta = x - mean
|
70
|
+
mean = mean + (delta / n)
|
71
|
+
s = s + delta * (x - mean)
|
72
|
+
}
|
73
|
+
# if you want to calculate std deviation
|
74
|
+
# of a sample change this to "s / (n-1)"
|
75
|
+
return s / n
|
76
|
+
end
|
77
|
+
|
78
|
+
# calculate the standard deviation of a population
|
79
|
+
# accepts: an array, the population
|
80
|
+
# returns: the standard deviation
|
81
|
+
def stddev(population)
|
82
|
+
sqrt(variance(population))
|
83
|
+
end
|
84
|
+
|
85
|
+
|
86
|
+
def avg(pop)
|
87
|
+
total = pop.inject(0) { |sum, n| sum + n }
|
88
|
+
total.to_f / pop.count.to_f
|
89
|
+
end
|
90
|
+
|
91
|
+
module_function :variance, :avg, :stddev
|
92
|
+
|
93
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'uri'
|
2
|
+
|
3
|
+
module Basset
|
4
|
+
|
5
|
+
# A class for representing a document as a vector of features. It takes the text
|
6
|
+
# of the document and the classification. The vector of features representation is
|
7
|
+
# just a basic bag of words approach.
|
8
|
+
class Document
|
9
|
+
attr_reader :text, :classification
|
10
|
+
|
11
|
+
#
|
12
|
+
# initialize the object with document text. Set an explicit classification
|
13
|
+
# to use the document as training data
|
14
|
+
def initialize(text, classification = nil)
|
15
|
+
@text, @classification = text, classification
|
16
|
+
@tokens = stemmed_words
|
17
|
+
end
|
18
|
+
|
19
|
+
#
|
20
|
+
# returns an array of feature (token) vectors, which are instances Feature
|
21
|
+
def vector_of_features
|
22
|
+
@feature_vector ||= vector_of_features_from_terms_hash( terms_hash_from_words_array( @tokens ) )
|
23
|
+
end
|
24
|
+
|
25
|
+
#
|
26
|
+
# Alias for #vector_of_features
|
27
|
+
def feature_vectors
|
28
|
+
vector_of_features
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
# returns a hash with each word as a key and the value is the number of times
|
34
|
+
# the word appears in the passed in words array
|
35
|
+
def terms_hash_from_words_array(words)
|
36
|
+
terms = Hash.new(0)
|
37
|
+
words.each do |term|
|
38
|
+
terms[term] += 1
|
39
|
+
end
|
40
|
+
return terms
|
41
|
+
end
|
42
|
+
|
43
|
+
def vector_of_features_from_terms_hash(terms)
|
44
|
+
terms.collect do |term, frequency|
|
45
|
+
Feature.new(term, frequency)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def stemmed_words
|
50
|
+
words.map { |w| w.stem.downcase }
|
51
|
+
end
|
52
|
+
|
53
|
+
def words
|
54
|
+
clean_text.split(" ")
|
55
|
+
end
|
56
|
+
|
57
|
+
# Remove punctuation, numbers and symbols
|
58
|
+
def clean_text
|
59
|
+
text.tr("'@_", '').gsub(/\W/, ' ').gsub(/[0-9]/, '')
|
60
|
+
# text.tr( ',?.!;:"#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "@'\-", "")
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
|
65
|
+
#
|
66
|
+
# Subclass of Document intended to be used to classify URIs
|
67
|
+
class UriDocument < Document
|
68
|
+
|
69
|
+
def initialize(uri, classification=nil)
|
70
|
+
@text, @classification = uri, classification
|
71
|
+
@tokens = uri_tokens
|
72
|
+
end
|
73
|
+
|
74
|
+
def vector_of_features
|
75
|
+
@feature_vector ||= vector_of_features_from_terms_hash(terms_hash_from_words_array(@tokens))
|
76
|
+
end
|
77
|
+
|
78
|
+
def uri_tokens
|
79
|
+
URI.decode(@text).gsub(/(\&|\?|\\\\|\\|\/\/|\/|\=|\[|\]|\.\.|\.)/) { |char| " " + char + " " }.split
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Basset
|
2
|
+
# This class is an example for how to do custom document representations. In this
|
3
|
+
# example, I change the way text is cleaned and don't stem the words. It would also
|
4
|
+
# be easy to put in additional hard coded features.
|
5
|
+
# The important thing to note is that the new document class only needs one function: vector_of_features
|
6
|
+
class DocumentOverrideExample < Document
|
7
|
+
def vector_of_features
|
8
|
+
@vector_of_features ||= vector_of_features_from_terms_hash( terms_hash_from_words_array( text.gsub(/\W/, ' ').split(' ') ) )
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Basset
|
2
|
+
|
3
|
+
# A class to hold a feature which consists of a name and a value. In the basic sense
|
4
|
+
# of document classification the name would be the word and the value would be the
|
5
|
+
# number of times that word appeared in the document.
|
6
|
+
class Feature
|
7
|
+
attr_accessor :name, :value
|
8
|
+
|
9
|
+
def initialize(name, value = 0)
|
10
|
+
@name = name
|
11
|
+
@value = value
|
12
|
+
end
|
13
|
+
|
14
|
+
def <=>(other)
|
15
|
+
ret = self.name <=> other.name
|
16
|
+
ret = self.value <=> other.value if ret.zero?
|
17
|
+
ret
|
18
|
+
end
|
19
|
+
|
20
|
+
def ==(other)
|
21
|
+
ret = self.name == other.name
|
22
|
+
ret = self.value == other.value if ret
|
23
|
+
ret
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), "yaml_serialization")
|
2
|
+
|
3
|
+
module Basset
|
4
|
+
|
5
|
+
# Extracts features from a document. On initialization it expects the set of features that
|
6
|
+
# are to be extracted from documents. The extracted features will just be numbered in
|
7
|
+
# ascending order. This makes it easy to output feature sets for libraries like svmlight.
|
8
|
+
class FeatureExtractor
|
9
|
+
include YamlSerialization
|
10
|
+
|
11
|
+
# the constructor takes an array of feature names. These are the features that will be
|
12
|
+
# extracted from documents. All others will be ignored.
|
13
|
+
def initialize(feature_names)
|
14
|
+
@feature_names = {}
|
15
|
+
feature_names.each_with_index {|feature_name, index| @feature_names[feature_name] = index + 1}
|
16
|
+
end
|
17
|
+
|
18
|
+
def number_of_features
|
19
|
+
@feature_names.size
|
20
|
+
end
|
21
|
+
|
22
|
+
# returns an array of features, but with their names replaced with an integer identifier.
|
23
|
+
# They should be sorted in ascending identifier order. This is a generic representation that works
|
24
|
+
# well with other machine learning packages like svm_light.
|
25
|
+
def extract_numbered(document)
|
26
|
+
numbered_features = extract(document).collect do |feature|
|
27
|
+
Feature.new(@feature_names[feature.name], feature.value)
|
28
|
+
end
|
29
|
+
numbered_features.sort
|
30
|
+
end
|
31
|
+
|
32
|
+
# just returns the features from the document that the extractor is interested in
|
33
|
+
def extract(document)
|
34
|
+
document.vector_of_features.find_all do |feature|
|
35
|
+
@feature_names[feature.name]
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# def extract_with_duplicate_removal(document)
|
40
|
+
# features = extract(document)
|
41
|
+
# # # now remove the unigrams that dupe bigram features
|
42
|
+
# # # first grab an array of the bigram ones
|
43
|
+
# # bigram_features = []
|
44
|
+
# # sorted_features.each {|feature| bigram_features << feature if feature.name =~ /.*_AND_.*/}
|
45
|
+
# # # now remove all the ones that have a match in the bigram features
|
46
|
+
# # sorted_features.each_with_index do |feature, index|
|
47
|
+
# # sorted_features.delete_at(index) if (feature.name !~ /_AND_/ and bigram_features.detect {|bf| bf.name =~ /^#{feature.name}_|_#{feature.name}$/})
|
48
|
+
# # end
|
49
|
+
# end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
module Basset
|
2
|
+
|
3
|
+
# This class is the feature selector. All documents in the training set should be added
|
4
|
+
# to the selector. Once they are in, a number of features may be selected based on the
|
5
|
+
# chi square value. When in doubt just call feature_with_chi_value_greater_than with an
|
6
|
+
# empty hash. It will return all features that have at least some statistical significance
|
7
|
+
# and occur in more than one document.
|
8
|
+
class FeatureSelector
|
9
|
+
attr_reader :docs
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
@docs = 0
|
13
|
+
@docs_in_class = Hash.new(0)
|
14
|
+
@features = Hash.new { |h, k| h[k] = FeatureValues.new }
|
15
|
+
end
|
16
|
+
|
17
|
+
# Adds a document to the feature selector. The document should respond_to a
|
18
|
+
# method vector_of_features which returns a vector of unique features.
|
19
|
+
def add_document(document)
|
20
|
+
@docs += 1
|
21
|
+
@docs_in_class[document.classification] += 1
|
22
|
+
|
23
|
+
document.vector_of_features.each do |feature|
|
24
|
+
@features[feature.name].add_document_with_class(document.classification)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# returns all features, regardless of chi_square or frequency
|
29
|
+
def all_feature_names
|
30
|
+
@features.keys
|
31
|
+
end
|
32
|
+
|
33
|
+
def number_of_features
|
34
|
+
@features.size
|
35
|
+
end
|
36
|
+
|
37
|
+
# returns an array of the best features for a given classification
|
38
|
+
def best_features(count = 10, classification = nil)
|
39
|
+
select_features(1.0, classification).first(count)
|
40
|
+
end
|
41
|
+
|
42
|
+
def features_with_chi(classification)
|
43
|
+
@features.keys.map do |feature_name|
|
44
|
+
Feature.new(feature_name, chi_squared(feature_name, classification))
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# returns an array of features that have a minimum or better chi_square value.
|
49
|
+
def select_features(chi_value = 1.0, classification = nil)
|
50
|
+
classification ||= @docs_in_class.keys.first
|
51
|
+
|
52
|
+
selected_features = features_with_chi(classification).select do |feature|
|
53
|
+
(docs_with_feature(feature.name) > 1) && (feature.value >= chi_value)
|
54
|
+
end
|
55
|
+
|
56
|
+
selected_features.sort_by(&:value).reverse.collect(&:name)
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
def docs_with_feature_and_class(feature_name, classification)
|
62
|
+
@features[feature_name].docs_with_class(classification)
|
63
|
+
end
|
64
|
+
|
65
|
+
def docs_with_feature_and_not_class(feature_name, classification)
|
66
|
+
@features[feature_name].docs_with_feature - @features[feature_name].docs_with_class(classification)
|
67
|
+
end
|
68
|
+
|
69
|
+
def docs_with_class_and_not_feature(classification, feature_name)
|
70
|
+
@docs_in_class[classification] - @features[feature_name].docs_with_class(classification)
|
71
|
+
end
|
72
|
+
|
73
|
+
def docs_without_feature_or_class(feature_name, classification)
|
74
|
+
@docs - @docs_in_class[classification] - docs_with_feature_and_not_class(feature_name, classification)
|
75
|
+
end
|
76
|
+
|
77
|
+
def docs_with_feature(feature_name)
|
78
|
+
@features[feature_name].docs_with_feature
|
79
|
+
end
|
80
|
+
|
81
|
+
def docs_with_class(classification)
|
82
|
+
@docs_in_class[classification]
|
83
|
+
end
|
84
|
+
|
85
|
+
# Returns the chi_squared value for this feature with the passed classification
|
86
|
+
# This is formula 13.14 on page 215 of An Introduction to Information Retrieval by
|
87
|
+
# Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze.
|
88
|
+
def chi_squared(feature_name, classification)
|
89
|
+
chi_squared_algo(
|
90
|
+
docs_with_feature_and_class(feature_name, classification),
|
91
|
+
docs_with_class_and_not_feature(classification, feature_name),
|
92
|
+
docs_with_feature_and_not_class(feature_name, classification),
|
93
|
+
docs_without_feature_or_class(feature_name, classification)
|
94
|
+
)
|
95
|
+
end
|
96
|
+
|
97
|
+
def chi_squared_algo(o11, o10, o01, o00)
|
98
|
+
denominator = ((o11 + o01) * (o11 + o10) * (o10 + o00) * (o01 + o00))
|
99
|
+
numerator = ((o11 + o10 + o01 + o00) * ((o11 * o00 - o10 * o01)**2))
|
100
|
+
# Checking zero to avoid producing Infinity
|
101
|
+
denominator.zero? ? 0.0 : numerator.to_f / denominator.to_f
|
102
|
+
end
|
103
|
+
|
104
|
+
# A class to hold the values associated with a feature. These values are
|
105
|
+
# important for feature selection.
|
106
|
+
class FeatureValues
|
107
|
+
attr_accessor :docs_with_feature
|
108
|
+
|
109
|
+
def initialize()
|
110
|
+
@classes = Hash.new(0)
|
111
|
+
@docs_with_feature = 0
|
112
|
+
end
|
113
|
+
|
114
|
+
def add_document_with_class(classification)
|
115
|
+
@classes[classification] += 1
|
116
|
+
@docs_with_feature += 1
|
117
|
+
end
|
118
|
+
|
119
|
+
def docs_with_class(classification)
|
120
|
+
@classes[classification]
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
126
|
+
end
|
@@ -0,0 +1,151 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), "yaml_serialization")
|
2
|
+
|
3
|
+
module Basset
|
4
|
+
|
5
|
+
# A class for running Naive Bayes classification.
|
6
|
+
# Documents are added to the classifier. Once they are added
|
7
|
+
# it can be used to classify new documents.
|
8
|
+
class NaiveBayes
|
9
|
+
include YamlSerialization
|
10
|
+
|
11
|
+
attr_reader :total_docs, :total_docs_in_class, :feature_counts
|
12
|
+
|
13
|
+
def initialize
|
14
|
+
@total_docs = 0
|
15
|
+
@total_docs_in_class = Hash.new(0)
|
16
|
+
@feature_counts = {}
|
17
|
+
@occurrences_of_all_features_in_class = {}
|
18
|
+
end
|
19
|
+
|
20
|
+
# takes a classification which can be a string and
|
21
|
+
# a vector of features.
|
22
|
+
def add_document(classification, feature_vector)
|
23
|
+
reset_cached_probabilities
|
24
|
+
|
25
|
+
@total_docs_in_class[classification] += 1
|
26
|
+
@total_docs += 1
|
27
|
+
|
28
|
+
feature_vector.each do |feature|
|
29
|
+
@feature_counts[feature.name] ||= FeatureCount.new(feature.name)
|
30
|
+
@feature_counts[feature.name].add_count_for_class(feature.value, classification)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def classes
|
35
|
+
@total_docs_in_class.keys
|
36
|
+
end
|
37
|
+
|
38
|
+
# returns the most likely class given a vector of features
|
39
|
+
def classify(feature_vectors, opts={:normalize_classes=>true})
|
40
|
+
class_probabilities = []
|
41
|
+
|
42
|
+
classes.each do |classification|
|
43
|
+
class_probability = 0
|
44
|
+
class_probability += Math.log10(probability_of_class(classification)) if opts[:normalize_classes]
|
45
|
+
class_probability += probability_of_vectors_for_class(feature_vectors, classification)
|
46
|
+
class_probabilities << [class_probability, classification]
|
47
|
+
end
|
48
|
+
|
49
|
+
# this next bit picks a random item first
|
50
|
+
# this covers the case that all the class probabilities are equal and we need to randomly select a class
|
51
|
+
max = class_probabilities.pick_random
|
52
|
+
class_probabilities.each do |cp|
|
53
|
+
max = cp if cp.first > max.first
|
54
|
+
end
|
55
|
+
max
|
56
|
+
end
|
57
|
+
|
58
|
+
#
|
59
|
+
# Gives a score for probability of _feature_vector_ being in
|
60
|
+
# class _classification_.
|
61
|
+
#
|
62
|
+
# This score can be normalized to the number of feature vectors by passing
|
63
|
+
# :normalize => true for the third argument.
|
64
|
+
#
|
65
|
+
# Score is not normalized for the relatives probabilities of each class.
|
66
|
+
def probability_of_vectors_for_class(feature_vectors, classification, opts={:normalize=>false})
|
67
|
+
probability = 0
|
68
|
+
feature_vectors.each do |feature_vector|
|
69
|
+
probability += probability_of_vector_for_class(feature_vector, classification)
|
70
|
+
end
|
71
|
+
if opts[:normalize]
|
72
|
+
probability / feature_vectors.count.to_f
|
73
|
+
else
|
74
|
+
probability
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# returns the probability of a feature given the class
|
79
|
+
def probability_of_vector_for_class(feature_vector, classification)
|
80
|
+
# the reason the rescue 0 is in there is tricky
|
81
|
+
# because of the removal of redundant unigrams, it's possible that one of the features is never used/initialized
|
82
|
+
decimal_probability = (((@feature_counts[feature_vector.name].count_for_class(classification) rescue 0) + 0.1)/ occurrences_of_all_features_in_class(classification).to_f) * feature_vector.value
|
83
|
+
Math.log10(decimal_probability)
|
84
|
+
end
|
85
|
+
|
86
|
+
# The sum total of times all features occurs for a given class.
|
87
|
+
def occurrences_of_all_features_in_class(classification)
|
88
|
+
# return the cached value, if there is one
|
89
|
+
return @occurrences_of_all_features_in_class[classification] if @occurrences_of_all_features_in_class[classification]
|
90
|
+
|
91
|
+
@feature_counts.each_value do |feature_count|
|
92
|
+
@occurrences_of_all_features_in_class[classification] ||= 0
|
93
|
+
@occurrences_of_all_features_in_class[classification] += feature_count.count_for_class(classification)
|
94
|
+
end
|
95
|
+
@occurrences_of_all_features_in_class[classification]
|
96
|
+
end
|
97
|
+
|
98
|
+
def ==(other)
|
99
|
+
other.is_a?(self.class) && other.total_docs == total_docs &&
|
100
|
+
other.total_docs_in_class == total_docs_in_class && other.feature_counts == feature_counts
|
101
|
+
end
|
102
|
+
|
103
|
+
private
|
104
|
+
|
105
|
+
# probabilities are cached when the classification is run. This method resets
|
106
|
+
# the cached probabities.
|
107
|
+
def reset_cached_probabilities
|
108
|
+
@occurrences_of_all_features_in_class.clear
|
109
|
+
end
|
110
|
+
|
111
|
+
# returns the probability of a given class
|
112
|
+
def probability_of_class(classification)
|
113
|
+
@total_docs_in_class[classification] / @total_docs.to_f
|
114
|
+
end
|
115
|
+
|
116
|
+
# A class to store feature counts
|
117
|
+
class FeatureCount
|
118
|
+
attr_reader :classes, :name
|
119
|
+
|
120
|
+
def initialize(feature_name=nil, classification=nil, count=0)
|
121
|
+
@name, @classes = feature_name, {}
|
122
|
+
add_count_for_class(count, classification) if classification
|
123
|
+
end
|
124
|
+
|
125
|
+
def add_count_for_class(count, classification)
|
126
|
+
@classes[classification] ||= 0
|
127
|
+
@classes[classification] += count
|
128
|
+
end
|
129
|
+
|
130
|
+
def count_for_class(classification)
|
131
|
+
#@classes[classification] || 1 um, what?
|
132
|
+
@classes[classification] || 0
|
133
|
+
end
|
134
|
+
|
135
|
+
def count
|
136
|
+
@classes.values.sum
|
137
|
+
end
|
138
|
+
|
139
|
+
def ==(other)
|
140
|
+
other.kind_of?(FeatureCount) && other.classes == @classes && other.name == @name
|
141
|
+
end
|
142
|
+
|
143
|
+
def inspect(opts={:verbose=>false})
|
144
|
+
return super if opts[:verbose]
|
145
|
+
"#<FeatureCount for ``" + @name.to_s + "''" + " --> " + @classes.inspect + " > "
|
146
|
+
end
|
147
|
+
|
148
|
+
end
|
149
|
+
|
150
|
+
end
|
151
|
+
end
|