rjspotter-basset 1.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +7 -0
- data/License.txt +20 -0
- data/Manifest.txt +21 -0
- data/README.rdoc +58 -0
- data/Rakefile +63 -0
- data/VERSION.yml +4 -0
- data/basset.gemspec +38 -0
- data/examples/example.rb +25 -0
- data/lib/basset.rb +9 -0
- data/lib/basset/classification_evaluator.rb +170 -0
- data/lib/basset/classifier.rb +188 -0
- data/lib/basset/core_extensions.rb +93 -0
- data/lib/basset/document.rb +84 -0
- data/lib/basset/document_override_example.rb +11 -0
- data/lib/basset/feature.rb +26 -0
- data/lib/basset/feature_extractor.rb +52 -0
- data/lib/basset/feature_selector.rb +126 -0
- data/lib/basset/naive_bayes.rb +151 -0
- data/lib/basset/svm.rb +180 -0
- data/lib/basset/yaml_serialization.rb +41 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +12 -0
- data/spec/unit/classifier_spec.rb +166 -0
- data/spec/unit/core_extension_spec.rb +33 -0
- data/spec/unit/document_spec.rb +59 -0
- data/spec/unit/feature_extractor_spec.rb +33 -0
- data/spec/unit/feature_selector_spec.rb +108 -0
- data/spec/unit/feature_spec.rb +40 -0
- data/spec/unit/naive_bayes_spec.rb +119 -0
- data/spec/unit/svm_spec.rb +83 -0
- metadata +115 -0
data/lib/basset/svm.rb
ADDED
@@ -0,0 +1,180 @@
|
|
1
|
+
require "svm" #File.dirname(__FILE__) + "/../../../libsvm-ruby-swig/lib/svm"
|
2
|
+
|
3
|
+
require "bloomfilter" # igrigorik-bloomfilter (github)
|
4
|
+
|
5
|
+
module Basset
|
6
|
+
# =Overview
|
7
|
+
# A class for SVM document classification. Follows the same basic interface
|
8
|
+
# as NaiveBayes; add labeled training documents to the classifier, then
|
9
|
+
# use it to classify unlabeled documents. Do test your accuracy before
|
10
|
+
# using the classifier in production, there are a lot of knobs to tweak.
|
11
|
+
# When testing, it is usually best to use a separate set of documents, i.e.,
|
12
|
+
# not the training set.
|
13
|
+
# =Learning Resources
|
14
|
+
# SVM can be tricky to understand at first, try the following references:
|
15
|
+
# http://en.wikipedia.org/wiki/Support_vector_machine
|
16
|
+
# http://www.igvita.com/2008/01/07/support-vector-machines-svm-in-ruby/
|
17
|
+
# http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf
|
18
|
+
# =Implementation
|
19
|
+
# This class wraps libsvm-ruby-swig, which is itself a swig based wrapper for
|
20
|
+
# libsvm.
|
21
|
+
# libsvm-ruby-swig: http://github.com/tomz/libsvm-ruby-swig
|
22
|
+
# libsvm: http://www.csie.ntu.edu.tw/~cjlin/libsvm
|
23
|
+
# verbose version:
|
24
|
+
# Chih-Chung Chang and Chih-Jen Lin, LIBSVM : a library for support vector machines, 2001. Software available at http://www.csie.ntu.edu.tw/~cjlin/libsvm
|
25
|
+
#
|
26
|
+
# There is also the libsvm-ruby implementation. It was originally available from
|
27
|
+
# http://debian.cilibrar.com/debian/pool/main/libs/libsvm-ruby/libsvm-ruby_2.8.4.orig.tar.gz
|
28
|
+
# but was not available from there when I last checked. The Ubuntu package
|
29
|
+
# was still available as of this writing.
|
30
|
+
class Svm
|
31
|
+
#include YamlSerialization
|
32
|
+
attr_reader :class_labels, :feature_dictionary
|
33
|
+
|
34
|
+
def initialize
|
35
|
+
@total_classes = 0
|
36
|
+
@feature_dictionary = []
|
37
|
+
@class_labels = {}
|
38
|
+
@documents_for_class = Hash.new {|docs_hash,key| docs_hash[key] = []}
|
39
|
+
@svm_parameter = default_svm_parameter
|
40
|
+
end
|
41
|
+
|
42
|
+
# Adds a new document to the training set.
|
43
|
+
def add_document(classification, feature_vectors)
|
44
|
+
update_class_labels_with_new(classification) if new_class?(classification)
|
45
|
+
@feature_dictionary += feature_vectors.map { |fv| fv.name }
|
46
|
+
@feature_dictionary.uniq!
|
47
|
+
@documents_for_class[classification] << feature_vectors.map { |fv| fv.name }
|
48
|
+
reset_memoized_vars!
|
49
|
+
end
|
50
|
+
|
51
|
+
# Gives the vector representation of the training documents of class
|
52
|
+
# _classification_
|
53
|
+
def vectorized_docs(classification)
|
54
|
+
# hardwired to binary representation
|
55
|
+
@documents_for_class[classification].map do |features|
|
56
|
+
vectorize_doc(features)
|
57
|
+
#@feature_dictionary.map { |dict_feature| features.include?(dict_feature) ? 1 : 0}
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Returns the vectorized representation of the training data, suitable for
|
62
|
+
# use in the constructor for the libsvm Problem class.
|
63
|
+
def labels_and_document_vectors
|
64
|
+
# {labels => [features1-label, features2-label, ...], :features => [features1, features2, ...]}
|
65
|
+
labels_features = {:labels => [], :features => []}
|
66
|
+
@class_labels.each do |classification, label|
|
67
|
+
vectorized_docs(classification).each do |document_vector|
|
68
|
+
labels_features[:labels] << label
|
69
|
+
labels_features[:features] << document_vector
|
70
|
+
end
|
71
|
+
end
|
72
|
+
labels_features
|
73
|
+
end
|
74
|
+
|
75
|
+
def classify(feature_vectors)
|
76
|
+
class_of_label(model.predict(vectorize_doc(feature_vectors.map { |fv| fv.name })))
|
77
|
+
end
|
78
|
+
|
79
|
+
def classes
|
80
|
+
@class_labels.keys
|
81
|
+
end
|
82
|
+
|
83
|
+
# Exposes the libsvm-ruby-swig Parameter object. If given
|
84
|
+
# a block, the parameter object is yielded, otherwise,
|
85
|
+
# it's returned.
|
86
|
+
#
|
87
|
+
# For example, to set parameters to their default values:
|
88
|
+
#
|
89
|
+
# basset_svm_obj.parameters do |param|
|
90
|
+
# param.C = 100
|
91
|
+
# param.svm_type = NU_SVC
|
92
|
+
# param.degree = 1
|
93
|
+
# param.coef0 = 0
|
94
|
+
# param.eps= 0.001
|
95
|
+
# param.kernel_type = RBF
|
96
|
+
# end
|
97
|
+
#
|
98
|
+
# To access one value:
|
99
|
+
# basset_svm_obj.parameters.svm_type
|
100
|
+
# => NU_SVC
|
101
|
+
def parameters
|
102
|
+
if block_given?
|
103
|
+
yield @svm_parameter
|
104
|
+
else
|
105
|
+
@svm_parameter
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
private
|
110
|
+
|
111
|
+
def vectorize_doc(features)
|
112
|
+
vectorized_doc = Array.new(@feature_dictionary.size, 0)
|
113
|
+
features.each do |feature|
|
114
|
+
if index = feature_dictionary_hash[feature]
|
115
|
+
vectorized_doc[index] = 1
|
116
|
+
end
|
117
|
+
end
|
118
|
+
vectorized_doc
|
119
|
+
end
|
120
|
+
|
121
|
+
def feature_dictionary_hash
|
122
|
+
unless @memoized_feature_dictionary_hash
|
123
|
+
m = 15 * @feature_dictionary.count # bloom filter size (bytes)
|
124
|
+
@memoized_feature_dictionary_hash = BloomFilter.new(m,3,23)
|
125
|
+
|
126
|
+
@feature_dictionary.each_index do |i|
|
127
|
+
@memoized_feature_dictionary_hash[@feature_dictionary[i]] = i
|
128
|
+
end
|
129
|
+
end
|
130
|
+
@memoized_feature_dictionary_hash
|
131
|
+
end
|
132
|
+
|
133
|
+
def reset_memoized_vars!
|
134
|
+
@memoized_model, @memoized_problem, @memoized_feature_dictionary_hash = nil, nil, nil
|
135
|
+
@memoized_inverted_class_labels = nil
|
136
|
+
end
|
137
|
+
|
138
|
+
def model
|
139
|
+
@memoized_model ||= Model.new(problem, @svm_parameter)
|
140
|
+
end
|
141
|
+
|
142
|
+
def problem
|
143
|
+
unless @memoized_problem
|
144
|
+
labels_features = labels_and_document_vectors
|
145
|
+
@memoized_problem = Problem.new(labels_features[:labels], labels_features[:features])
|
146
|
+
end
|
147
|
+
@memoized_problem
|
148
|
+
end
|
149
|
+
|
150
|
+
def new_class?(classification)
|
151
|
+
!@class_labels.keys.include?(classification)
|
152
|
+
end
|
153
|
+
|
154
|
+
def default_svm_parameter
|
155
|
+
param = ::Parameter.new
|
156
|
+
param.C = 100
|
157
|
+
param.svm_type = NU_SVC
|
158
|
+
param.degree = 1
|
159
|
+
param.coef0 = 0
|
160
|
+
param.eps= 0.001
|
161
|
+
param.nu = 0.5 #?! this blows up on my dataset...
|
162
|
+
param.kernel_type = RBF
|
163
|
+
param
|
164
|
+
end
|
165
|
+
|
166
|
+
def update_class_labels_with_new(classification)
|
167
|
+
#@class_labels.each_value { |vector| vector << 0 }
|
168
|
+
@class_labels[classification] = @total_classes #Array.new(@total_classes, 0) << 1
|
169
|
+
@total_classes += 1
|
170
|
+
end
|
171
|
+
|
172
|
+
def class_of_label(label)
|
173
|
+
unless @memoized_inverted_class_labels
|
174
|
+
@memoized_inverted_class_labels = @class_labels.invert
|
175
|
+
end
|
176
|
+
@memoized_inverted_class_labels[label.to_i]
|
177
|
+
end
|
178
|
+
|
179
|
+
end
|
180
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require "yaml"
|
2
|
+
|
3
|
+
module YamlSerialization
|
4
|
+
|
5
|
+
def self.included(base)
|
6
|
+
base.extend ClassMethods
|
7
|
+
end
|
8
|
+
|
9
|
+
|
10
|
+
module ClassMethods
|
11
|
+
def load_from_file(file_name)
|
12
|
+
YAML.load_file(file_name)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def save_to_file(file_name)
|
17
|
+
File.open(file_name, 'w') do |file|
|
18
|
+
YAML.dump(self, file)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
class ::Class
|
25
|
+
yaml_as "tag:ruby.yaml.org,2002:class"
|
26
|
+
|
27
|
+
def Class.yaml_new( klass, tag, val )
|
28
|
+
if String === val
|
29
|
+
val.split(/::/).inject(Object) {|m, n| m.const_get(n)}
|
30
|
+
else
|
31
|
+
raise YAML::TypeError, "Invalid Class: " + val.inspect
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def to_yaml( opts = {} )
|
36
|
+
YAML::quick_emit( nil, opts ) { |out|
|
37
|
+
out.scalar( "tag:ruby.yaml.org,2002:class", self.name, :plain )
|
38
|
+
}
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
data/spec/spec.opts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
-c
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,166 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper'
|
2
|
+
|
3
|
+
describe Classifier do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
@classifier = Classifier.new
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should automagically determine the ruby class of the classifier engine a la rails' constantize" do
|
10
|
+
Classifier.new(:type => :naive_bayes).engine.class.should == NaiveBayes
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should automagically determine the ruby class of the document type" do
|
14
|
+
Classifier.new(:type => :naive_bayes, :doctype => :document).doctype.should == Document
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should default to NaiveBayes engine and Document doctype" do
|
18
|
+
classifier = Classifier.new()
|
19
|
+
classifier.engine.class.should == NaiveBayes
|
20
|
+
classifier.doctype.should == Document
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
it "should accept training docs as plain strings, extracting features automatically" do
|
25
|
+
@classifier.train(:hip, "that hipster has an asymmetrical haircut")
|
26
|
+
@classifier.train(:unhip, "that dude is a frat boy")
|
27
|
+
@classifier.engine.classes.should == [:hip, :unhip]
|
28
|
+
@classifier.engine.occurrences_of_all_features_in_class(:unhip).should == 6
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should classify documents" do
|
32
|
+
@classifier.train(:hip, "that hipster has an asymmetrical haircut")
|
33
|
+
@classifier.train(:unhip, "that dude is a frat boy")
|
34
|
+
@classifier.classify("hipsters").should == :hip
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should train iteratively for speed learning" do
|
38
|
+
@classifier.train(:hip, "turntables", "techno music", "DJs with turntables", "techno DJs")
|
39
|
+
@classifier.train(:unhip, "rock music", "guitar bass drums", "guitar rock", "guitar players")
|
40
|
+
@classifier.classify("guitar music").should == :unhip
|
41
|
+
# now everyone likes rock music again! retrain fast! cf LCD Soundsystem
|
42
|
+
@classifier.train_iterative(:hip, "guitars") # takes 3 iterations
|
43
|
+
@classifier.classify("guitars").should == :hip
|
44
|
+
end
|
45
|
+
|
46
|
+
it "should give document scores for a class" do
|
47
|
+
@classifier.train(:hip, "turntables", "techno music", "DJs with turntables", "techno DJs")
|
48
|
+
@classifier.train(:unhip, "rock music", "guitar bass drums", "guitar rock", "guitar players")
|
49
|
+
@classifier.similarity_score(:hip, "guitars").should be_a Float
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
describe AnomalyDetector do
|
56
|
+
|
57
|
+
YAML_FILE_BASENAME = "/tmp/basset_anomaly_detector_rspec_savefile"
|
58
|
+
|
59
|
+
def tmp_file
|
60
|
+
YAML_FILE_BASENAME + rand(2 ** 32).to_s(16)
|
61
|
+
end
|
62
|
+
|
63
|
+
before(:each) do
|
64
|
+
@detector = AnomalyDetector.new
|
65
|
+
end
|
66
|
+
|
67
|
+
after(:each) do
|
68
|
+
Dir.glob(YAML_FILE_BASENAME + '*').each {|file| File.delete file}
|
69
|
+
end
|
70
|
+
|
71
|
+
def train_detector_on_code_love
|
72
|
+
@detector.train("coding all night and loving it", "coding and drinking jolt")
|
73
|
+
end
|
74
|
+
|
75
|
+
it "should train on the normal set only" do
|
76
|
+
@detector.train("coding all night", "coding and drinking jolt")
|
77
|
+
@detector.engine.classes.should == [:normal]
|
78
|
+
@detector.engine.occurrences_of_all_features_in_class(:normal).should == 7
|
79
|
+
end
|
80
|
+
|
81
|
+
it "should give a score for the probability of a document to be in the ``normal'' set" do
|
82
|
+
train_detector_on_code_love
|
83
|
+
score = @detector.similarity_score("I love coding and jolt")
|
84
|
+
score.should be_a Float
|
85
|
+
score.should be_close(-1, 2)
|
86
|
+
end
|
87
|
+
|
88
|
+
it "should give a list of the probability scores for the training set" do
|
89
|
+
train_detector_on_code_love
|
90
|
+
@detector.scores_for_training_set.should have(2).items
|
91
|
+
@detector.scores_for_training_set.each do |score|
|
92
|
+
score.should be_close(-1, 1)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
it "should compute the average probability score for training set" do
|
97
|
+
train_detector_on_code_love
|
98
|
+
@detector.avg_score_of_training_set.should be_close(-0.841, 0.001)
|
99
|
+
end
|
100
|
+
|
101
|
+
it "should give the range of probability scores for the training set" do
|
102
|
+
train_detector_on_code_love
|
103
|
+
@detector.score_range_of_training_set.should be_a Range
|
104
|
+
@detector.score_range_of_training_set.first.should be_close(-0.864, 0.001)
|
105
|
+
@detector.score_range_of_training_set.last.should be_close(-0.818, 0.001)
|
106
|
+
end
|
107
|
+
|
108
|
+
it "should give the standard deviation of probability scores for the training set" do
|
109
|
+
train_detector_on_code_love
|
110
|
+
@detector.stddev_of_scores_of_training_set.should be_close(0.0234, 0.001) #0.0234
|
111
|
+
end
|
112
|
+
|
113
|
+
it "should use the average minus 4 times the stddev as the lower bound for normal" do
|
114
|
+
train_detector_on_code_love
|
115
|
+
expected = -0.841 - (4 * 0.0234)
|
116
|
+
@detector.minimum_acceptable_score.should be_close(expected, 0.001)
|
117
|
+
end
|
118
|
+
|
119
|
+
it "should say if text is anomalous or not" do
|
120
|
+
train_detector_on_code_love
|
121
|
+
@detector.should be_anomalous("watching tv")
|
122
|
+
@detector.should_not be_anomalous("code and jolt")
|
123
|
+
end
|
124
|
+
|
125
|
+
it "should classify text as anomalous or normal" do
|
126
|
+
train_detector_on_code_love
|
127
|
+
@detector.classify("watching tv").should == :anomalous
|
128
|
+
@detector.classify("code and jolt").should == :normal
|
129
|
+
end
|
130
|
+
|
131
|
+
it "should give an anomaly score based on the std deviations from mean" do
|
132
|
+
train_detector_on_code_love
|
133
|
+
@detector.anomaly_score("watching_tv").should be_close( 80, 10)
|
134
|
+
end
|
135
|
+
|
136
|
+
it "should train iteratively" do
|
137
|
+
train_detector_on_code_love
|
138
|
+
50.times {@detector.train("coding drinking jolt")}
|
139
|
+
@detector.train_iterative("watching tv")
|
140
|
+
@detector.should be_normal("watching tv")
|
141
|
+
end
|
142
|
+
|
143
|
+
it "should reset memoized values to nil after retraining" do
|
144
|
+
train_detector_on_code_love
|
145
|
+
@detector.score_range_of_training_set
|
146
|
+
@detector.scores_for_training_set.should_not be_nil
|
147
|
+
train_detector_on_code_love
|
148
|
+
@detector.instance_variable_get(:@scores_for_training_set).should be_nil
|
149
|
+
end
|
150
|
+
|
151
|
+
it "should serialize itself to YAML" do
|
152
|
+
train_detector_on_code_love
|
153
|
+
file = tmp_file
|
154
|
+
@detector.save_to_file(file)
|
155
|
+
File.exist?(file).should be_true
|
156
|
+
end
|
157
|
+
|
158
|
+
it "should load itself from YAML" do
|
159
|
+
train_detector_on_code_love
|
160
|
+
file = tmp_file
|
161
|
+
@detector.save_to_file(file)
|
162
|
+
reloaded_detector = AnomalyDetector.load_from_file(file)
|
163
|
+
reloaded_detector.should == @detector
|
164
|
+
end
|
165
|
+
|
166
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper'
|
2
|
+
|
3
|
+
describe Array, "with Basset extensions" do
|
4
|
+
|
5
|
+
it "should give the tail of an array like FP lists do" do
|
6
|
+
[1,2,3].rest.should == [2,3]
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should not choke when giving the tail of an empty list" do
|
10
|
+
[].rest.should == []
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should return a random element" do
|
14
|
+
srand(123456)
|
15
|
+
[1,2,3,4].pick_random.should == 2
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should randomly rearrange itself" do
|
19
|
+
srand(123456)
|
20
|
+
[1,2,3,4].randomize.should == [1,3,4,2]
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should sum itself" do
|
24
|
+
[1,2,3,4].sum.should == 10
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
describe Float, "with Basset extensions" do
|
30
|
+
it "should convert itself to a string with variable precsion" do
|
31
|
+
1.23456.to_s_decimal_places(3).should == "1.234"
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), "..", "spec_helper")
|
2
|
+
|
3
|
+
describe Document do
|
4
|
+
it "should remove punctuation from words" do
|
5
|
+
Document.new("abc.").vector_of_features.should == [Feature.new("abc", 1)]
|
6
|
+
end
|
7
|
+
|
8
|
+
it "should remove numbers from words" do
|
9
|
+
Document.new("abc1").vector_of_features.should == [Feature.new("abc", 1)]
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should remove symbols from words" do
|
13
|
+
Document.new("abc%").vector_of_features.should == [Feature.new("abc", 1)]
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should lowercase text" do
|
17
|
+
Document.new("ABC").vector_of_features.should == [Feature.new("abc", 1)]
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should stem words" do
|
21
|
+
Document.new("testing").vector_of_features.should == [Feature.new("test", 1)]
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should count feature occurances" do
|
25
|
+
Document.new("test doc test", :test).vector_of_features.should ==
|
26
|
+
[Feature.new("doc", 1), Feature.new("test", 2)]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe UriDocument do
|
31
|
+
|
32
|
+
def single_features(*uris)
|
33
|
+
uris.flatten.map { |uri| Feature.new(uri.to_s, 1) }
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should extract URI token separators &, ?, \\, /, =, [, ], and . separately" do
|
37
|
+
expected_features = [:a,:b,:c,:d,:e,:f,:g,:h, :i, '&', '?', "\\", '/', '=', '[', ']', '.']
|
38
|
+
expected = single_features(expected_features).sort
|
39
|
+
UriDocument.new('a&b?c\d/e=f[g]h.i').feature_vectors.sort.should == expected
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should extract two dots as a single feature instead of two dots" do
|
43
|
+
UriDocument.new('..').feature_vectors.should == [Feature.new("..", 1)]
|
44
|
+
end
|
45
|
+
|
46
|
+
it "should extract two slashes as a single feature" do
|
47
|
+
UriDocument.new('//').feature_vectors.should == [Feature.new('//', 1)]
|
48
|
+
UriDocument.new("\\\\").feature_vectors.should == [Feature.new('\\\\', 1)]
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should not stem words" do
|
52
|
+
UriDocument.new("testing").feature_vectors.should == [Feature.new("testing", 1)]
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should URI decode encoded strings" do
|
56
|
+
UriDocument.new("%23%25").feature_vectors.should == [Feature.new("#%", 1)]
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|