classifier 1.1.1 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +341 -0
- data/README +59 -6
- data/Rakefile +16 -4
- data/bin/bayes.rb +8 -2
- data/doc/classes/Classifier.html +15 -10
- data/doc/classes/Classifier/Bayes.html +68 -38
- data/doc/classes/Classifier/Bayes.src/{M000005.html → M000023.html} +1 -1
- data/doc/classes/Classifier/Bayes.src/{M000006.html → M000024.html} +1 -1
- data/doc/classes/Classifier/Bayes.src/M000025.html +30 -0
- data/doc/classes/Classifier/Bayes.src/{M000007.html → M000026.html} +1 -1
- data/doc/classes/Classifier/Bayes.src/{M000008.html → M000027.html} +1 -1
- data/doc/classes/Classifier/Bayes.src/{M000009.html → M000028.html} +4 -4
- data/doc/classes/Classifier/Bayes.src/{M000010.html → M000029.html} +2 -2
- data/doc/classes/Classifier/ContentNode.html +252 -0
- data/doc/classes/Classifier/ContentNode.src/M000031.html +21 -0
- data/doc/classes/Classifier/ContentNode.src/M000032.html +18 -0
- data/doc/classes/Classifier/ContentNode.src/M000033.html +18 -0
- data/doc/classes/Classifier/ContentNode.src/M000034.html +41 -0
- data/doc/classes/Classifier/LSI.html +449 -0
- data/doc/classes/Classifier/LSI.src/M000011.html +20 -0
- data/doc/classes/Classifier/LSI.src/M000012.html +18 -0
- data/doc/classes/Classifier/LSI.src/M000013.html +20 -0
- data/doc/classes/Classifier/LSI.src/M000014.html +18 -0
- data/doc/classes/Classifier/LSI.src/M000015.html +21 -0
- data/doc/classes/Classifier/LSI.src/M000016.html +18 -0
- data/doc/classes/Classifier/LSI.src/M000017.html +32 -0
- data/doc/classes/Classifier/LSI.src/M000018.html +26 -0
- data/doc/classes/Classifier/LSI.src/M000019.html +26 -0
- data/doc/classes/Classifier/LSI.src/M000020.html +23 -0
- data/doc/classes/Classifier/LSI.src/M000021.html +21 -0
- data/doc/classes/Classifier/LSI.src/M000022.html +31 -0
- data/doc/classes/Classifier/WordList.html +202 -0
- data/doc/classes/Classifier/WordList.src/M000007.html +18 -0
- data/doc/classes/Classifier/WordList.src/M000008.html +19 -0
- data/doc/classes/Classifier/WordList.src/M000009.html +19 -0
- data/doc/classes/Classifier/WordList.src/M000010.html +18 -0
- data/doc/classes/GSL.html +111 -0
- data/doc/classes/GSL/Vector.html +156 -0
- data/doc/classes/GSL/Vector.src/M000005.html +18 -0
- data/doc/classes/GSL/Vector.src/M000006.html +19 -0
- data/doc/classes/Object.html +139 -0
- data/doc/classes/Object.src/M000001.html +16 -0
- data/doc/classes/String.html +95 -9
- data/doc/classes/{Classifier/WordHash.src/M000001.html → String.src/M000002.html} +3 -3
- data/doc/classes/String.src/M000003.html +18 -0
- data/doc/classes/String.src/M000004.html +18 -0
- data/doc/created.rid +1 -1
- data/doc/files/README.html +102 -12
- data/doc/files/lib/classifier/bayes_rb.html +1 -1
- data/doc/files/lib/classifier/{string_extensions/porter_stemmer_rb.html → extensions/vector_serialize_rb.html} +4 -15
- data/doc/files/lib/classifier/{string_extensions → extensions}/word_hash_rb.html +2 -2
- data/doc/files/lib/classifier/extensions/word_list_rb.html +115 -0
- data/doc/files/lib/classifier/lsi/content_node_rb.html +115 -0
- data/doc/files/lib/classifier/lsi_rb.html +125 -0
- data/doc/files/lib/classifier/string_extensions_rb.html +2 -3
- data/doc/files/lib/classifier_rb.html +3 -1
- data/doc/fr_class_index.html +6 -2
- data/doc/fr_file_index.html +5 -2
- data/doc/fr_method_index.html +34 -11
- data/lib/classifier.rb +3 -1
- data/lib/classifier/bayes.rb +34 -9
- data/lib/classifier/extensions/vector_serialize.rb +14 -0
- data/lib/classifier/extensions/word_hash.rb +125 -0
- data/lib/classifier/extensions/word_list.rb +31 -0
- data/lib/classifier/lsi.rb +248 -0
- data/lib/classifier/lsi/content_node.rb +67 -0
- data/lib/classifier/string_extensions.rb +10 -5
- data/test/bayes/bayesian_test.rb +2 -2
- data/test/lsi/lsi_test.rb +88 -0
- data/test/string_extensions/word_hash_test.rb +7 -5
- metadata +79 -24
- data/doc/classes/Classifier/Stemmable.html +0 -243
- data/doc/classes/Classifier/Stemmable.src/M000003.html +0 -102
- data/doc/classes/Classifier/WordHash.html +0 -178
- data/doc/classes/Classifier/WordHash.src/M000002.html +0 -28
- data/lib/classifier/string_extensions/porter_stemmer.rb +0 -199
- data/lib/classifier/string_extensions/word_hash.rb +0 -119
@@ -2,10 +2,15 @@
|
|
2
2
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
3
|
# License:: LGPL
|
4
4
|
|
5
|
-
|
6
|
-
|
5
|
+
begin
|
6
|
+
require_gem 'stemmer'
|
7
|
+
rescue LoadError
|
8
|
+
puts "Please install stemmer from http://rubyforge.org/projects/stemmer or 'gem install stemmer'"
|
9
|
+
exit(-1)
|
10
|
+
end
|
7
11
|
|
8
|
-
|
9
|
-
|
10
|
-
|
12
|
+
require 'classifier/extensions/word_hash'
|
13
|
+
|
14
|
+
class Object
|
15
|
+
def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
|
11
16
|
end
|
data/test/bayes/bayesian_test.rb
CHANGED
@@ -17,12 +17,12 @@ class BayesianTest < Test::Unit::TestCase
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def test_categories
|
20
|
-
assert_equal ['Interesting', 'Uninteresting'], @classifier.categories
|
20
|
+
assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
|
21
21
|
end
|
22
22
|
|
23
23
|
def test_add_category
|
24
24
|
@classifier.add_category 'Test'
|
25
|
-
assert_equal ['
|
25
|
+
assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
|
26
26
|
end
|
27
27
|
|
28
28
|
def test_classification
|
@@ -0,0 +1,88 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../test_helper'
|
2
|
+
class LSITest < Test::Unit::TestCase
|
3
|
+
def setup
|
4
|
+
# we repeat principle words to help weight them.
|
5
|
+
# This test is rather delicate, since this system is mostly noise.
|
6
|
+
@str1 = "This text deals with dogs. Dogs."
|
7
|
+
@str2 = "This text involves dogs too. Dogs! "
|
8
|
+
@str3 = "This text revolves around cats. Cats."
|
9
|
+
@str4 = "This text also involves cats. Cats!"
|
10
|
+
@str5 = "This text involves birds. Birds."
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_basic_indexing
|
14
|
+
lsi = Classifier::LSI.new
|
15
|
+
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
16
|
+
assert ! lsi.needs_rebuild?
|
17
|
+
|
18
|
+
# note that the closest match to str1 is str2, even though it is not
|
19
|
+
# the closest text match.
|
20
|
+
assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_not_auto_rebuild
|
24
|
+
lsi = Classifier::LSI.new :auto_rebuild => false
|
25
|
+
lsi.add_item @str1, "Dog"
|
26
|
+
assert lsi.needs_rebuild?
|
27
|
+
lsi.build_index
|
28
|
+
assert ! lsi.needs_rebuild?
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_basic_categorizing
|
32
|
+
lsi = Classifier::LSI.new
|
33
|
+
lsi.add_item @str2, "Dog"
|
34
|
+
lsi.add_item @str3, "Cat"
|
35
|
+
lsi.add_item @str4, "Cat"
|
36
|
+
lsi.add_item @str5, "Bird"
|
37
|
+
|
38
|
+
assert_equal "Dog", lsi.classify( @str1 )
|
39
|
+
assert_equal "Cat", lsi.classify( @str3 )
|
40
|
+
assert_equal "Bird", lsi.classify( @str5 )
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_external_classifying
|
44
|
+
lsi = Classifier::LSI.new
|
45
|
+
bayes = Classifier::Bayes.new 'Dog', 'Cat', 'Bird'
|
46
|
+
lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
|
47
|
+
lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
|
48
|
+
lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
|
49
|
+
lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
|
50
|
+
lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
|
51
|
+
|
52
|
+
# We're talking about dogs. Even though the text matches the corpus on
|
53
|
+
# cats better. Dogs have more semantic weight than cats. So bayes
|
54
|
+
# will fail here, but the LSI recognizes content.
|
55
|
+
tricky_case = "This text revolves around dogs."
|
56
|
+
assert_equal "Dog", lsi.classify( tricky_case )
|
57
|
+
assert_not_equal "Dog", bayes.classify( tricky_case )
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_search
|
61
|
+
lsi = Classifier::LSI.new
|
62
|
+
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
63
|
+
|
64
|
+
# Searching by content and text, note that @str2 comes up first, because
|
65
|
+
# both "dog" and "involve" are present. But, the next match is @str1 instead
|
66
|
+
# of @str4, because "dog" carries more weight than involves.
|
67
|
+
assert_equal( [@str2, @str1, @str4, @str5, @str3],
|
68
|
+
lsi.search("dog involves", 100) )
|
69
|
+
|
70
|
+
# Keyword search shows how the space is mapped out in relation to
|
71
|
+
# dog when magnitude is remove. Note the relations. We move from dog
|
72
|
+
# through involve and then finally to other words.
|
73
|
+
assert_equal( [@str1, @str2, @str4, @str5, @str3],
|
74
|
+
lsi.search("dog", 5) )
|
75
|
+
end
|
76
|
+
|
77
|
+
def test_serialize_safe
|
78
|
+
lsi = Classifier::LSI.new
|
79
|
+
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
80
|
+
|
81
|
+
lsi_md = Marshal.dump lsi
|
82
|
+
lsi_m = Marshal.load lsi_md
|
83
|
+
|
84
|
+
assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
|
85
|
+
assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
@@ -1,12 +1,14 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/../test_helper'
|
2
2
|
class StringExtensionsTest < Test::Unit::TestCase
|
3
3
|
def test_word_hash
|
4
|
-
hash = {:good=>1, :"!"=>1, :hope=>1, :"'."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
|
4
|
+
hash = {:good=>1, :"!"=>1, :hope=>1, :"'"=>1, :"."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
|
5
5
|
assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
|
6
6
|
end
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
7
|
+
|
8
|
+
|
9
|
+
def test_clean_word_hash
|
10
|
+
hash = {:good=>1, :word=>1, :hope=>1, :love=>1, :them=>1, :test=>1}
|
11
|
+
assert_equal hash, "here are some good words of test's. I hope you love them!".clean_word_hash
|
11
12
|
end
|
13
|
+
|
12
14
|
end
|
metadata
CHANGED
@@ -3,13 +3,13 @@ rubygems_version: 0.8.6
|
|
3
3
|
specification_version: 1
|
4
4
|
name: classifier
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 1.
|
7
|
-
date: 2005-04-
|
6
|
+
version: 1.2.0
|
7
|
+
date: 2005-04-24
|
8
8
|
summary: A general classifier module to allow Bayesian and other types of classifications.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
11
11
|
email: lucas@rufy.com
|
12
|
-
homepage: http://
|
12
|
+
homepage: http://classifier.rufy.com/
|
13
13
|
rubyforge_project:
|
14
14
|
description: A general classifier module to allow Bayesian and other types of classifications.
|
15
15
|
autorequire: classifier
|
@@ -30,16 +30,23 @@ files:
|
|
30
30
|
- lib/classifier
|
31
31
|
- lib/classifier.rb
|
32
32
|
- lib/classifier/bayes.rb
|
33
|
-
- lib/classifier/
|
33
|
+
- lib/classifier/extensions
|
34
|
+
- lib/classifier/lsi
|
35
|
+
- lib/classifier/lsi.rb
|
34
36
|
- lib/classifier/string_extensions.rb
|
35
|
-
- lib/classifier/
|
36
|
-
- lib/classifier/
|
37
|
+
- lib/classifier/extensions/vector_serialize.rb
|
38
|
+
- lib/classifier/extensions/word_hash.rb
|
39
|
+
- lib/classifier/extensions/word_list.rb
|
40
|
+
- lib/classifier/lsi/content_node.rb
|
37
41
|
- bin/bayes.rb
|
38
42
|
- test/bayes
|
43
|
+
- test/lsi
|
39
44
|
- test/string_extensions
|
40
45
|
- test/test_helper.rb
|
41
46
|
- test/bayes/bayesian_test.rb
|
47
|
+
- test/lsi/lsi_test.rb
|
42
48
|
- test/string_extensions/word_hash_test.rb
|
49
|
+
- LICENSE
|
43
50
|
- Rakefile
|
44
51
|
- README
|
45
52
|
- doc/classes
|
@@ -52,35 +59,83 @@ files:
|
|
52
59
|
- doc/rdoc-style.css
|
53
60
|
- doc/classes/Classifier
|
54
61
|
- doc/classes/Classifier.html
|
62
|
+
- doc/classes/GSL
|
63
|
+
- doc/classes/GSL.html
|
64
|
+
- doc/classes/Object.html
|
65
|
+
- doc/classes/Object.src
|
55
66
|
- doc/classes/String.html
|
67
|
+
- doc/classes/String.src
|
56
68
|
- doc/classes/Classifier/Bayes.html
|
57
69
|
- doc/classes/Classifier/Bayes.src
|
58
|
-
- doc/classes/Classifier/
|
59
|
-
- doc/classes/Classifier/
|
60
|
-
- doc/classes/Classifier/
|
61
|
-
- doc/classes/Classifier/
|
62
|
-
- doc/classes/Classifier/
|
63
|
-
- doc/classes/Classifier/
|
64
|
-
- doc/classes/Classifier/Bayes.src/
|
65
|
-
- doc/classes/Classifier/Bayes.src/
|
66
|
-
- doc/classes/Classifier/Bayes.src/
|
67
|
-
- doc/classes/Classifier/Bayes.src/
|
68
|
-
- doc/classes/Classifier/
|
69
|
-
- doc/classes/Classifier/
|
70
|
-
- doc/classes/Classifier/
|
70
|
+
- doc/classes/Classifier/ContentNode.html
|
71
|
+
- doc/classes/Classifier/ContentNode.src
|
72
|
+
- doc/classes/Classifier/LSI.html
|
73
|
+
- doc/classes/Classifier/LSI.src
|
74
|
+
- doc/classes/Classifier/WordList.html
|
75
|
+
- doc/classes/Classifier/WordList.src
|
76
|
+
- doc/classes/Classifier/Bayes.src/M000023.html
|
77
|
+
- doc/classes/Classifier/Bayes.src/M000024.html
|
78
|
+
- doc/classes/Classifier/Bayes.src/M000025.html
|
79
|
+
- doc/classes/Classifier/Bayes.src/M000026.html
|
80
|
+
- doc/classes/Classifier/Bayes.src/M000027.html
|
81
|
+
- doc/classes/Classifier/Bayes.src/M000028.html
|
82
|
+
- doc/classes/Classifier/Bayes.src/M000029.html
|
83
|
+
- doc/classes/Classifier/ContentNode.src/M000031.html
|
84
|
+
- doc/classes/Classifier/ContentNode.src/M000032.html
|
85
|
+
- doc/classes/Classifier/ContentNode.src/M000033.html
|
86
|
+
- doc/classes/Classifier/ContentNode.src/M000034.html
|
87
|
+
- doc/classes/Classifier/LSI.src/M000011.html
|
88
|
+
- doc/classes/Classifier/LSI.src/M000012.html
|
89
|
+
- doc/classes/Classifier/LSI.src/M000013.html
|
90
|
+
- doc/classes/Classifier/LSI.src/M000014.html
|
91
|
+
- doc/classes/Classifier/LSI.src/M000015.html
|
92
|
+
- doc/classes/Classifier/LSI.src/M000016.html
|
93
|
+
- doc/classes/Classifier/LSI.src/M000017.html
|
94
|
+
- doc/classes/Classifier/LSI.src/M000018.html
|
95
|
+
- doc/classes/Classifier/LSI.src/M000019.html
|
96
|
+
- doc/classes/Classifier/LSI.src/M000020.html
|
97
|
+
- doc/classes/Classifier/LSI.src/M000021.html
|
98
|
+
- doc/classes/Classifier/LSI.src/M000022.html
|
99
|
+
- doc/classes/Classifier/WordList.src/M000007.html
|
100
|
+
- doc/classes/Classifier/WordList.src/M000008.html
|
101
|
+
- doc/classes/Classifier/WordList.src/M000009.html
|
102
|
+
- doc/classes/Classifier/WordList.src/M000010.html
|
103
|
+
- doc/classes/GSL/Vector.html
|
104
|
+
- doc/classes/GSL/Vector.src
|
105
|
+
- doc/classes/GSL/Vector.src/M000005.html
|
106
|
+
- doc/classes/GSL/Vector.src/M000006.html
|
107
|
+
- doc/classes/Object.src/M000001.html
|
108
|
+
- doc/classes/String.src/M000002.html
|
109
|
+
- doc/classes/String.src/M000003.html
|
110
|
+
- doc/classes/String.src/M000004.html
|
71
111
|
- doc/files/lib
|
72
112
|
- doc/files/README.html
|
73
113
|
- doc/files/lib/classifier
|
74
114
|
- doc/files/lib/classifier_rb.html
|
75
115
|
- doc/files/lib/classifier/bayes_rb.html
|
76
|
-
- doc/files/lib/classifier/
|
116
|
+
- doc/files/lib/classifier/extensions
|
117
|
+
- doc/files/lib/classifier/lsi
|
118
|
+
- doc/files/lib/classifier/lsi_rb.html
|
77
119
|
- doc/files/lib/classifier/string_extensions_rb.html
|
78
|
-
- doc/files/lib/classifier/
|
79
|
-
- doc/files/lib/classifier/
|
120
|
+
- doc/files/lib/classifier/extensions/vector_serialize_rb.html
|
121
|
+
- doc/files/lib/classifier/extensions/word_hash_rb.html
|
122
|
+
- doc/files/lib/classifier/extensions/word_list_rb.html
|
123
|
+
- doc/files/lib/classifier/lsi/content_node_rb.html
|
80
124
|
test_files: []
|
81
125
|
rdoc_options: []
|
82
126
|
extra_rdoc_files: []
|
83
127
|
executables: []
|
84
128
|
extensions: []
|
85
|
-
requirements:
|
86
|
-
|
129
|
+
requirements:
|
130
|
+
- A porter-stemmer module to split word stems.
|
131
|
+
dependencies:
|
132
|
+
- !ruby/object:Gem::Dependency
|
133
|
+
name: stemmer
|
134
|
+
version_requirement:
|
135
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
136
|
+
requirements:
|
137
|
+
-
|
138
|
+
- ">="
|
139
|
+
- !ruby/object:Gem::Version
|
140
|
+
version: 1.0.0
|
141
|
+
version:
|
@@ -1,243 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
-
<!DOCTYPE html
|
3
|
-
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
-
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
-
|
6
|
-
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
7
|
-
<head>
|
8
|
-
<title>Module: Classifier::Stemmable</title>
|
9
|
-
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
-
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
-
<link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
|
12
|
-
<script type="text/javascript">
|
13
|
-
// <![CDATA[
|
14
|
-
|
15
|
-
function popupCode( url ) {
|
16
|
-
window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
|
17
|
-
}
|
18
|
-
|
19
|
-
function toggleCode( id ) {
|
20
|
-
if ( document.getElementById )
|
21
|
-
elem = document.getElementById( id );
|
22
|
-
else if ( document.all )
|
23
|
-
elem = eval( "document.all." + id );
|
24
|
-
else
|
25
|
-
return false;
|
26
|
-
|
27
|
-
elemStyle = elem.style;
|
28
|
-
|
29
|
-
if ( elemStyle.display != "block" ) {
|
30
|
-
elemStyle.display = "block"
|
31
|
-
} else {
|
32
|
-
elemStyle.display = "none"
|
33
|
-
}
|
34
|
-
|
35
|
-
return true;
|
36
|
-
}
|
37
|
-
|
38
|
-
// Make codeblocks hidden by default
|
39
|
-
document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
|
40
|
-
|
41
|
-
// ]]>
|
42
|
-
</script>
|
43
|
-
|
44
|
-
</head>
|
45
|
-
<body>
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
<div id="classHeader">
|
50
|
-
<table class="header-table">
|
51
|
-
<tr class="top-aligned-row">
|
52
|
-
<td><strong>Module</strong></td>
|
53
|
-
<td class="class-name-in-header">Classifier::Stemmable</td>
|
54
|
-
</tr>
|
55
|
-
<tr class="top-aligned-row">
|
56
|
-
<td><strong>In:</strong></td>
|
57
|
-
<td>
|
58
|
-
<a href="../../files/lib/classifier/string_extensions/porter_stemmer_rb.html">
|
59
|
-
lib/classifier/string_extensions/porter_stemmer.rb
|
60
|
-
</a>
|
61
|
-
<br />
|
62
|
-
</td>
|
63
|
-
</tr>
|
64
|
-
|
65
|
-
</table>
|
66
|
-
</div>
|
67
|
-
<!-- banner header -->
|
68
|
-
|
69
|
-
<div id="bodyContent">
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
<div id="contextContent">
|
74
|
-
|
75
|
-
<div id="description">
|
76
|
-
<p>
|
77
|
-
Porter stemmer in Ruby.
|
78
|
-
</p>
|
79
|
-
<p>
|
80
|
-
This is the Porter stemming algorithm, ported to Ruby from the version
|
81
|
-
coded up in Perl. It’s easy to follow against the rules in the
|
82
|
-
original paper in:
|
83
|
-
</p>
|
84
|
-
<pre>
|
85
|
-
Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
86
|
-
no. 3, pp 130-137,
|
87
|
-
</pre>
|
88
|
-
<p>
|
89
|
-
See also <a
|
90
|
-
href="http://www.tartarus.org/~martin/PorterStemmer">www.tartarus.org/~martin/PorterStemmer</a>
|
91
|
-
</p>
|
92
|
-
<p>
|
93
|
-
Send comments to raypereda@hotmail.com
|
94
|
-
</p>
|
95
|
-
|
96
|
-
</div>
|
97
|
-
|
98
|
-
|
99
|
-
</div>
|
100
|
-
|
101
|
-
<div id="method-list">
|
102
|
-
<h3 class="section-bar">Methods</h3>
|
103
|
-
|
104
|
-
<div class="name-list">
|
105
|
-
<a href="#M000004">stem</a>
|
106
|
-
<a href="#M000003">stem_porter</a>
|
107
|
-
</div>
|
108
|
-
</div>
|
109
|
-
|
110
|
-
</div>
|
111
|
-
|
112
|
-
|
113
|
-
<!-- if includes -->
|
114
|
-
|
115
|
-
<div id="section">
|
116
|
-
|
117
|
-
|
118
|
-
<div id="constants-list">
|
119
|
-
<h3 class="section-bar">Constants</h3>
|
120
|
-
|
121
|
-
<div class="name-list">
|
122
|
-
<table summary="Constants">
|
123
|
-
<tr class="top-aligned-row context-row">
|
124
|
-
<td class="context-item-name">STEP_2_LIST</td>
|
125
|
-
<td>=</td>
|
126
|
-
<td class="context-item-value">{
|
127
|
'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
|
128
1
|
'izer'=>'ize', 'bli'=>'ble',
|
129
2
|
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
|
130
3
|
'ization'=>'ize', 'ation'=>'ate',
|
131
4
|
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
|
132
5
|
'ousness'=>'ous', 'aliti'=>'al',
|
133
6
|
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'</td>
|
134
|
-
</tr>
|
135
|
-
<tr class="top-aligned-row context-row">
|
136
|
-
<td class="context-item-name">STEP_3_LIST</td>
|
137
|
-
<td>=</td>
|
138
|
-
<td class="context-item-value">{
|
139
7
|
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
|
140
8
|
'ical'=>'ic', 'ful'=>'', 'ness'=>''</td>
|
141
|
-
</tr>
|
142
|
-
<tr class="top-aligned-row context-row">
|
143
|
-
<td class="context-item-name">SUFFIX_1_REGEXP</td>
|
144
|
-
<td>=</td>
|
145
|
-
<td class="context-item-value">/(
|
146
9
|
ational |
|
147
10
|
tional |
|
148
11
|
enci |
|
149
12
|
anci |
|
150
13
|
izer |
|
151
14
|
bli |
|
152
15
|
alli |
|
153
16
|
entli |
|
154
17
|
eli |
|
155
18
|
ousli |
|
156
19
|
ization |
|
157
20
|
ation |
|
158
21
|
ator |
|
159
22
|
alism |
|
160
23
|
iveness |
|
161
24
|
fulness |
|
162
25
|
ousness |
|
163
26
|
aliti |
|
164
27
|
iviti |
|
165
28
|
biliti |
|
166
29
|
logi)$/x</td>
|
167
|
-
</tr>
|
168
|
-
<tr class="top-aligned-row context-row">
|
169
|
-
<td class="context-item-name">SUFFIX_2_REGEXP</td>
|
170
|
-
<td>=</td>
|
171
|
-
<td class="context-item-value">/(
|
172
30
|
al |
|
173
31
|
ance |
|
174
32
|
ence |
|
175
33
|
er |
|
176
34
|
ic |
|
177
35
|
able |
|
178
36
|
ible |
|
179
37
|
ant |
|
180
38
|
ement |
|
181
39
|
ment |
|
182
40
|
ent |
|
183
41
|
ou |
|
184
42
|
ism |
|
185
43
|
ate |
|
186
44
|
iti |
|
187
45
|
ous |
|
188
46
|
ive |
|
189
47
|
ize)$/x</td>
|
190
|
-
</tr>
|
191
|
-
<tr class="top-aligned-row context-row">
|
192
|
-
<td class="context-item-name">C</td>
|
193
|
-
<td>=</td>
|
194
|
-
<td class="context-item-value">"[^aeiou]"</td>
|
195
|
-
</tr>
|
196
|
-
<tr class="top-aligned-row context-row">
|
197
|
-
<td class="context-item-name">V</td>
|
198
|
-
<td>=</td>
|
199
|
-
<td class="context-item-value">"[aeiouy]"</td>
|
200
|
-
</tr>
|
201
|
-
<tr class="top-aligned-row context-row">
|
202
|
-
<td class="context-item-name">CC</td>
|
203
|
-
<td>=</td>
|
204
|
-
<td class="context-item-value">"#{C}(?>[^aeiouy]*)"</td>
|
205
|
-
</tr>
|
206
|
-
<tr class="top-aligned-row context-row">
|
207
|
-
<td class="context-item-name">VV</td>
|
208
|
-
<td>=</td>
|
209
|
-
<td class="context-item-value">"#{V}(?>[aeiou]*)"</td>
|
210
|
-
</tr>
|
211
|
-
<tr class="top-aligned-row context-row">
|
212
|
-
<td class="context-item-name">MGR0</td>
|
213
|
-
<td>=</td>
|
214
|
-
<td class="context-item-value">/^(#{CC})?#{VV}#{CC}/o</td>
|
215
|
-
</tr>
|
216
|
-
<tr class="top-aligned-row context-row">
|
217
|
-
<td class="context-item-name">MEQ1</td>
|
218
|
-
<td>=</td>
|
219
|
-
<td class="context-item-value">/^(#{CC})?#{VV}#{CC}(#{VV})?$/o</td>
|
220
|
-
</tr>
|
221
|
-
<tr class="top-aligned-row context-row">
|
222
|
-
<td class="context-item-name">MGR1</td>
|
223
|
-
<td>=</td>
|
224
|
-
<td class="context-item-value">/^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o</td>
|
225
|
-
</tr>
|
226
|
-
<tr class="top-aligned-row context-row">
|
227
|
-
<td class="context-item-name">VOWEL_IN_STEM</td>
|
228
|
-
<td>=</td>
|
229
|
-
<td class="context-item-value">/^(#{CC})?#{V}/o</td>
|
230
|
-
</tr>
|
231
|
-
</table>
|
232
|
-
</div>
|
233
|
-
</div>
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
<!-- if method_list -->
|
241
|
-
<div id="methods">
|
242
|
-
<h3 class="section-bar">Public Instance methods</h3>
|
243
|
-
|
244
|
-
<div id="method-M000004" class="method-detail">
|
245
|
-
<a name="M000004"></a>
|
246
|
-
|
247
|
-
<div class="method-heading">
|
248
|
-
<span class="method-name">stem</span><span class="method-args">()</span>
|
249
|
-
</div>
|
250
|
-
|
251
|
-
<div class="method-description">
|
252
|
-
<p>
|
253
|
-
Alias for <a href="Stemmable.html#M000003">stem_porter</a>
|
254
|
-
</p>
|
255
|
-
</div>
|
256
|
-
</div>
|
257
|
-
|
258
|
-
<div id="method-M000003" class="method-detail">
|
259
|
-
<a name="M000003"></a>
|
260
|
-
|
261
|
-
<div class="method-heading">
|
262
|
-
<a href="Stemmable.src/M000003.html" target="Code" class="method-signature"
|
263
|
-
onclick="popupCode('Stemmable.src/M000003.html');return false;">
|
264
|
-
<span class="method-name">stem_porter</span><span class="method-args">()</span>
|
265
|
-
</a>
|
266
|
-
</div>
|
267
|
-
|
268
|
-
<div class="method-description">
|
269
|
-
<p>
|
270
|
-
Stems the word contained in the current object. E.g.,
|
271
|
-
</p>
|
272
|
-
<pre>
|
273
|
-
"actually".stem_porter
|
274
|
-
=> "actual"
|
275
|
-
</pre>
|
276
|
-
</div>
|
277
|
-
</div>
|
278
|
-
|
279
|
-
|
280
|
-
</div>
|
281
|
-
|
282
|
-
|
283
|
-
</div>
|
284
|
-
|
285
|
-
|
286
|
-
<div id="validator-badges">
|
287
|
-
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
288
|
-
</div>
|
289
|
-
|
290
|
-
</body>
|
291
|
-
</html>
|