classifier 1.1.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +341 -0
- data/README +59 -6
- data/Rakefile +16 -4
- data/bin/bayes.rb +8 -2
- data/doc/classes/Classifier.html +15 -10
- data/doc/classes/Classifier/Bayes.html +68 -38
- data/doc/classes/Classifier/Bayes.src/{M000005.html → M000023.html} +1 -1
- data/doc/classes/Classifier/Bayes.src/{M000006.html → M000024.html} +1 -1
- data/doc/classes/Classifier/Bayes.src/M000025.html +30 -0
- data/doc/classes/Classifier/Bayes.src/{M000007.html → M000026.html} +1 -1
- data/doc/classes/Classifier/Bayes.src/{M000008.html → M000027.html} +1 -1
- data/doc/classes/Classifier/Bayes.src/{M000009.html → M000028.html} +4 -4
- data/doc/classes/Classifier/Bayes.src/{M000010.html → M000029.html} +2 -2
- data/doc/classes/Classifier/ContentNode.html +252 -0
- data/doc/classes/Classifier/ContentNode.src/M000031.html +21 -0
- data/doc/classes/Classifier/ContentNode.src/M000032.html +18 -0
- data/doc/classes/Classifier/ContentNode.src/M000033.html +18 -0
- data/doc/classes/Classifier/ContentNode.src/M000034.html +41 -0
- data/doc/classes/Classifier/LSI.html +449 -0
- data/doc/classes/Classifier/LSI.src/M000011.html +20 -0
- data/doc/classes/Classifier/LSI.src/M000012.html +18 -0
- data/doc/classes/Classifier/LSI.src/M000013.html +20 -0
- data/doc/classes/Classifier/LSI.src/M000014.html +18 -0
- data/doc/classes/Classifier/LSI.src/M000015.html +21 -0
- data/doc/classes/Classifier/LSI.src/M000016.html +18 -0
- data/doc/classes/Classifier/LSI.src/M000017.html +32 -0
- data/doc/classes/Classifier/LSI.src/M000018.html +26 -0
- data/doc/classes/Classifier/LSI.src/M000019.html +26 -0
- data/doc/classes/Classifier/LSI.src/M000020.html +23 -0
- data/doc/classes/Classifier/LSI.src/M000021.html +21 -0
- data/doc/classes/Classifier/LSI.src/M000022.html +31 -0
- data/doc/classes/Classifier/WordList.html +202 -0
- data/doc/classes/Classifier/WordList.src/M000007.html +18 -0
- data/doc/classes/Classifier/WordList.src/M000008.html +19 -0
- data/doc/classes/Classifier/WordList.src/M000009.html +19 -0
- data/doc/classes/Classifier/WordList.src/M000010.html +18 -0
- data/doc/classes/GSL.html +111 -0
- data/doc/classes/GSL/Vector.html +156 -0
- data/doc/classes/GSL/Vector.src/M000005.html +18 -0
- data/doc/classes/GSL/Vector.src/M000006.html +19 -0
- data/doc/classes/Object.html +139 -0
- data/doc/classes/Object.src/M000001.html +16 -0
- data/doc/classes/String.html +95 -9
- data/doc/classes/{Classifier/WordHash.src/M000001.html → String.src/M000002.html} +3 -3
- data/doc/classes/String.src/M000003.html +18 -0
- data/doc/classes/String.src/M000004.html +18 -0
- data/doc/created.rid +1 -1
- data/doc/files/README.html +102 -12
- data/doc/files/lib/classifier/bayes_rb.html +1 -1
- data/doc/files/lib/classifier/{string_extensions/porter_stemmer_rb.html → extensions/vector_serialize_rb.html} +4 -15
- data/doc/files/lib/classifier/{string_extensions → extensions}/word_hash_rb.html +2 -2
- data/doc/files/lib/classifier/extensions/word_list_rb.html +115 -0
- data/doc/files/lib/classifier/lsi/content_node_rb.html +115 -0
- data/doc/files/lib/classifier/lsi_rb.html +125 -0
- data/doc/files/lib/classifier/string_extensions_rb.html +2 -3
- data/doc/files/lib/classifier_rb.html +3 -1
- data/doc/fr_class_index.html +6 -2
- data/doc/fr_file_index.html +5 -2
- data/doc/fr_method_index.html +34 -11
- data/lib/classifier.rb +3 -1
- data/lib/classifier/bayes.rb +34 -9
- data/lib/classifier/extensions/vector_serialize.rb +14 -0
- data/lib/classifier/extensions/word_hash.rb +125 -0
- data/lib/classifier/extensions/word_list.rb +31 -0
- data/lib/classifier/lsi.rb +248 -0
- data/lib/classifier/lsi/content_node.rb +67 -0
- data/lib/classifier/string_extensions.rb +10 -5
- data/test/bayes/bayesian_test.rb +2 -2
- data/test/lsi/lsi_test.rb +88 -0
- data/test/string_extensions/word_hash_test.rb +7 -5
- metadata +79 -24
- data/doc/classes/Classifier/Stemmable.html +0 -243
- data/doc/classes/Classifier/Stemmable.src/M000003.html +0 -102
- data/doc/classes/Classifier/WordHash.html +0 -178
- data/doc/classes/Classifier/WordHash.src/M000002.html +0 -28
- data/lib/classifier/string_extensions/porter_stemmer.rb +0 -199
- data/lib/classifier/string_extensions/word_hash.rb +0 -119
@@ -2,10 +2,15 @@
|
|
2
2
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
3
|
# License:: LGPL
|
4
4
|
|
5
|
-
|
6
|
-
|
5
|
+
begin
|
6
|
+
require_gem 'stemmer'
|
7
|
+
rescue LoadError
|
8
|
+
puts "Please install stemmer from http://rubyforge.org/projects/stemmer or 'gem install stemmer'"
|
9
|
+
exit(-1)
|
10
|
+
end
|
7
11
|
|
8
|
-
|
9
|
-
|
10
|
-
|
12
|
+
require 'classifier/extensions/word_hash'
|
13
|
+
|
14
|
+
class Object
|
15
|
+
def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
|
11
16
|
end
|
data/test/bayes/bayesian_test.rb
CHANGED
@@ -17,12 +17,12 @@ class BayesianTest < Test::Unit::TestCase
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def test_categories
|
20
|
-
assert_equal ['Interesting', 'Uninteresting'], @classifier.categories
|
20
|
+
assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
|
21
21
|
end
|
22
22
|
|
23
23
|
def test_add_category
|
24
24
|
@classifier.add_category 'Test'
|
25
|
-
assert_equal ['
|
25
|
+
assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
|
26
26
|
end
|
27
27
|
|
28
28
|
def test_classification
|
@@ -0,0 +1,88 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../test_helper'
|
2
|
+
class LSITest < Test::Unit::TestCase
|
3
|
+
def setup
|
4
|
+
# we repeat principle words to help weight them.
|
5
|
+
# This test is rather delicate, since this system is mostly noise.
|
6
|
+
@str1 = "This text deals with dogs. Dogs."
|
7
|
+
@str2 = "This text involves dogs too. Dogs! "
|
8
|
+
@str3 = "This text revolves around cats. Cats."
|
9
|
+
@str4 = "This text also involves cats. Cats!"
|
10
|
+
@str5 = "This text involves birds. Birds."
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_basic_indexing
|
14
|
+
lsi = Classifier::LSI.new
|
15
|
+
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
16
|
+
assert ! lsi.needs_rebuild?
|
17
|
+
|
18
|
+
# note that the closest match to str1 is str2, even though it is not
|
19
|
+
# the closest text match.
|
20
|
+
assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_not_auto_rebuild
|
24
|
+
lsi = Classifier::LSI.new :auto_rebuild => false
|
25
|
+
lsi.add_item @str1, "Dog"
|
26
|
+
assert lsi.needs_rebuild?
|
27
|
+
lsi.build_index
|
28
|
+
assert ! lsi.needs_rebuild?
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_basic_categorizing
|
32
|
+
lsi = Classifier::LSI.new
|
33
|
+
lsi.add_item @str2, "Dog"
|
34
|
+
lsi.add_item @str3, "Cat"
|
35
|
+
lsi.add_item @str4, "Cat"
|
36
|
+
lsi.add_item @str5, "Bird"
|
37
|
+
|
38
|
+
assert_equal "Dog", lsi.classify( @str1 )
|
39
|
+
assert_equal "Cat", lsi.classify( @str3 )
|
40
|
+
assert_equal "Bird", lsi.classify( @str5 )
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_external_classifying
|
44
|
+
lsi = Classifier::LSI.new
|
45
|
+
bayes = Classifier::Bayes.new 'Dog', 'Cat', 'Bird'
|
46
|
+
lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
|
47
|
+
lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
|
48
|
+
lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
|
49
|
+
lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
|
50
|
+
lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
|
51
|
+
|
52
|
+
# We're talking about dogs. Even though the text matches the corpus on
|
53
|
+
# cats better. Dogs have more semantic weight than cats. So bayes
|
54
|
+
# will fail here, but the LSI recognizes content.
|
55
|
+
tricky_case = "This text revolves around dogs."
|
56
|
+
assert_equal "Dog", lsi.classify( tricky_case )
|
57
|
+
assert_not_equal "Dog", bayes.classify( tricky_case )
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_search
|
61
|
+
lsi = Classifier::LSI.new
|
62
|
+
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
63
|
+
|
64
|
+
# Searching by content and text, note that @str2 comes up first, because
|
65
|
+
# both "dog" and "involve" are present. But, the next match is @str1 instead
|
66
|
+
# of @str4, because "dog" carries more weight than involves.
|
67
|
+
assert_equal( [@str2, @str1, @str4, @str5, @str3],
|
68
|
+
lsi.search("dog involves", 100) )
|
69
|
+
|
70
|
+
# Keyword search shows how the space is mapped out in relation to
|
71
|
+
# dog when magnitude is remove. Note the relations. We move from dog
|
72
|
+
# through involve and then finally to other words.
|
73
|
+
assert_equal( [@str1, @str2, @str4, @str5, @str3],
|
74
|
+
lsi.search("dog", 5) )
|
75
|
+
end
|
76
|
+
|
77
|
+
def test_serialize_safe
|
78
|
+
lsi = Classifier::LSI.new
|
79
|
+
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
80
|
+
|
81
|
+
lsi_md = Marshal.dump lsi
|
82
|
+
lsi_m = Marshal.load lsi_md
|
83
|
+
|
84
|
+
assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
|
85
|
+
assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
@@ -1,12 +1,14 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/../test_helper'
|
2
2
|
class StringExtensionsTest < Test::Unit::TestCase
|
3
3
|
def test_word_hash
|
4
|
-
hash = {:good=>1, :"!"=>1, :hope=>1, :"'."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
|
4
|
+
hash = {:good=>1, :"!"=>1, :hope=>1, :"'"=>1, :"."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
|
5
5
|
assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
|
6
6
|
end
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
7
|
+
|
8
|
+
|
9
|
+
def test_clean_word_hash
|
10
|
+
hash = {:good=>1, :word=>1, :hope=>1, :love=>1, :them=>1, :test=>1}
|
11
|
+
assert_equal hash, "here are some good words of test's. I hope you love them!".clean_word_hash
|
11
12
|
end
|
13
|
+
|
12
14
|
end
|
metadata
CHANGED
@@ -3,13 +3,13 @@ rubygems_version: 0.8.6
|
|
3
3
|
specification_version: 1
|
4
4
|
name: classifier
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 1.
|
7
|
-
date: 2005-04-
|
6
|
+
version: 1.2.0
|
7
|
+
date: 2005-04-24
|
8
8
|
summary: A general classifier module to allow Bayesian and other types of classifications.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
11
11
|
email: lucas@rufy.com
|
12
|
-
homepage: http://
|
12
|
+
homepage: http://classifier.rufy.com/
|
13
13
|
rubyforge_project:
|
14
14
|
description: A general classifier module to allow Bayesian and other types of classifications.
|
15
15
|
autorequire: classifier
|
@@ -30,16 +30,23 @@ files:
|
|
30
30
|
- lib/classifier
|
31
31
|
- lib/classifier.rb
|
32
32
|
- lib/classifier/bayes.rb
|
33
|
-
- lib/classifier/
|
33
|
+
- lib/classifier/extensions
|
34
|
+
- lib/classifier/lsi
|
35
|
+
- lib/classifier/lsi.rb
|
34
36
|
- lib/classifier/string_extensions.rb
|
35
|
-
- lib/classifier/
|
36
|
-
- lib/classifier/
|
37
|
+
- lib/classifier/extensions/vector_serialize.rb
|
38
|
+
- lib/classifier/extensions/word_hash.rb
|
39
|
+
- lib/classifier/extensions/word_list.rb
|
40
|
+
- lib/classifier/lsi/content_node.rb
|
37
41
|
- bin/bayes.rb
|
38
42
|
- test/bayes
|
43
|
+
- test/lsi
|
39
44
|
- test/string_extensions
|
40
45
|
- test/test_helper.rb
|
41
46
|
- test/bayes/bayesian_test.rb
|
47
|
+
- test/lsi/lsi_test.rb
|
42
48
|
- test/string_extensions/word_hash_test.rb
|
49
|
+
- LICENSE
|
43
50
|
- Rakefile
|
44
51
|
- README
|
45
52
|
- doc/classes
|
@@ -52,35 +59,83 @@ files:
|
|
52
59
|
- doc/rdoc-style.css
|
53
60
|
- doc/classes/Classifier
|
54
61
|
- doc/classes/Classifier.html
|
62
|
+
- doc/classes/GSL
|
63
|
+
- doc/classes/GSL.html
|
64
|
+
- doc/classes/Object.html
|
65
|
+
- doc/classes/Object.src
|
55
66
|
- doc/classes/String.html
|
67
|
+
- doc/classes/String.src
|
56
68
|
- doc/classes/Classifier/Bayes.html
|
57
69
|
- doc/classes/Classifier/Bayes.src
|
58
|
-
- doc/classes/Classifier/
|
59
|
-
- doc/classes/Classifier/
|
60
|
-
- doc/classes/Classifier/
|
61
|
-
- doc/classes/Classifier/
|
62
|
-
- doc/classes/Classifier/
|
63
|
-
- doc/classes/Classifier/
|
64
|
-
- doc/classes/Classifier/Bayes.src/
|
65
|
-
- doc/classes/Classifier/Bayes.src/
|
66
|
-
- doc/classes/Classifier/Bayes.src/
|
67
|
-
- doc/classes/Classifier/Bayes.src/
|
68
|
-
- doc/classes/Classifier/
|
69
|
-
- doc/classes/Classifier/
|
70
|
-
- doc/classes/Classifier/
|
70
|
+
- doc/classes/Classifier/ContentNode.html
|
71
|
+
- doc/classes/Classifier/ContentNode.src
|
72
|
+
- doc/classes/Classifier/LSI.html
|
73
|
+
- doc/classes/Classifier/LSI.src
|
74
|
+
- doc/classes/Classifier/WordList.html
|
75
|
+
- doc/classes/Classifier/WordList.src
|
76
|
+
- doc/classes/Classifier/Bayes.src/M000023.html
|
77
|
+
- doc/classes/Classifier/Bayes.src/M000024.html
|
78
|
+
- doc/classes/Classifier/Bayes.src/M000025.html
|
79
|
+
- doc/classes/Classifier/Bayes.src/M000026.html
|
80
|
+
- doc/classes/Classifier/Bayes.src/M000027.html
|
81
|
+
- doc/classes/Classifier/Bayes.src/M000028.html
|
82
|
+
- doc/classes/Classifier/Bayes.src/M000029.html
|
83
|
+
- doc/classes/Classifier/ContentNode.src/M000031.html
|
84
|
+
- doc/classes/Classifier/ContentNode.src/M000032.html
|
85
|
+
- doc/classes/Classifier/ContentNode.src/M000033.html
|
86
|
+
- doc/classes/Classifier/ContentNode.src/M000034.html
|
87
|
+
- doc/classes/Classifier/LSI.src/M000011.html
|
88
|
+
- doc/classes/Classifier/LSI.src/M000012.html
|
89
|
+
- doc/classes/Classifier/LSI.src/M000013.html
|
90
|
+
- doc/classes/Classifier/LSI.src/M000014.html
|
91
|
+
- doc/classes/Classifier/LSI.src/M000015.html
|
92
|
+
- doc/classes/Classifier/LSI.src/M000016.html
|
93
|
+
- doc/classes/Classifier/LSI.src/M000017.html
|
94
|
+
- doc/classes/Classifier/LSI.src/M000018.html
|
95
|
+
- doc/classes/Classifier/LSI.src/M000019.html
|
96
|
+
- doc/classes/Classifier/LSI.src/M000020.html
|
97
|
+
- doc/classes/Classifier/LSI.src/M000021.html
|
98
|
+
- doc/classes/Classifier/LSI.src/M000022.html
|
99
|
+
- doc/classes/Classifier/WordList.src/M000007.html
|
100
|
+
- doc/classes/Classifier/WordList.src/M000008.html
|
101
|
+
- doc/classes/Classifier/WordList.src/M000009.html
|
102
|
+
- doc/classes/Classifier/WordList.src/M000010.html
|
103
|
+
- doc/classes/GSL/Vector.html
|
104
|
+
- doc/classes/GSL/Vector.src
|
105
|
+
- doc/classes/GSL/Vector.src/M000005.html
|
106
|
+
- doc/classes/GSL/Vector.src/M000006.html
|
107
|
+
- doc/classes/Object.src/M000001.html
|
108
|
+
- doc/classes/String.src/M000002.html
|
109
|
+
- doc/classes/String.src/M000003.html
|
110
|
+
- doc/classes/String.src/M000004.html
|
71
111
|
- doc/files/lib
|
72
112
|
- doc/files/README.html
|
73
113
|
- doc/files/lib/classifier
|
74
114
|
- doc/files/lib/classifier_rb.html
|
75
115
|
- doc/files/lib/classifier/bayes_rb.html
|
76
|
-
- doc/files/lib/classifier/
|
116
|
+
- doc/files/lib/classifier/extensions
|
117
|
+
- doc/files/lib/classifier/lsi
|
118
|
+
- doc/files/lib/classifier/lsi_rb.html
|
77
119
|
- doc/files/lib/classifier/string_extensions_rb.html
|
78
|
-
- doc/files/lib/classifier/
|
79
|
-
- doc/files/lib/classifier/
|
120
|
+
- doc/files/lib/classifier/extensions/vector_serialize_rb.html
|
121
|
+
- doc/files/lib/classifier/extensions/word_hash_rb.html
|
122
|
+
- doc/files/lib/classifier/extensions/word_list_rb.html
|
123
|
+
- doc/files/lib/classifier/lsi/content_node_rb.html
|
80
124
|
test_files: []
|
81
125
|
rdoc_options: []
|
82
126
|
extra_rdoc_files: []
|
83
127
|
executables: []
|
84
128
|
extensions: []
|
85
|
-
requirements:
|
86
|
-
|
129
|
+
requirements:
|
130
|
+
- A porter-stemmer module to split word stems.
|
131
|
+
dependencies:
|
132
|
+
- !ruby/object:Gem::Dependency
|
133
|
+
name: stemmer
|
134
|
+
version_requirement:
|
135
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
136
|
+
requirements:
|
137
|
+
-
|
138
|
+
- ">="
|
139
|
+
- !ruby/object:Gem::Version
|
140
|
+
version: 1.0.0
|
141
|
+
version:
|
@@ -1,243 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
-
<!DOCTYPE html
|
3
|
-
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
-
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
-
|
6
|
-
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
7
|
-
<head>
|
8
|
-
<title>Module: Classifier::Stemmable</title>
|
9
|
-
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
-
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
-
<link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
|
12
|
-
<script type="text/javascript">
|
13
|
-
// <![CDATA[
|
14
|
-
|
15
|
-
function popupCode( url ) {
|
16
|
-
window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
|
17
|
-
}
|
18
|
-
|
19
|
-
function toggleCode( id ) {
|
20
|
-
if ( document.getElementById )
|
21
|
-
elem = document.getElementById( id );
|
22
|
-
else if ( document.all )
|
23
|
-
elem = eval( "document.all." + id );
|
24
|
-
else
|
25
|
-
return false;
|
26
|
-
|
27
|
-
elemStyle = elem.style;
|
28
|
-
|
29
|
-
if ( elemStyle.display != "block" ) {
|
30
|
-
elemStyle.display = "block"
|
31
|
-
} else {
|
32
|
-
elemStyle.display = "none"
|
33
|
-
}
|
34
|
-
|
35
|
-
return true;
|
36
|
-
}
|
37
|
-
|
38
|
-
// Make codeblocks hidden by default
|
39
|
-
document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
|
40
|
-
|
41
|
-
// ]]>
|
42
|
-
</script>
|
43
|
-
|
44
|
-
</head>
|
45
|
-
<body>
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
<div id="classHeader">
|
50
|
-
<table class="header-table">
|
51
|
-
<tr class="top-aligned-row">
|
52
|
-
<td><strong>Module</strong></td>
|
53
|
-
<td class="class-name-in-header">Classifier::Stemmable</td>
|
54
|
-
</tr>
|
55
|
-
<tr class="top-aligned-row">
|
56
|
-
<td><strong>In:</strong></td>
|
57
|
-
<td>
|
58
|
-
<a href="../../files/lib/classifier/string_extensions/porter_stemmer_rb.html">
|
59
|
-
lib/classifier/string_extensions/porter_stemmer.rb
|
60
|
-
</a>
|
61
|
-
<br />
|
62
|
-
</td>
|
63
|
-
</tr>
|
64
|
-
|
65
|
-
</table>
|
66
|
-
</div>
|
67
|
-
<!-- banner header -->
|
68
|
-
|
69
|
-
<div id="bodyContent">
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
<div id="contextContent">
|
74
|
-
|
75
|
-
<div id="description">
|
76
|
-
<p>
|
77
|
-
Porter stemmer in Ruby.
|
78
|
-
</p>
|
79
|
-
<p>
|
80
|
-
This is the Porter stemming algorithm, ported to Ruby from the version
|
81
|
-
coded up in Perl. It’s easy to follow against the rules in the
|
82
|
-
original paper in:
|
83
|
-
</p>
|
84
|
-
<pre>
|
85
|
-
Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
86
|
-
no. 3, pp 130-137,
|
87
|
-
</pre>
|
88
|
-
<p>
|
89
|
-
See also <a
|
90
|
-
href="http://www.tartarus.org/~martin/PorterStemmer">www.tartarus.org/~martin/PorterStemmer</a>
|
91
|
-
</p>
|
92
|
-
<p>
|
93
|
-
Send comments to raypereda@hotmail.com
|
94
|
-
</p>
|
95
|
-
|
96
|
-
</div>
|
97
|
-
|
98
|
-
|
99
|
-
</div>
|
100
|
-
|
101
|
-
<div id="method-list">
|
102
|
-
<h3 class="section-bar">Methods</h3>
|
103
|
-
|
104
|
-
<div class="name-list">
|
105
|
-
<a href="#M000004">stem</a>
|
106
|
-
<a href="#M000003">stem_porter</a>
|
107
|
-
</div>
|
108
|
-
</div>
|
109
|
-
|
110
|
-
</div>
|
111
|
-
|
112
|
-
|
113
|
-
<!-- if includes -->
|
114
|
-
|
115
|
-
<div id="section">
|
116
|
-
|
117
|
-
|
118
|
-
<div id="constants-list">
|
119
|
-
<h3 class="section-bar">Constants</h3>
|
120
|
-
|
121
|
-
<div class="name-list">
|
122
|
-
<table summary="Constants">
|
123
|
-
<tr class="top-aligned-row context-row">
|
124
|
-
<td class="context-item-name">STEP_2_LIST</td>
|
125
|
-
<td>=</td>
|
126
|
-
<td class="context-item-value">{
|
127
|
'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
|
128
1
|
'izer'=>'ize', 'bli'=>'ble',
|
129
2
|
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
|
130
3
|
'ization'=>'ize', 'ation'=>'ate',
|
131
4
|
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
|
132
5
|
'ousness'=>'ous', 'aliti'=>'al',
|
133
6
|
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'</td>
|
134
|
-
</tr>
|
135
|
-
<tr class="top-aligned-row context-row">
|
136
|
-
<td class="context-item-name">STEP_3_LIST</td>
|
137
|
-
<td>=</td>
|
138
|
-
<td class="context-item-value">{
|
139
7
|
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
|
140
8
|
'ical'=>'ic', 'ful'=>'', 'ness'=>''</td>
|
141
|
-
</tr>
|
142
|
-
<tr class="top-aligned-row context-row">
|
143
|
-
<td class="context-item-name">SUFFIX_1_REGEXP</td>
|
144
|
-
<td>=</td>
|
145
|
-
<td class="context-item-value">/(
|
146
9
|
ational |
|
147
10
|
tional |
|
148
11
|
enci |
|
149
12
|
anci |
|
150
13
|
izer |
|
151
14
|
bli |
|
152
15
|
alli |
|
153
16
|
entli |
|
154
17
|
eli |
|
155
18
|
ousli |
|
156
19
|
ization |
|
157
20
|
ation |
|
158
21
|
ator |
|
159
22
|
alism |
|
160
23
|
iveness |
|
161
24
|
fulness |
|
162
25
|
ousness |
|
163
26
|
aliti |
|
164
27
|
iviti |
|
165
28
|
biliti |
|
166
29
|
logi)$/x</td>
|
167
|
-
</tr>
|
168
|
-
<tr class="top-aligned-row context-row">
|
169
|
-
<td class="context-item-name">SUFFIX_2_REGEXP</td>
|
170
|
-
<td>=</td>
|
171
|
-
<td class="context-item-value">/(
|
172
30
|
al |
|
173
31
|
ance |
|
174
32
|
ence |
|
175
33
|
er |
|
176
34
|
ic |
|
177
35
|
able |
|
178
36
|
ible |
|
179
37
|
ant |
|
180
38
|
ement |
|
181
39
|
ment |
|
182
40
|
ent |
|
183
41
|
ou |
|
184
42
|
ism |
|
185
43
|
ate |
|
186
44
|
iti |
|
187
45
|
ous |
|
188
46
|
ive |
|
189
47
|
ize)$/x</td>
|
190
|
-
</tr>
|
191
|
-
<tr class="top-aligned-row context-row">
|
192
|
-
<td class="context-item-name">C</td>
|
193
|
-
<td>=</td>
|
194
|
-
<td class="context-item-value">"[^aeiou]"</td>
|
195
|
-
</tr>
|
196
|
-
<tr class="top-aligned-row context-row">
|
197
|
-
<td class="context-item-name">V</td>
|
198
|
-
<td>=</td>
|
199
|
-
<td class="context-item-value">"[aeiouy]"</td>
|
200
|
-
</tr>
|
201
|
-
<tr class="top-aligned-row context-row">
|
202
|
-
<td class="context-item-name">CC</td>
|
203
|
-
<td>=</td>
|
204
|
-
<td class="context-item-value">"#{C}(?>[^aeiouy]*)"</td>
|
205
|
-
</tr>
|
206
|
-
<tr class="top-aligned-row context-row">
|
207
|
-
<td class="context-item-name">VV</td>
|
208
|
-
<td>=</td>
|
209
|
-
<td class="context-item-value">"#{V}(?>[aeiou]*)"</td>
|
210
|
-
</tr>
|
211
|
-
<tr class="top-aligned-row context-row">
|
212
|
-
<td class="context-item-name">MGR0</td>
|
213
|
-
<td>=</td>
|
214
|
-
<td class="context-item-value">/^(#{CC})?#{VV}#{CC}/o</td>
|
215
|
-
</tr>
|
216
|
-
<tr class="top-aligned-row context-row">
|
217
|
-
<td class="context-item-name">MEQ1</td>
|
218
|
-
<td>=</td>
|
219
|
-
<td class="context-item-value">/^(#{CC})?#{VV}#{CC}(#{VV})?$/o</td>
|
220
|
-
</tr>
|
221
|
-
<tr class="top-aligned-row context-row">
|
222
|
-
<td class="context-item-name">MGR1</td>
|
223
|
-
<td>=</td>
|
224
|
-
<td class="context-item-value">/^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o</td>
|
225
|
-
</tr>
|
226
|
-
<tr class="top-aligned-row context-row">
|
227
|
-
<td class="context-item-name">VOWEL_IN_STEM</td>
|
228
|
-
<td>=</td>
|
229
|
-
<td class="context-item-value">/^(#{CC})?#{V}/o</td>
|
230
|
-
</tr>
|
231
|
-
</table>
|
232
|
-
</div>
|
233
|
-
</div>
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
<!-- if method_list -->
|
241
|
-
<div id="methods">
|
242
|
-
<h3 class="section-bar">Public Instance methods</h3>
|
243
|
-
|
244
|
-
<div id="method-M000004" class="method-detail">
|
245
|
-
<a name="M000004"></a>
|
246
|
-
|
247
|
-
<div class="method-heading">
|
248
|
-
<span class="method-name">stem</span><span class="method-args">()</span>
|
249
|
-
</div>
|
250
|
-
|
251
|
-
<div class="method-description">
|
252
|
-
<p>
|
253
|
-
Alias for <a href="Stemmable.html#M000003">stem_porter</a>
|
254
|
-
</p>
|
255
|
-
</div>
|
256
|
-
</div>
|
257
|
-
|
258
|
-
<div id="method-M000003" class="method-detail">
|
259
|
-
<a name="M000003"></a>
|
260
|
-
|
261
|
-
<div class="method-heading">
|
262
|
-
<a href="Stemmable.src/M000003.html" target="Code" class="method-signature"
|
263
|
-
onclick="popupCode('Stemmable.src/M000003.html');return false;">
|
264
|
-
<span class="method-name">stem_porter</span><span class="method-args">()</span>
|
265
|
-
</a>
|
266
|
-
</div>
|
267
|
-
|
268
|
-
<div class="method-description">
|
269
|
-
<p>
|
270
|
-
Stems the word contained in the current object. E.g.,
|
271
|
-
</p>
|
272
|
-
<pre>
|
273
|
-
"actually".stem_porter
|
274
|
-
=> "actual"
|
275
|
-
</pre>
|
276
|
-
</div>
|
277
|
-
</div>
|
278
|
-
|
279
|
-
|
280
|
-
</div>
|
281
|
-
|
282
|
-
|
283
|
-
</div>
|
284
|
-
|
285
|
-
|
286
|
-
<div id="validator-badges">
|
287
|
-
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
288
|
-
</div>
|
289
|
-
|
290
|
-
</body>
|
291
|
-
</html>
|