classifier 1.1.1 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +341 -0
- data/README +59 -6
- data/Rakefile +16 -4
- data/bin/bayes.rb +8 -2
- data/doc/classes/Classifier.html +15 -10
- data/doc/classes/Classifier/Bayes.html +68 -38
- data/doc/classes/Classifier/Bayes.src/{M000005.html → M000023.html} +1 -1
- data/doc/classes/Classifier/Bayes.src/{M000006.html → M000024.html} +1 -1
- data/doc/classes/Classifier/Bayes.src/M000025.html +30 -0
- data/doc/classes/Classifier/Bayes.src/{M000007.html → M000026.html} +1 -1
- data/doc/classes/Classifier/Bayes.src/{M000008.html → M000027.html} +1 -1
- data/doc/classes/Classifier/Bayes.src/{M000009.html → M000028.html} +4 -4
- data/doc/classes/Classifier/Bayes.src/{M000010.html → M000029.html} +2 -2
- data/doc/classes/Classifier/ContentNode.html +252 -0
- data/doc/classes/Classifier/ContentNode.src/M000031.html +21 -0
- data/doc/classes/Classifier/ContentNode.src/M000032.html +18 -0
- data/doc/classes/Classifier/ContentNode.src/M000033.html +18 -0
- data/doc/classes/Classifier/ContentNode.src/M000034.html +41 -0
- data/doc/classes/Classifier/LSI.html +449 -0
- data/doc/classes/Classifier/LSI.src/M000011.html +20 -0
- data/doc/classes/Classifier/LSI.src/M000012.html +18 -0
- data/doc/classes/Classifier/LSI.src/M000013.html +20 -0
- data/doc/classes/Classifier/LSI.src/M000014.html +18 -0
- data/doc/classes/Classifier/LSI.src/M000015.html +21 -0
- data/doc/classes/Classifier/LSI.src/M000016.html +18 -0
- data/doc/classes/Classifier/LSI.src/M000017.html +32 -0
- data/doc/classes/Classifier/LSI.src/M000018.html +26 -0
- data/doc/classes/Classifier/LSI.src/M000019.html +26 -0
- data/doc/classes/Classifier/LSI.src/M000020.html +23 -0
- data/doc/classes/Classifier/LSI.src/M000021.html +21 -0
- data/doc/classes/Classifier/LSI.src/M000022.html +31 -0
- data/doc/classes/Classifier/WordList.html +202 -0
- data/doc/classes/Classifier/WordList.src/M000007.html +18 -0
- data/doc/classes/Classifier/WordList.src/M000008.html +19 -0
- data/doc/classes/Classifier/WordList.src/M000009.html +19 -0
- data/doc/classes/Classifier/WordList.src/M000010.html +18 -0
- data/doc/classes/GSL.html +111 -0
- data/doc/classes/GSL/Vector.html +156 -0
- data/doc/classes/GSL/Vector.src/M000005.html +18 -0
- data/doc/classes/GSL/Vector.src/M000006.html +19 -0
- data/doc/classes/Object.html +139 -0
- data/doc/classes/Object.src/M000001.html +16 -0
- data/doc/classes/String.html +95 -9
- data/doc/classes/{Classifier/WordHash.src/M000001.html → String.src/M000002.html} +3 -3
- data/doc/classes/String.src/M000003.html +18 -0
- data/doc/classes/String.src/M000004.html +18 -0
- data/doc/created.rid +1 -1
- data/doc/files/README.html +102 -12
- data/doc/files/lib/classifier/bayes_rb.html +1 -1
- data/doc/files/lib/classifier/{string_extensions/porter_stemmer_rb.html → extensions/vector_serialize_rb.html} +4 -15
- data/doc/files/lib/classifier/{string_extensions → extensions}/word_hash_rb.html +2 -2
- data/doc/files/lib/classifier/extensions/word_list_rb.html +115 -0
- data/doc/files/lib/classifier/lsi/content_node_rb.html +115 -0
- data/doc/files/lib/classifier/lsi_rb.html +125 -0
- data/doc/files/lib/classifier/string_extensions_rb.html +2 -3
- data/doc/files/lib/classifier_rb.html +3 -1
- data/doc/fr_class_index.html +6 -2
- data/doc/fr_file_index.html +5 -2
- data/doc/fr_method_index.html +34 -11
- data/lib/classifier.rb +3 -1
- data/lib/classifier/bayes.rb +34 -9
- data/lib/classifier/extensions/vector_serialize.rb +14 -0
- data/lib/classifier/extensions/word_hash.rb +125 -0
- data/lib/classifier/extensions/word_list.rb +31 -0
- data/lib/classifier/lsi.rb +248 -0
- data/lib/classifier/lsi/content_node.rb +67 -0
- data/lib/classifier/string_extensions.rb +10 -5
- data/test/bayes/bayesian_test.rb +2 -2
- data/test/lsi/lsi_test.rb +88 -0
- data/test/string_extensions/word_hash_test.rb +7 -5
- metadata +79 -24
- data/doc/classes/Classifier/Stemmable.html +0 -243
- data/doc/classes/Classifier/Stemmable.src/M000003.html +0 -102
- data/doc/classes/Classifier/WordHash.html +0 -178
- data/doc/classes/Classifier/WordHash.src/M000002.html +0 -28
- data/lib/classifier/string_extensions/porter_stemmer.rb +0 -199
- data/lib/classifier/string_extensions/word_hash.rb +0 -119
@@ -1,102 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
-
<!DOCTYPE html
|
3
|
-
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
-
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
-
|
6
|
-
<html>
|
7
|
-
<head>
|
8
|
-
<title>stem_porter (Classifier::Stemmable)</title>
|
9
|
-
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
-
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
11
|
-
</head>
|
12
|
-
<body class="standalone-code">
|
13
|
-
<pre><span class="ruby-comment cmt"># File lib/classifier/string_extensions/porter_stemmer.rb, line 102</span>
|
14
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">stem_porter</span>
|
15
|
-
|
16
|
-
<span class="ruby-comment cmt"># make a copy of the given object and convert it to a string.
|
17
|
-
<span class="ruby-identifier">w</span> = <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">dup</span>.<span class="ruby-identifier">to_str</span>
|
18
|
-
|
19
|
-
<span class="ruby-keyword kw">return</span> <span class="ruby-identifier">w</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator"><</span> <span class="ruby-value">3</span>
|
20
|
-
|
21
|
-
<span class="ruby-comment cmt"># now map initial y to Y so that the patterns never treat it as vowel
|
22
|
-
<span class="ruby-identifier">w</span>[<span class="ruby-value">0</span>] = <span class="ruby-value str">'Y'</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span>[<span class="ruby-value">0</span>] <span class="ruby-operator">==</span> <span class="ruby-value">?y</span>
|
23
|
-
|
24
|
-
<span class="ruby-comment cmt"># Step 1a
|
25
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/(ss|i)es$/</span>
|
26
|
-
<span class="ruby-identifier">w</span> = <span class="ruby-identifier">$`</span> <span class="ruby-operator">+</span> <span class="ruby-identifier">$1</span>
|
27
|
-
<span class="ruby-keyword kw">elsif</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/([^s])s$/</span>
|
28
|
-
<span class="ruby-identifier">w</span> = <span class="ruby-identifier">$`</span> <span class="ruby-operator">+</span> <span class="ruby-identifier">$1</span>
|
29
|
-
<span class="ruby-keyword kw">end</span>
|
30
|
-
|
31
|
-
<span class="ruby-comment cmt"># Step 1b
|
32
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/eed$/</span>
|
33
|
-
<span class="ruby-identifier">w</span>.<span class="ruby-identifier">chop!</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">$`</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR0</span>
|
34
|
-
<span class="ruby-keyword kw">elsif</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/(ed|ing)$/</span>
|
35
|
-
<span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
|
36
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">VOWEL_IN_STEM</span>
|
37
|
-
<span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span>
|
38
|
-
<span class="ruby-keyword kw">case</span> <span class="ruby-identifier">w</span>
|
39
|
-
<span class="ruby-keyword kw">when</span> <span class="ruby-regexp re">/(at|bl|iz)$/</span> <span class="ruby-keyword kw">then</span> <span class="ruby-identifier">w</span> <span class="ruby-operator"><<</span> <span class="ruby-value str">"e"</span>
|
40
|
-
<span class="ruby-keyword kw">when</span> <span class="ruby-regexp re">/([^aeiouylsz])\1$/</span> <span class="ruby-keyword kw">then</span> <span class="ruby-identifier">w</span>.<span class="ruby-identifier">chop!</span>
|
41
|
-
<span class="ruby-keyword kw">when</span> <span class="ruby-node">/^#{CC}#{V}[^aeiouwxy]$/o</span> <span class="ruby-keyword kw">then</span> <span class="ruby-identifier">w</span> <span class="ruby-operator"><<</span> <span class="ruby-value str">"e"</span>
|
42
|
-
<span class="ruby-keyword kw">end</span>
|
43
|
-
<span class="ruby-keyword kw">end</span>
|
44
|
-
<span class="ruby-keyword kw">end</span>
|
45
|
-
|
46
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/y$/</span>
|
47
|
-
<span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
|
48
|
-
<span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span> <span class="ruby-operator">+</span> <span class="ruby-value str">"i"</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">VOWEL_IN_STEM</span>
|
49
|
-
<span class="ruby-keyword kw">end</span>
|
50
|
-
|
51
|
-
<span class="ruby-comment cmt"># Step 2
|
52
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">SUFFIX_1_REGEXP</span>
|
53
|
-
<span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
|
54
|
-
<span class="ruby-identifier">suffix</span> = <span class="ruby-identifier">$1</span>
|
55
|
-
<span class="ruby-comment cmt"># print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
|
56
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR0</span>
|
57
|
-
<span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span> <span class="ruby-operator">+</span> <span class="ruby-constant">STEP_2_LIST</span>[<span class="ruby-identifier">suffix</span>]
|
58
|
-
<span class="ruby-keyword kw">end</span>
|
59
|
-
<span class="ruby-keyword kw">end</span>
|
60
|
-
|
61
|
-
<span class="ruby-comment cmt"># Step 3
|
62
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/(icate|ative|alize|iciti|ical|ful|ness)$/</span>
|
63
|
-
<span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
|
64
|
-
<span class="ruby-identifier">suffix</span> = <span class="ruby-identifier">$1</span>
|
65
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR0</span>
|
66
|
-
<span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span> <span class="ruby-operator">+</span> <span class="ruby-constant">STEP_3_LIST</span>[<span class="ruby-identifier">suffix</span>]
|
67
|
-
<span class="ruby-keyword kw">end</span>
|
68
|
-
<span class="ruby-keyword kw">end</span>
|
69
|
-
|
70
|
-
<span class="ruby-comment cmt"># Step 4
|
71
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">SUFFIX_2_REGEXP</span>
|
72
|
-
<span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
|
73
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR1</span>
|
74
|
-
<span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span>
|
75
|
-
<span class="ruby-keyword kw">end</span>
|
76
|
-
<span class="ruby-keyword kw">elsif</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/(s|t)(ion)$/</span>
|
77
|
-
<span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span> <span class="ruby-operator">+</span> <span class="ruby-identifier">$1</span>
|
78
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR1</span>
|
79
|
-
<span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span>
|
80
|
-
<span class="ruby-keyword kw">end</span>
|
81
|
-
<span class="ruby-keyword kw">end</span>
|
82
|
-
|
83
|
-
<span class="ruby-comment cmt"># Step 5
|
84
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/e$/</span>
|
85
|
-
<span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
|
86
|
-
<span class="ruby-keyword kw">if</span> (<span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR1</span>) <span class="ruby-operator">||</span>
|
87
|
-
(<span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MEQ1</span> <span class="ruby-operator">&&</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">!~</span> <span class="ruby-node">/^#{CC}#{V}[^aeiouwxy]$/o</span>)
|
88
|
-
<span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span>
|
89
|
-
<span class="ruby-keyword kw">end</span>
|
90
|
-
<span class="ruby-keyword kw">end</span>
|
91
|
-
|
92
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/ll$/</span> <span class="ruby-operator">&&</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR1</span>
|
93
|
-
<span class="ruby-identifier">w</span>.<span class="ruby-identifier">chop!</span>
|
94
|
-
<span class="ruby-keyword kw">end</span>
|
95
|
-
|
96
|
-
<span class="ruby-comment cmt"># and turn initial Y back to y
|
97
|
-
<span class="ruby-identifier">w</span>[<span class="ruby-value">0</span>] = <span class="ruby-value str">'y'</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span>[<span class="ruby-value">0</span>] <span class="ruby-operator">==</span> <span class="ruby-value">?Y</span>
|
98
|
-
|
99
|
-
<span class="ruby-identifier">w</span>
|
100
|
-
<span class="ruby-keyword kw">end</span></pre>
|
101
|
-
</body>
|
102
|
-
</html>
|
@@ -1,178 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
-
<!DOCTYPE html
|
3
|
-
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
-
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
-
|
6
|
-
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
7
|
-
<head>
|
8
|
-
<title>Module: Classifier::WordHash</title>
|
9
|
-
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
-
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
-
<link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
|
12
|
-
<script type="text/javascript">
|
13
|
-
// <![CDATA[
|
14
|
-
|
15
|
-
function popupCode( url ) {
|
16
|
-
window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
|
17
|
-
}
|
18
|
-
|
19
|
-
function toggleCode( id ) {
|
20
|
-
if ( document.getElementById )
|
21
|
-
elem = document.getElementById( id );
|
22
|
-
else if ( document.all )
|
23
|
-
elem = eval( "document.all." + id );
|
24
|
-
else
|
25
|
-
return false;
|
26
|
-
|
27
|
-
elemStyle = elem.style;
|
28
|
-
|
29
|
-
if ( elemStyle.display != "block" ) {
|
30
|
-
elemStyle.display = "block"
|
31
|
-
} else {
|
32
|
-
elemStyle.display = "none"
|
33
|
-
}
|
34
|
-
|
35
|
-
return true;
|
36
|
-
}
|
37
|
-
|
38
|
-
// Make codeblocks hidden by default
|
39
|
-
document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
|
40
|
-
|
41
|
-
// ]]>
|
42
|
-
</script>
|
43
|
-
|
44
|
-
</head>
|
45
|
-
<body>
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
<div id="classHeader">
|
50
|
-
<table class="header-table">
|
51
|
-
<tr class="top-aligned-row">
|
52
|
-
<td><strong>Module</strong></td>
|
53
|
-
<td class="class-name-in-header">Classifier::WordHash</td>
|
54
|
-
</tr>
|
55
|
-
<tr class="top-aligned-row">
|
56
|
-
<td><strong>In:</strong></td>
|
57
|
-
<td>
|
58
|
-
<a href="../../files/lib/classifier/string_extensions/word_hash_rb.html">
|
59
|
-
lib/classifier/string_extensions/word_hash.rb
|
60
|
-
</a>
|
61
|
-
<br />
|
62
|
-
</td>
|
63
|
-
</tr>
|
64
|
-
|
65
|
-
</table>
|
66
|
-
</div>
|
67
|
-
<!-- banner header -->
|
68
|
-
|
69
|
-
<div id="bodyContent">
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
<div id="contextContent">
|
74
|
-
|
75
|
-
<div id="description">
|
76
|
-
<p>
|
77
|
-
This module is mixed into <a href="../String.html">String</a> to provide
|
78
|
-
convenience methods for the <a href="../Classifier.html">Classifier</a>
|
79
|
-
package.
|
80
|
-
</p>
|
81
|
-
|
82
|
-
</div>
|
83
|
-
|
84
|
-
|
85
|
-
</div>
|
86
|
-
|
87
|
-
<div id="method-list">
|
88
|
-
<h3 class="section-bar">Methods</h3>
|
89
|
-
|
90
|
-
<div class="name-list">
|
91
|
-
<a href="#M000001">without_punctuation</a>
|
92
|
-
<a href="#M000002">word_hash</a>
|
93
|
-
</div>
|
94
|
-
</div>
|
95
|
-
|
96
|
-
</div>
|
97
|
-
|
98
|
-
|
99
|
-
<!-- if includes -->
|
100
|
-
|
101
|
-
<div id="section">
|
102
|
-
|
103
|
-
|
104
|
-
<div id="constants-list">
|
105
|
-
<h3 class="section-bar">Constants</h3>
|
106
|
-
|
107
|
-
<div class="name-list">
|
108
|
-
<table summary="Constants">
|
109
|
-
<tr class="top-aligned-row context-row">
|
110
|
-
<td class="context-item-name">CORPUS_SKIP_WORDS</td>
|
111
|
-
<td>=</td>
|
112
|
-
<td class="context-item-value">{ "a" => 1, "again" => 1, "all" => 1, "along" => 1, "are" => 1, "also" => 1, "an" => 1, "and" => 1, "as" => 1, "at" => 1, "but" => 1, "by" => 1, "came" => 1, "can" => 1, "cant" => 1, "couldnt" => 1, "did" => 1, "didn" => 1, "didnt" => 1, "do" => 1, "doesnt" => 1, "dont" => 1, "ever" => 1, "first" => 1, "from" => 1, "have" => 1, "her" => 1, "here" => 1, "him" => 1, "how" => 1, "i" => 1, "if" => 1, "in" => 1, "into" => 1, "is" => 1, "isnt" => 1, "it" => 1, "itll" => 1, "just" => 1, "last" => 1, "least" => 1, "like" => 1, "most" => 1, "my" => 1, "new" => 1, "no" => 1, "not" => 1, "now" => 1, "of" => 1, "on" => 1, "or" => 1, "should" => 1, "sinc" => 1, "so" => 1, "some" => 1, "th" => 1, "than" => 1, "this" => 1, "that" => 1, "the" => 1, "their" => 1, "then" => 1, "those" => 1, "to" => 1, "told" => 1, "too" => 1, "true" => 1, "try" => 1, "until" => 1, "url" => 1, "us" => 1, "were" => 1, "when" => 1, "whether" => 1, "while" => 1, "with" => 1, "within" => 1, "yes" => 1, "you" => 1, "youll" => 1, }</td>
|
113
|
-
</tr>
|
114
|
-
</table>
|
115
|
-
</div>
|
116
|
-
</div>
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
<!-- if method_list -->
|
124
|
-
<div id="methods">
|
125
|
-
<h3 class="section-bar">Public Instance methods</h3>
|
126
|
-
|
127
|
-
<div id="method-M000001" class="method-detail">
|
128
|
-
<a name="M000001"></a>
|
129
|
-
|
130
|
-
<div class="method-heading">
|
131
|
-
<a href="WordHash.src/M000001.html" target="Code" class="method-signature"
|
132
|
-
onclick="popupCode('WordHash.src/M000001.html');return false;">
|
133
|
-
<span class="method-name">without_punctuation</span><span class="method-args">()</span>
|
134
|
-
</a>
|
135
|
-
</div>
|
136
|
-
|
137
|
-
<div class="method-description">
|
138
|
-
<p>
|
139
|
-
Removes common punctuation symbols, returning a new string. E.g.,
|
140
|
-
</p>
|
141
|
-
<pre>
|
142
|
-
"Hello (greeting's), with {braces} < >...?".without_punctuation
|
143
|
-
=> "Hello greetings with braces "
|
144
|
-
</pre>
|
145
|
-
</div>
|
146
|
-
</div>
|
147
|
-
|
148
|
-
<div id="method-M000002" class="method-detail">
|
149
|
-
<a name="M000002"></a>
|
150
|
-
|
151
|
-
<div class="method-heading">
|
152
|
-
<a href="WordHash.src/M000002.html" target="Code" class="method-signature"
|
153
|
-
onclick="popupCode('WordHash.src/M000002.html');return false;">
|
154
|
-
<span class="method-name">word_hash</span><span class="method-args">()</span>
|
155
|
-
</a>
|
156
|
-
</div>
|
157
|
-
|
158
|
-
<div class="method-description">
|
159
|
-
<p>
|
160
|
-
Return a Hash of strings => ints. Each word in the string is stemmed,
|
161
|
-
interned, and indexes to its frequency in the document.
|
162
|
-
</p>
|
163
|
-
</div>
|
164
|
-
</div>
|
165
|
-
|
166
|
-
|
167
|
-
</div>
|
168
|
-
|
169
|
-
|
170
|
-
</div>
|
171
|
-
|
172
|
-
|
173
|
-
<div id="validator-badges">
|
174
|
-
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
175
|
-
</div>
|
176
|
-
|
177
|
-
</body>
|
178
|
-
</html>
|
@@ -1,28 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
-
<!DOCTYPE html
|
3
|
-
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
-
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
-
|
6
|
-
<html>
|
7
|
-
<head>
|
8
|
-
<title>word_hash (Classifier::WordHash)</title>
|
9
|
-
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
-
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
11
|
-
</head>
|
12
|
-
<body class="standalone-code">
|
13
|
-
<pre><span class="ruby-comment cmt"># File lib/classifier/string_extensions/word_hash.rb, line 20</span>
|
14
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">word_hash</span>
|
15
|
-
<span class="ruby-identifier">d</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span>
|
16
|
-
<span class="ruby-identifier">corpus</span> = <span class="ruby-identifier">without_punctuation</span>
|
17
|
-
(<span class="ruby-identifier">corpus</span>.<span class="ruby-identifier">split</span> <span class="ruby-operator">+</span> <span class="ruby-identifier">gsub</span>(<span class="ruby-regexp re">/[\w+]/</span>,<span class="ruby-value str">""</span>).<span class="ruby-identifier">split</span>).<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">word</span><span class="ruby-operator">|</span>
|
18
|
-
<span class="ruby-identifier">item</span> = <span class="ruby-identifier">word</span>.<span class="ruby-identifier">downcase</span>
|
19
|
-
<span class="ruby-identifier">key</span> = <span class="ruby-identifier">item</span>.<span class="ruby-identifier">stem</span>.<span class="ruby-identifier">intern</span>
|
20
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-operator">!</span>(<span class="ruby-identifier">word</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/[\w+]/</span>) <span class="ruby-operator">||</span> <span class="ruby-identifier">word</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">></span> <span class="ruby-value">2</span>
|
21
|
-
<span class="ruby-identifier">d</span>[<span class="ruby-identifier">key</span>] <span class="ruby-operator">||=</span> <span class="ruby-value">0</span>
|
22
|
-
<span class="ruby-identifier">d</span>[<span class="ruby-identifier">key</span>] <span class="ruby-operator">+=</span> <span class="ruby-value">1</span>
|
23
|
-
<span class="ruby-keyword kw">end</span> <span class="ruby-keyword kw">unless</span> <span class="ruby-constant">CORPUS_SKIP_WORDS</span>[<span class="ruby-identifier">item</span>]
|
24
|
-
<span class="ruby-keyword kw">end</span>
|
25
|
-
<span class="ruby-keyword kw">return</span> <span class="ruby-identifier">d</span>
|
26
|
-
<span class="ruby-keyword kw">end</span></pre>
|
27
|
-
</body>
|
28
|
-
</html>
|
@@ -1,199 +0,0 @@
|
|
1
|
-
# = Author
|
2
|
-
#
|
3
|
-
# Greg Fast, gdf@speakeasy.net
|
4
|
-
#
|
5
|
-
# = Copyright
|
6
|
-
#
|
7
|
-
# Copyright 2005 Greg Fast <gdf@speakeasy.net>
|
8
|
-
|
9
|
-
module Classifier
|
10
|
-
|
11
|
-
#
|
12
|
-
# Porter stemmer in Ruby.
|
13
|
-
#
|
14
|
-
# This is the Porter stemming algorithm, ported to Ruby from the
|
15
|
-
# version coded up in Perl. It's easy to follow against the rules
|
16
|
-
# in the original paper in:
|
17
|
-
#
|
18
|
-
# Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
19
|
-
# no. 3, pp 130-137,
|
20
|
-
#
|
21
|
-
# See also http://www.tartarus.org/~martin/PorterStemmer
|
22
|
-
#
|
23
|
-
# Send comments to raypereda@hotmail.com
|
24
|
-
#
|
25
|
-
module Stemmable
|
26
|
-
|
27
|
-
STEP_2_LIST = {
|
28
|
-
'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
|
29
|
-
'izer'=>'ize', 'bli'=>'ble',
|
30
|
-
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
|
31
|
-
'ization'=>'ize', 'ation'=>'ate',
|
32
|
-
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
|
33
|
-
'ousness'=>'ous', 'aliti'=>'al',
|
34
|
-
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
|
35
|
-
}
|
36
|
-
|
37
|
-
STEP_3_LIST = {
|
38
|
-
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
|
39
|
-
'ical'=>'ic', 'ful'=>'', 'ness'=>''
|
40
|
-
}
|
41
|
-
|
42
|
-
|
43
|
-
SUFFIX_1_REGEXP = /(
|
44
|
-
ational |
|
45
|
-
tional |
|
46
|
-
enci |
|
47
|
-
anci |
|
48
|
-
izer |
|
49
|
-
bli |
|
50
|
-
alli |
|
51
|
-
entli |
|
52
|
-
eli |
|
53
|
-
ousli |
|
54
|
-
ization |
|
55
|
-
ation |
|
56
|
-
ator |
|
57
|
-
alism |
|
58
|
-
iveness |
|
59
|
-
fulness |
|
60
|
-
ousness |
|
61
|
-
aliti |
|
62
|
-
iviti |
|
63
|
-
biliti |
|
64
|
-
logi)$/x
|
65
|
-
|
66
|
-
|
67
|
-
SUFFIX_2_REGEXP = /(
|
68
|
-
al |
|
69
|
-
ance |
|
70
|
-
ence |
|
71
|
-
er |
|
72
|
-
ic |
|
73
|
-
able |
|
74
|
-
ible |
|
75
|
-
ant |
|
76
|
-
ement |
|
77
|
-
ment |
|
78
|
-
ent |
|
79
|
-
ou |
|
80
|
-
ism |
|
81
|
-
ate |
|
82
|
-
iti |
|
83
|
-
ous |
|
84
|
-
ive |
|
85
|
-
ize)$/x
|
86
|
-
|
87
|
-
|
88
|
-
C = "[^aeiou]" # consonant
|
89
|
-
V = "[aeiouy]" # vowel
|
90
|
-
CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
|
91
|
-
VV = "#{V}(?>[aeiou]*)" # vowel sequence
|
92
|
-
|
93
|
-
MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
|
94
|
-
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
|
95
|
-
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
|
96
|
-
VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
|
97
|
-
|
98
|
-
#
|
99
|
-
# Stems the word contained in the current object. E.g.,
|
100
|
-
# "actually".stem_porter
|
101
|
-
# => "actual"
|
102
|
-
def stem_porter
|
103
|
-
|
104
|
-
# make a copy of the given object and convert it to a string.
|
105
|
-
w = self.dup.to_str
|
106
|
-
|
107
|
-
return w if w.length < 3
|
108
|
-
|
109
|
-
# now map initial y to Y so that the patterns never treat it as vowel
|
110
|
-
w[0] = 'Y' if w[0] == ?y
|
111
|
-
|
112
|
-
# Step 1a
|
113
|
-
if w =~ /(ss|i)es$/
|
114
|
-
w = $` + $1
|
115
|
-
elsif w =~ /([^s])s$/
|
116
|
-
w = $` + $1
|
117
|
-
end
|
118
|
-
|
119
|
-
# Step 1b
|
120
|
-
if w =~ /eed$/
|
121
|
-
w.chop! if $` =~ MGR0
|
122
|
-
elsif w =~ /(ed|ing)$/
|
123
|
-
stem = $`
|
124
|
-
if stem =~ VOWEL_IN_STEM
|
125
|
-
w = stem
|
126
|
-
case w
|
127
|
-
when /(at|bl|iz)$/ then w << "e"
|
128
|
-
when /([^aeiouylsz])\1$/ then w.chop!
|
129
|
-
when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
|
130
|
-
end
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
|
-
if w =~ /y$/
|
135
|
-
stem = $`
|
136
|
-
w = stem + "i" if stem =~ VOWEL_IN_STEM
|
137
|
-
end
|
138
|
-
|
139
|
-
# Step 2
|
140
|
-
if w =~ SUFFIX_1_REGEXP
|
141
|
-
stem = $`
|
142
|
-
suffix = $1
|
143
|
-
# print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
|
144
|
-
if stem =~ MGR0
|
145
|
-
w = stem + STEP_2_LIST[suffix]
|
146
|
-
end
|
147
|
-
end
|
148
|
-
|
149
|
-
# Step 3
|
150
|
-
if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
|
151
|
-
stem = $`
|
152
|
-
suffix = $1
|
153
|
-
if stem =~ MGR0
|
154
|
-
w = stem + STEP_3_LIST[suffix]
|
155
|
-
end
|
156
|
-
end
|
157
|
-
|
158
|
-
# Step 4
|
159
|
-
if w =~ SUFFIX_2_REGEXP
|
160
|
-
stem = $`
|
161
|
-
if stem =~ MGR1
|
162
|
-
w = stem
|
163
|
-
end
|
164
|
-
elsif w =~ /(s|t)(ion)$/
|
165
|
-
stem = $` + $1
|
166
|
-
if stem =~ MGR1
|
167
|
-
w = stem
|
168
|
-
end
|
169
|
-
end
|
170
|
-
|
171
|
-
# Step 5
|
172
|
-
if w =~ /e$/
|
173
|
-
stem = $`
|
174
|
-
if (stem =~ MGR1) ||
|
175
|
-
(stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
|
176
|
-
w = stem
|
177
|
-
end
|
178
|
-
end
|
179
|
-
|
180
|
-
if w =~ /ll$/ && w =~ MGR1
|
181
|
-
w.chop!
|
182
|
-
end
|
183
|
-
|
184
|
-
# and turn initial Y back to y
|
185
|
-
w[0] = 'y' if w[0] == ?Y
|
186
|
-
|
187
|
-
w
|
188
|
-
end
|
189
|
-
|
190
|
-
|
191
|
-
#
|
192
|
-
# make the stem_porter the default stem method, just in case we
|
193
|
-
# feel like having multiple stemmers available later.
|
194
|
-
#
|
195
|
-
alias stem stem_porter
|
196
|
-
|
197
|
-
end
|
198
|
-
|
199
|
-
end
|