classifier 1.1.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +341 -0
- data/README +59 -6
- data/Rakefile +16 -4
- data/bin/bayes.rb +8 -2
- data/doc/classes/Classifier.html +15 -10
- data/doc/classes/Classifier/Bayes.html +68 -38
- data/doc/classes/Classifier/Bayes.src/{M000005.html → M000023.html} +1 -1
- data/doc/classes/Classifier/Bayes.src/{M000006.html → M000024.html} +1 -1
- data/doc/classes/Classifier/Bayes.src/M000025.html +30 -0
- data/doc/classes/Classifier/Bayes.src/{M000007.html → M000026.html} +1 -1
- data/doc/classes/Classifier/Bayes.src/{M000008.html → M000027.html} +1 -1
- data/doc/classes/Classifier/Bayes.src/{M000009.html → M000028.html} +4 -4
- data/doc/classes/Classifier/Bayes.src/{M000010.html → M000029.html} +2 -2
- data/doc/classes/Classifier/ContentNode.html +252 -0
- data/doc/classes/Classifier/ContentNode.src/M000031.html +21 -0
- data/doc/classes/Classifier/ContentNode.src/M000032.html +18 -0
- data/doc/classes/Classifier/ContentNode.src/M000033.html +18 -0
- data/doc/classes/Classifier/ContentNode.src/M000034.html +41 -0
- data/doc/classes/Classifier/LSI.html +449 -0
- data/doc/classes/Classifier/LSI.src/M000011.html +20 -0
- data/doc/classes/Classifier/LSI.src/M000012.html +18 -0
- data/doc/classes/Classifier/LSI.src/M000013.html +20 -0
- data/doc/classes/Classifier/LSI.src/M000014.html +18 -0
- data/doc/classes/Classifier/LSI.src/M000015.html +21 -0
- data/doc/classes/Classifier/LSI.src/M000016.html +18 -0
- data/doc/classes/Classifier/LSI.src/M000017.html +32 -0
- data/doc/classes/Classifier/LSI.src/M000018.html +26 -0
- data/doc/classes/Classifier/LSI.src/M000019.html +26 -0
- data/doc/classes/Classifier/LSI.src/M000020.html +23 -0
- data/doc/classes/Classifier/LSI.src/M000021.html +21 -0
- data/doc/classes/Classifier/LSI.src/M000022.html +31 -0
- data/doc/classes/Classifier/WordList.html +202 -0
- data/doc/classes/Classifier/WordList.src/M000007.html +18 -0
- data/doc/classes/Classifier/WordList.src/M000008.html +19 -0
- data/doc/classes/Classifier/WordList.src/M000009.html +19 -0
- data/doc/classes/Classifier/WordList.src/M000010.html +18 -0
- data/doc/classes/GSL.html +111 -0
- data/doc/classes/GSL/Vector.html +156 -0
- data/doc/classes/GSL/Vector.src/M000005.html +18 -0
- data/doc/classes/GSL/Vector.src/M000006.html +19 -0
- data/doc/classes/Object.html +139 -0
- data/doc/classes/Object.src/M000001.html +16 -0
- data/doc/classes/String.html +95 -9
- data/doc/classes/{Classifier/WordHash.src/M000001.html → String.src/M000002.html} +3 -3
- data/doc/classes/String.src/M000003.html +18 -0
- data/doc/classes/String.src/M000004.html +18 -0
- data/doc/created.rid +1 -1
- data/doc/files/README.html +102 -12
- data/doc/files/lib/classifier/bayes_rb.html +1 -1
- data/doc/files/lib/classifier/{string_extensions/porter_stemmer_rb.html → extensions/vector_serialize_rb.html} +4 -15
- data/doc/files/lib/classifier/{string_extensions → extensions}/word_hash_rb.html +2 -2
- data/doc/files/lib/classifier/extensions/word_list_rb.html +115 -0
- data/doc/files/lib/classifier/lsi/content_node_rb.html +115 -0
- data/doc/files/lib/classifier/lsi_rb.html +125 -0
- data/doc/files/lib/classifier/string_extensions_rb.html +2 -3
- data/doc/files/lib/classifier_rb.html +3 -1
- data/doc/fr_class_index.html +6 -2
- data/doc/fr_file_index.html +5 -2
- data/doc/fr_method_index.html +34 -11
- data/lib/classifier.rb +3 -1
- data/lib/classifier/bayes.rb +34 -9
- data/lib/classifier/extensions/vector_serialize.rb +14 -0
- data/lib/classifier/extensions/word_hash.rb +125 -0
- data/lib/classifier/extensions/word_list.rb +31 -0
- data/lib/classifier/lsi.rb +248 -0
- data/lib/classifier/lsi/content_node.rb +67 -0
- data/lib/classifier/string_extensions.rb +10 -5
- data/test/bayes/bayesian_test.rb +2 -2
- data/test/lsi/lsi_test.rb +88 -0
- data/test/string_extensions/word_hash_test.rb +7 -5
- metadata +79 -24
- data/doc/classes/Classifier/Stemmable.html +0 -243
- data/doc/classes/Classifier/Stemmable.src/M000003.html +0 -102
- data/doc/classes/Classifier/WordHash.html +0 -178
- data/doc/classes/Classifier/WordHash.src/M000002.html +0 -28
- data/lib/classifier/string_extensions/porter_stemmer.rb +0 -199
- data/lib/classifier/string_extensions/word_hash.rb +0 -119
@@ -1,102 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
-
<!DOCTYPE html
|
3
|
-
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
-
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
-
|
6
|
-
<html>
|
7
|
-
<head>
|
8
|
-
<title>stem_porter (Classifier::Stemmable)</title>
|
9
|
-
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
-
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
11
|
-
</head>
|
12
|
-
<body class="standalone-code">
|
13
|
-
<pre><span class="ruby-comment cmt"># File lib/classifier/string_extensions/porter_stemmer.rb, line 102</span>
|
14
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">stem_porter</span>
|
15
|
-
|
16
|
-
<span class="ruby-comment cmt"># make a copy of the given object and convert it to a string.
|
17
|
-
<span class="ruby-identifier">w</span> = <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">dup</span>.<span class="ruby-identifier">to_str</span>
|
18
|
-
|
19
|
-
<span class="ruby-keyword kw">return</span> <span class="ruby-identifier">w</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator"><</span> <span class="ruby-value">3</span>
|
20
|
-
|
21
|
-
<span class="ruby-comment cmt"># now map initial y to Y so that the patterns never treat it as vowel
|
22
|
-
<span class="ruby-identifier">w</span>[<span class="ruby-value">0</span>] = <span class="ruby-value str">'Y'</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span>[<span class="ruby-value">0</span>] <span class="ruby-operator">==</span> <span class="ruby-value">?y</span>
|
23
|
-
|
24
|
-
<span class="ruby-comment cmt"># Step 1a
|
25
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/(ss|i)es$/</span>
|
26
|
-
<span class="ruby-identifier">w</span> = <span class="ruby-identifier">$`</span> <span class="ruby-operator">+</span> <span class="ruby-identifier">$1</span>
|
27
|
-
<span class="ruby-keyword kw">elsif</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/([^s])s$/</span>
|
28
|
-
<span class="ruby-identifier">w</span> = <span class="ruby-identifier">$`</span> <span class="ruby-operator">+</span> <span class="ruby-identifier">$1</span>
|
29
|
-
<span class="ruby-keyword kw">end</span>
|
30
|
-
|
31
|
-
<span class="ruby-comment cmt"># Step 1b
|
32
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/eed$/</span>
|
33
|
-
<span class="ruby-identifier">w</span>.<span class="ruby-identifier">chop!</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">$`</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR0</span>
|
34
|
-
<span class="ruby-keyword kw">elsif</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/(ed|ing)$/</span>
|
35
|
-
<span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
|
36
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">VOWEL_IN_STEM</span>
|
37
|
-
<span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span>
|
38
|
-
<span class="ruby-keyword kw">case</span> <span class="ruby-identifier">w</span>
|
39
|
-
<span class="ruby-keyword kw">when</span> <span class="ruby-regexp re">/(at|bl|iz)$/</span> <span class="ruby-keyword kw">then</span> <span class="ruby-identifier">w</span> <span class="ruby-operator"><<</span> <span class="ruby-value str">"e"</span>
|
40
|
-
<span class="ruby-keyword kw">when</span> <span class="ruby-regexp re">/([^aeiouylsz])\1$/</span> <span class="ruby-keyword kw">then</span> <span class="ruby-identifier">w</span>.<span class="ruby-identifier">chop!</span>
|
41
|
-
<span class="ruby-keyword kw">when</span> <span class="ruby-node">/^#{CC}#{V}[^aeiouwxy]$/o</span> <span class="ruby-keyword kw">then</span> <span class="ruby-identifier">w</span> <span class="ruby-operator"><<</span> <span class="ruby-value str">"e"</span>
|
42
|
-
<span class="ruby-keyword kw">end</span>
|
43
|
-
<span class="ruby-keyword kw">end</span>
|
44
|
-
<span class="ruby-keyword kw">end</span>
|
45
|
-
|
46
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/y$/</span>
|
47
|
-
<span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
|
48
|
-
<span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span> <span class="ruby-operator">+</span> <span class="ruby-value str">"i"</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">VOWEL_IN_STEM</span>
|
49
|
-
<span class="ruby-keyword kw">end</span>
|
50
|
-
|
51
|
-
<span class="ruby-comment cmt"># Step 2
|
52
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">SUFFIX_1_REGEXP</span>
|
53
|
-
<span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
|
54
|
-
<span class="ruby-identifier">suffix</span> = <span class="ruby-identifier">$1</span>
|
55
|
-
<span class="ruby-comment cmt"># print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
|
56
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR0</span>
|
57
|
-
<span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span> <span class="ruby-operator">+</span> <span class="ruby-constant">STEP_2_LIST</span>[<span class="ruby-identifier">suffix</span>]
|
58
|
-
<span class="ruby-keyword kw">end</span>
|
59
|
-
<span class="ruby-keyword kw">end</span>
|
60
|
-
|
61
|
-
<span class="ruby-comment cmt"># Step 3
|
62
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/(icate|ative|alize|iciti|ical|ful|ness)$/</span>
|
63
|
-
<span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
|
64
|
-
<span class="ruby-identifier">suffix</span> = <span class="ruby-identifier">$1</span>
|
65
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR0</span>
|
66
|
-
<span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span> <span class="ruby-operator">+</span> <span class="ruby-constant">STEP_3_LIST</span>[<span class="ruby-identifier">suffix</span>]
|
67
|
-
<span class="ruby-keyword kw">end</span>
|
68
|
-
<span class="ruby-keyword kw">end</span>
|
69
|
-
|
70
|
-
<span class="ruby-comment cmt"># Step 4
|
71
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">SUFFIX_2_REGEXP</span>
|
72
|
-
<span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
|
73
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR1</span>
|
74
|
-
<span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span>
|
75
|
-
<span class="ruby-keyword kw">end</span>
|
76
|
-
<span class="ruby-keyword kw">elsif</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/(s|t)(ion)$/</span>
|
77
|
-
<span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span> <span class="ruby-operator">+</span> <span class="ruby-identifier">$1</span>
|
78
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR1</span>
|
79
|
-
<span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span>
|
80
|
-
<span class="ruby-keyword kw">end</span>
|
81
|
-
<span class="ruby-keyword kw">end</span>
|
82
|
-
|
83
|
-
<span class="ruby-comment cmt"># Step 5
|
84
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/e$/</span>
|
85
|
-
<span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
|
86
|
-
<span class="ruby-keyword kw">if</span> (<span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR1</span>) <span class="ruby-operator">||</span>
|
87
|
-
(<span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MEQ1</span> <span class="ruby-operator">&&</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">!~</span> <span class="ruby-node">/^#{CC}#{V}[^aeiouwxy]$/o</span>)
|
88
|
-
<span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span>
|
89
|
-
<span class="ruby-keyword kw">end</span>
|
90
|
-
<span class="ruby-keyword kw">end</span>
|
91
|
-
|
92
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/ll$/</span> <span class="ruby-operator">&&</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR1</span>
|
93
|
-
<span class="ruby-identifier">w</span>.<span class="ruby-identifier">chop!</span>
|
94
|
-
<span class="ruby-keyword kw">end</span>
|
95
|
-
|
96
|
-
<span class="ruby-comment cmt"># and turn initial Y back to y
|
97
|
-
<span class="ruby-identifier">w</span>[<span class="ruby-value">0</span>] = <span class="ruby-value str">'y'</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span>[<span class="ruby-value">0</span>] <span class="ruby-operator">==</span> <span class="ruby-value">?Y</span>
|
98
|
-
|
99
|
-
<span class="ruby-identifier">w</span>
|
100
|
-
<span class="ruby-keyword kw">end</span></pre>
|
101
|
-
</body>
|
102
|
-
</html>
|
@@ -1,178 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
-
<!DOCTYPE html
|
3
|
-
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
-
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
-
|
6
|
-
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
7
|
-
<head>
|
8
|
-
<title>Module: Classifier::WordHash</title>
|
9
|
-
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
-
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
-
<link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
|
12
|
-
<script type="text/javascript">
|
13
|
-
// <![CDATA[
|
14
|
-
|
15
|
-
function popupCode( url ) {
|
16
|
-
window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
|
17
|
-
}
|
18
|
-
|
19
|
-
function toggleCode( id ) {
|
20
|
-
if ( document.getElementById )
|
21
|
-
elem = document.getElementById( id );
|
22
|
-
else if ( document.all )
|
23
|
-
elem = eval( "document.all." + id );
|
24
|
-
else
|
25
|
-
return false;
|
26
|
-
|
27
|
-
elemStyle = elem.style;
|
28
|
-
|
29
|
-
if ( elemStyle.display != "block" ) {
|
30
|
-
elemStyle.display = "block"
|
31
|
-
} else {
|
32
|
-
elemStyle.display = "none"
|
33
|
-
}
|
34
|
-
|
35
|
-
return true;
|
36
|
-
}
|
37
|
-
|
38
|
-
// Make codeblocks hidden by default
|
39
|
-
document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
|
40
|
-
|
41
|
-
// ]]>
|
42
|
-
</script>
|
43
|
-
|
44
|
-
</head>
|
45
|
-
<body>
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
<div id="classHeader">
|
50
|
-
<table class="header-table">
|
51
|
-
<tr class="top-aligned-row">
|
52
|
-
<td><strong>Module</strong></td>
|
53
|
-
<td class="class-name-in-header">Classifier::WordHash</td>
|
54
|
-
</tr>
|
55
|
-
<tr class="top-aligned-row">
|
56
|
-
<td><strong>In:</strong></td>
|
57
|
-
<td>
|
58
|
-
<a href="../../files/lib/classifier/string_extensions/word_hash_rb.html">
|
59
|
-
lib/classifier/string_extensions/word_hash.rb
|
60
|
-
</a>
|
61
|
-
<br />
|
62
|
-
</td>
|
63
|
-
</tr>
|
64
|
-
|
65
|
-
</table>
|
66
|
-
</div>
|
67
|
-
<!-- banner header -->
|
68
|
-
|
69
|
-
<div id="bodyContent">
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
<div id="contextContent">
|
74
|
-
|
75
|
-
<div id="description">
|
76
|
-
<p>
|
77
|
-
This module is mixed into <a href="../String.html">String</a> to provide
|
78
|
-
convenience methods for the <a href="../Classifier.html">Classifier</a>
|
79
|
-
package.
|
80
|
-
</p>
|
81
|
-
|
82
|
-
</div>
|
83
|
-
|
84
|
-
|
85
|
-
</div>
|
86
|
-
|
87
|
-
<div id="method-list">
|
88
|
-
<h3 class="section-bar">Methods</h3>
|
89
|
-
|
90
|
-
<div class="name-list">
|
91
|
-
<a href="#M000001">without_punctuation</a>
|
92
|
-
<a href="#M000002">word_hash</a>
|
93
|
-
</div>
|
94
|
-
</div>
|
95
|
-
|
96
|
-
</div>
|
97
|
-
|
98
|
-
|
99
|
-
<!-- if includes -->
|
100
|
-
|
101
|
-
<div id="section">
|
102
|
-
|
103
|
-
|
104
|
-
<div id="constants-list">
|
105
|
-
<h3 class="section-bar">Constants</h3>
|
106
|
-
|
107
|
-
<div class="name-list">
|
108
|
-
<table summary="Constants">
|
109
|
-
<tr class="top-aligned-row context-row">
|
110
|
-
<td class="context-item-name">CORPUS_SKIP_WORDS</td>
|
111
|
-
<td>=</td>
|
112
|
-
<td class="context-item-value">{ "a" => 1, "again" => 1, "all" => 1, "along" => 1, "are" => 1, "also" => 1, "an" => 1, "and" => 1, "as" => 1, "at" => 1, "but" => 1, "by" => 1, "came" => 1, "can" => 1, "cant" => 1, "couldnt" => 1, "did" => 1, "didn" => 1, "didnt" => 1, "do" => 1, "doesnt" => 1, "dont" => 1, "ever" => 1, "first" => 1, "from" => 1, "have" => 1, "her" => 1, "here" => 1, "him" => 1, "how" => 1, "i" => 1, "if" => 1, "in" => 1, "into" => 1, "is" => 1, "isnt" => 1, "it" => 1, "itll" => 1, "just" => 1, "last" => 1, "least" => 1, "like" => 1, "most" => 1, "my" => 1, "new" => 1, "no" => 1, "not" => 1, "now" => 1, "of" => 1, "on" => 1, "or" => 1, "should" => 1, "sinc" => 1, "so" => 1, "some" => 1, "th" => 1, "than" => 1, "this" => 1, "that" => 1, "the" => 1, "their" => 1, "then" => 1, "those" => 1, "to" => 1, "told" => 1, "too" => 1, "true" => 1, "try" => 1, "until" => 1, "url" => 1, "us" => 1, "were" => 1, "when" => 1, "whether" => 1, "while" => 1, "with" => 1, "within" => 1, "yes" => 1, "you" => 1, "youll" => 1, }</td>
|
113
|
-
</tr>
|
114
|
-
</table>
|
115
|
-
</div>
|
116
|
-
</div>
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
<!-- if method_list -->
|
124
|
-
<div id="methods">
|
125
|
-
<h3 class="section-bar">Public Instance methods</h3>
|
126
|
-
|
127
|
-
<div id="method-M000001" class="method-detail">
|
128
|
-
<a name="M000001"></a>
|
129
|
-
|
130
|
-
<div class="method-heading">
|
131
|
-
<a href="WordHash.src/M000001.html" target="Code" class="method-signature"
|
132
|
-
onclick="popupCode('WordHash.src/M000001.html');return false;">
|
133
|
-
<span class="method-name">without_punctuation</span><span class="method-args">()</span>
|
134
|
-
</a>
|
135
|
-
</div>
|
136
|
-
|
137
|
-
<div class="method-description">
|
138
|
-
<p>
|
139
|
-
Removes common punctuation symbols, returning a new string. E.g.,
|
140
|
-
</p>
|
141
|
-
<pre>
|
142
|
-
"Hello (greeting's), with {braces} < >...?".without_punctuation
|
143
|
-
=> "Hello greetings with braces "
|
144
|
-
</pre>
|
145
|
-
</div>
|
146
|
-
</div>
|
147
|
-
|
148
|
-
<div id="method-M000002" class="method-detail">
|
149
|
-
<a name="M000002"></a>
|
150
|
-
|
151
|
-
<div class="method-heading">
|
152
|
-
<a href="WordHash.src/M000002.html" target="Code" class="method-signature"
|
153
|
-
onclick="popupCode('WordHash.src/M000002.html');return false;">
|
154
|
-
<span class="method-name">word_hash</span><span class="method-args">()</span>
|
155
|
-
</a>
|
156
|
-
</div>
|
157
|
-
|
158
|
-
<div class="method-description">
|
159
|
-
<p>
|
160
|
-
Return a Hash of strings => ints. Each word in the string is stemmed,
|
161
|
-
interned, and indexes to its frequency in the document.
|
162
|
-
</p>
|
163
|
-
</div>
|
164
|
-
</div>
|
165
|
-
|
166
|
-
|
167
|
-
</div>
|
168
|
-
|
169
|
-
|
170
|
-
</div>
|
171
|
-
|
172
|
-
|
173
|
-
<div id="validator-badges">
|
174
|
-
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
175
|
-
</div>
|
176
|
-
|
177
|
-
</body>
|
178
|
-
</html>
|
@@ -1,28 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
-
<!DOCTYPE html
|
3
|
-
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
-
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
-
|
6
|
-
<html>
|
7
|
-
<head>
|
8
|
-
<title>word_hash (Classifier::WordHash)</title>
|
9
|
-
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
-
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
11
|
-
</head>
|
12
|
-
<body class="standalone-code">
|
13
|
-
<pre><span class="ruby-comment cmt"># File lib/classifier/string_extensions/word_hash.rb, line 20</span>
|
14
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">word_hash</span>
|
15
|
-
<span class="ruby-identifier">d</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span>
|
16
|
-
<span class="ruby-identifier">corpus</span> = <span class="ruby-identifier">without_punctuation</span>
|
17
|
-
(<span class="ruby-identifier">corpus</span>.<span class="ruby-identifier">split</span> <span class="ruby-operator">+</span> <span class="ruby-identifier">gsub</span>(<span class="ruby-regexp re">/[\w+]/</span>,<span class="ruby-value str">""</span>).<span class="ruby-identifier">split</span>).<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">word</span><span class="ruby-operator">|</span>
|
18
|
-
<span class="ruby-identifier">item</span> = <span class="ruby-identifier">word</span>.<span class="ruby-identifier">downcase</span>
|
19
|
-
<span class="ruby-identifier">key</span> = <span class="ruby-identifier">item</span>.<span class="ruby-identifier">stem</span>.<span class="ruby-identifier">intern</span>
|
20
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-operator">!</span>(<span class="ruby-identifier">word</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/[\w+]/</span>) <span class="ruby-operator">||</span> <span class="ruby-identifier">word</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">></span> <span class="ruby-value">2</span>
|
21
|
-
<span class="ruby-identifier">d</span>[<span class="ruby-identifier">key</span>] <span class="ruby-operator">||=</span> <span class="ruby-value">0</span>
|
22
|
-
<span class="ruby-identifier">d</span>[<span class="ruby-identifier">key</span>] <span class="ruby-operator">+=</span> <span class="ruby-value">1</span>
|
23
|
-
<span class="ruby-keyword kw">end</span> <span class="ruby-keyword kw">unless</span> <span class="ruby-constant">CORPUS_SKIP_WORDS</span>[<span class="ruby-identifier">item</span>]
|
24
|
-
<span class="ruby-keyword kw">end</span>
|
25
|
-
<span class="ruby-keyword kw">return</span> <span class="ruby-identifier">d</span>
|
26
|
-
<span class="ruby-keyword kw">end</span></pre>
|
27
|
-
</body>
|
28
|
-
</html>
|
@@ -1,199 +0,0 @@
|
|
1
|
-
# = Author
|
2
|
-
#
|
3
|
-
# Greg Fast, gdf@speakeasy.net
|
4
|
-
#
|
5
|
-
# = Copyright
|
6
|
-
#
|
7
|
-
# Copyright 2005 Greg Fast <gdf@speakeasy.net>
|
8
|
-
|
9
|
-
module Classifier
|
10
|
-
|
11
|
-
#
|
12
|
-
# Porter stemmer in Ruby.
|
13
|
-
#
|
14
|
-
# This is the Porter stemming algorithm, ported to Ruby from the
|
15
|
-
# version coded up in Perl. It's easy to follow against the rules
|
16
|
-
# in the original paper in:
|
17
|
-
#
|
18
|
-
# Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
19
|
-
# no. 3, pp 130-137,
|
20
|
-
#
|
21
|
-
# See also http://www.tartarus.org/~martin/PorterStemmer
|
22
|
-
#
|
23
|
-
# Send comments to raypereda@hotmail.com
|
24
|
-
#
|
25
|
-
module Stemmable
|
26
|
-
|
27
|
-
STEP_2_LIST = {
|
28
|
-
'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
|
29
|
-
'izer'=>'ize', 'bli'=>'ble',
|
30
|
-
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
|
31
|
-
'ization'=>'ize', 'ation'=>'ate',
|
32
|
-
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
|
33
|
-
'ousness'=>'ous', 'aliti'=>'al',
|
34
|
-
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
|
35
|
-
}
|
36
|
-
|
37
|
-
STEP_3_LIST = {
|
38
|
-
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
|
39
|
-
'ical'=>'ic', 'ful'=>'', 'ness'=>''
|
40
|
-
}
|
41
|
-
|
42
|
-
|
43
|
-
SUFFIX_1_REGEXP = /(
|
44
|
-
ational |
|
45
|
-
tional |
|
46
|
-
enci |
|
47
|
-
anci |
|
48
|
-
izer |
|
49
|
-
bli |
|
50
|
-
alli |
|
51
|
-
entli |
|
52
|
-
eli |
|
53
|
-
ousli |
|
54
|
-
ization |
|
55
|
-
ation |
|
56
|
-
ator |
|
57
|
-
alism |
|
58
|
-
iveness |
|
59
|
-
fulness |
|
60
|
-
ousness |
|
61
|
-
aliti |
|
62
|
-
iviti |
|
63
|
-
biliti |
|
64
|
-
logi)$/x
|
65
|
-
|
66
|
-
|
67
|
-
SUFFIX_2_REGEXP = /(
|
68
|
-
al |
|
69
|
-
ance |
|
70
|
-
ence |
|
71
|
-
er |
|
72
|
-
ic |
|
73
|
-
able |
|
74
|
-
ible |
|
75
|
-
ant |
|
76
|
-
ement |
|
77
|
-
ment |
|
78
|
-
ent |
|
79
|
-
ou |
|
80
|
-
ism |
|
81
|
-
ate |
|
82
|
-
iti |
|
83
|
-
ous |
|
84
|
-
ive |
|
85
|
-
ize)$/x
|
86
|
-
|
87
|
-
|
88
|
-
C = "[^aeiou]" # consonant
|
89
|
-
V = "[aeiouy]" # vowel
|
90
|
-
CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
|
91
|
-
VV = "#{V}(?>[aeiou]*)" # vowel sequence
|
92
|
-
|
93
|
-
MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
|
94
|
-
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
|
95
|
-
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
|
96
|
-
VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
|
97
|
-
|
98
|
-
#
|
99
|
-
# Stems the word contained in the current object. E.g.,
|
100
|
-
# "actually".stem_porter
|
101
|
-
# => "actual"
|
102
|
-
def stem_porter
|
103
|
-
|
104
|
-
# make a copy of the given object and convert it to a string.
|
105
|
-
w = self.dup.to_str
|
106
|
-
|
107
|
-
return w if w.length < 3
|
108
|
-
|
109
|
-
# now map initial y to Y so that the patterns never treat it as vowel
|
110
|
-
w[0] = 'Y' if w[0] == ?y
|
111
|
-
|
112
|
-
# Step 1a
|
113
|
-
if w =~ /(ss|i)es$/
|
114
|
-
w = $` + $1
|
115
|
-
elsif w =~ /([^s])s$/
|
116
|
-
w = $` + $1
|
117
|
-
end
|
118
|
-
|
119
|
-
# Step 1b
|
120
|
-
if w =~ /eed$/
|
121
|
-
w.chop! if $` =~ MGR0
|
122
|
-
elsif w =~ /(ed|ing)$/
|
123
|
-
stem = $`
|
124
|
-
if stem =~ VOWEL_IN_STEM
|
125
|
-
w = stem
|
126
|
-
case w
|
127
|
-
when /(at|bl|iz)$/ then w << "e"
|
128
|
-
when /([^aeiouylsz])\1$/ then w.chop!
|
129
|
-
when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
|
130
|
-
end
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
|
-
if w =~ /y$/
|
135
|
-
stem = $`
|
136
|
-
w = stem + "i" if stem =~ VOWEL_IN_STEM
|
137
|
-
end
|
138
|
-
|
139
|
-
# Step 2
|
140
|
-
if w =~ SUFFIX_1_REGEXP
|
141
|
-
stem = $`
|
142
|
-
suffix = $1
|
143
|
-
# print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
|
144
|
-
if stem =~ MGR0
|
145
|
-
w = stem + STEP_2_LIST[suffix]
|
146
|
-
end
|
147
|
-
end
|
148
|
-
|
149
|
-
# Step 3
|
150
|
-
if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
|
151
|
-
stem = $`
|
152
|
-
suffix = $1
|
153
|
-
if stem =~ MGR0
|
154
|
-
w = stem + STEP_3_LIST[suffix]
|
155
|
-
end
|
156
|
-
end
|
157
|
-
|
158
|
-
# Step 4
|
159
|
-
if w =~ SUFFIX_2_REGEXP
|
160
|
-
stem = $`
|
161
|
-
if stem =~ MGR1
|
162
|
-
w = stem
|
163
|
-
end
|
164
|
-
elsif w =~ /(s|t)(ion)$/
|
165
|
-
stem = $` + $1
|
166
|
-
if stem =~ MGR1
|
167
|
-
w = stem
|
168
|
-
end
|
169
|
-
end
|
170
|
-
|
171
|
-
# Step 5
|
172
|
-
if w =~ /e$/
|
173
|
-
stem = $`
|
174
|
-
if (stem =~ MGR1) ||
|
175
|
-
(stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
|
176
|
-
w = stem
|
177
|
-
end
|
178
|
-
end
|
179
|
-
|
180
|
-
if w =~ /ll$/ && w =~ MGR1
|
181
|
-
w.chop!
|
182
|
-
end
|
183
|
-
|
184
|
-
# and turn initial Y back to y
|
185
|
-
w[0] = 'y' if w[0] == ?Y
|
186
|
-
|
187
|
-
w
|
188
|
-
end
|
189
|
-
|
190
|
-
|
191
|
-
#
|
192
|
-
# make the stem_porter the default stem method, just in case we
|
193
|
-
# feel like having multiple stemmers available later.
|
194
|
-
#
|
195
|
-
alias stem stem_porter
|
196
|
-
|
197
|
-
end
|
198
|
-
|
199
|
-
end
|