classifier 1.1.1 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (77) hide show
  1. data/LICENSE +341 -0
  2. data/README +59 -6
  3. data/Rakefile +16 -4
  4. data/bin/bayes.rb +8 -2
  5. data/doc/classes/Classifier.html +15 -10
  6. data/doc/classes/Classifier/Bayes.html +68 -38
  7. data/doc/classes/Classifier/Bayes.src/{M000005.html → M000023.html} +1 -1
  8. data/doc/classes/Classifier/Bayes.src/{M000006.html → M000024.html} +1 -1
  9. data/doc/classes/Classifier/Bayes.src/M000025.html +30 -0
  10. data/doc/classes/Classifier/Bayes.src/{M000007.html → M000026.html} +1 -1
  11. data/doc/classes/Classifier/Bayes.src/{M000008.html → M000027.html} +1 -1
  12. data/doc/classes/Classifier/Bayes.src/{M000009.html → M000028.html} +4 -4
  13. data/doc/classes/Classifier/Bayes.src/{M000010.html → M000029.html} +2 -2
  14. data/doc/classes/Classifier/ContentNode.html +252 -0
  15. data/doc/classes/Classifier/ContentNode.src/M000031.html +21 -0
  16. data/doc/classes/Classifier/ContentNode.src/M000032.html +18 -0
  17. data/doc/classes/Classifier/ContentNode.src/M000033.html +18 -0
  18. data/doc/classes/Classifier/ContentNode.src/M000034.html +41 -0
  19. data/doc/classes/Classifier/LSI.html +449 -0
  20. data/doc/classes/Classifier/LSI.src/M000011.html +20 -0
  21. data/doc/classes/Classifier/LSI.src/M000012.html +18 -0
  22. data/doc/classes/Classifier/LSI.src/M000013.html +20 -0
  23. data/doc/classes/Classifier/LSI.src/M000014.html +18 -0
  24. data/doc/classes/Classifier/LSI.src/M000015.html +21 -0
  25. data/doc/classes/Classifier/LSI.src/M000016.html +18 -0
  26. data/doc/classes/Classifier/LSI.src/M000017.html +32 -0
  27. data/doc/classes/Classifier/LSI.src/M000018.html +26 -0
  28. data/doc/classes/Classifier/LSI.src/M000019.html +26 -0
  29. data/doc/classes/Classifier/LSI.src/M000020.html +23 -0
  30. data/doc/classes/Classifier/LSI.src/M000021.html +21 -0
  31. data/doc/classes/Classifier/LSI.src/M000022.html +31 -0
  32. data/doc/classes/Classifier/WordList.html +202 -0
  33. data/doc/classes/Classifier/WordList.src/M000007.html +18 -0
  34. data/doc/classes/Classifier/WordList.src/M000008.html +19 -0
  35. data/doc/classes/Classifier/WordList.src/M000009.html +19 -0
  36. data/doc/classes/Classifier/WordList.src/M000010.html +18 -0
  37. data/doc/classes/GSL.html +111 -0
  38. data/doc/classes/GSL/Vector.html +156 -0
  39. data/doc/classes/GSL/Vector.src/M000005.html +18 -0
  40. data/doc/classes/GSL/Vector.src/M000006.html +19 -0
  41. data/doc/classes/Object.html +139 -0
  42. data/doc/classes/Object.src/M000001.html +16 -0
  43. data/doc/classes/String.html +95 -9
  44. data/doc/classes/{Classifier/WordHash.src/M000001.html → String.src/M000002.html} +3 -3
  45. data/doc/classes/String.src/M000003.html +18 -0
  46. data/doc/classes/String.src/M000004.html +18 -0
  47. data/doc/created.rid +1 -1
  48. data/doc/files/README.html +102 -12
  49. data/doc/files/lib/classifier/bayes_rb.html +1 -1
  50. data/doc/files/lib/classifier/{string_extensions/porter_stemmer_rb.html → extensions/vector_serialize_rb.html} +4 -15
  51. data/doc/files/lib/classifier/{string_extensions → extensions}/word_hash_rb.html +2 -2
  52. data/doc/files/lib/classifier/extensions/word_list_rb.html +115 -0
  53. data/doc/files/lib/classifier/lsi/content_node_rb.html +115 -0
  54. data/doc/files/lib/classifier/lsi_rb.html +125 -0
  55. data/doc/files/lib/classifier/string_extensions_rb.html +2 -3
  56. data/doc/files/lib/classifier_rb.html +3 -1
  57. data/doc/fr_class_index.html +6 -2
  58. data/doc/fr_file_index.html +5 -2
  59. data/doc/fr_method_index.html +34 -11
  60. data/lib/classifier.rb +3 -1
  61. data/lib/classifier/bayes.rb +34 -9
  62. data/lib/classifier/extensions/vector_serialize.rb +14 -0
  63. data/lib/classifier/extensions/word_hash.rb +125 -0
  64. data/lib/classifier/extensions/word_list.rb +31 -0
  65. data/lib/classifier/lsi.rb +248 -0
  66. data/lib/classifier/lsi/content_node.rb +67 -0
  67. data/lib/classifier/string_extensions.rb +10 -5
  68. data/test/bayes/bayesian_test.rb +2 -2
  69. data/test/lsi/lsi_test.rb +88 -0
  70. data/test/string_extensions/word_hash_test.rb +7 -5
  71. metadata +79 -24
  72. data/doc/classes/Classifier/Stemmable.html +0 -243
  73. data/doc/classes/Classifier/Stemmable.src/M000003.html +0 -102
  74. data/doc/classes/Classifier/WordHash.html +0 -178
  75. data/doc/classes/Classifier/WordHash.src/M000002.html +0 -28
  76. data/lib/classifier/string_extensions/porter_stemmer.rb +0 -199
  77. data/lib/classifier/string_extensions/word_hash.rb +0 -119
@@ -1,102 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html>
7
- <head>
8
- <title>stem_porter (Classifier::Stemmable)</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
11
- </head>
12
- <body class="standalone-code">
13
- <pre><span class="ruby-comment cmt"># File lib/classifier/string_extensions/porter_stemmer.rb, line 102</span>
14
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">stem_porter</span>
15
-
16
- <span class="ruby-comment cmt"># make a copy of the given object and convert it to a string.
17
- <span class="ruby-identifier">w</span> = <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">dup</span>.<span class="ruby-identifier">to_str</span>
18
-
19
- <span class="ruby-keyword kw">return</span> <span class="ruby-identifier">w</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">&lt;</span> <span class="ruby-value">3</span>
20
-
21
- <span class="ruby-comment cmt"># now map initial y to Y so that the patterns never treat it as vowel
22
- <span class="ruby-identifier">w</span>[<span class="ruby-value">0</span>] = <span class="ruby-value str">'Y'</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span>[<span class="ruby-value">0</span>] <span class="ruby-operator">==</span> <span class="ruby-value">?y</span>
23
-
24
- <span class="ruby-comment cmt"># Step 1a
25
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/(ss|i)es$/</span>
26
- <span class="ruby-identifier">w</span> = <span class="ruby-identifier">$`</span> <span class="ruby-operator">+</span> <span class="ruby-identifier">$1</span>
27
- <span class="ruby-keyword kw">elsif</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/([^s])s$/</span>
28
- <span class="ruby-identifier">w</span> = <span class="ruby-identifier">$`</span> <span class="ruby-operator">+</span> <span class="ruby-identifier">$1</span>
29
- <span class="ruby-keyword kw">end</span>
30
-
31
- <span class="ruby-comment cmt"># Step 1b
32
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/eed$/</span>
33
- <span class="ruby-identifier">w</span>.<span class="ruby-identifier">chop!</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">$`</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR0</span>
34
- <span class="ruby-keyword kw">elsif</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/(ed|ing)$/</span>
35
- <span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
36
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">VOWEL_IN_STEM</span>
37
- <span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span>
38
- <span class="ruby-keyword kw">case</span> <span class="ruby-identifier">w</span>
39
- <span class="ruby-keyword kw">when</span> <span class="ruby-regexp re">/(at|bl|iz)$/</span> <span class="ruby-keyword kw">then</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-value str">&quot;e&quot;</span>
40
- <span class="ruby-keyword kw">when</span> <span class="ruby-regexp re">/([^aeiouylsz])\1$/</span> <span class="ruby-keyword kw">then</span> <span class="ruby-identifier">w</span>.<span class="ruby-identifier">chop!</span>
41
- <span class="ruby-keyword kw">when</span> <span class="ruby-node">/^#{CC}#{V}[^aeiouwxy]$/o</span> <span class="ruby-keyword kw">then</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-value str">&quot;e&quot;</span>
42
- <span class="ruby-keyword kw">end</span>
43
- <span class="ruby-keyword kw">end</span>
44
- <span class="ruby-keyword kw">end</span>
45
-
46
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/y$/</span>
47
- <span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
48
- <span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span> <span class="ruby-operator">+</span> <span class="ruby-value str">&quot;i&quot;</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">VOWEL_IN_STEM</span>
49
- <span class="ruby-keyword kw">end</span>
50
-
51
- <span class="ruby-comment cmt"># Step 2
52
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">SUFFIX_1_REGEXP</span>
53
- <span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
54
- <span class="ruby-identifier">suffix</span> = <span class="ruby-identifier">$1</span>
55
- <span class="ruby-comment cmt"># print &quot;stem= &quot; + stem + &quot;\n&quot; + &quot;suffix=&quot; + suffix + &quot;\n&quot;
56
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR0</span>
57
- <span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span> <span class="ruby-operator">+</span> <span class="ruby-constant">STEP_2_LIST</span>[<span class="ruby-identifier">suffix</span>]
58
- <span class="ruby-keyword kw">end</span>
59
- <span class="ruby-keyword kw">end</span>
60
-
61
- <span class="ruby-comment cmt"># Step 3
62
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/(icate|ative|alize|iciti|ical|ful|ness)$/</span>
63
- <span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
64
- <span class="ruby-identifier">suffix</span> = <span class="ruby-identifier">$1</span>
65
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR0</span>
66
- <span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span> <span class="ruby-operator">+</span> <span class="ruby-constant">STEP_3_LIST</span>[<span class="ruby-identifier">suffix</span>]
67
- <span class="ruby-keyword kw">end</span>
68
- <span class="ruby-keyword kw">end</span>
69
-
70
- <span class="ruby-comment cmt"># Step 4
71
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">SUFFIX_2_REGEXP</span>
72
- <span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
73
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR1</span>
74
- <span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span>
75
- <span class="ruby-keyword kw">end</span>
76
- <span class="ruby-keyword kw">elsif</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/(s|t)(ion)$/</span>
77
- <span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span> <span class="ruby-operator">+</span> <span class="ruby-identifier">$1</span>
78
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR1</span>
79
- <span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span>
80
- <span class="ruby-keyword kw">end</span>
81
- <span class="ruby-keyword kw">end</span>
82
-
83
- <span class="ruby-comment cmt"># Step 5
84
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/e$/</span>
85
- <span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
86
- <span class="ruby-keyword kw">if</span> (<span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR1</span>) <span class="ruby-operator">||</span>
87
- (<span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MEQ1</span> <span class="ruby-operator">&amp;&amp;</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">!~</span> <span class="ruby-node">/^#{CC}#{V}[^aeiouwxy]$/o</span>)
88
- <span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span>
89
- <span class="ruby-keyword kw">end</span>
90
- <span class="ruby-keyword kw">end</span>
91
-
92
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/ll$/</span> <span class="ruby-operator">&amp;&amp;</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR1</span>
93
- <span class="ruby-identifier">w</span>.<span class="ruby-identifier">chop!</span>
94
- <span class="ruby-keyword kw">end</span>
95
-
96
- <span class="ruby-comment cmt"># and turn initial Y back to y
97
- <span class="ruby-identifier">w</span>[<span class="ruby-value">0</span>] = <span class="ruby-value str">'y'</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span>[<span class="ruby-value">0</span>] <span class="ruby-operator">==</span> <span class="ruby-value">?Y</span>
98
-
99
- <span class="ruby-identifier">w</span>
100
- <span class="ruby-keyword kw">end</span></pre>
101
- </body>
102
- </html>
@@ -1,178 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
- <head>
8
- <title>Module: Classifier::WordHash</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
- <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
12
- <script type="text/javascript">
13
- // <![CDATA[
14
-
15
- function popupCode( url ) {
16
- window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
- }
18
-
19
- function toggleCode( id ) {
20
- if ( document.getElementById )
21
- elem = document.getElementById( id );
22
- else if ( document.all )
23
- elem = eval( "document.all." + id );
24
- else
25
- return false;
26
-
27
- elemStyle = elem.style;
28
-
29
- if ( elemStyle.display != "block" ) {
30
- elemStyle.display = "block"
31
- } else {
32
- elemStyle.display = "none"
33
- }
34
-
35
- return true;
36
- }
37
-
38
- // Make codeblocks hidden by default
39
- document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
-
41
- // ]]>
42
- </script>
43
-
44
- </head>
45
- <body>
46
-
47
-
48
-
49
- <div id="classHeader">
50
- <table class="header-table">
51
- <tr class="top-aligned-row">
52
- <td><strong>Module</strong></td>
53
- <td class="class-name-in-header">Classifier::WordHash</td>
54
- </tr>
55
- <tr class="top-aligned-row">
56
- <td><strong>In:</strong></td>
57
- <td>
58
- <a href="../../files/lib/classifier/string_extensions/word_hash_rb.html">
59
- lib/classifier/string_extensions/word_hash.rb
60
- </a>
61
- <br />
62
- </td>
63
- </tr>
64
-
65
- </table>
66
- </div>
67
- <!-- banner header -->
68
-
69
- <div id="bodyContent">
70
-
71
-
72
-
73
- <div id="contextContent">
74
-
75
- <div id="description">
76
- <p>
77
- This module is mixed into <a href="../String.html">String</a> to provide
78
- convenience methods for the <a href="../Classifier.html">Classifier</a>
79
- package.
80
- </p>
81
-
82
- </div>
83
-
84
-
85
- </div>
86
-
87
- <div id="method-list">
88
- <h3 class="section-bar">Methods</h3>
89
-
90
- <div class="name-list">
91
- <a href="#M000001">without_punctuation</a>&nbsp;&nbsp;
92
- <a href="#M000002">word_hash</a>&nbsp;&nbsp;
93
- </div>
94
- </div>
95
-
96
- </div>
97
-
98
-
99
- <!-- if includes -->
100
-
101
- <div id="section">
102
-
103
-
104
- <div id="constants-list">
105
- <h3 class="section-bar">Constants</h3>
106
-
107
- <div class="name-list">
108
- <table summary="Constants">
109
- <tr class="top-aligned-row context-row">
110
- <td class="context-item-name">CORPUS_SKIP_WORDS</td>
111
- <td>=</td>
112
- <td class="context-item-value">{ &quot;a&quot; =&gt; 1, &quot;again&quot; =&gt; 1, &quot;all&quot; =&gt; 1, &quot;along&quot; =&gt; 1, &quot;are&quot; =&gt; 1, &quot;also&quot; =&gt; 1, &quot;an&quot; =&gt; 1, &quot;and&quot; =&gt; 1, &quot;as&quot; =&gt; 1, &quot;at&quot; =&gt; 1, &quot;but&quot; =&gt; 1, &quot;by&quot; =&gt; 1, &quot;came&quot; =&gt; 1, &quot;can&quot; =&gt; 1, &quot;cant&quot; =&gt; 1, &quot;couldnt&quot; =&gt; 1, &quot;did&quot; =&gt; 1, &quot;didn&quot; =&gt; 1, &quot;didnt&quot; =&gt; 1, &quot;do&quot; =&gt; 1, &quot;doesnt&quot; =&gt; 1, &quot;dont&quot; =&gt; 1, &quot;ever&quot; =&gt; 1, &quot;first&quot; =&gt; 1, &quot;from&quot; =&gt; 1, &quot;have&quot; =&gt; 1, &quot;her&quot; =&gt; 1, &quot;here&quot; =&gt; 1, &quot;him&quot; =&gt; 1, &quot;how&quot; =&gt; 1, &quot;i&quot; =&gt; 1, &quot;if&quot; =&gt; 1, &quot;in&quot; =&gt; 1, &quot;into&quot; =&gt; 1, &quot;is&quot; =&gt; 1, &quot;isnt&quot; =&gt; 1, &quot;it&quot; =&gt; 1, &quot;itll&quot; =&gt; 1, &quot;just&quot; =&gt; 1, &quot;last&quot; =&gt; 1, &quot;least&quot; =&gt; 1, &quot;like&quot; =&gt; 1, &quot;most&quot; =&gt; 1, &quot;my&quot; =&gt; 1, &quot;new&quot; =&gt; 1, &quot;no&quot; =&gt; 1, &quot;not&quot; =&gt; 1, &quot;now&quot; =&gt; 1, &quot;of&quot; =&gt; 1, &quot;on&quot; =&gt; 1, &quot;or&quot; =&gt; 1, &quot;should&quot; =&gt; 1, &quot;sinc&quot; =&gt; 1, &quot;so&quot; =&gt; 1, &quot;some&quot; =&gt; 1, &quot;th&quot; =&gt; 1, &quot;than&quot; =&gt; 1, &quot;this&quot; =&gt; 1, &quot;that&quot; =&gt; 1, &quot;the&quot; =&gt; 1, &quot;their&quot; =&gt; 1, &quot;then&quot; =&gt; 1, &quot;those&quot; =&gt; 1, &quot;to&quot; =&gt; 1, &quot;told&quot; =&gt; 1, &quot;too&quot; =&gt; 1, &quot;true&quot; =&gt; 1, &quot;try&quot; =&gt; 1, &quot;until&quot; =&gt; 1, &quot;url&quot; =&gt; 1, &quot;us&quot; =&gt; 1, &quot;were&quot; =&gt; 1, &quot;when&quot; =&gt; 1, &quot;whether&quot; =&gt; 1, &quot;while&quot; =&gt; 1, &quot;with&quot; =&gt; 1, &quot;within&quot; =&gt; 1, &quot;yes&quot; =&gt; 1, &quot;you&quot; =&gt; 1, &quot;youll&quot; =&gt; 1, }</td>
113
- </tr>
114
- </table>
115
- </div>
116
- </div>
117
-
118
-
119
-
120
-
121
-
122
-
123
- <!-- if method_list -->
124
- <div id="methods">
125
- <h3 class="section-bar">Public Instance methods</h3>
126
-
127
- <div id="method-M000001" class="method-detail">
128
- <a name="M000001"></a>
129
-
130
- <div class="method-heading">
131
- <a href="WordHash.src/M000001.html" target="Code" class="method-signature"
132
- onclick="popupCode('WordHash.src/M000001.html');return false;">
133
- <span class="method-name">without_punctuation</span><span class="method-args">()</span>
134
- </a>
135
- </div>
136
-
137
- <div class="method-description">
138
- <p>
139
- Removes common punctuation symbols, returning a new string. E.g.,
140
- </p>
141
- <pre>
142
- &quot;Hello (greeting's), with {braces} &lt; &gt;...?&quot;.without_punctuation
143
- =&gt; &quot;Hello greetings with braces &quot;
144
- </pre>
145
- </div>
146
- </div>
147
-
148
- <div id="method-M000002" class="method-detail">
149
- <a name="M000002"></a>
150
-
151
- <div class="method-heading">
152
- <a href="WordHash.src/M000002.html" target="Code" class="method-signature"
153
- onclick="popupCode('WordHash.src/M000002.html');return false;">
154
- <span class="method-name">word_hash</span><span class="method-args">()</span>
155
- </a>
156
- </div>
157
-
158
- <div class="method-description">
159
- <p>
160
- Return a Hash of strings =&gt; ints. Each word in the string is stemmed,
161
- interned, and indexes to its frequency in the document.
162
- </p>
163
- </div>
164
- </div>
165
-
166
-
167
- </div>
168
-
169
-
170
- </div>
171
-
172
-
173
- <div id="validator-badges">
174
- <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
175
- </div>
176
-
177
- </body>
178
- </html>
@@ -1,28 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html>
7
- <head>
8
- <title>word_hash (Classifier::WordHash)</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
11
- </head>
12
- <body class="standalone-code">
13
- <pre><span class="ruby-comment cmt"># File lib/classifier/string_extensions/word_hash.rb, line 20</span>
14
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">word_hash</span>
15
- <span class="ruby-identifier">d</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span>
16
- <span class="ruby-identifier">corpus</span> = <span class="ruby-identifier">without_punctuation</span>
17
- (<span class="ruby-identifier">corpus</span>.<span class="ruby-identifier">split</span> <span class="ruby-operator">+</span> <span class="ruby-identifier">gsub</span>(<span class="ruby-regexp re">/[\w+]/</span>,<span class="ruby-value str">&quot;&quot;</span>).<span class="ruby-identifier">split</span>).<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">word</span><span class="ruby-operator">|</span>
18
- <span class="ruby-identifier">item</span> = <span class="ruby-identifier">word</span>.<span class="ruby-identifier">downcase</span>
19
- <span class="ruby-identifier">key</span> = <span class="ruby-identifier">item</span>.<span class="ruby-identifier">stem</span>.<span class="ruby-identifier">intern</span>
20
- <span class="ruby-keyword kw">if</span> <span class="ruby-operator">!</span>(<span class="ruby-identifier">word</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/[\w+]/</span>) <span class="ruby-operator">||</span> <span class="ruby-identifier">word</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">&gt;</span> <span class="ruby-value">2</span>
21
- <span class="ruby-identifier">d</span>[<span class="ruby-identifier">key</span>] <span class="ruby-operator">||=</span> <span class="ruby-value">0</span>
22
- <span class="ruby-identifier">d</span>[<span class="ruby-identifier">key</span>] <span class="ruby-operator">+=</span> <span class="ruby-value">1</span>
23
- <span class="ruby-keyword kw">end</span> <span class="ruby-keyword kw">unless</span> <span class="ruby-constant">CORPUS_SKIP_WORDS</span>[<span class="ruby-identifier">item</span>]
24
- <span class="ruby-keyword kw">end</span>
25
- <span class="ruby-keyword kw">return</span> <span class="ruby-identifier">d</span>
26
- <span class="ruby-keyword kw">end</span></pre>
27
- </body>
28
- </html>
@@ -1,199 +0,0 @@
1
- # = Author
2
- #
3
- # Greg Fast, gdf@speakeasy.net
4
- #
5
- # = Copyright
6
- #
7
- # Copyright 2005 Greg Fast <gdf@speakeasy.net>
8
-
9
- module Classifier
10
-
11
- #
12
- # Porter stemmer in Ruby.
13
- #
14
- # This is the Porter stemming algorithm, ported to Ruby from the
15
- # version coded up in Perl. It's easy to follow against the rules
16
- # in the original paper in:
17
- #
18
- # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
19
- # no. 3, pp 130-137,
20
- #
21
- # See also http://www.tartarus.org/~martin/PorterStemmer
22
- #
23
- # Send comments to raypereda@hotmail.com
24
- #
25
- module Stemmable
26
-
27
- STEP_2_LIST = {
28
- 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
29
- 'izer'=>'ize', 'bli'=>'ble',
30
- 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
31
- 'ization'=>'ize', 'ation'=>'ate',
32
- 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
33
- 'ousness'=>'ous', 'aliti'=>'al',
34
- 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
35
- }
36
-
37
- STEP_3_LIST = {
38
- 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
39
- 'ical'=>'ic', 'ful'=>'', 'ness'=>''
40
- }
41
-
42
-
43
- SUFFIX_1_REGEXP = /(
44
- ational |
45
- tional |
46
- enci |
47
- anci |
48
- izer |
49
- bli |
50
- alli |
51
- entli |
52
- eli |
53
- ousli |
54
- ization |
55
- ation |
56
- ator |
57
- alism |
58
- iveness |
59
- fulness |
60
- ousness |
61
- aliti |
62
- iviti |
63
- biliti |
64
- logi)$/x
65
-
66
-
67
- SUFFIX_2_REGEXP = /(
68
- al |
69
- ance |
70
- ence |
71
- er |
72
- ic |
73
- able |
74
- ible |
75
- ant |
76
- ement |
77
- ment |
78
- ent |
79
- ou |
80
- ism |
81
- ate |
82
- iti |
83
- ous |
84
- ive |
85
- ize)$/x
86
-
87
-
88
- C = "[^aeiou]" # consonant
89
- V = "[aeiouy]" # vowel
90
- CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
91
- VV = "#{V}(?>[aeiou]*)" # vowel sequence
92
-
93
- MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
94
- MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
95
- MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
96
- VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
97
-
98
- #
99
- # Stems the word contained in the current object. E.g.,
100
- # "actually".stem_porter
101
- # => "actual"
102
- def stem_porter
103
-
104
- # make a copy of the given object and convert it to a string.
105
- w = self.dup.to_str
106
-
107
- return w if w.length < 3
108
-
109
- # now map initial y to Y so that the patterns never treat it as vowel
110
- w[0] = 'Y' if w[0] == ?y
111
-
112
- # Step 1a
113
- if w =~ /(ss|i)es$/
114
- w = $` + $1
115
- elsif w =~ /([^s])s$/
116
- w = $` + $1
117
- end
118
-
119
- # Step 1b
120
- if w =~ /eed$/
121
- w.chop! if $` =~ MGR0
122
- elsif w =~ /(ed|ing)$/
123
- stem = $`
124
- if stem =~ VOWEL_IN_STEM
125
- w = stem
126
- case w
127
- when /(at|bl|iz)$/ then w << "e"
128
- when /([^aeiouylsz])\1$/ then w.chop!
129
- when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
130
- end
131
- end
132
- end
133
-
134
- if w =~ /y$/
135
- stem = $`
136
- w = stem + "i" if stem =~ VOWEL_IN_STEM
137
- end
138
-
139
- # Step 2
140
- if w =~ SUFFIX_1_REGEXP
141
- stem = $`
142
- suffix = $1
143
- # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
144
- if stem =~ MGR0
145
- w = stem + STEP_2_LIST[suffix]
146
- end
147
- end
148
-
149
- # Step 3
150
- if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
151
- stem = $`
152
- suffix = $1
153
- if stem =~ MGR0
154
- w = stem + STEP_3_LIST[suffix]
155
- end
156
- end
157
-
158
- # Step 4
159
- if w =~ SUFFIX_2_REGEXP
160
- stem = $`
161
- if stem =~ MGR1
162
- w = stem
163
- end
164
- elsif w =~ /(s|t)(ion)$/
165
- stem = $` + $1
166
- if stem =~ MGR1
167
- w = stem
168
- end
169
- end
170
-
171
- # Step 5
172
- if w =~ /e$/
173
- stem = $`
174
- if (stem =~ MGR1) ||
175
- (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
176
- w = stem
177
- end
178
- end
179
-
180
- if w =~ /ll$/ && w =~ MGR1
181
- w.chop!
182
- end
183
-
184
- # and turn initial Y back to y
185
- w[0] = 'y' if w[0] == ?Y
186
-
187
- w
188
- end
189
-
190
-
191
- #
192
- # make the stem_porter the default stem method, just in case we
193
- # feel like having multiple stemmers available later.
194
- #
195
- alias stem stem_porter
196
-
197
- end
198
-
199
- end