classifier 1.1.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. data/LICENSE +341 -0
  2. data/README +59 -6
  3. data/Rakefile +16 -4
  4. data/bin/bayes.rb +8 -2
  5. data/doc/classes/Classifier.html +15 -10
  6. data/doc/classes/Classifier/Bayes.html +68 -38
  7. data/doc/classes/Classifier/Bayes.src/{M000005.html → M000023.html} +1 -1
  8. data/doc/classes/Classifier/Bayes.src/{M000006.html → M000024.html} +1 -1
  9. data/doc/classes/Classifier/Bayes.src/M000025.html +30 -0
  10. data/doc/classes/Classifier/Bayes.src/{M000007.html → M000026.html} +1 -1
  11. data/doc/classes/Classifier/Bayes.src/{M000008.html → M000027.html} +1 -1
  12. data/doc/classes/Classifier/Bayes.src/{M000009.html → M000028.html} +4 -4
  13. data/doc/classes/Classifier/Bayes.src/{M000010.html → M000029.html} +2 -2
  14. data/doc/classes/Classifier/ContentNode.html +252 -0
  15. data/doc/classes/Classifier/ContentNode.src/M000031.html +21 -0
  16. data/doc/classes/Classifier/ContentNode.src/M000032.html +18 -0
  17. data/doc/classes/Classifier/ContentNode.src/M000033.html +18 -0
  18. data/doc/classes/Classifier/ContentNode.src/M000034.html +41 -0
  19. data/doc/classes/Classifier/LSI.html +449 -0
  20. data/doc/classes/Classifier/LSI.src/M000011.html +20 -0
  21. data/doc/classes/Classifier/LSI.src/M000012.html +18 -0
  22. data/doc/classes/Classifier/LSI.src/M000013.html +20 -0
  23. data/doc/classes/Classifier/LSI.src/M000014.html +18 -0
  24. data/doc/classes/Classifier/LSI.src/M000015.html +21 -0
  25. data/doc/classes/Classifier/LSI.src/M000016.html +18 -0
  26. data/doc/classes/Classifier/LSI.src/M000017.html +32 -0
  27. data/doc/classes/Classifier/LSI.src/M000018.html +26 -0
  28. data/doc/classes/Classifier/LSI.src/M000019.html +26 -0
  29. data/doc/classes/Classifier/LSI.src/M000020.html +23 -0
  30. data/doc/classes/Classifier/LSI.src/M000021.html +21 -0
  31. data/doc/classes/Classifier/LSI.src/M000022.html +31 -0
  32. data/doc/classes/Classifier/WordList.html +202 -0
  33. data/doc/classes/Classifier/WordList.src/M000007.html +18 -0
  34. data/doc/classes/Classifier/WordList.src/M000008.html +19 -0
  35. data/doc/classes/Classifier/WordList.src/M000009.html +19 -0
  36. data/doc/classes/Classifier/WordList.src/M000010.html +18 -0
  37. data/doc/classes/GSL.html +111 -0
  38. data/doc/classes/GSL/Vector.html +156 -0
  39. data/doc/classes/GSL/Vector.src/M000005.html +18 -0
  40. data/doc/classes/GSL/Vector.src/M000006.html +19 -0
  41. data/doc/classes/Object.html +139 -0
  42. data/doc/classes/Object.src/M000001.html +16 -0
  43. data/doc/classes/String.html +95 -9
  44. data/doc/classes/{Classifier/WordHash.src/M000001.html → String.src/M000002.html} +3 -3
  45. data/doc/classes/String.src/M000003.html +18 -0
  46. data/doc/classes/String.src/M000004.html +18 -0
  47. data/doc/created.rid +1 -1
  48. data/doc/files/README.html +102 -12
  49. data/doc/files/lib/classifier/bayes_rb.html +1 -1
  50. data/doc/files/lib/classifier/{string_extensions/porter_stemmer_rb.html → extensions/vector_serialize_rb.html} +4 -15
  51. data/doc/files/lib/classifier/{string_extensions → extensions}/word_hash_rb.html +2 -2
  52. data/doc/files/lib/classifier/extensions/word_list_rb.html +115 -0
  53. data/doc/files/lib/classifier/lsi/content_node_rb.html +115 -0
  54. data/doc/files/lib/classifier/lsi_rb.html +125 -0
  55. data/doc/files/lib/classifier/string_extensions_rb.html +2 -3
  56. data/doc/files/lib/classifier_rb.html +3 -1
  57. data/doc/fr_class_index.html +6 -2
  58. data/doc/fr_file_index.html +5 -2
  59. data/doc/fr_method_index.html +34 -11
  60. data/lib/classifier.rb +3 -1
  61. data/lib/classifier/bayes.rb +34 -9
  62. data/lib/classifier/extensions/vector_serialize.rb +14 -0
  63. data/lib/classifier/extensions/word_hash.rb +125 -0
  64. data/lib/classifier/extensions/word_list.rb +31 -0
  65. data/lib/classifier/lsi.rb +248 -0
  66. data/lib/classifier/lsi/content_node.rb +67 -0
  67. data/lib/classifier/string_extensions.rb +10 -5
  68. data/test/bayes/bayesian_test.rb +2 -2
  69. data/test/lsi/lsi_test.rb +88 -0
  70. data/test/string_extensions/word_hash_test.rb +7 -5
  71. metadata +79 -24
  72. data/doc/classes/Classifier/Stemmable.html +0 -243
  73. data/doc/classes/Classifier/Stemmable.src/M000003.html +0 -102
  74. data/doc/classes/Classifier/WordHash.html +0 -178
  75. data/doc/classes/Classifier/WordHash.src/M000002.html +0 -28
  76. data/lib/classifier/string_extensions/porter_stemmer.rb +0 -199
  77. data/lib/classifier/string_extensions/word_hash.rb +0 -119
@@ -1,102 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html>
7
- <head>
8
- <title>stem_porter (Classifier::Stemmable)</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
11
- </head>
12
- <body class="standalone-code">
13
- <pre><span class="ruby-comment cmt"># File lib/classifier/string_extensions/porter_stemmer.rb, line 102</span>
14
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">stem_porter</span>
15
-
16
- <span class="ruby-comment cmt"># make a copy of the given object and convert it to a string.
17
- <span class="ruby-identifier">w</span> = <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">dup</span>.<span class="ruby-identifier">to_str</span>
18
-
19
- <span class="ruby-keyword kw">return</span> <span class="ruby-identifier">w</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">&lt;</span> <span class="ruby-value">3</span>
20
-
21
- <span class="ruby-comment cmt"># now map initial y to Y so that the patterns never treat it as vowel
22
- <span class="ruby-identifier">w</span>[<span class="ruby-value">0</span>] = <span class="ruby-value str">'Y'</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span>[<span class="ruby-value">0</span>] <span class="ruby-operator">==</span> <span class="ruby-value">?y</span>
23
-
24
- <span class="ruby-comment cmt"># Step 1a
25
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/(ss|i)es$/</span>
26
- <span class="ruby-identifier">w</span> = <span class="ruby-identifier">$`</span> <span class="ruby-operator">+</span> <span class="ruby-identifier">$1</span>
27
- <span class="ruby-keyword kw">elsif</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/([^s])s$/</span>
28
- <span class="ruby-identifier">w</span> = <span class="ruby-identifier">$`</span> <span class="ruby-operator">+</span> <span class="ruby-identifier">$1</span>
29
- <span class="ruby-keyword kw">end</span>
30
-
31
- <span class="ruby-comment cmt"># Step 1b
32
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/eed$/</span>
33
- <span class="ruby-identifier">w</span>.<span class="ruby-identifier">chop!</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">$`</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR0</span>
34
- <span class="ruby-keyword kw">elsif</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/(ed|ing)$/</span>
35
- <span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
36
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">VOWEL_IN_STEM</span>
37
- <span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span>
38
- <span class="ruby-keyword kw">case</span> <span class="ruby-identifier">w</span>
39
- <span class="ruby-keyword kw">when</span> <span class="ruby-regexp re">/(at|bl|iz)$/</span> <span class="ruby-keyword kw">then</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-value str">&quot;e&quot;</span>
40
- <span class="ruby-keyword kw">when</span> <span class="ruby-regexp re">/([^aeiouylsz])\1$/</span> <span class="ruby-keyword kw">then</span> <span class="ruby-identifier">w</span>.<span class="ruby-identifier">chop!</span>
41
- <span class="ruby-keyword kw">when</span> <span class="ruby-node">/^#{CC}#{V}[^aeiouwxy]$/o</span> <span class="ruby-keyword kw">then</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-value str">&quot;e&quot;</span>
42
- <span class="ruby-keyword kw">end</span>
43
- <span class="ruby-keyword kw">end</span>
44
- <span class="ruby-keyword kw">end</span>
45
-
46
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/y$/</span>
47
- <span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
48
- <span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span> <span class="ruby-operator">+</span> <span class="ruby-value str">&quot;i&quot;</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">VOWEL_IN_STEM</span>
49
- <span class="ruby-keyword kw">end</span>
50
-
51
- <span class="ruby-comment cmt"># Step 2
52
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">SUFFIX_1_REGEXP</span>
53
- <span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
54
- <span class="ruby-identifier">suffix</span> = <span class="ruby-identifier">$1</span>
55
- <span class="ruby-comment cmt"># print &quot;stem= &quot; + stem + &quot;\n&quot; + &quot;suffix=&quot; + suffix + &quot;\n&quot;
56
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR0</span>
57
- <span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span> <span class="ruby-operator">+</span> <span class="ruby-constant">STEP_2_LIST</span>[<span class="ruby-identifier">suffix</span>]
58
- <span class="ruby-keyword kw">end</span>
59
- <span class="ruby-keyword kw">end</span>
60
-
61
- <span class="ruby-comment cmt"># Step 3
62
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/(icate|ative|alize|iciti|ical|ful|ness)$/</span>
63
- <span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
64
- <span class="ruby-identifier">suffix</span> = <span class="ruby-identifier">$1</span>
65
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR0</span>
66
- <span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span> <span class="ruby-operator">+</span> <span class="ruby-constant">STEP_3_LIST</span>[<span class="ruby-identifier">suffix</span>]
67
- <span class="ruby-keyword kw">end</span>
68
- <span class="ruby-keyword kw">end</span>
69
-
70
- <span class="ruby-comment cmt"># Step 4
71
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">SUFFIX_2_REGEXP</span>
72
- <span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
73
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR1</span>
74
- <span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span>
75
- <span class="ruby-keyword kw">end</span>
76
- <span class="ruby-keyword kw">elsif</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/(s|t)(ion)$/</span>
77
- <span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span> <span class="ruby-operator">+</span> <span class="ruby-identifier">$1</span>
78
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR1</span>
79
- <span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span>
80
- <span class="ruby-keyword kw">end</span>
81
- <span class="ruby-keyword kw">end</span>
82
-
83
- <span class="ruby-comment cmt"># Step 5
84
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/e$/</span>
85
- <span class="ruby-identifier">stem</span> = <span class="ruby-identifier">$`</span>
86
- <span class="ruby-keyword kw">if</span> (<span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR1</span>) <span class="ruby-operator">||</span>
87
- (<span class="ruby-identifier">stem</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MEQ1</span> <span class="ruby-operator">&amp;&amp;</span> <span class="ruby-identifier">stem</span> <span class="ruby-operator">!~</span> <span class="ruby-node">/^#{CC}#{V}[^aeiouwxy]$/o</span>)
88
- <span class="ruby-identifier">w</span> = <span class="ruby-identifier">stem</span>
89
- <span class="ruby-keyword kw">end</span>
90
- <span class="ruby-keyword kw">end</span>
91
-
92
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/ll$/</span> <span class="ruby-operator">&amp;&amp;</span> <span class="ruby-identifier">w</span> <span class="ruby-operator">=~</span> <span class="ruby-constant">MGR1</span>
93
- <span class="ruby-identifier">w</span>.<span class="ruby-identifier">chop!</span>
94
- <span class="ruby-keyword kw">end</span>
95
-
96
- <span class="ruby-comment cmt"># and turn initial Y back to y
97
- <span class="ruby-identifier">w</span>[<span class="ruby-value">0</span>] = <span class="ruby-value str">'y'</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">w</span>[<span class="ruby-value">0</span>] <span class="ruby-operator">==</span> <span class="ruby-value">?Y</span>
98
-
99
- <span class="ruby-identifier">w</span>
100
- <span class="ruby-keyword kw">end</span></pre>
101
- </body>
102
- </html>
@@ -1,178 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
- <head>
8
- <title>Module: Classifier::WordHash</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
- <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
12
- <script type="text/javascript">
13
- // <![CDATA[
14
-
15
- function popupCode( url ) {
16
- window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
- }
18
-
19
- function toggleCode( id ) {
20
- if ( document.getElementById )
21
- elem = document.getElementById( id );
22
- else if ( document.all )
23
- elem = eval( "document.all." + id );
24
- else
25
- return false;
26
-
27
- elemStyle = elem.style;
28
-
29
- if ( elemStyle.display != "block" ) {
30
- elemStyle.display = "block"
31
- } else {
32
- elemStyle.display = "none"
33
- }
34
-
35
- return true;
36
- }
37
-
38
- // Make codeblocks hidden by default
39
- document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
-
41
- // ]]>
42
- </script>
43
-
44
- </head>
45
- <body>
46
-
47
-
48
-
49
- <div id="classHeader">
50
- <table class="header-table">
51
- <tr class="top-aligned-row">
52
- <td><strong>Module</strong></td>
53
- <td class="class-name-in-header">Classifier::WordHash</td>
54
- </tr>
55
- <tr class="top-aligned-row">
56
- <td><strong>In:</strong></td>
57
- <td>
58
- <a href="../../files/lib/classifier/string_extensions/word_hash_rb.html">
59
- lib/classifier/string_extensions/word_hash.rb
60
- </a>
61
- <br />
62
- </td>
63
- </tr>
64
-
65
- </table>
66
- </div>
67
- <!-- banner header -->
68
-
69
- <div id="bodyContent">
70
-
71
-
72
-
73
- <div id="contextContent">
74
-
75
- <div id="description">
76
- <p>
77
- This module is mixed into <a href="../String.html">String</a> to provide
78
- convenience methods for the <a href="../Classifier.html">Classifier</a>
79
- package.
80
- </p>
81
-
82
- </div>
83
-
84
-
85
- </div>
86
-
87
- <div id="method-list">
88
- <h3 class="section-bar">Methods</h3>
89
-
90
- <div class="name-list">
91
- <a href="#M000001">without_punctuation</a>&nbsp;&nbsp;
92
- <a href="#M000002">word_hash</a>&nbsp;&nbsp;
93
- </div>
94
- </div>
95
-
96
- </div>
97
-
98
-
99
- <!-- if includes -->
100
-
101
- <div id="section">
102
-
103
-
104
- <div id="constants-list">
105
- <h3 class="section-bar">Constants</h3>
106
-
107
- <div class="name-list">
108
- <table summary="Constants">
109
- <tr class="top-aligned-row context-row">
110
- <td class="context-item-name">CORPUS_SKIP_WORDS</td>
111
- <td>=</td>
112
- <td class="context-item-value">{ &quot;a&quot; =&gt; 1, &quot;again&quot; =&gt; 1, &quot;all&quot; =&gt; 1, &quot;along&quot; =&gt; 1, &quot;are&quot; =&gt; 1, &quot;also&quot; =&gt; 1, &quot;an&quot; =&gt; 1, &quot;and&quot; =&gt; 1, &quot;as&quot; =&gt; 1, &quot;at&quot; =&gt; 1, &quot;but&quot; =&gt; 1, &quot;by&quot; =&gt; 1, &quot;came&quot; =&gt; 1, &quot;can&quot; =&gt; 1, &quot;cant&quot; =&gt; 1, &quot;couldnt&quot; =&gt; 1, &quot;did&quot; =&gt; 1, &quot;didn&quot; =&gt; 1, &quot;didnt&quot; =&gt; 1, &quot;do&quot; =&gt; 1, &quot;doesnt&quot; =&gt; 1, &quot;dont&quot; =&gt; 1, &quot;ever&quot; =&gt; 1, &quot;first&quot; =&gt; 1, &quot;from&quot; =&gt; 1, &quot;have&quot; =&gt; 1, &quot;her&quot; =&gt; 1, &quot;here&quot; =&gt; 1, &quot;him&quot; =&gt; 1, &quot;how&quot; =&gt; 1, &quot;i&quot; =&gt; 1, &quot;if&quot; =&gt; 1, &quot;in&quot; =&gt; 1, &quot;into&quot; =&gt; 1, &quot;is&quot; =&gt; 1, &quot;isnt&quot; =&gt; 1, &quot;it&quot; =&gt; 1, &quot;itll&quot; =&gt; 1, &quot;just&quot; =&gt; 1, &quot;last&quot; =&gt; 1, &quot;least&quot; =&gt; 1, &quot;like&quot; =&gt; 1, &quot;most&quot; =&gt; 1, &quot;my&quot; =&gt; 1, &quot;new&quot; =&gt; 1, &quot;no&quot; =&gt; 1, &quot;not&quot; =&gt; 1, &quot;now&quot; =&gt; 1, &quot;of&quot; =&gt; 1, &quot;on&quot; =&gt; 1, &quot;or&quot; =&gt; 1, &quot;should&quot; =&gt; 1, &quot;sinc&quot; =&gt; 1, &quot;so&quot; =&gt; 1, &quot;some&quot; =&gt; 1, &quot;th&quot; =&gt; 1, &quot;than&quot; =&gt; 1, &quot;this&quot; =&gt; 1, &quot;that&quot; =&gt; 1, &quot;the&quot; =&gt; 1, &quot;their&quot; =&gt; 1, &quot;then&quot; =&gt; 1, &quot;those&quot; =&gt; 1, &quot;to&quot; =&gt; 1, &quot;told&quot; =&gt; 1, &quot;too&quot; =&gt; 1, &quot;true&quot; =&gt; 1, &quot;try&quot; =&gt; 1, &quot;until&quot; =&gt; 1, &quot;url&quot; =&gt; 1, &quot;us&quot; =&gt; 1, &quot;were&quot; =&gt; 1, &quot;when&quot; =&gt; 1, &quot;whether&quot; =&gt; 1, &quot;while&quot; =&gt; 1, &quot;with&quot; =&gt; 1, &quot;within&quot; =&gt; 1, &quot;yes&quot; =&gt; 1, &quot;you&quot; =&gt; 1, &quot;youll&quot; =&gt; 1, }</td>
113
- </tr>
114
- </table>
115
- </div>
116
- </div>
117
-
118
-
119
-
120
-
121
-
122
-
123
- <!-- if method_list -->
124
- <div id="methods">
125
- <h3 class="section-bar">Public Instance methods</h3>
126
-
127
- <div id="method-M000001" class="method-detail">
128
- <a name="M000001"></a>
129
-
130
- <div class="method-heading">
131
- <a href="WordHash.src/M000001.html" target="Code" class="method-signature"
132
- onclick="popupCode('WordHash.src/M000001.html');return false;">
133
- <span class="method-name">without_punctuation</span><span class="method-args">()</span>
134
- </a>
135
- </div>
136
-
137
- <div class="method-description">
138
- <p>
139
- Removes common punctuation symbols, returning a new string. E.g.,
140
- </p>
141
- <pre>
142
- &quot;Hello (greeting's), with {braces} &lt; &gt;...?&quot;.without_punctuation
143
- =&gt; &quot;Hello greetings with braces &quot;
144
- </pre>
145
- </div>
146
- </div>
147
-
148
- <div id="method-M000002" class="method-detail">
149
- <a name="M000002"></a>
150
-
151
- <div class="method-heading">
152
- <a href="WordHash.src/M000002.html" target="Code" class="method-signature"
153
- onclick="popupCode('WordHash.src/M000002.html');return false;">
154
- <span class="method-name">word_hash</span><span class="method-args">()</span>
155
- </a>
156
- </div>
157
-
158
- <div class="method-description">
159
- <p>
160
- Return a Hash of strings =&gt; ints. Each word in the string is stemmed,
161
- interned, and indexes to its frequency in the document.
162
- </p>
163
- </div>
164
- </div>
165
-
166
-
167
- </div>
168
-
169
-
170
- </div>
171
-
172
-
173
- <div id="validator-badges">
174
- <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
175
- </div>
176
-
177
- </body>
178
- </html>
@@ -1,28 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html>
7
- <head>
8
- <title>word_hash (Classifier::WordHash)</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
11
- </head>
12
- <body class="standalone-code">
13
- <pre><span class="ruby-comment cmt"># File lib/classifier/string_extensions/word_hash.rb, line 20</span>
14
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">word_hash</span>
15
- <span class="ruby-identifier">d</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span>
16
- <span class="ruby-identifier">corpus</span> = <span class="ruby-identifier">without_punctuation</span>
17
- (<span class="ruby-identifier">corpus</span>.<span class="ruby-identifier">split</span> <span class="ruby-operator">+</span> <span class="ruby-identifier">gsub</span>(<span class="ruby-regexp re">/[\w+]/</span>,<span class="ruby-value str">&quot;&quot;</span>).<span class="ruby-identifier">split</span>).<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">word</span><span class="ruby-operator">|</span>
18
- <span class="ruby-identifier">item</span> = <span class="ruby-identifier">word</span>.<span class="ruby-identifier">downcase</span>
19
- <span class="ruby-identifier">key</span> = <span class="ruby-identifier">item</span>.<span class="ruby-identifier">stem</span>.<span class="ruby-identifier">intern</span>
20
- <span class="ruby-keyword kw">if</span> <span class="ruby-operator">!</span>(<span class="ruby-identifier">word</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/[\w+]/</span>) <span class="ruby-operator">||</span> <span class="ruby-identifier">word</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">&gt;</span> <span class="ruby-value">2</span>
21
- <span class="ruby-identifier">d</span>[<span class="ruby-identifier">key</span>] <span class="ruby-operator">||=</span> <span class="ruby-value">0</span>
22
- <span class="ruby-identifier">d</span>[<span class="ruby-identifier">key</span>] <span class="ruby-operator">+=</span> <span class="ruby-value">1</span>
23
- <span class="ruby-keyword kw">end</span> <span class="ruby-keyword kw">unless</span> <span class="ruby-constant">CORPUS_SKIP_WORDS</span>[<span class="ruby-identifier">item</span>]
24
- <span class="ruby-keyword kw">end</span>
25
- <span class="ruby-keyword kw">return</span> <span class="ruby-identifier">d</span>
26
- <span class="ruby-keyword kw">end</span></pre>
27
- </body>
28
- </html>
@@ -1,199 +0,0 @@
1
- # = Author
2
- #
3
- # Greg Fast, gdf@speakeasy.net
4
- #
5
- # = Copyright
6
- #
7
- # Copyright 2005 Greg Fast <gdf@speakeasy.net>
8
-
9
- module Classifier
10
-
11
- #
12
- # Porter stemmer in Ruby.
13
- #
14
- # This is the Porter stemming algorithm, ported to Ruby from the
15
- # version coded up in Perl. It's easy to follow against the rules
16
- # in the original paper in:
17
- #
18
- # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
19
- # no. 3, pp 130-137,
20
- #
21
- # See also http://www.tartarus.org/~martin/PorterStemmer
22
- #
23
- # Send comments to raypereda@hotmail.com
24
- #
25
- module Stemmable
26
-
27
- STEP_2_LIST = {
28
- 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
29
- 'izer'=>'ize', 'bli'=>'ble',
30
- 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
31
- 'ization'=>'ize', 'ation'=>'ate',
32
- 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
33
- 'ousness'=>'ous', 'aliti'=>'al',
34
- 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
35
- }
36
-
37
- STEP_3_LIST = {
38
- 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
39
- 'ical'=>'ic', 'ful'=>'', 'ness'=>''
40
- }
41
-
42
-
43
- SUFFIX_1_REGEXP = /(
44
- ational |
45
- tional |
46
- enci |
47
- anci |
48
- izer |
49
- bli |
50
- alli |
51
- entli |
52
- eli |
53
- ousli |
54
- ization |
55
- ation |
56
- ator |
57
- alism |
58
- iveness |
59
- fulness |
60
- ousness |
61
- aliti |
62
- iviti |
63
- biliti |
64
- logi)$/x
65
-
66
-
67
- SUFFIX_2_REGEXP = /(
68
- al |
69
- ance |
70
- ence |
71
- er |
72
- ic |
73
- able |
74
- ible |
75
- ant |
76
- ement |
77
- ment |
78
- ent |
79
- ou |
80
- ism |
81
- ate |
82
- iti |
83
- ous |
84
- ive |
85
- ize)$/x
86
-
87
-
88
- C = "[^aeiou]" # consonant
89
- V = "[aeiouy]" # vowel
90
- CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
91
- VV = "#{V}(?>[aeiou]*)" # vowel sequence
92
-
93
- MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
94
- MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
95
- MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
96
- VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
97
-
98
- #
99
- # Stems the word contained in the current object. E.g.,
100
- # "actually".stem_porter
101
- # => "actual"
102
- def stem_porter
103
-
104
- # make a copy of the given object and convert it to a string.
105
- w = self.dup.to_str
106
-
107
- return w if w.length < 3
108
-
109
- # now map initial y to Y so that the patterns never treat it as vowel
110
- w[0] = 'Y' if w[0] == ?y
111
-
112
- # Step 1a
113
- if w =~ /(ss|i)es$/
114
- w = $` + $1
115
- elsif w =~ /([^s])s$/
116
- w = $` + $1
117
- end
118
-
119
- # Step 1b
120
- if w =~ /eed$/
121
- w.chop! if $` =~ MGR0
122
- elsif w =~ /(ed|ing)$/
123
- stem = $`
124
- if stem =~ VOWEL_IN_STEM
125
- w = stem
126
- case w
127
- when /(at|bl|iz)$/ then w << "e"
128
- when /([^aeiouylsz])\1$/ then w.chop!
129
- when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
130
- end
131
- end
132
- end
133
-
134
- if w =~ /y$/
135
- stem = $`
136
- w = stem + "i" if stem =~ VOWEL_IN_STEM
137
- end
138
-
139
- # Step 2
140
- if w =~ SUFFIX_1_REGEXP
141
- stem = $`
142
- suffix = $1
143
- # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
144
- if stem =~ MGR0
145
- w = stem + STEP_2_LIST[suffix]
146
- end
147
- end
148
-
149
- # Step 3
150
- if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
151
- stem = $`
152
- suffix = $1
153
- if stem =~ MGR0
154
- w = stem + STEP_3_LIST[suffix]
155
- end
156
- end
157
-
158
- # Step 4
159
- if w =~ SUFFIX_2_REGEXP
160
- stem = $`
161
- if stem =~ MGR1
162
- w = stem
163
- end
164
- elsif w =~ /(s|t)(ion)$/
165
- stem = $` + $1
166
- if stem =~ MGR1
167
- w = stem
168
- end
169
- end
170
-
171
- # Step 5
172
- if w =~ /e$/
173
- stem = $`
174
- if (stem =~ MGR1) ||
175
- (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
176
- w = stem
177
- end
178
- end
179
-
180
- if w =~ /ll$/ && w =~ MGR1
181
- w.chop!
182
- end
183
-
184
- # and turn initial Y back to y
185
- w[0] = 'y' if w[0] == ?Y
186
-
187
- w
188
- end
189
-
190
-
191
- #
192
- # make the stem_porter the default stem method, just in case we
193
- # feel like having multiple stemmers available later.
194
- #
195
- alias stem stem_porter
196
-
197
- end
198
-
199
- end