classifier 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +361 -273
- data/README +6 -5
- data/Rakefile +12 -2
- data/bin/summarize.rb +11 -0
- data/doc/classes/Array.html +139 -0
- data/doc/classes/Array.src/M000003.html +18 -0
- data/doc/classes/Classifier.html +5 -5
- data/doc/classes/Classifier/Bayes.html +43 -43
- data/doc/classes/Classifier/Bayes.src/{M000023.html → M000038.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000024.html → M000039.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000025.html → M000040.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000026.html → M000041.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000027.html → M000042.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000028.html → M000043.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000029.html → M000044.html} +0 -0
- data/doc/classes/Classifier/ContentNode.html +23 -28
- data/doc/classes/Classifier/ContentNode.src/M000046.html +19 -0
- data/doc/classes/Classifier/ContentNode.src/{M000032.html → M000047.html} +1 -1
- data/doc/classes/Classifier/ContentNode.src/{M000033.html → M000048.html} +1 -1
- data/doc/classes/Classifier/ContentNode.src/M000049.html +49 -0
- data/doc/classes/Classifier/LSI.html +158 -68
- data/doc/classes/Classifier/LSI.src/M000022.html +6 -17
- data/doc/classes/Classifier/LSI.src/{M000012.html → M000023.html} +2 -2
- data/doc/classes/Classifier/LSI.src/{M000013.html → M000024.html} +3 -2
- data/doc/classes/Classifier/LSI.src/{M000014.html → M000025.html} +1 -1
- data/doc/classes/Classifier/LSI.src/M000026.html +19 -0
- data/doc/classes/Classifier/LSI.src/{M000015.html → M000027.html} +1 -1
- data/doc/classes/Classifier/LSI.src/{M000016.html → M000028.html} +1 -1
- data/doc/classes/Classifier/LSI.src/M000029.html +19 -0
- data/doc/classes/Classifier/LSI.src/M000030.html +43 -0
- data/doc/classes/Classifier/LSI.src/M000031.html +23 -0
- data/doc/classes/Classifier/LSI.src/{M000018.html → M000032.html} +7 -3
- data/doc/classes/Classifier/LSI.src/{M000019.html → M000033.html} +6 -2
- data/doc/classes/Classifier/LSI.src/{M000020.html → M000034.html} +2 -4
- data/doc/classes/Classifier/LSI.src/{M000021.html → M000035.html} +1 -1
- data/doc/classes/Classifier/LSI.src/M000036.html +31 -0
- data/doc/classes/Classifier/LSI.src/M000037.html +21 -0
- data/doc/classes/Classifier/WordList.html +37 -22
- data/doc/classes/Classifier/WordList.src/{M000007.html → M000017.html} +2 -2
- data/doc/classes/Classifier/WordList.src/{M000008.html → M000018.html} +1 -1
- data/doc/classes/Classifier/WordList.src/{M000009.html → M000019.html} +1 -1
- data/doc/classes/Classifier/WordList.src/M000020.html +18 -0
- data/doc/classes/Classifier/WordList.src/{M000010.html → M000021.html} +1 -1
- data/doc/classes/GSL.html +2 -1
- data/doc/classes/GSL/Matrix.html +126 -0
- data/doc/classes/GSL/Vector.html +10 -10
- data/doc/classes/GSL/Vector.src/{M000005.html → M000015.html} +0 -0
- data/doc/classes/GSL/Vector.src/{M000006.html → M000016.html} +0 -0
- data/doc/classes/Matrix.html +184 -0
- data/doc/classes/Matrix.src/M000004.html +18 -0
- data/doc/classes/Matrix.src/M000005.html +76 -0
- data/doc/classes/Matrix.src/M000006.html +18 -0
- data/doc/classes/Object.html +7 -7
- data/doc/classes/Object.src/{M000001.html → M000007.html} +1 -1
- data/doc/classes/String.html +90 -20
- data/doc/classes/String.src/{M000002.html → M000008.html} +0 -0
- data/doc/classes/String.src/{M000003.html → M000009.html} +0 -0
- data/doc/classes/String.src/{M000004.html → M000010.html} +0 -0
- data/doc/classes/String.src/M000011.html +18 -0
- data/doc/classes/String.src/M000012.html +18 -0
- data/doc/classes/String.src/M000013.html +18 -0
- data/doc/classes/String.src/M000014.html +18 -0
- data/doc/classes/Vector.html +154 -0
- data/doc/classes/Vector.src/M000001.html +22 -0
- data/doc/classes/Vector.src/M000002.html +25 -0
- data/doc/created.rid +1 -1
- data/doc/files/README.html +14 -8
- data/doc/files/lib/classifier/bayes_rb.html +1 -1
- data/doc/files/lib/classifier/{string_extensions_rb.html → extensions/string_rb.html} +5 -5
- data/doc/files/lib/classifier/extensions/vector_rb.html +120 -0
- data/doc/files/lib/classifier/extensions/vector_serialize_rb.html +1 -1
- data/doc/files/lib/classifier/extensions/word_hash_rb.html +1 -1
- data/doc/files/lib/classifier/lsi/content_node_rb.html +2 -2
- data/doc/files/lib/classifier/lsi/summary_rb.html +115 -0
- data/doc/files/lib/classifier/{extensions → lsi}/word_list_rb.html +3 -3
- data/doc/files/lib/classifier/lsi_rb.html +5 -3
- data/doc/files/lib/classifier_rb.html +2 -2
- data/doc/fr_class_index.html +4 -0
- data/doc/fr_file_index.html +4 -2
- data/doc/fr_method_index.html +49 -34
- data/doc/index.html +2 -2
- data/lib/classifier.rb +1 -1
- data/lib/classifier/{string_extensions.rb → extensions/string.rb} +0 -0
- data/lib/classifier/extensions/vector.rb +106 -0
- data/lib/classifier/extensions/vector_serialize.rb +6 -0
- data/lib/classifier/lsi.rb +101 -31
- data/lib/classifier/lsi/content_node.rb +28 -23
- data/lib/classifier/lsi/summary.rb +31 -0
- data/lib/classifier/{extensions → lsi}/word_list.rb +7 -2
- data/test/{string_extensions → extensions}/word_hash_test.rb +0 -0
- data/test/lsi/lsi_test.rb +36 -1
- metadata +68 -41
- data/doc/classes/Classifier/ContentNode.src/M000031.html +0 -21
- data/doc/classes/Classifier/ContentNode.src/M000034.html +0 -41
- data/doc/classes/Classifier/LSI.src/M000011.html +0 -20
- data/doc/classes/Classifier/LSI.src/M000017.html +0 -32
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -83,7 +83,7 @@
|
|
|
83
83
|
<div id="description">
|
|
84
84
|
<p>
|
|
85
85
|
This is an internal data structure class for the <a href="LSI.html">LSI</a>
|
|
86
|
-
node. Save for <a href="ContentNode.html#
|
|
86
|
+
node. Save for <a href="ContentNode.html#M000049">raw_vector_with</a>, it
|
|
87
87
|
should be fairly straightforward to understand. You should never have to
|
|
88
88
|
use it directly.
|
|
89
89
|
</p>
|
|
@@ -97,10 +97,10 @@ use it directly.
|
|
|
97
97
|
<h3 class="section-bar">Methods</h3>
|
|
98
98
|
|
|
99
99
|
<div class="name-list">
|
|
100
|
-
<a href="#
|
|
101
|
-
<a href="#
|
|
102
|
-
<a href="#
|
|
103
|
-
<a href="#
|
|
100
|
+
<a href="#M000046">new</a>
|
|
101
|
+
<a href="#M000049">raw_vector_with</a>
|
|
102
|
+
<a href="#M000048">search_norm</a>
|
|
103
|
+
<a href="#M000047">search_vector</a>
|
|
104
104
|
</div>
|
|
105
105
|
</div>
|
|
106
106
|
|
|
@@ -145,14 +145,9 @@ use it directly.
|
|
|
145
145
|
<td class="context-item-value"> [RW] </td>
|
|
146
146
|
<td class="context-item-desc"></td>
|
|
147
147
|
</tr>
|
|
148
|
-
<tr class="top-aligned-row context-row">
|
|
149
|
-
<td class="context-item-name">source</td>
|
|
150
|
-
<td class="context-item-value"> [R] </td>
|
|
151
|
-
<td class="context-item-desc"></td>
|
|
152
|
-
</tr>
|
|
153
148
|
<tr class="top-aligned-row context-row">
|
|
154
149
|
<td class="context-item-name">word_hash</td>
|
|
155
|
-
<td class="context-item-value"> [
|
|
150
|
+
<td class="context-item-value"> [R] </td>
|
|
156
151
|
<td class="context-item-desc"></td>
|
|
157
152
|
</tr>
|
|
158
153
|
</table>
|
|
@@ -165,13 +160,13 @@ use it directly.
|
|
|
165
160
|
<div id="methods">
|
|
166
161
|
<h3 class="section-bar">Public Class methods</h3>
|
|
167
162
|
|
|
168
|
-
<div id="method-
|
|
169
|
-
<a name="
|
|
163
|
+
<div id="method-M000046" class="method-detail">
|
|
164
|
+
<a name="M000046"></a>
|
|
170
165
|
|
|
171
166
|
<div class="method-heading">
|
|
172
|
-
<a href="ContentNode.src/
|
|
173
|
-
onclick="popupCode('ContentNode.src/
|
|
174
|
-
<span class="method-name">new</span><span class="method-args">(
|
|
167
|
+
<a href="ContentNode.src/M000046.html" target="Code" class="method-signature"
|
|
168
|
+
onclick="popupCode('ContentNode.src/M000046.html');return false;">
|
|
169
|
+
<span class="method-name">new</span><span class="method-args">( word_hash, *categories )</span>
|
|
175
170
|
</a>
|
|
176
171
|
</div>
|
|
177
172
|
|
|
@@ -185,12 +180,12 @@ source.to_s
|
|
|
185
180
|
|
|
186
181
|
<h3 class="section-bar">Public Instance methods</h3>
|
|
187
182
|
|
|
188
|
-
<div id="method-
|
|
189
|
-
<a name="
|
|
183
|
+
<div id="method-M000049" class="method-detail">
|
|
184
|
+
<a name="M000049"></a>
|
|
190
185
|
|
|
191
186
|
<div class="method-heading">
|
|
192
|
-
<a href="ContentNode.src/
|
|
193
|
-
onclick="popupCode('ContentNode.src/
|
|
187
|
+
<a href="ContentNode.src/M000049.html" target="Code" class="method-signature"
|
|
188
|
+
onclick="popupCode('ContentNode.src/M000049.html');return false;">
|
|
194
189
|
<span class="method-name">raw_vector_with</span><span class="method-args">( word_list )</span>
|
|
195
190
|
</a>
|
|
196
191
|
</div>
|
|
@@ -203,12 +198,12 @@ mapping the vector space.
|
|
|
203
198
|
</div>
|
|
204
199
|
</div>
|
|
205
200
|
|
|
206
|
-
<div id="method-
|
|
207
|
-
<a name="
|
|
201
|
+
<div id="method-M000048" class="method-detail">
|
|
202
|
+
<a name="M000048"></a>
|
|
208
203
|
|
|
209
204
|
<div class="method-heading">
|
|
210
|
-
<a href="ContentNode.src/
|
|
211
|
-
onclick="popupCode('ContentNode.src/
|
|
205
|
+
<a href="ContentNode.src/M000048.html" target="Code" class="method-signature"
|
|
206
|
+
onclick="popupCode('ContentNode.src/M000048.html');return false;">
|
|
212
207
|
<span class="method-name">search_norm</span><span class="method-args">()</span>
|
|
213
208
|
</a>
|
|
214
209
|
</div>
|
|
@@ -220,12 +215,12 @@ Use this to fetch the appropriate search vector in normalized form.
|
|
|
220
215
|
</div>
|
|
221
216
|
</div>
|
|
222
217
|
|
|
223
|
-
<div id="method-
|
|
224
|
-
<a name="
|
|
218
|
+
<div id="method-M000047" class="method-detail">
|
|
219
|
+
<a name="M000047"></a>
|
|
225
220
|
|
|
226
221
|
<div class="method-heading">
|
|
227
|
-
<a href="ContentNode.src/
|
|
228
|
-
onclick="popupCode('ContentNode.src/
|
|
222
|
+
<a href="ContentNode.src/M000047.html" target="Code" class="method-signature"
|
|
223
|
+
onclick="popupCode('ContentNode.src/M000047.html');return false;">
|
|
229
224
|
<span class="method-name">search_vector</span><span class="method-args">()</span>
|
|
230
225
|
</a>
|
|
231
226
|
</div>
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
|
2
|
+
<!DOCTYPE html
|
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
5
|
+
|
|
6
|
+
<html>
|
|
7
|
+
<head>
|
|
8
|
+
<title>new (Classifier::ContentNode)</title>
|
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
|
10
|
+
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
|
11
|
+
</head>
|
|
12
|
+
<body class="standalone-code">
|
|
13
|
+
<pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line 18</span>
|
|
14
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>( <span class="ruby-identifier">word_hash</span>, <span class="ruby-operator">*</span><span class="ruby-identifier">categories</span> )
|
|
15
|
+
<span class="ruby-ivar">@categories</span> = <span class="ruby-identifier">categories</span> <span class="ruby-operator">||</span> []
|
|
16
|
+
<span class="ruby-ivar">@word_hash</span> = <span class="ruby-identifier">word_hash</span>
|
|
17
|
+
<span class="ruby-keyword kw">end</span></pre>
|
|
18
|
+
</body>
|
|
19
|
+
</html>
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
|
11
11
|
</head>
|
|
12
12
|
<body class="standalone-code">
|
|
13
|
-
<pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line
|
|
13
|
+
<pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line 24</span>
|
|
14
14
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">search_vector</span>
|
|
15
15
|
<span class="ruby-ivar">@lsi_vector</span> <span class="ruby-operator">||</span> <span class="ruby-ivar">@raw_vector</span>
|
|
16
16
|
<span class="ruby-keyword kw">end</span></pre>
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
|
11
11
|
</head>
|
|
12
12
|
<body class="standalone-code">
|
|
13
|
-
<pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line
|
|
13
|
+
<pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line 29</span>
|
|
14
14
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">search_norm</span>
|
|
15
15
|
<span class="ruby-ivar">@lsi_norm</span> <span class="ruby-operator">||</span> <span class="ruby-ivar">@raw_norm</span>
|
|
16
16
|
<span class="ruby-keyword kw">end</span></pre>
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
|
2
|
+
<!DOCTYPE html
|
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
5
|
+
|
|
6
|
+
<html>
|
|
7
|
+
<head>
|
|
8
|
+
<title>raw_vector_with (Classifier::ContentNode)</title>
|
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
|
10
|
+
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
|
11
|
+
</head>
|
|
12
|
+
<body class="standalone-code">
|
|
13
|
+
<pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line 35</span>
|
|
14
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">raw_vector_with</span>( <span class="ruby-identifier">word_list</span> )
|
|
15
|
+
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">$GSL</span>
|
|
16
|
+
<span class="ruby-identifier">vec</span> = <span class="ruby-constant">Vector</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">word_list</span>.<span class="ruby-identifier">size</span>)
|
|
17
|
+
<span class="ruby-keyword kw">else</span>
|
|
18
|
+
<span class="ruby-identifier">vec</span> = <span class="ruby-constant">Array</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">word_list</span>.<span class="ruby-identifier">size</span>, <span class="ruby-value">0</span>)
|
|
19
|
+
<span class="ruby-keyword kw">end</span>
|
|
20
|
+
|
|
21
|
+
<span class="ruby-ivar">@word_hash</span>.<span class="ruby-identifier">each_key</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">word</span><span class="ruby-operator">|</span>
|
|
22
|
+
<span class="ruby-identifier">vec</span>[<span class="ruby-identifier">word_list</span>[<span class="ruby-identifier">word</span>]] = <span class="ruby-ivar">@word_hash</span>[<span class="ruby-identifier">word</span>] <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">word_list</span>[<span class="ruby-identifier">word</span>]
|
|
23
|
+
<span class="ruby-keyword kw">end</span>
|
|
24
|
+
|
|
25
|
+
<span class="ruby-comment cmt"># Perform the scaling transform</span>
|
|
26
|
+
<span class="ruby-identifier">total_words</span> = <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">sum</span>
|
|
27
|
+
|
|
28
|
+
<span class="ruby-comment cmt"># Perform first-order association transform if this vector has more</span>
|
|
29
|
+
<span class="ruby-comment cmt"># than one word in it. </span>
|
|
30
|
+
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">total_words</span> <span class="ruby-operator">></span> <span class="ruby-value">1.0</span>
|
|
31
|
+
<span class="ruby-identifier">weighted_total</span> = <span class="ruby-value">0</span><span class="ruby-value">.0</span>
|
|
32
|
+
<span class="ruby-identifier">vec</span>.<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">term</span><span class="ruby-operator">|</span>
|
|
33
|
+
<span class="ruby-keyword kw">if</span> ( <span class="ruby-identifier">term</span> <span class="ruby-operator">></span> <span class="ruby-value">0</span> )
|
|
34
|
+
<span class="ruby-identifier">weighted_total</span> <span class="ruby-operator">+=</span> (( <span class="ruby-identifier">term</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">total_words</span> ) <span class="ruby-operator">*</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>( <span class="ruby-identifier">term</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">total_words</span> ))
|
|
35
|
+
<span class="ruby-keyword kw">end</span>
|
|
36
|
+
<span class="ruby-keyword kw">end</span>
|
|
37
|
+
<span class="ruby-identifier">vec</span> = <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">collect</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">val</span><span class="ruby-operator">|</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>( <span class="ruby-identifier">val</span> <span class="ruby-operator">+</span> <span class="ruby-value">1</span> ) <span class="ruby-operator">/</span> <span class="ruby-operator">-</span><span class="ruby-identifier">weighted_total</span> }
|
|
38
|
+
<span class="ruby-keyword kw">end</span>
|
|
39
|
+
|
|
40
|
+
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">$GSL</span>
|
|
41
|
+
<span class="ruby-ivar">@raw_norm</span> = <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">normalize</span>
|
|
42
|
+
<span class="ruby-ivar">@raw_vector</span> = <span class="ruby-identifier">vec</span>
|
|
43
|
+
<span class="ruby-keyword kw">else</span>
|
|
44
|
+
<span class="ruby-ivar">@raw_norm</span> = <span class="ruby-constant">Vector</span>[<span class="ruby-operator">*</span><span class="ruby-identifier">vec</span>].<span class="ruby-identifier">normalize</span>
|
|
45
|
+
<span class="ruby-ivar">@raw_vector</span> = <span class="ruby-constant">Vector</span>[<span class="ruby-operator">*</span><span class="ruby-identifier">vec</span>]
|
|
46
|
+
<span class="ruby-keyword kw">end</span>
|
|
47
|
+
<span class="ruby-keyword kw">end</span></pre>
|
|
48
|
+
</body>
|
|
49
|
+
</html>
|
|
@@ -97,18 +97,22 @@ href="http://en.wikipedia.org/wiki/Latent_Semantic_Indexing">Wikipedia</a>.
|
|
|
97
97
|
<h3 class="section-bar">Methods</h3>
|
|
98
98
|
|
|
99
99
|
<div class="name-list">
|
|
100
|
-
<a href="#
|
|
101
|
-
<a href="#
|
|
102
|
-
<a href="#
|
|
103
|
-
<a href="#
|
|
104
|
-
<a href="#
|
|
105
|
-
<a href="#
|
|
106
|
-
<a href="#
|
|
107
|
-
<a href="#
|
|
108
|
-
<a href="#
|
|
109
|
-
<a href="#
|
|
110
|
-
<a href="#
|
|
111
|
-
<a href="#
|
|
100
|
+
<a href="#M000025"><<</a>
|
|
101
|
+
<a href="#M000024">add_item</a>
|
|
102
|
+
<a href="#M000030">build_index</a>
|
|
103
|
+
<a href="#M000029">categories_for</a>
|
|
104
|
+
<a href="#M000026">categories_for</a>
|
|
105
|
+
<a href="#M000036">classify</a>
|
|
106
|
+
<a href="#M000035">find_related</a>
|
|
107
|
+
<a href="#M000037">highest_ranked_stems</a>
|
|
108
|
+
<a href="#M000031">highest_relative_content</a>
|
|
109
|
+
<a href="#M000028">items</a>
|
|
110
|
+
<a href="#M000023">needs_rebuild?</a>
|
|
111
|
+
<a href="#M000022">new</a>
|
|
112
|
+
<a href="#M000032">proximity_array_for_content</a>
|
|
113
|
+
<a href="#M000033">proximity_norms_for_content</a>
|
|
114
|
+
<a href="#M000027">remove_item</a>
|
|
115
|
+
<a href="#M000034">search</a>
|
|
112
116
|
</div>
|
|
113
117
|
</div>
|
|
114
118
|
|
|
@@ -128,6 +132,11 @@ href="http://en.wikipedia.org/wiki/Latent_Semantic_Indexing">Wikipedia</a>.
|
|
|
128
132
|
|
|
129
133
|
<div class="name-list">
|
|
130
134
|
<table>
|
|
135
|
+
<tr class="top-aligned-row context-row">
|
|
136
|
+
<td class="context-item-name">auto_rebuild</td>
|
|
137
|
+
<td class="context-item-value"> [RW] </td>
|
|
138
|
+
<td class="context-item-desc"></td>
|
|
139
|
+
</tr>
|
|
131
140
|
<tr class="top-aligned-row context-row">
|
|
132
141
|
<td class="context-item-name">word_list</td>
|
|
133
142
|
<td class="context-item-value"> [R] </td>
|
|
@@ -143,12 +152,12 @@ href="http://en.wikipedia.org/wiki/Latent_Semantic_Indexing">Wikipedia</a>.
|
|
|
143
152
|
<div id="methods">
|
|
144
153
|
<h3 class="section-bar">Public Class methods</h3>
|
|
145
154
|
|
|
146
|
-
<div id="method-
|
|
147
|
-
<a name="
|
|
155
|
+
<div id="method-M000022" class="method-detail">
|
|
156
|
+
<a name="M000022"></a>
|
|
148
157
|
|
|
149
158
|
<div class="method-heading">
|
|
150
|
-
<a href="LSI.src/
|
|
151
|
-
onclick="popupCode('LSI.src/
|
|
159
|
+
<a href="LSI.src/M000022.html" target="Code" class="method-signature"
|
|
160
|
+
onclick="popupCode('LSI.src/M000022.html');return false;">
|
|
152
161
|
<span class="method-name">new</span><span class="method-args">(options = {})</span>
|
|
153
162
|
</a>
|
|
154
163
|
</div>
|
|
@@ -156,7 +165,7 @@ href="http://en.wikipedia.org/wiki/Latent_Semantic_Indexing">Wikipedia</a>.
|
|
|
156
165
|
<div class="method-description">
|
|
157
166
|
<p>
|
|
158
167
|
Create a fresh index. If you want to call <a
|
|
159
|
-
href="LSI.html#
|
|
168
|
+
href="LSI.html#M000030">build_index</a> manually, use
|
|
160
169
|
</p>
|
|
161
170
|
<pre>
|
|
162
171
|
Classifier::LSI.new :auto_rebuild => false
|
|
@@ -166,31 +175,31 @@ href="LSI.html#M000017">build_index</a> manually, use
|
|
|
166
175
|
|
|
167
176
|
<h3 class="section-bar">Public Instance methods</h3>
|
|
168
177
|
|
|
169
|
-
<div id="method-
|
|
170
|
-
<a name="
|
|
178
|
+
<div id="method-M000025" class="method-detail">
|
|
179
|
+
<a name="M000025"></a>
|
|
171
180
|
|
|
172
181
|
<div class="method-heading">
|
|
173
|
-
<a href="LSI.src/
|
|
174
|
-
onclick="popupCode('LSI.src/
|
|
182
|
+
<a href="LSI.src/M000025.html" target="Code" class="method-signature"
|
|
183
|
+
onclick="popupCode('LSI.src/M000025.html');return false;">
|
|
175
184
|
<span class="method-name"><<</span><span class="method-args">( item )</span>
|
|
176
185
|
</a>
|
|
177
186
|
</div>
|
|
178
187
|
|
|
179
188
|
<div class="method-description">
|
|
180
189
|
<p>
|
|
181
|
-
A less flexible shorthand for <a href="LSI.html#
|
|
190
|
+
A less flexible shorthand for <a href="LSI.html#M000024">add_item</a> that
|
|
182
191
|
assumes you are passing in a string with no categorries. item will be duck
|
|
183
192
|
typed via to_s .
|
|
184
193
|
</p>
|
|
185
194
|
</div>
|
|
186
195
|
</div>
|
|
187
196
|
|
|
188
|
-
<div id="method-
|
|
189
|
-
<a name="
|
|
197
|
+
<div id="method-M000024" class="method-detail">
|
|
198
|
+
<a name="M000024"></a>
|
|
190
199
|
|
|
191
200
|
<div class="method-heading">
|
|
192
|
-
<a href="LSI.src/
|
|
193
|
-
onclick="popupCode('LSI.src/
|
|
201
|
+
<a href="LSI.src/M000024.html" target="Code" class="method-signature"
|
|
202
|
+
onclick="popupCode('LSI.src/M000024.html');return false;">
|
|
194
203
|
<span class="method-name">add_item</span><span class="method-args">( item, *categories, &block )</span>
|
|
195
204
|
</a>
|
|
196
205
|
</div>
|
|
@@ -216,12 +225,12 @@ For example:
|
|
|
216
225
|
</div>
|
|
217
226
|
</div>
|
|
218
227
|
|
|
219
|
-
<div id="method-
|
|
220
|
-
<a name="
|
|
228
|
+
<div id="method-M000030" class="method-detail">
|
|
229
|
+
<a name="M000030"></a>
|
|
221
230
|
|
|
222
231
|
<div class="method-heading">
|
|
223
|
-
<a href="LSI.src/
|
|
224
|
-
onclick="popupCode('LSI.src/
|
|
232
|
+
<a href="LSI.src/M000030.html" target="Code" class="method-signature"
|
|
233
|
+
onclick="popupCode('LSI.src/M000030.html');return false;">
|
|
225
234
|
<span class="method-name">build_index</span><span class="method-args">( cutoff=0.75 )</span>
|
|
226
235
|
</a>
|
|
227
236
|
</div>
|
|
@@ -229,7 +238,7 @@ For example:
|
|
|
229
238
|
<div class="method-description">
|
|
230
239
|
<p>
|
|
231
240
|
This function rebuilds the index if <a
|
|
232
|
-
href="LSI.html#
|
|
241
|
+
href="LSI.html#M000023">needs_rebuild?</a> returns true. For very large
|
|
233
242
|
document spaces, this indexing operation may take some time to complete, so
|
|
234
243
|
it may be wise to place the operation in another thread.
|
|
235
244
|
</p>
|
|
@@ -249,12 +258,50 @@ engine.
|
|
|
249
258
|
</div>
|
|
250
259
|
</div>
|
|
251
260
|
|
|
252
|
-
<div id="method-
|
|
253
|
-
<a name="
|
|
261
|
+
<div id="method-M000029" class="method-detail">
|
|
262
|
+
<a name="M000029"></a>
|
|
254
263
|
|
|
255
264
|
<div class="method-heading">
|
|
256
|
-
<a href="LSI.src/
|
|
257
|
-
onclick="popupCode('LSI.src/
|
|
265
|
+
<a href="LSI.src/M000029.html" target="Code" class="method-signature"
|
|
266
|
+
onclick="popupCode('LSI.src/M000029.html');return false;">
|
|
267
|
+
<span class="method-name">categories_for</span><span class="method-args">(item)</span>
|
|
268
|
+
</a>
|
|
269
|
+
</div>
|
|
270
|
+
|
|
271
|
+
<div class="method-description">
|
|
272
|
+
<p>
|
|
273
|
+
Returns the categories for a given indexed items. You are free to add and
|
|
274
|
+
remove items from this as you see fit. It does not invalide an index to
|
|
275
|
+
change its categories.
|
|
276
|
+
</p>
|
|
277
|
+
</div>
|
|
278
|
+
</div>
|
|
279
|
+
|
|
280
|
+
<div id="method-M000026" class="method-detail">
|
|
281
|
+
<a name="M000026"></a>
|
|
282
|
+
|
|
283
|
+
<div class="method-heading">
|
|
284
|
+
<a href="LSI.src/M000026.html" target="Code" class="method-signature"
|
|
285
|
+
onclick="popupCode('LSI.src/M000026.html');return false;">
|
|
286
|
+
<span class="method-name">categories_for</span><span class="method-args">(item)</span>
|
|
287
|
+
</a>
|
|
288
|
+
</div>
|
|
289
|
+
|
|
290
|
+
<div class="method-description">
|
|
291
|
+
<p>
|
|
292
|
+
Returns the categories for a given indexed items. You are free to add and
|
|
293
|
+
remove items from this as you see fit. It does not invalide an index to
|
|
294
|
+
change its categories.
|
|
295
|
+
</p>
|
|
296
|
+
</div>
|
|
297
|
+
</div>
|
|
298
|
+
|
|
299
|
+
<div id="method-M000036" class="method-detail">
|
|
300
|
+
<a name="M000036"></a>
|
|
301
|
+
|
|
302
|
+
<div class="method-heading">
|
|
303
|
+
<a href="LSI.src/M000036.html" target="Code" class="method-signature"
|
|
304
|
+
onclick="popupCode('LSI.src/M000036.html');return false;">
|
|
258
305
|
<span class="method-name">classify</span><span class="method-args">( doc, cutoff=0.30, &block )</span>
|
|
259
306
|
</a>
|
|
260
307
|
</div>
|
|
@@ -263,7 +310,7 @@ engine.
|
|
|
263
310
|
<p>
|
|
264
311
|
This function uses a voting system to categorize documents, based on the
|
|
265
312
|
categories of other documents. It uses the same logic as the <a
|
|
266
|
-
href="LSI.html#
|
|
313
|
+
href="LSI.html#M000035">find_related</a> function to find related
|
|
267
314
|
documents, then returns the most obvious category from this list.
|
|
268
315
|
</p>
|
|
269
316
|
<p>
|
|
@@ -274,12 +321,12 @@ the document is in. This may not always make sense.
|
|
|
274
321
|
</div>
|
|
275
322
|
</div>
|
|
276
323
|
|
|
277
|
-
<div id="method-
|
|
278
|
-
<a name="
|
|
324
|
+
<div id="method-M000035" class="method-detail">
|
|
325
|
+
<a name="M000035"></a>
|
|
279
326
|
|
|
280
327
|
<div class="method-heading">
|
|
281
|
-
<a href="LSI.src/
|
|
282
|
-
onclick="popupCode('LSI.src/
|
|
328
|
+
<a href="LSI.src/M000035.html" target="Code" class="method-signature"
|
|
329
|
+
onclick="popupCode('LSI.src/M000035.html');return false;">
|
|
283
330
|
<span class="method-name">find_related</span><span class="method-args">( doc, max_nearest=3, &block )</span>
|
|
284
331
|
</a>
|
|
285
332
|
</div>
|
|
@@ -301,12 +348,55 @@ each other in an essay.
|
|
|
301
348
|
</div>
|
|
302
349
|
</div>
|
|
303
350
|
|
|
304
|
-
<div id="method-
|
|
305
|
-
<a name="
|
|
351
|
+
<div id="method-M000037" class="method-detail">
|
|
352
|
+
<a name="M000037"></a>
|
|
353
|
+
|
|
354
|
+
<div class="method-heading">
|
|
355
|
+
<a href="LSI.src/M000037.html" target="Code" class="method-signature"
|
|
356
|
+
onclick="popupCode('LSI.src/M000037.html');return false;">
|
|
357
|
+
<span class="method-name">highest_ranked_stems</span><span class="method-args">( doc, count=3 )</span>
|
|
358
|
+
</a>
|
|
359
|
+
</div>
|
|
360
|
+
|
|
361
|
+
<div class="method-description">
|
|
362
|
+
<p>
|
|
363
|
+
Prototype, only works on indexed documents. I have no clue if this is going
|
|
364
|
+
to work, but in theory it’s supposed to.
|
|
365
|
+
</p>
|
|
366
|
+
</div>
|
|
367
|
+
</div>
|
|
368
|
+
|
|
369
|
+
<div id="method-M000031" class="method-detail">
|
|
370
|
+
<a name="M000031"></a>
|
|
371
|
+
|
|
372
|
+
<div class="method-heading">
|
|
373
|
+
<a href="LSI.src/M000031.html" target="Code" class="method-signature"
|
|
374
|
+
onclick="popupCode('LSI.src/M000031.html');return false;">
|
|
375
|
+
<span class="method-name">highest_relative_content</span><span class="method-args">( max_chunks=10 )</span>
|
|
376
|
+
</a>
|
|
377
|
+
</div>
|
|
378
|
+
|
|
379
|
+
<div class="method-description">
|
|
380
|
+
<p>
|
|
381
|
+
This method returns max_chunks entries, ordered by their average semantic
|
|
382
|
+
rating. Essentially, the average distance of each entry from all other
|
|
383
|
+
entries is calculated, the highest are returned.
|
|
384
|
+
</p>
|
|
385
|
+
<p>
|
|
386
|
+
This can be used to build a summary service, or to provide more information
|
|
387
|
+
about your dataset’s general content. For example, if you were to use
|
|
388
|
+
categorize on the results of this data, you could gather information on
|
|
389
|
+
what your dataset is generally about.
|
|
390
|
+
</p>
|
|
391
|
+
</div>
|
|
392
|
+
</div>
|
|
393
|
+
|
|
394
|
+
<div id="method-M000028" class="method-detail">
|
|
395
|
+
<a name="M000028"></a>
|
|
306
396
|
|
|
307
397
|
<div class="method-heading">
|
|
308
|
-
<a href="LSI.src/
|
|
309
|
-
onclick="popupCode('LSI.src/
|
|
398
|
+
<a href="LSI.src/M000028.html" target="Code" class="method-signature"
|
|
399
|
+
onclick="popupCode('LSI.src/M000028.html');return false;">
|
|
310
400
|
<span class="method-name">items</span><span class="method-args">()</span>
|
|
311
401
|
</a>
|
|
312
402
|
</div>
|
|
@@ -318,12 +408,12 @@ Returns an array of items that are indexed.
|
|
|
318
408
|
</div>
|
|
319
409
|
</div>
|
|
320
410
|
|
|
321
|
-
<div id="method-
|
|
322
|
-
<a name="
|
|
411
|
+
<div id="method-M000023" class="method-detail">
|
|
412
|
+
<a name="M000023"></a>
|
|
323
413
|
|
|
324
414
|
<div class="method-heading">
|
|
325
|
-
<a href="LSI.src/
|
|
326
|
-
onclick="popupCode('LSI.src/
|
|
415
|
+
<a href="LSI.src/M000023.html" target="Code" class="method-signature"
|
|
416
|
+
onclick="popupCode('LSI.src/M000023.html');return false;">
|
|
327
417
|
<span class="method-name">needs_rebuild?</span><span class="method-args">()</span>
|
|
328
418
|
</a>
|
|
329
419
|
</div>
|
|
@@ -337,12 +427,12 @@ classification and cluster detection.
|
|
|
337
427
|
</div>
|
|
338
428
|
</div>
|
|
339
429
|
|
|
340
|
-
<div id="method-
|
|
341
|
-
<a name="
|
|
430
|
+
<div id="method-M000032" class="method-detail">
|
|
431
|
+
<a name="M000032"></a>
|
|
342
432
|
|
|
343
433
|
<div class="method-heading">
|
|
344
|
-
<a href="LSI.src/
|
|
345
|
-
onclick="popupCode('LSI.src/
|
|
434
|
+
<a href="LSI.src/M000032.html" target="Code" class="method-signature"
|
|
435
|
+
onclick="popupCode('LSI.src/M000032.html');return false;">
|
|
346
436
|
<span class="method-name">proximity_array_for_content</span><span class="method-args">( doc, &block )</span>
|
|
347
437
|
</a>
|
|
348
438
|
</div>
|
|
@@ -350,7 +440,7 @@ classification and cluster detection.
|
|
|
350
440
|
<div class="method-description">
|
|
351
441
|
<p>
|
|
352
442
|
This function is the primitive that <a
|
|
353
|
-
href="LSI.html#
|
|
443
|
+
href="LSI.html#M000035">find_related</a> and classify build upon. It
|
|
354
444
|
returns an array of 2-element arrays. The first element of this array is a
|
|
355
445
|
document, and the second is its "score", defining how
|
|
356
446
|
"close" it is to other indexed items.
|
|
@@ -363,25 +453,25 @@ meaningful between indexes.
|
|
|
363
453
|
<p>
|
|
364
454
|
The parameter doc is the content to compare. If that content is not
|
|
365
455
|
indexed, you can pass an optional block to define how to create the text
|
|
366
|
-
data. See <a href="LSI.html#
|
|
456
|
+
data. See <a href="LSI.html#M000024">add_item</a> for examples of how this
|
|
367
457
|
works.
|
|
368
458
|
</p>
|
|
369
459
|
</div>
|
|
370
460
|
</div>
|
|
371
461
|
|
|
372
|
-
<div id="method-
|
|
373
|
-
<a name="
|
|
462
|
+
<div id="method-M000033" class="method-detail">
|
|
463
|
+
<a name="M000033"></a>
|
|
374
464
|
|
|
375
465
|
<div class="method-heading">
|
|
376
|
-
<a href="LSI.src/
|
|
377
|
-
onclick="popupCode('LSI.src/
|
|
466
|
+
<a href="LSI.src/M000033.html" target="Code" class="method-signature"
|
|
467
|
+
onclick="popupCode('LSI.src/M000033.html');return false;">
|
|
378
468
|
<span class="method-name">proximity_norms_for_content</span><span class="method-args">( doc, &block )</span>
|
|
379
469
|
</a>
|
|
380
470
|
</div>
|
|
381
471
|
|
|
382
472
|
<div class="method-description">
|
|
383
473
|
<p>
|
|
384
|
-
Similar to <a href="LSI.html#
|
|
474
|
+
Similar to <a href="LSI.html#M000032">proximity_array_for_content</a>, this
|
|
385
475
|
function takes similar arguments and returns a similar array. However, it
|
|
386
476
|
uses the normalized calculated vectors instead of their full versions. This
|
|
387
477
|
is useful when you’re trying to perform operations on content that is
|
|
@@ -391,12 +481,12 @@ primitive.
|
|
|
391
481
|
</div>
|
|
392
482
|
</div>
|
|
393
483
|
|
|
394
|
-
<div id="method-
|
|
395
|
-
<a name="
|
|
484
|
+
<div id="method-M000027" class="method-detail">
|
|
485
|
+
<a name="M000027"></a>
|
|
396
486
|
|
|
397
487
|
<div class="method-heading">
|
|
398
|
-
<a href="LSI.src/
|
|
399
|
-
onclick="popupCode('LSI.src/
|
|
488
|
+
<a href="LSI.src/M000027.html" target="Code" class="method-signature"
|
|
489
|
+
onclick="popupCode('LSI.src/M000027.html');return false;">
|
|
400
490
|
<span class="method-name">remove_item</span><span class="method-args">( item )</span>
|
|
401
491
|
</a>
|
|
402
492
|
</div>
|
|
@@ -408,12 +498,12 @@ Removes an item from the database, if it is indexed.
|
|
|
408
498
|
</div>
|
|
409
499
|
</div>
|
|
410
500
|
|
|
411
|
-
<div id="method-
|
|
412
|
-
<a name="
|
|
501
|
+
<div id="method-M000034" class="method-detail">
|
|
502
|
+
<a name="M000034"></a>
|
|
413
503
|
|
|
414
504
|
<div class="method-heading">
|
|
415
|
-
<a href="LSI.src/
|
|
416
|
-
onclick="popupCode('LSI.src/
|
|
505
|
+
<a href="LSI.src/M000034.html" target="Code" class="method-signature"
|
|
506
|
+
onclick="popupCode('LSI.src/M000034.html');return false;">
|
|
417
507
|
<span class="method-name">search</span><span class="method-args">( string, max_nearest=3 )</span>
|
|
418
508
|
</a>
|
|
419
509
|
</div>
|
|
@@ -421,7 +511,7 @@ Removes an item from the database, if it is indexed.
|
|
|
421
511
|
<div class="method-description">
|
|
422
512
|
<p>
|
|
423
513
|
This function allows for text-based search of your index. Unlike other
|
|
424
|
-
functions like <a href="LSI.html#
|
|
514
|
+
functions like <a href="LSI.html#M000035">find_related</a> and classify,
|
|
425
515
|
search only takes short strings. It will also ignore factors like repeated
|
|
426
516
|
words. It is best for short, google-like search terms. A search will first
|
|
427
517
|
priortize lexical relationships, then semantic ones.
|