classifier 1.2.0 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +361 -273
- data/README +6 -5
- data/Rakefile +12 -2
- data/bin/summarize.rb +11 -0
- data/doc/classes/Array.html +139 -0
- data/doc/classes/Array.src/M000003.html +18 -0
- data/doc/classes/Classifier.html +5 -5
- data/doc/classes/Classifier/Bayes.html +43 -43
- data/doc/classes/Classifier/Bayes.src/{M000023.html → M000038.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000024.html → M000039.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000025.html → M000040.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000026.html → M000041.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000027.html → M000042.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000028.html → M000043.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000029.html → M000044.html} +0 -0
- data/doc/classes/Classifier/ContentNode.html +23 -28
- data/doc/classes/Classifier/ContentNode.src/M000046.html +19 -0
- data/doc/classes/Classifier/ContentNode.src/{M000032.html → M000047.html} +1 -1
- data/doc/classes/Classifier/ContentNode.src/{M000033.html → M000048.html} +1 -1
- data/doc/classes/Classifier/ContentNode.src/M000049.html +49 -0
- data/doc/classes/Classifier/LSI.html +158 -68
- data/doc/classes/Classifier/LSI.src/M000022.html +6 -17
- data/doc/classes/Classifier/LSI.src/{M000012.html → M000023.html} +2 -2
- data/doc/classes/Classifier/LSI.src/{M000013.html → M000024.html} +3 -2
- data/doc/classes/Classifier/LSI.src/{M000014.html → M000025.html} +1 -1
- data/doc/classes/Classifier/LSI.src/M000026.html +19 -0
- data/doc/classes/Classifier/LSI.src/{M000015.html → M000027.html} +1 -1
- data/doc/classes/Classifier/LSI.src/{M000016.html → M000028.html} +1 -1
- data/doc/classes/Classifier/LSI.src/M000029.html +19 -0
- data/doc/classes/Classifier/LSI.src/M000030.html +43 -0
- data/doc/classes/Classifier/LSI.src/M000031.html +23 -0
- data/doc/classes/Classifier/LSI.src/{M000018.html → M000032.html} +7 -3
- data/doc/classes/Classifier/LSI.src/{M000019.html → M000033.html} +6 -2
- data/doc/classes/Classifier/LSI.src/{M000020.html → M000034.html} +2 -4
- data/doc/classes/Classifier/LSI.src/{M000021.html → M000035.html} +1 -1
- data/doc/classes/Classifier/LSI.src/M000036.html +31 -0
- data/doc/classes/Classifier/LSI.src/M000037.html +21 -0
- data/doc/classes/Classifier/WordList.html +37 -22
- data/doc/classes/Classifier/WordList.src/{M000007.html → M000017.html} +2 -2
- data/doc/classes/Classifier/WordList.src/{M000008.html → M000018.html} +1 -1
- data/doc/classes/Classifier/WordList.src/{M000009.html → M000019.html} +1 -1
- data/doc/classes/Classifier/WordList.src/M000020.html +18 -0
- data/doc/classes/Classifier/WordList.src/{M000010.html → M000021.html} +1 -1
- data/doc/classes/GSL.html +2 -1
- data/doc/classes/GSL/Matrix.html +126 -0
- data/doc/classes/GSL/Vector.html +10 -10
- data/doc/classes/GSL/Vector.src/{M000005.html → M000015.html} +0 -0
- data/doc/classes/GSL/Vector.src/{M000006.html → M000016.html} +0 -0
- data/doc/classes/Matrix.html +184 -0
- data/doc/classes/Matrix.src/M000004.html +18 -0
- data/doc/classes/Matrix.src/M000005.html +76 -0
- data/doc/classes/Matrix.src/M000006.html +18 -0
- data/doc/classes/Object.html +7 -7
- data/doc/classes/Object.src/{M000001.html → M000007.html} +1 -1
- data/doc/classes/String.html +90 -20
- data/doc/classes/String.src/{M000002.html → M000008.html} +0 -0
- data/doc/classes/String.src/{M000003.html → M000009.html} +0 -0
- data/doc/classes/String.src/{M000004.html → M000010.html} +0 -0
- data/doc/classes/String.src/M000011.html +18 -0
- data/doc/classes/String.src/M000012.html +18 -0
- data/doc/classes/String.src/M000013.html +18 -0
- data/doc/classes/String.src/M000014.html +18 -0
- data/doc/classes/Vector.html +154 -0
- data/doc/classes/Vector.src/M000001.html +22 -0
- data/doc/classes/Vector.src/M000002.html +25 -0
- data/doc/created.rid +1 -1
- data/doc/files/README.html +14 -8
- data/doc/files/lib/classifier/bayes_rb.html +1 -1
- data/doc/files/lib/classifier/{string_extensions_rb.html → extensions/string_rb.html} +5 -5
- data/doc/files/lib/classifier/extensions/vector_rb.html +120 -0
- data/doc/files/lib/classifier/extensions/vector_serialize_rb.html +1 -1
- data/doc/files/lib/classifier/extensions/word_hash_rb.html +1 -1
- data/doc/files/lib/classifier/lsi/content_node_rb.html +2 -2
- data/doc/files/lib/classifier/lsi/summary_rb.html +115 -0
- data/doc/files/lib/classifier/{extensions → lsi}/word_list_rb.html +3 -3
- data/doc/files/lib/classifier/lsi_rb.html +5 -3
- data/doc/files/lib/classifier_rb.html +2 -2
- data/doc/fr_class_index.html +4 -0
- data/doc/fr_file_index.html +4 -2
- data/doc/fr_method_index.html +49 -34
- data/doc/index.html +2 -2
- data/lib/classifier.rb +1 -1
- data/lib/classifier/{string_extensions.rb → extensions/string.rb} +0 -0
- data/lib/classifier/extensions/vector.rb +106 -0
- data/lib/classifier/extensions/vector_serialize.rb +6 -0
- data/lib/classifier/lsi.rb +101 -31
- data/lib/classifier/lsi/content_node.rb +28 -23
- data/lib/classifier/lsi/summary.rb +31 -0
- data/lib/classifier/{extensions → lsi}/word_list.rb +7 -2
- data/test/{string_extensions → extensions}/word_hash_test.rb +0 -0
- data/test/lsi/lsi_test.rb +36 -1
- metadata +68 -41
- data/doc/classes/Classifier/ContentNode.src/M000031.html +0 -21
- data/doc/classes/Classifier/ContentNode.src/M000034.html +0 -41
- data/doc/classes/Classifier/LSI.src/M000011.html +0 -20
- data/doc/classes/Classifier/LSI.src/M000017.html +0 -32
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -83,7 +83,7 @@
|
|
83
83
|
<div id="description">
|
84
84
|
<p>
|
85
85
|
This is an internal data structure class for the <a href="LSI.html">LSI</a>
|
86
|
-
node. Save for <a href="ContentNode.html#
|
86
|
+
node. Save for <a href="ContentNode.html#M000049">raw_vector_with</a>, it
|
87
87
|
should be fairly straightforward to understand. You should never have to
|
88
88
|
use it directly.
|
89
89
|
</p>
|
@@ -97,10 +97,10 @@ use it directly.
|
|
97
97
|
<h3 class="section-bar">Methods</h3>
|
98
98
|
|
99
99
|
<div class="name-list">
|
100
|
-
<a href="#
|
101
|
-
<a href="#
|
102
|
-
<a href="#
|
103
|
-
<a href="#
|
100
|
+
<a href="#M000046">new</a>
|
101
|
+
<a href="#M000049">raw_vector_with</a>
|
102
|
+
<a href="#M000048">search_norm</a>
|
103
|
+
<a href="#M000047">search_vector</a>
|
104
104
|
</div>
|
105
105
|
</div>
|
106
106
|
|
@@ -145,14 +145,9 @@ use it directly.
|
|
145
145
|
<td class="context-item-value"> [RW] </td>
|
146
146
|
<td class="context-item-desc"></td>
|
147
147
|
</tr>
|
148
|
-
<tr class="top-aligned-row context-row">
|
149
|
-
<td class="context-item-name">source</td>
|
150
|
-
<td class="context-item-value"> [R] </td>
|
151
|
-
<td class="context-item-desc"></td>
|
152
|
-
</tr>
|
153
148
|
<tr class="top-aligned-row context-row">
|
154
149
|
<td class="context-item-name">word_hash</td>
|
155
|
-
<td class="context-item-value"> [
|
150
|
+
<td class="context-item-value"> [R] </td>
|
156
151
|
<td class="context-item-desc"></td>
|
157
152
|
</tr>
|
158
153
|
</table>
|
@@ -165,13 +160,13 @@ use it directly.
|
|
165
160
|
<div id="methods">
|
166
161
|
<h3 class="section-bar">Public Class methods</h3>
|
167
162
|
|
168
|
-
<div id="method-
|
169
|
-
<a name="
|
163
|
+
<div id="method-M000046" class="method-detail">
|
164
|
+
<a name="M000046"></a>
|
170
165
|
|
171
166
|
<div class="method-heading">
|
172
|
-
<a href="ContentNode.src/
|
173
|
-
onclick="popupCode('ContentNode.src/
|
174
|
-
<span class="method-name">new</span><span class="method-args">(
|
167
|
+
<a href="ContentNode.src/M000046.html" target="Code" class="method-signature"
|
168
|
+
onclick="popupCode('ContentNode.src/M000046.html');return false;">
|
169
|
+
<span class="method-name">new</span><span class="method-args">( word_hash, *categories )</span>
|
175
170
|
</a>
|
176
171
|
</div>
|
177
172
|
|
@@ -185,12 +180,12 @@ source.to_s
|
|
185
180
|
|
186
181
|
<h3 class="section-bar">Public Instance methods</h3>
|
187
182
|
|
188
|
-
<div id="method-
|
189
|
-
<a name="
|
183
|
+
<div id="method-M000049" class="method-detail">
|
184
|
+
<a name="M000049"></a>
|
190
185
|
|
191
186
|
<div class="method-heading">
|
192
|
-
<a href="ContentNode.src/
|
193
|
-
onclick="popupCode('ContentNode.src/
|
187
|
+
<a href="ContentNode.src/M000049.html" target="Code" class="method-signature"
|
188
|
+
onclick="popupCode('ContentNode.src/M000049.html');return false;">
|
194
189
|
<span class="method-name">raw_vector_with</span><span class="method-args">( word_list )</span>
|
195
190
|
</a>
|
196
191
|
</div>
|
@@ -203,12 +198,12 @@ mapping the vector space.
|
|
203
198
|
</div>
|
204
199
|
</div>
|
205
200
|
|
206
|
-
<div id="method-
|
207
|
-
<a name="
|
201
|
+
<div id="method-M000048" class="method-detail">
|
202
|
+
<a name="M000048"></a>
|
208
203
|
|
209
204
|
<div class="method-heading">
|
210
|
-
<a href="ContentNode.src/
|
211
|
-
onclick="popupCode('ContentNode.src/
|
205
|
+
<a href="ContentNode.src/M000048.html" target="Code" class="method-signature"
|
206
|
+
onclick="popupCode('ContentNode.src/M000048.html');return false;">
|
212
207
|
<span class="method-name">search_norm</span><span class="method-args">()</span>
|
213
208
|
</a>
|
214
209
|
</div>
|
@@ -220,12 +215,12 @@ Use this to fetch the appropriate search vector in normalized form.
|
|
220
215
|
</div>
|
221
216
|
</div>
|
222
217
|
|
223
|
-
<div id="method-
|
224
|
-
<a name="
|
218
|
+
<div id="method-M000047" class="method-detail">
|
219
|
+
<a name="M000047"></a>
|
225
220
|
|
226
221
|
<div class="method-heading">
|
227
|
-
<a href="ContentNode.src/
|
228
|
-
onclick="popupCode('ContentNode.src/
|
222
|
+
<a href="ContentNode.src/M000047.html" target="Code" class="method-signature"
|
223
|
+
onclick="popupCode('ContentNode.src/M000047.html');return false;">
|
229
224
|
<span class="method-name">search_vector</span><span class="method-args">()</span>
|
230
225
|
</a>
|
231
226
|
</div>
|
@@ -0,0 +1,19 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
+
|
6
|
+
<html>
|
7
|
+
<head>
|
8
|
+
<title>new (Classifier::ContentNode)</title>
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
+
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
11
|
+
</head>
|
12
|
+
<body class="standalone-code">
|
13
|
+
<pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line 18</span>
|
14
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>( <span class="ruby-identifier">word_hash</span>, <span class="ruby-operator">*</span><span class="ruby-identifier">categories</span> )
|
15
|
+
<span class="ruby-ivar">@categories</span> = <span class="ruby-identifier">categories</span> <span class="ruby-operator">||</span> []
|
16
|
+
<span class="ruby-ivar">@word_hash</span> = <span class="ruby-identifier">word_hash</span>
|
17
|
+
<span class="ruby-keyword kw">end</span></pre>
|
18
|
+
</body>
|
19
|
+
</html>
|
@@ -10,7 +10,7 @@
|
|
10
10
|
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
11
11
|
</head>
|
12
12
|
<body class="standalone-code">
|
13
|
-
<pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line
|
13
|
+
<pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line 24</span>
|
14
14
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">search_vector</span>
|
15
15
|
<span class="ruby-ivar">@lsi_vector</span> <span class="ruby-operator">||</span> <span class="ruby-ivar">@raw_vector</span>
|
16
16
|
<span class="ruby-keyword kw">end</span></pre>
|
@@ -10,7 +10,7 @@
|
|
10
10
|
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
11
11
|
</head>
|
12
12
|
<body class="standalone-code">
|
13
|
-
<pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line
|
13
|
+
<pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line 29</span>
|
14
14
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">search_norm</span>
|
15
15
|
<span class="ruby-ivar">@lsi_norm</span> <span class="ruby-operator">||</span> <span class="ruby-ivar">@raw_norm</span>
|
16
16
|
<span class="ruby-keyword kw">end</span></pre>
|
@@ -0,0 +1,49 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
+
|
6
|
+
<html>
|
7
|
+
<head>
|
8
|
+
<title>raw_vector_with (Classifier::ContentNode)</title>
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
+
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
11
|
+
</head>
|
12
|
+
<body class="standalone-code">
|
13
|
+
<pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line 35</span>
|
14
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">raw_vector_with</span>( <span class="ruby-identifier">word_list</span> )
|
15
|
+
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">$GSL</span>
|
16
|
+
<span class="ruby-identifier">vec</span> = <span class="ruby-constant">Vector</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">word_list</span>.<span class="ruby-identifier">size</span>)
|
17
|
+
<span class="ruby-keyword kw">else</span>
|
18
|
+
<span class="ruby-identifier">vec</span> = <span class="ruby-constant">Array</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">word_list</span>.<span class="ruby-identifier">size</span>, <span class="ruby-value">0</span>)
|
19
|
+
<span class="ruby-keyword kw">end</span>
|
20
|
+
|
21
|
+
<span class="ruby-ivar">@word_hash</span>.<span class="ruby-identifier">each_key</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">word</span><span class="ruby-operator">|</span>
|
22
|
+
<span class="ruby-identifier">vec</span>[<span class="ruby-identifier">word_list</span>[<span class="ruby-identifier">word</span>]] = <span class="ruby-ivar">@word_hash</span>[<span class="ruby-identifier">word</span>] <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">word_list</span>[<span class="ruby-identifier">word</span>]
|
23
|
+
<span class="ruby-keyword kw">end</span>
|
24
|
+
|
25
|
+
<span class="ruby-comment cmt"># Perform the scaling transform</span>
|
26
|
+
<span class="ruby-identifier">total_words</span> = <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">sum</span>
|
27
|
+
|
28
|
+
<span class="ruby-comment cmt"># Perform first-order association transform if this vector has more</span>
|
29
|
+
<span class="ruby-comment cmt"># than one word in it. </span>
|
30
|
+
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">total_words</span> <span class="ruby-operator">></span> <span class="ruby-value">1.0</span>
|
31
|
+
<span class="ruby-identifier">weighted_total</span> = <span class="ruby-value">0</span><span class="ruby-value">.0</span>
|
32
|
+
<span class="ruby-identifier">vec</span>.<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">term</span><span class="ruby-operator">|</span>
|
33
|
+
<span class="ruby-keyword kw">if</span> ( <span class="ruby-identifier">term</span> <span class="ruby-operator">></span> <span class="ruby-value">0</span> )
|
34
|
+
<span class="ruby-identifier">weighted_total</span> <span class="ruby-operator">+=</span> (( <span class="ruby-identifier">term</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">total_words</span> ) <span class="ruby-operator">*</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>( <span class="ruby-identifier">term</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">total_words</span> ))
|
35
|
+
<span class="ruby-keyword kw">end</span>
|
36
|
+
<span class="ruby-keyword kw">end</span>
|
37
|
+
<span class="ruby-identifier">vec</span> = <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">collect</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">val</span><span class="ruby-operator">|</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>( <span class="ruby-identifier">val</span> <span class="ruby-operator">+</span> <span class="ruby-value">1</span> ) <span class="ruby-operator">/</span> <span class="ruby-operator">-</span><span class="ruby-identifier">weighted_total</span> }
|
38
|
+
<span class="ruby-keyword kw">end</span>
|
39
|
+
|
40
|
+
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">$GSL</span>
|
41
|
+
<span class="ruby-ivar">@raw_norm</span> = <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">normalize</span>
|
42
|
+
<span class="ruby-ivar">@raw_vector</span> = <span class="ruby-identifier">vec</span>
|
43
|
+
<span class="ruby-keyword kw">else</span>
|
44
|
+
<span class="ruby-ivar">@raw_norm</span> = <span class="ruby-constant">Vector</span>[<span class="ruby-operator">*</span><span class="ruby-identifier">vec</span>].<span class="ruby-identifier">normalize</span>
|
45
|
+
<span class="ruby-ivar">@raw_vector</span> = <span class="ruby-constant">Vector</span>[<span class="ruby-operator">*</span><span class="ruby-identifier">vec</span>]
|
46
|
+
<span class="ruby-keyword kw">end</span>
|
47
|
+
<span class="ruby-keyword kw">end</span></pre>
|
48
|
+
</body>
|
49
|
+
</html>
|
@@ -97,18 +97,22 @@ href="http://en.wikipedia.org/wiki/Latent_Semantic_Indexing">Wikipedia</a>.
|
|
97
97
|
<h3 class="section-bar">Methods</h3>
|
98
98
|
|
99
99
|
<div class="name-list">
|
100
|
-
<a href="#
|
101
|
-
<a href="#
|
102
|
-
<a href="#
|
103
|
-
<a href="#
|
104
|
-
<a href="#
|
105
|
-
<a href="#
|
106
|
-
<a href="#
|
107
|
-
<a href="#
|
108
|
-
<a href="#
|
109
|
-
<a href="#
|
110
|
-
<a href="#
|
111
|
-
<a href="#
|
100
|
+
<a href="#M000025"><<</a>
|
101
|
+
<a href="#M000024">add_item</a>
|
102
|
+
<a href="#M000030">build_index</a>
|
103
|
+
<a href="#M000029">categories_for</a>
|
104
|
+
<a href="#M000026">categories_for</a>
|
105
|
+
<a href="#M000036">classify</a>
|
106
|
+
<a href="#M000035">find_related</a>
|
107
|
+
<a href="#M000037">highest_ranked_stems</a>
|
108
|
+
<a href="#M000031">highest_relative_content</a>
|
109
|
+
<a href="#M000028">items</a>
|
110
|
+
<a href="#M000023">needs_rebuild?</a>
|
111
|
+
<a href="#M000022">new</a>
|
112
|
+
<a href="#M000032">proximity_array_for_content</a>
|
113
|
+
<a href="#M000033">proximity_norms_for_content</a>
|
114
|
+
<a href="#M000027">remove_item</a>
|
115
|
+
<a href="#M000034">search</a>
|
112
116
|
</div>
|
113
117
|
</div>
|
114
118
|
|
@@ -128,6 +132,11 @@ href="http://en.wikipedia.org/wiki/Latent_Semantic_Indexing">Wikipedia</a>.
|
|
128
132
|
|
129
133
|
<div class="name-list">
|
130
134
|
<table>
|
135
|
+
<tr class="top-aligned-row context-row">
|
136
|
+
<td class="context-item-name">auto_rebuild</td>
|
137
|
+
<td class="context-item-value"> [RW] </td>
|
138
|
+
<td class="context-item-desc"></td>
|
139
|
+
</tr>
|
131
140
|
<tr class="top-aligned-row context-row">
|
132
141
|
<td class="context-item-name">word_list</td>
|
133
142
|
<td class="context-item-value"> [R] </td>
|
@@ -143,12 +152,12 @@ href="http://en.wikipedia.org/wiki/Latent_Semantic_Indexing">Wikipedia</a>.
|
|
143
152
|
<div id="methods">
|
144
153
|
<h3 class="section-bar">Public Class methods</h3>
|
145
154
|
|
146
|
-
<div id="method-
|
147
|
-
<a name="
|
155
|
+
<div id="method-M000022" class="method-detail">
|
156
|
+
<a name="M000022"></a>
|
148
157
|
|
149
158
|
<div class="method-heading">
|
150
|
-
<a href="LSI.src/
|
151
|
-
onclick="popupCode('LSI.src/
|
159
|
+
<a href="LSI.src/M000022.html" target="Code" class="method-signature"
|
160
|
+
onclick="popupCode('LSI.src/M000022.html');return false;">
|
152
161
|
<span class="method-name">new</span><span class="method-args">(options = {})</span>
|
153
162
|
</a>
|
154
163
|
</div>
|
@@ -156,7 +165,7 @@ href="http://en.wikipedia.org/wiki/Latent_Semantic_Indexing">Wikipedia</a>.
|
|
156
165
|
<div class="method-description">
|
157
166
|
<p>
|
158
167
|
Create a fresh index. If you want to call <a
|
159
|
-
href="LSI.html#
|
168
|
+
href="LSI.html#M000030">build_index</a> manually, use
|
160
169
|
</p>
|
161
170
|
<pre>
|
162
171
|
Classifier::LSI.new :auto_rebuild => false
|
@@ -166,31 +175,31 @@ href="LSI.html#M000017">build_index</a> manually, use
|
|
166
175
|
|
167
176
|
<h3 class="section-bar">Public Instance methods</h3>
|
168
177
|
|
169
|
-
<div id="method-
|
170
|
-
<a name="
|
178
|
+
<div id="method-M000025" class="method-detail">
|
179
|
+
<a name="M000025"></a>
|
171
180
|
|
172
181
|
<div class="method-heading">
|
173
|
-
<a href="LSI.src/
|
174
|
-
onclick="popupCode('LSI.src/
|
182
|
+
<a href="LSI.src/M000025.html" target="Code" class="method-signature"
|
183
|
+
onclick="popupCode('LSI.src/M000025.html');return false;">
|
175
184
|
<span class="method-name"><<</span><span class="method-args">( item )</span>
|
176
185
|
</a>
|
177
186
|
</div>
|
178
187
|
|
179
188
|
<div class="method-description">
|
180
189
|
<p>
|
181
|
-
A less flexible shorthand for <a href="LSI.html#
|
190
|
+
A less flexible shorthand for <a href="LSI.html#M000024">add_item</a> that
|
182
191
|
assumes you are passing in a string with no categorries. item will be duck
|
183
192
|
typed via to_s .
|
184
193
|
</p>
|
185
194
|
</div>
|
186
195
|
</div>
|
187
196
|
|
188
|
-
<div id="method-
|
189
|
-
<a name="
|
197
|
+
<div id="method-M000024" class="method-detail">
|
198
|
+
<a name="M000024"></a>
|
190
199
|
|
191
200
|
<div class="method-heading">
|
192
|
-
<a href="LSI.src/
|
193
|
-
onclick="popupCode('LSI.src/
|
201
|
+
<a href="LSI.src/M000024.html" target="Code" class="method-signature"
|
202
|
+
onclick="popupCode('LSI.src/M000024.html');return false;">
|
194
203
|
<span class="method-name">add_item</span><span class="method-args">( item, *categories, &block )</span>
|
195
204
|
</a>
|
196
205
|
</div>
|
@@ -216,12 +225,12 @@ For example:
|
|
216
225
|
</div>
|
217
226
|
</div>
|
218
227
|
|
219
|
-
<div id="method-
|
220
|
-
<a name="
|
228
|
+
<div id="method-M000030" class="method-detail">
|
229
|
+
<a name="M000030"></a>
|
221
230
|
|
222
231
|
<div class="method-heading">
|
223
|
-
<a href="LSI.src/
|
224
|
-
onclick="popupCode('LSI.src/
|
232
|
+
<a href="LSI.src/M000030.html" target="Code" class="method-signature"
|
233
|
+
onclick="popupCode('LSI.src/M000030.html');return false;">
|
225
234
|
<span class="method-name">build_index</span><span class="method-args">( cutoff=0.75 )</span>
|
226
235
|
</a>
|
227
236
|
</div>
|
@@ -229,7 +238,7 @@ For example:
|
|
229
238
|
<div class="method-description">
|
230
239
|
<p>
|
231
240
|
This function rebuilds the index if <a
|
232
|
-
href="LSI.html#
|
241
|
+
href="LSI.html#M000023">needs_rebuild?</a> returns true. For very large
|
233
242
|
document spaces, this indexing operation may take some time to complete, so
|
234
243
|
it may be wise to place the operation in another thread.
|
235
244
|
</p>
|
@@ -249,12 +258,50 @@ engine.
|
|
249
258
|
</div>
|
250
259
|
</div>
|
251
260
|
|
252
|
-
<div id="method-
|
253
|
-
<a name="
|
261
|
+
<div id="method-M000029" class="method-detail">
|
262
|
+
<a name="M000029"></a>
|
254
263
|
|
255
264
|
<div class="method-heading">
|
256
|
-
<a href="LSI.src/
|
257
|
-
onclick="popupCode('LSI.src/
|
265
|
+
<a href="LSI.src/M000029.html" target="Code" class="method-signature"
|
266
|
+
onclick="popupCode('LSI.src/M000029.html');return false;">
|
267
|
+
<span class="method-name">categories_for</span><span class="method-args">(item)</span>
|
268
|
+
</a>
|
269
|
+
</div>
|
270
|
+
|
271
|
+
<div class="method-description">
|
272
|
+
<p>
|
273
|
+
Returns the categories for a given indexed items. You are free to add and
|
274
|
+
remove items from this as you see fit. It does not invalide an index to
|
275
|
+
change its categories.
|
276
|
+
</p>
|
277
|
+
</div>
|
278
|
+
</div>
|
279
|
+
|
280
|
+
<div id="method-M000026" class="method-detail">
|
281
|
+
<a name="M000026"></a>
|
282
|
+
|
283
|
+
<div class="method-heading">
|
284
|
+
<a href="LSI.src/M000026.html" target="Code" class="method-signature"
|
285
|
+
onclick="popupCode('LSI.src/M000026.html');return false;">
|
286
|
+
<span class="method-name">categories_for</span><span class="method-args">(item)</span>
|
287
|
+
</a>
|
288
|
+
</div>
|
289
|
+
|
290
|
+
<div class="method-description">
|
291
|
+
<p>
|
292
|
+
Returns the categories for a given indexed items. You are free to add and
|
293
|
+
remove items from this as you see fit. It does not invalide an index to
|
294
|
+
change its categories.
|
295
|
+
</p>
|
296
|
+
</div>
|
297
|
+
</div>
|
298
|
+
|
299
|
+
<div id="method-M000036" class="method-detail">
|
300
|
+
<a name="M000036"></a>
|
301
|
+
|
302
|
+
<div class="method-heading">
|
303
|
+
<a href="LSI.src/M000036.html" target="Code" class="method-signature"
|
304
|
+
onclick="popupCode('LSI.src/M000036.html');return false;">
|
258
305
|
<span class="method-name">classify</span><span class="method-args">( doc, cutoff=0.30, &block )</span>
|
259
306
|
</a>
|
260
307
|
</div>
|
@@ -263,7 +310,7 @@ engine.
|
|
263
310
|
<p>
|
264
311
|
This function uses a voting system to categorize documents, based on the
|
265
312
|
categories of other documents. It uses the same logic as the <a
|
266
|
-
href="LSI.html#
|
313
|
+
href="LSI.html#M000035">find_related</a> function to find related
|
267
314
|
documents, then returns the most obvious category from this list.
|
268
315
|
</p>
|
269
316
|
<p>
|
@@ -274,12 +321,12 @@ the document is in. This may not always make sense.
|
|
274
321
|
</div>
|
275
322
|
</div>
|
276
323
|
|
277
|
-
<div id="method-
|
278
|
-
<a name="
|
324
|
+
<div id="method-M000035" class="method-detail">
|
325
|
+
<a name="M000035"></a>
|
279
326
|
|
280
327
|
<div class="method-heading">
|
281
|
-
<a href="LSI.src/
|
282
|
-
onclick="popupCode('LSI.src/
|
328
|
+
<a href="LSI.src/M000035.html" target="Code" class="method-signature"
|
329
|
+
onclick="popupCode('LSI.src/M000035.html');return false;">
|
283
330
|
<span class="method-name">find_related</span><span class="method-args">( doc, max_nearest=3, &block )</span>
|
284
331
|
</a>
|
285
332
|
</div>
|
@@ -301,12 +348,55 @@ each other in an essay.
|
|
301
348
|
</div>
|
302
349
|
</div>
|
303
350
|
|
304
|
-
<div id="method-
|
305
|
-
<a name="
|
351
|
+
<div id="method-M000037" class="method-detail">
|
352
|
+
<a name="M000037"></a>
|
353
|
+
|
354
|
+
<div class="method-heading">
|
355
|
+
<a href="LSI.src/M000037.html" target="Code" class="method-signature"
|
356
|
+
onclick="popupCode('LSI.src/M000037.html');return false;">
|
357
|
+
<span class="method-name">highest_ranked_stems</span><span class="method-args">( doc, count=3 )</span>
|
358
|
+
</a>
|
359
|
+
</div>
|
360
|
+
|
361
|
+
<div class="method-description">
|
362
|
+
<p>
|
363
|
+
Prototype, only works on indexed documents. I have no clue if this is going
|
364
|
+
to work, but in theory it’s supposed to.
|
365
|
+
</p>
|
366
|
+
</div>
|
367
|
+
</div>
|
368
|
+
|
369
|
+
<div id="method-M000031" class="method-detail">
|
370
|
+
<a name="M000031"></a>
|
371
|
+
|
372
|
+
<div class="method-heading">
|
373
|
+
<a href="LSI.src/M000031.html" target="Code" class="method-signature"
|
374
|
+
onclick="popupCode('LSI.src/M000031.html');return false;">
|
375
|
+
<span class="method-name">highest_relative_content</span><span class="method-args">( max_chunks=10 )</span>
|
376
|
+
</a>
|
377
|
+
</div>
|
378
|
+
|
379
|
+
<div class="method-description">
|
380
|
+
<p>
|
381
|
+
This method returns max_chunks entries, ordered by their average semantic
|
382
|
+
rating. Essentially, the average distance of each entry from all other
|
383
|
+
entries is calculated, the highest are returned.
|
384
|
+
</p>
|
385
|
+
<p>
|
386
|
+
This can be used to build a summary service, or to provide more information
|
387
|
+
about your dataset’s general content. For example, if you were to use
|
388
|
+
categorize on the results of this data, you could gather information on
|
389
|
+
what your dataset is generally about.
|
390
|
+
</p>
|
391
|
+
</div>
|
392
|
+
</div>
|
393
|
+
|
394
|
+
<div id="method-M000028" class="method-detail">
|
395
|
+
<a name="M000028"></a>
|
306
396
|
|
307
397
|
<div class="method-heading">
|
308
|
-
<a href="LSI.src/
|
309
|
-
onclick="popupCode('LSI.src/
|
398
|
+
<a href="LSI.src/M000028.html" target="Code" class="method-signature"
|
399
|
+
onclick="popupCode('LSI.src/M000028.html');return false;">
|
310
400
|
<span class="method-name">items</span><span class="method-args">()</span>
|
311
401
|
</a>
|
312
402
|
</div>
|
@@ -318,12 +408,12 @@ Returns an array of items that are indexed.
|
|
318
408
|
</div>
|
319
409
|
</div>
|
320
410
|
|
321
|
-
<div id="method-
|
322
|
-
<a name="
|
411
|
+
<div id="method-M000023" class="method-detail">
|
412
|
+
<a name="M000023"></a>
|
323
413
|
|
324
414
|
<div class="method-heading">
|
325
|
-
<a href="LSI.src/
|
326
|
-
onclick="popupCode('LSI.src/
|
415
|
+
<a href="LSI.src/M000023.html" target="Code" class="method-signature"
|
416
|
+
onclick="popupCode('LSI.src/M000023.html');return false;">
|
327
417
|
<span class="method-name">needs_rebuild?</span><span class="method-args">()</span>
|
328
418
|
</a>
|
329
419
|
</div>
|
@@ -337,12 +427,12 @@ classification and cluster detection.
|
|
337
427
|
</div>
|
338
428
|
</div>
|
339
429
|
|
340
|
-
<div id="method-
|
341
|
-
<a name="
|
430
|
+
<div id="method-M000032" class="method-detail">
|
431
|
+
<a name="M000032"></a>
|
342
432
|
|
343
433
|
<div class="method-heading">
|
344
|
-
<a href="LSI.src/
|
345
|
-
onclick="popupCode('LSI.src/
|
434
|
+
<a href="LSI.src/M000032.html" target="Code" class="method-signature"
|
435
|
+
onclick="popupCode('LSI.src/M000032.html');return false;">
|
346
436
|
<span class="method-name">proximity_array_for_content</span><span class="method-args">( doc, &block )</span>
|
347
437
|
</a>
|
348
438
|
</div>
|
@@ -350,7 +440,7 @@ classification and cluster detection.
|
|
350
440
|
<div class="method-description">
|
351
441
|
<p>
|
352
442
|
This function is the primitive that <a
|
353
|
-
href="LSI.html#
|
443
|
+
href="LSI.html#M000035">find_related</a> and classify build upon. It
|
354
444
|
returns an array of 2-element arrays. The first element of this array is a
|
355
445
|
document, and the second is its "score", defining how
|
356
446
|
"close" it is to other indexed items.
|
@@ -363,25 +453,25 @@ meaningful between indexes.
|
|
363
453
|
<p>
|
364
454
|
The parameter doc is the content to compare. If that content is not
|
365
455
|
indexed, you can pass an optional block to define how to create the text
|
366
|
-
data. See <a href="LSI.html#
|
456
|
+
data. See <a href="LSI.html#M000024">add_item</a> for examples of how this
|
367
457
|
works.
|
368
458
|
</p>
|
369
459
|
</div>
|
370
460
|
</div>
|
371
461
|
|
372
|
-
<div id="method-
|
373
|
-
<a name="
|
462
|
+
<div id="method-M000033" class="method-detail">
|
463
|
+
<a name="M000033"></a>
|
374
464
|
|
375
465
|
<div class="method-heading">
|
376
|
-
<a href="LSI.src/
|
377
|
-
onclick="popupCode('LSI.src/
|
466
|
+
<a href="LSI.src/M000033.html" target="Code" class="method-signature"
|
467
|
+
onclick="popupCode('LSI.src/M000033.html');return false;">
|
378
468
|
<span class="method-name">proximity_norms_for_content</span><span class="method-args">( doc, &block )</span>
|
379
469
|
</a>
|
380
470
|
</div>
|
381
471
|
|
382
472
|
<div class="method-description">
|
383
473
|
<p>
|
384
|
-
Similar to <a href="LSI.html#
|
474
|
+
Similar to <a href="LSI.html#M000032">proximity_array_for_content</a>, this
|
385
475
|
function takes similar arguments and returns a similar array. However, it
|
386
476
|
uses the normalized calculated vectors instead of their full versions. This
|
387
477
|
is useful when you’re trying to perform operations on content that is
|
@@ -391,12 +481,12 @@ primitive.
|
|
391
481
|
</div>
|
392
482
|
</div>
|
393
483
|
|
394
|
-
<div id="method-
|
395
|
-
<a name="
|
484
|
+
<div id="method-M000027" class="method-detail">
|
485
|
+
<a name="M000027"></a>
|
396
486
|
|
397
487
|
<div class="method-heading">
|
398
|
-
<a href="LSI.src/
|
399
|
-
onclick="popupCode('LSI.src/
|
488
|
+
<a href="LSI.src/M000027.html" target="Code" class="method-signature"
|
489
|
+
onclick="popupCode('LSI.src/M000027.html');return false;">
|
400
490
|
<span class="method-name">remove_item</span><span class="method-args">( item )</span>
|
401
491
|
</a>
|
402
492
|
</div>
|
@@ -408,12 +498,12 @@ Removes an item from the database, if it is indexed.
|
|
408
498
|
</div>
|
409
499
|
</div>
|
410
500
|
|
411
|
-
<div id="method-
|
412
|
-
<a name="
|
501
|
+
<div id="method-M000034" class="method-detail">
|
502
|
+
<a name="M000034"></a>
|
413
503
|
|
414
504
|
<div class="method-heading">
|
415
|
-
<a href="LSI.src/
|
416
|
-
onclick="popupCode('LSI.src/
|
505
|
+
<a href="LSI.src/M000034.html" target="Code" class="method-signature"
|
506
|
+
onclick="popupCode('LSI.src/M000034.html');return false;">
|
417
507
|
<span class="method-name">search</span><span class="method-args">( string, max_nearest=3 )</span>
|
418
508
|
</a>
|
419
509
|
</div>
|
@@ -421,7 +511,7 @@ Removes an item from the database, if it is indexed.
|
|
421
511
|
<div class="method-description">
|
422
512
|
<p>
|
423
513
|
This function allows for text-based search of your index. Unlike other
|
424
|
-
functions like <a href="LSI.html#
|
514
|
+
functions like <a href="LSI.html#M000035">find_related</a> and classify,
|
425
515
|
search only takes short strings. It will also ignore factors like repeated
|
426
516
|
words. It is best for short, google-like search terms. A search will first
|
427
517
|
priortize lexical relationships, then semantic ones.
|