classifier 1.3.0 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (87) hide show
  1. data/README +2 -1
  2. data/Rakefile +3 -4
  3. data/bin/bayes.rb +1 -1
  4. data/bin/summarize.rb +7 -2
  5. data/lib/classifier/extensions/string.rb +2 -2
  6. data/lib/classifier/extensions/vector_serialize.rb +2 -2
  7. data/lib/classifier/lsi.rb +2 -2
  8. data/lib/classifier/lsi/content_node.rb +1 -1
  9. metadata +73 -159
  10. data/doc/classes/Array.html +0 -139
  11. data/doc/classes/Array.src/M000003.html +0 -18
  12. data/doc/classes/Classifier.html +0 -140
  13. data/doc/classes/Classifier/Bayes.html +0 -317
  14. data/doc/classes/Classifier/Bayes.src/M000038.html +0 -20
  15. data/doc/classes/Classifier/Bayes.src/M000039.html +0 -23
  16. data/doc/classes/Classifier/Bayes.src/M000040.html +0 -30
  17. data/doc/classes/Classifier/Bayes.src/M000041.html +0 -27
  18. data/doc/classes/Classifier/Bayes.src/M000042.html +0 -18
  19. data/doc/classes/Classifier/Bayes.src/M000043.html +0 -25
  20. data/doc/classes/Classifier/Bayes.src/M000044.html +0 -18
  21. data/doc/classes/Classifier/ContentNode.html +0 -247
  22. data/doc/classes/Classifier/ContentNode.src/M000046.html +0 -19
  23. data/doc/classes/Classifier/ContentNode.src/M000047.html +0 -18
  24. data/doc/classes/Classifier/ContentNode.src/M000048.html +0 -18
  25. data/doc/classes/Classifier/ContentNode.src/M000049.html +0 -49
  26. data/doc/classes/Classifier/LSI.html +0 -539
  27. data/doc/classes/Classifier/LSI.src/M000022.html +0 -20
  28. data/doc/classes/Classifier/LSI.src/M000023.html +0 -18
  29. data/doc/classes/Classifier/LSI.src/M000024.html +0 -21
  30. data/doc/classes/Classifier/LSI.src/M000025.html +0 -18
  31. data/doc/classes/Classifier/LSI.src/M000026.html +0 -19
  32. data/doc/classes/Classifier/LSI.src/M000027.html +0 -21
  33. data/doc/classes/Classifier/LSI.src/M000028.html +0 -18
  34. data/doc/classes/Classifier/LSI.src/M000029.html +0 -19
  35. data/doc/classes/Classifier/LSI.src/M000030.html +0 -43
  36. data/doc/classes/Classifier/LSI.src/M000031.html +0 -23
  37. data/doc/classes/Classifier/LSI.src/M000032.html +0 -30
  38. data/doc/classes/Classifier/LSI.src/M000033.html +0 -30
  39. data/doc/classes/Classifier/LSI.src/M000034.html +0 -21
  40. data/doc/classes/Classifier/LSI.src/M000035.html +0 -21
  41. data/doc/classes/Classifier/LSI.src/M000036.html +0 -31
  42. data/doc/classes/Classifier/LSI.src/M000037.html +0 -21
  43. data/doc/classes/Classifier/WordList.html +0 -217
  44. data/doc/classes/Classifier/WordList.src/M000017.html +0 -18
  45. data/doc/classes/Classifier/WordList.src/M000018.html +0 -19
  46. data/doc/classes/Classifier/WordList.src/M000019.html +0 -19
  47. data/doc/classes/Classifier/WordList.src/M000020.html +0 -18
  48. data/doc/classes/Classifier/WordList.src/M000021.html +0 -18
  49. data/doc/classes/GSL.html +0 -112
  50. data/doc/classes/GSL/Matrix.html +0 -126
  51. data/doc/classes/GSL/Vector.html +0 -156
  52. data/doc/classes/GSL/Vector.src/M000015.html +0 -18
  53. data/doc/classes/GSL/Vector.src/M000016.html +0 -19
  54. data/doc/classes/Matrix.html +0 -184
  55. data/doc/classes/Matrix.src/M000004.html +0 -18
  56. data/doc/classes/Matrix.src/M000005.html +0 -76
  57. data/doc/classes/Matrix.src/M000006.html +0 -18
  58. data/doc/classes/Object.html +0 -139
  59. data/doc/classes/Object.src/M000007.html +0 -16
  60. data/doc/classes/String.html +0 -275
  61. data/doc/classes/String.src/M000008.html +0 -18
  62. data/doc/classes/String.src/M000009.html +0 -18
  63. data/doc/classes/String.src/M000010.html +0 -18
  64. data/doc/classes/String.src/M000011.html +0 -18
  65. data/doc/classes/String.src/M000012.html +0 -18
  66. data/doc/classes/String.src/M000013.html +0 -18
  67. data/doc/classes/String.src/M000014.html +0 -18
  68. data/doc/classes/Vector.html +0 -154
  69. data/doc/classes/Vector.src/M000001.html +0 -22
  70. data/doc/classes/Vector.src/M000002.html +0 -25
  71. data/doc/created.rid +0 -1
  72. data/doc/files/README.html +0 -252
  73. data/doc/files/lib/classifier/bayes_rb.html +0 -115
  74. data/doc/files/lib/classifier/extensions/string_rb.html +0 -122
  75. data/doc/files/lib/classifier/extensions/vector_rb.html +0 -120
  76. data/doc/files/lib/classifier/extensions/vector_serialize_rb.html +0 -101
  77. data/doc/files/lib/classifier/extensions/word_hash_rb.html +0 -115
  78. data/doc/files/lib/classifier/lsi/content_node_rb.html +0 -115
  79. data/doc/files/lib/classifier/lsi/summary_rb.html +0 -115
  80. data/doc/files/lib/classifier/lsi/word_list_rb.html +0 -115
  81. data/doc/files/lib/classifier/lsi_rb.html +0 -127
  82. data/doc/files/lib/classifier_rb.html +0 -125
  83. data/doc/fr_class_index.html +0 -39
  84. data/doc/fr_file_index.html +0 -37
  85. data/doc/fr_method_index.html +0 -75
  86. data/doc/index.html +0 -24
  87. data/doc/rdoc-style.css +0 -208
@@ -1,18 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html>
7
- <head>
8
- <title>search_norm (Classifier::ContentNode)</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
11
- </head>
12
- <body class="standalone-code">
13
- <pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line 29</span>
14
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">search_norm</span>
15
- <span class="ruby-ivar">@lsi_norm</span> <span class="ruby-operator">||</span> <span class="ruby-ivar">@raw_norm</span>
16
- <span class="ruby-keyword kw">end</span></pre>
17
- </body>
18
- </html>
@@ -1,49 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html>
7
- <head>
8
- <title>raw_vector_with (Classifier::ContentNode)</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
11
- </head>
12
- <body class="standalone-code">
13
- <pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line 35</span>
14
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">raw_vector_with</span>( <span class="ruby-identifier">word_list</span> )
15
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">$GSL</span>
16
- <span class="ruby-identifier">vec</span> = <span class="ruby-constant">Vector</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">word_list</span>.<span class="ruby-identifier">size</span>)
17
- <span class="ruby-keyword kw">else</span>
18
- <span class="ruby-identifier">vec</span> = <span class="ruby-constant">Array</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">word_list</span>.<span class="ruby-identifier">size</span>, <span class="ruby-value">0</span>)
19
- <span class="ruby-keyword kw">end</span>
20
-
21
- <span class="ruby-ivar">@word_hash</span>.<span class="ruby-identifier">each_key</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">word</span><span class="ruby-operator">|</span>
22
- <span class="ruby-identifier">vec</span>[<span class="ruby-identifier">word_list</span>[<span class="ruby-identifier">word</span>]] = <span class="ruby-ivar">@word_hash</span>[<span class="ruby-identifier">word</span>] <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">word_list</span>[<span class="ruby-identifier">word</span>]
23
- <span class="ruby-keyword kw">end</span>
24
-
25
- <span class="ruby-comment cmt"># Perform the scaling transform</span>
26
- <span class="ruby-identifier">total_words</span> = <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">sum</span>
27
-
28
- <span class="ruby-comment cmt"># Perform first-order association transform if this vector has more</span>
29
- <span class="ruby-comment cmt"># than one word in it. </span>
30
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">total_words</span> <span class="ruby-operator">&gt;</span> <span class="ruby-value">1.0</span>
31
- <span class="ruby-identifier">weighted_total</span> = <span class="ruby-value">0</span><span class="ruby-value">.0</span>
32
- <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">term</span><span class="ruby-operator">|</span>
33
- <span class="ruby-keyword kw">if</span> ( <span class="ruby-identifier">term</span> <span class="ruby-operator">&gt;</span> <span class="ruby-value">0</span> )
34
- <span class="ruby-identifier">weighted_total</span> <span class="ruby-operator">+=</span> (( <span class="ruby-identifier">term</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">total_words</span> ) <span class="ruby-operator">*</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>( <span class="ruby-identifier">term</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">total_words</span> ))
35
- <span class="ruby-keyword kw">end</span>
36
- <span class="ruby-keyword kw">end</span>
37
- <span class="ruby-identifier">vec</span> = <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">collect</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">val</span><span class="ruby-operator">|</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>( <span class="ruby-identifier">val</span> <span class="ruby-operator">+</span> <span class="ruby-value">1</span> ) <span class="ruby-operator">/</span> <span class="ruby-operator">-</span><span class="ruby-identifier">weighted_total</span> }
38
- <span class="ruby-keyword kw">end</span>
39
-
40
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">$GSL</span>
41
- <span class="ruby-ivar">@raw_norm</span> = <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">normalize</span>
42
- <span class="ruby-ivar">@raw_vector</span> = <span class="ruby-identifier">vec</span>
43
- <span class="ruby-keyword kw">else</span>
44
- <span class="ruby-ivar">@raw_norm</span> = <span class="ruby-constant">Vector</span>[<span class="ruby-operator">*</span><span class="ruby-identifier">vec</span>].<span class="ruby-identifier">normalize</span>
45
- <span class="ruby-ivar">@raw_vector</span> = <span class="ruby-constant">Vector</span>[<span class="ruby-operator">*</span><span class="ruby-identifier">vec</span>]
46
- <span class="ruby-keyword kw">end</span>
47
- <span class="ruby-keyword kw">end</span></pre>
48
- </body>
49
- </html>
@@ -1,539 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
- <head>
8
- <title>Class: Classifier::LSI</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
- <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
12
- <script type="text/javascript">
13
- // <![CDATA[
14
-
15
- function popupCode( url ) {
16
- window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
- }
18
-
19
- function toggleCode( id ) {
20
- if ( document.getElementById )
21
- elem = document.getElementById( id );
22
- else if ( document.all )
23
- elem = eval( "document.all." + id );
24
- else
25
- return false;
26
-
27
- elemStyle = elem.style;
28
-
29
- if ( elemStyle.display != "block" ) {
30
- elemStyle.display = "block"
31
- } else {
32
- elemStyle.display = "none"
33
- }
34
-
35
- return true;
36
- }
37
-
38
- // Make codeblocks hidden by default
39
- document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
-
41
- // ]]>
42
- </script>
43
-
44
- </head>
45
- <body>
46
-
47
-
48
-
49
- <div id="classHeader">
50
- <table class="header-table">
51
- <tr class="top-aligned-row">
52
- <td><strong>Class</strong></td>
53
- <td class="class-name-in-header">Classifier::LSI</td>
54
- </tr>
55
- <tr class="top-aligned-row">
56
- <td><strong>In:</strong></td>
57
- <td>
58
- <a href="../../files/lib/classifier/lsi_rb.html">
59
- lib/classifier/lsi.rb
60
- </a>
61
- <br />
62
- </td>
63
- </tr>
64
-
65
- <tr class="top-aligned-row">
66
- <td><strong>Parent:</strong></td>
67
- <td>
68
- <a href="../Object.html">
69
- Object
70
- </a>
71
- </td>
72
- </tr>
73
- </table>
74
- </div>
75
- <!-- banner header -->
76
-
77
- <div id="bodyContent">
78
-
79
-
80
-
81
- <div id="contextContent">
82
-
83
- <div id="description">
84
- <p>
85
- This class implements a Latent Semantic Indexer, which can search, classify
86
- and cluster data based on underlying semantic relations. For more
87
- information on the algorithms used, please consult <a
88
- href="http://en.wikipedia.org/wiki/Latent_Semantic_Indexing">Wikipedia</a>.
89
- </p>
90
-
91
- </div>
92
-
93
-
94
- </div>
95
-
96
- <div id="method-list">
97
- <h3 class="section-bar">Methods</h3>
98
-
99
- <div class="name-list">
100
- <a href="#M000025">&lt;&lt;</a>&nbsp;&nbsp;
101
- <a href="#M000024">add_item</a>&nbsp;&nbsp;
102
- <a href="#M000030">build_index</a>&nbsp;&nbsp;
103
- <a href="#M000029">categories_for</a>&nbsp;&nbsp;
104
- <a href="#M000026">categories_for</a>&nbsp;&nbsp;
105
- <a href="#M000036">classify</a>&nbsp;&nbsp;
106
- <a href="#M000035">find_related</a>&nbsp;&nbsp;
107
- <a href="#M000037">highest_ranked_stems</a>&nbsp;&nbsp;
108
- <a href="#M000031">highest_relative_content</a>&nbsp;&nbsp;
109
- <a href="#M000028">items</a>&nbsp;&nbsp;
110
- <a href="#M000023">needs_rebuild?</a>&nbsp;&nbsp;
111
- <a href="#M000022">new</a>&nbsp;&nbsp;
112
- <a href="#M000032">proximity_array_for_content</a>&nbsp;&nbsp;
113
- <a href="#M000033">proximity_norms_for_content</a>&nbsp;&nbsp;
114
- <a href="#M000027">remove_item</a>&nbsp;&nbsp;
115
- <a href="#M000034">search</a>&nbsp;&nbsp;
116
- </div>
117
- </div>
118
-
119
- </div>
120
-
121
-
122
- <!-- if includes -->
123
-
124
- <div id="section">
125
-
126
-
127
-
128
-
129
-
130
- <div id="attribute-list">
131
- <h3 class="section-bar">Attributes</h3>
132
-
133
- <div class="name-list">
134
- <table>
135
- <tr class="top-aligned-row context-row">
136
- <td class="context-item-name">auto_rebuild</td>
137
- <td class="context-item-value">&nbsp;[RW]&nbsp;</td>
138
- <td class="context-item-desc"></td>
139
- </tr>
140
- <tr class="top-aligned-row context-row">
141
- <td class="context-item-name">word_list</td>
142
- <td class="context-item-value">&nbsp;[R]&nbsp;</td>
143
- <td class="context-item-desc"></td>
144
- </tr>
145
- </table>
146
- </div>
147
- </div>
148
-
149
-
150
-
151
- <!-- if method_list -->
152
- <div id="methods">
153
- <h3 class="section-bar">Public Class methods</h3>
154
-
155
- <div id="method-M000022" class="method-detail">
156
- <a name="M000022"></a>
157
-
158
- <div class="method-heading">
159
- <a href="LSI.src/M000022.html" target="Code" class="method-signature"
160
- onclick="popupCode('LSI.src/M000022.html');return false;">
161
- <span class="method-name">new</span><span class="method-args">(options = {})</span>
162
- </a>
163
- </div>
164
-
165
- <div class="method-description">
166
- <p>
167
- Create a fresh index. If you want to call <a
168
- href="LSI.html#M000030">build_index</a> manually, use
169
- </p>
170
- <pre>
171
- Classifier::LSI.new :auto_rebuild =&gt; false
172
- </pre>
173
- </div>
174
- </div>
175
-
176
- <h3 class="section-bar">Public Instance methods</h3>
177
-
178
- <div id="method-M000025" class="method-detail">
179
- <a name="M000025"></a>
180
-
181
- <div class="method-heading">
182
- <a href="LSI.src/M000025.html" target="Code" class="method-signature"
183
- onclick="popupCode('LSI.src/M000025.html');return false;">
184
- <span class="method-name">&lt;&lt;</span><span class="method-args">( item )</span>
185
- </a>
186
- </div>
187
-
188
- <div class="method-description">
189
- <p>
190
- A less flexible shorthand for <a href="LSI.html#M000024">add_item</a> that
191
- assumes you are passing in a string with no categorries. item will be duck
192
- typed via to_s .
193
- </p>
194
- </div>
195
- </div>
196
-
197
- <div id="method-M000024" class="method-detail">
198
- <a name="M000024"></a>
199
-
200
- <div class="method-heading">
201
- <a href="LSI.src/M000024.html" target="Code" class="method-signature"
202
- onclick="popupCode('LSI.src/M000024.html');return false;">
203
- <span class="method-name">add_item</span><span class="method-args">( item, *categories, &amp;block )</span>
204
- </a>
205
- </div>
206
-
207
- <div class="method-description">
208
- <p>
209
- Adds an item to the index. item is assumed to be a string, but any item may
210
- be indexed so long as it responds to to_s or if you provide an optional
211
- block explaining how the indexer can fetch fresh string data. This optional
212
- block is passed the item, so the item may only be a reference to a URL or
213
- file name.
214
- </p>
215
- <p>
216
- For example:
217
- </p>
218
- <pre>
219
- lsi = Classifier::LSI.new
220
- lsi.add_item &quot;This is just plain text&quot;
221
- lsi.add_item &quot;/home/me/filename.txt&quot; { |x| File.read x }
222
- ar = ActiveRecordObject.find( :all )
223
- lsi.add_item ar, *ar.categories { |x| ar.content }
224
- </pre>
225
- </div>
226
- </div>
227
-
228
- <div id="method-M000030" class="method-detail">
229
- <a name="M000030"></a>
230
-
231
- <div class="method-heading">
232
- <a href="LSI.src/M000030.html" target="Code" class="method-signature"
233
- onclick="popupCode('LSI.src/M000030.html');return false;">
234
- <span class="method-name">build_index</span><span class="method-args">( cutoff=0.75 )</span>
235
- </a>
236
- </div>
237
-
238
- <div class="method-description">
239
- <p>
240
- This function rebuilds the index if <a
241
- href="LSI.html#M000023">needs_rebuild?</a> returns true. For very large
242
- document spaces, this indexing operation may take some time to complete, so
243
- it may be wise to place the operation in another thread.
244
- </p>
245
- <p>
246
- As a rule, indexing will be fairly swift on modern machines until you have
247
- well over 500 documents indexed, or have an incredibly diverse vocabulary
248
- for your documents.
249
- </p>
250
- <p>
251
- The optional parameter &quot;cutoff&quot; is a tuning parameter. When the
252
- index is built, a certain number of s-values are discarded from the system.
253
- The cutoff parameter tells the indexer how many of these values to keep. A
254
- value of 1 for cutoff means that no semantic analysis will take place,
255
- turning the <a href="LSI.html">LSI</a> class into a simple vector search
256
- engine.
257
- </p>
258
- </div>
259
- </div>
260
-
261
- <div id="method-M000029" class="method-detail">
262
- <a name="M000029"></a>
263
-
264
- <div class="method-heading">
265
- <a href="LSI.src/M000029.html" target="Code" class="method-signature"
266
- onclick="popupCode('LSI.src/M000029.html');return false;">
267
- <span class="method-name">categories_for</span><span class="method-args">(item)</span>
268
- </a>
269
- </div>
270
-
271
- <div class="method-description">
272
- <p>
273
- Returns the categories for a given indexed items. You are free to add and
274
- remove items from this as you see fit. It does not invalide an index to
275
- change its categories.
276
- </p>
277
- </div>
278
- </div>
279
-
280
- <div id="method-M000026" class="method-detail">
281
- <a name="M000026"></a>
282
-
283
- <div class="method-heading">
284
- <a href="LSI.src/M000026.html" target="Code" class="method-signature"
285
- onclick="popupCode('LSI.src/M000026.html');return false;">
286
- <span class="method-name">categories_for</span><span class="method-args">(item)</span>
287
- </a>
288
- </div>
289
-
290
- <div class="method-description">
291
- <p>
292
- Returns the categories for a given indexed items. You are free to add and
293
- remove items from this as you see fit. It does not invalide an index to
294
- change its categories.
295
- </p>
296
- </div>
297
- </div>
298
-
299
- <div id="method-M000036" class="method-detail">
300
- <a name="M000036"></a>
301
-
302
- <div class="method-heading">
303
- <a href="LSI.src/M000036.html" target="Code" class="method-signature"
304
- onclick="popupCode('LSI.src/M000036.html');return false;">
305
- <span class="method-name">classify</span><span class="method-args">( doc, cutoff=0.30, &amp;block )</span>
306
- </a>
307
- </div>
308
-
309
- <div class="method-description">
310
- <p>
311
- This function uses a voting system to categorize documents, based on the
312
- categories of other documents. It uses the same logic as the <a
313
- href="LSI.html#M000035">find_related</a> function to find related
314
- documents, then returns the most obvious category from this list.
315
- </p>
316
- <p>
317
- cutoff signifies the number of documents to consider when clasifying text.
318
- A cutoff of 1 means that every document in the index votes on what category
319
- the document is in. This may not always make sense.
320
- </p>
321
- </div>
322
- </div>
323
-
324
- <div id="method-M000035" class="method-detail">
325
- <a name="M000035"></a>
326
-
327
- <div class="method-heading">
328
- <a href="LSI.src/M000035.html" target="Code" class="method-signature"
329
- onclick="popupCode('LSI.src/M000035.html');return false;">
330
- <span class="method-name">find_related</span><span class="method-args">( doc, max_nearest=3, &amp;block )</span>
331
- </a>
332
- </div>
333
-
334
- <div class="method-description">
335
- <p>
336
- This function takes content and finds other documents that are semantically
337
- &quot;close&quot;, returning an array of documents sorted from most to
338
- least relavant. max_nearest specifies the number of documents to return. A
339
- value of 0 means that it returns all the indexed documents, sorted by
340
- relavence.
341
- </p>
342
- <p>
343
- This is particularly useful for identifing clusters in your document space.
344
- For example you may want to identify several &quot;What&#8217;s
345
- Related&quot; items for weblog articles, or find paragraphs that relate to
346
- each other in an essay.
347
- </p>
348
- </div>
349
- </div>
350
-
351
- <div id="method-M000037" class="method-detail">
352
- <a name="M000037"></a>
353
-
354
- <div class="method-heading">
355
- <a href="LSI.src/M000037.html" target="Code" class="method-signature"
356
- onclick="popupCode('LSI.src/M000037.html');return false;">
357
- <span class="method-name">highest_ranked_stems</span><span class="method-args">( doc, count=3 )</span>
358
- </a>
359
- </div>
360
-
361
- <div class="method-description">
362
- <p>
363
- Prototype, only works on indexed documents. I have no clue if this is going
364
- to work, but in theory it&#8217;s supposed to.
365
- </p>
366
- </div>
367
- </div>
368
-
369
- <div id="method-M000031" class="method-detail">
370
- <a name="M000031"></a>
371
-
372
- <div class="method-heading">
373
- <a href="LSI.src/M000031.html" target="Code" class="method-signature"
374
- onclick="popupCode('LSI.src/M000031.html');return false;">
375
- <span class="method-name">highest_relative_content</span><span class="method-args">( max_chunks=10 )</span>
376
- </a>
377
- </div>
378
-
379
- <div class="method-description">
380
- <p>
381
- This method returns max_chunks entries, ordered by their average semantic
382
- rating. Essentially, the average distance of each entry from all other
383
- entries is calculated, the highest are returned.
384
- </p>
385
- <p>
386
- This can be used to build a summary service, or to provide more information
387
- about your dataset&#8217;s general content. For example, if you were to use
388
- categorize on the results of this data, you could gather information on
389
- what your dataset is generally about.
390
- </p>
391
- </div>
392
- </div>
393
-
394
- <div id="method-M000028" class="method-detail">
395
- <a name="M000028"></a>
396
-
397
- <div class="method-heading">
398
- <a href="LSI.src/M000028.html" target="Code" class="method-signature"
399
- onclick="popupCode('LSI.src/M000028.html');return false;">
400
- <span class="method-name">items</span><span class="method-args">()</span>
401
- </a>
402
- </div>
403
-
404
- <div class="method-description">
405
- <p>
406
- Returns an array of items that are indexed.
407
- </p>
408
- </div>
409
- </div>
410
-
411
- <div id="method-M000023" class="method-detail">
412
- <a name="M000023"></a>
413
-
414
- <div class="method-heading">
415
- <a href="LSI.src/M000023.html" target="Code" class="method-signature"
416
- onclick="popupCode('LSI.src/M000023.html');return false;">
417
- <span class="method-name">needs_rebuild?</span><span class="method-args">()</span>
418
- </a>
419
- </div>
420
-
421
- <div class="method-description">
422
- <p>
423
- Returns true if the index needs to be rebuilt. The index needs to be built
424
- after all informaton is added, but before you start using it for search,
425
- classification and cluster detection.
426
- </p>
427
- </div>
428
- </div>
429
-
430
- <div id="method-M000032" class="method-detail">
431
- <a name="M000032"></a>
432
-
433
- <div class="method-heading">
434
- <a href="LSI.src/M000032.html" target="Code" class="method-signature"
435
- onclick="popupCode('LSI.src/M000032.html');return false;">
436
- <span class="method-name">proximity_array_for_content</span><span class="method-args">( doc, &amp;block )</span>
437
- </a>
438
- </div>
439
-
440
- <div class="method-description">
441
- <p>
442
- This function is the primitive that <a
443
- href="LSI.html#M000035">find_related</a> and classify build upon. It
444
- returns an array of 2-element arrays. The first element of this array is a
445
- document, and the second is its &quot;score&quot;, defining how
446
- &quot;close&quot; it is to other indexed items.
447
- </p>
448
- <p>
449
- These values are somewhat arbitrary, having to do with the vector space
450
- created by your content, so the magnitude is interpretable but not always
451
- meaningful between indexes.
452
- </p>
453
- <p>
454
- The parameter doc is the content to compare. If that content is not
455
- indexed, you can pass an optional block to define how to create the text
456
- data. See <a href="LSI.html#M000024">add_item</a> for examples of how this
457
- works.
458
- </p>
459
- </div>
460
- </div>
461
-
462
- <div id="method-M000033" class="method-detail">
463
- <a name="M000033"></a>
464
-
465
- <div class="method-heading">
466
- <a href="LSI.src/M000033.html" target="Code" class="method-signature"
467
- onclick="popupCode('LSI.src/M000033.html');return false;">
468
- <span class="method-name">proximity_norms_for_content</span><span class="method-args">( doc, &amp;block )</span>
469
- </a>
470
- </div>
471
-
472
- <div class="method-description">
473
- <p>
474
- Similar to <a href="LSI.html#M000032">proximity_array_for_content</a>, this
475
- function takes similar arguments and returns a similar array. However, it
476
- uses the normalized calculated vectors instead of their full versions. This
477
- is useful when you&#8217;re trying to perform operations on content that is
478
- much smaller than the text you&#8217;re working with. search uses this
479
- primitive.
480
- </p>
481
- </div>
482
- </div>
483
-
484
- <div id="method-M000027" class="method-detail">
485
- <a name="M000027"></a>
486
-
487
- <div class="method-heading">
488
- <a href="LSI.src/M000027.html" target="Code" class="method-signature"
489
- onclick="popupCode('LSI.src/M000027.html');return false;">
490
- <span class="method-name">remove_item</span><span class="method-args">( item )</span>
491
- </a>
492
- </div>
493
-
494
- <div class="method-description">
495
- <p>
496
- Removes an item from the database, if it is indexed.
497
- </p>
498
- </div>
499
- </div>
500
-
501
- <div id="method-M000034" class="method-detail">
502
- <a name="M000034"></a>
503
-
504
- <div class="method-heading">
505
- <a href="LSI.src/M000034.html" target="Code" class="method-signature"
506
- onclick="popupCode('LSI.src/M000034.html');return false;">
507
- <span class="method-name">search</span><span class="method-args">( string, max_nearest=3 )</span>
508
- </a>
509
- </div>
510
-
511
- <div class="method-description">
512
- <p>
513
- This function allows for text-based search of your index. Unlike other
514
- functions like <a href="LSI.html#M000035">find_related</a> and classify,
515
- search only takes short strings. It will also ignore factors like repeated
516
- words. It is best for short, google-like search terms. A search will first
517
- priortize lexical relationships, then semantic ones.
518
- </p>
519
- <p>
520
- While this may seem backwards compared to the other functions that <a
521
- href="LSI.html">LSI</a> supports, it is actually the same algorithm, just
522
- applied on a smaller document.
523
- </p>
524
- </div>
525
- </div>
526
-
527
-
528
- </div>
529
-
530
-
531
- </div>
532
-
533
-
534
- <div id="validator-badges">
535
- <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
536
- </div>
537
-
538
- </body>
539
- </html>