classifier 1.3.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. data/README +2 -1
  2. data/Rakefile +3 -4
  3. data/bin/bayes.rb +1 -1
  4. data/bin/summarize.rb +7 -2
  5. data/lib/classifier/extensions/string.rb +2 -2
  6. data/lib/classifier/extensions/vector_serialize.rb +2 -2
  7. data/lib/classifier/lsi.rb +2 -2
  8. data/lib/classifier/lsi/content_node.rb +1 -1
  9. metadata +73 -159
  10. data/doc/classes/Array.html +0 -139
  11. data/doc/classes/Array.src/M000003.html +0 -18
  12. data/doc/classes/Classifier.html +0 -140
  13. data/doc/classes/Classifier/Bayes.html +0 -317
  14. data/doc/classes/Classifier/Bayes.src/M000038.html +0 -20
  15. data/doc/classes/Classifier/Bayes.src/M000039.html +0 -23
  16. data/doc/classes/Classifier/Bayes.src/M000040.html +0 -30
  17. data/doc/classes/Classifier/Bayes.src/M000041.html +0 -27
  18. data/doc/classes/Classifier/Bayes.src/M000042.html +0 -18
  19. data/doc/classes/Classifier/Bayes.src/M000043.html +0 -25
  20. data/doc/classes/Classifier/Bayes.src/M000044.html +0 -18
  21. data/doc/classes/Classifier/ContentNode.html +0 -247
  22. data/doc/classes/Classifier/ContentNode.src/M000046.html +0 -19
  23. data/doc/classes/Classifier/ContentNode.src/M000047.html +0 -18
  24. data/doc/classes/Classifier/ContentNode.src/M000048.html +0 -18
  25. data/doc/classes/Classifier/ContentNode.src/M000049.html +0 -49
  26. data/doc/classes/Classifier/LSI.html +0 -539
  27. data/doc/classes/Classifier/LSI.src/M000022.html +0 -20
  28. data/doc/classes/Classifier/LSI.src/M000023.html +0 -18
  29. data/doc/classes/Classifier/LSI.src/M000024.html +0 -21
  30. data/doc/classes/Classifier/LSI.src/M000025.html +0 -18
  31. data/doc/classes/Classifier/LSI.src/M000026.html +0 -19
  32. data/doc/classes/Classifier/LSI.src/M000027.html +0 -21
  33. data/doc/classes/Classifier/LSI.src/M000028.html +0 -18
  34. data/doc/classes/Classifier/LSI.src/M000029.html +0 -19
  35. data/doc/classes/Classifier/LSI.src/M000030.html +0 -43
  36. data/doc/classes/Classifier/LSI.src/M000031.html +0 -23
  37. data/doc/classes/Classifier/LSI.src/M000032.html +0 -30
  38. data/doc/classes/Classifier/LSI.src/M000033.html +0 -30
  39. data/doc/classes/Classifier/LSI.src/M000034.html +0 -21
  40. data/doc/classes/Classifier/LSI.src/M000035.html +0 -21
  41. data/doc/classes/Classifier/LSI.src/M000036.html +0 -31
  42. data/doc/classes/Classifier/LSI.src/M000037.html +0 -21
  43. data/doc/classes/Classifier/WordList.html +0 -217
  44. data/doc/classes/Classifier/WordList.src/M000017.html +0 -18
  45. data/doc/classes/Classifier/WordList.src/M000018.html +0 -19
  46. data/doc/classes/Classifier/WordList.src/M000019.html +0 -19
  47. data/doc/classes/Classifier/WordList.src/M000020.html +0 -18
  48. data/doc/classes/Classifier/WordList.src/M000021.html +0 -18
  49. data/doc/classes/GSL.html +0 -112
  50. data/doc/classes/GSL/Matrix.html +0 -126
  51. data/doc/classes/GSL/Vector.html +0 -156
  52. data/doc/classes/GSL/Vector.src/M000015.html +0 -18
  53. data/doc/classes/GSL/Vector.src/M000016.html +0 -19
  54. data/doc/classes/Matrix.html +0 -184
  55. data/doc/classes/Matrix.src/M000004.html +0 -18
  56. data/doc/classes/Matrix.src/M000005.html +0 -76
  57. data/doc/classes/Matrix.src/M000006.html +0 -18
  58. data/doc/classes/Object.html +0 -139
  59. data/doc/classes/Object.src/M000007.html +0 -16
  60. data/doc/classes/String.html +0 -275
  61. data/doc/classes/String.src/M000008.html +0 -18
  62. data/doc/classes/String.src/M000009.html +0 -18
  63. data/doc/classes/String.src/M000010.html +0 -18
  64. data/doc/classes/String.src/M000011.html +0 -18
  65. data/doc/classes/String.src/M000012.html +0 -18
  66. data/doc/classes/String.src/M000013.html +0 -18
  67. data/doc/classes/String.src/M000014.html +0 -18
  68. data/doc/classes/Vector.html +0 -154
  69. data/doc/classes/Vector.src/M000001.html +0 -22
  70. data/doc/classes/Vector.src/M000002.html +0 -25
  71. data/doc/created.rid +0 -1
  72. data/doc/files/README.html +0 -252
  73. data/doc/files/lib/classifier/bayes_rb.html +0 -115
  74. data/doc/files/lib/classifier/extensions/string_rb.html +0 -122
  75. data/doc/files/lib/classifier/extensions/vector_rb.html +0 -120
  76. data/doc/files/lib/classifier/extensions/vector_serialize_rb.html +0 -101
  77. data/doc/files/lib/classifier/extensions/word_hash_rb.html +0 -115
  78. data/doc/files/lib/classifier/lsi/content_node_rb.html +0 -115
  79. data/doc/files/lib/classifier/lsi/summary_rb.html +0 -115
  80. data/doc/files/lib/classifier/lsi/word_list_rb.html +0 -115
  81. data/doc/files/lib/classifier/lsi_rb.html +0 -127
  82. data/doc/files/lib/classifier_rb.html +0 -125
  83. data/doc/fr_class_index.html +0 -39
  84. data/doc/fr_file_index.html +0 -37
  85. data/doc/fr_method_index.html +0 -75
  86. data/doc/index.html +0 -24
  87. data/doc/rdoc-style.css +0 -208
@@ -1,18 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html>
7
- <head>
8
- <title>search_norm (Classifier::ContentNode)</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
11
- </head>
12
- <body class="standalone-code">
13
- <pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line 29</span>
14
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">search_norm</span>
15
- <span class="ruby-ivar">@lsi_norm</span> <span class="ruby-operator">||</span> <span class="ruby-ivar">@raw_norm</span>
16
- <span class="ruby-keyword kw">end</span></pre>
17
- </body>
18
- </html>
@@ -1,49 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html>
7
- <head>
8
- <title>raw_vector_with (Classifier::ContentNode)</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
11
- </head>
12
- <body class="standalone-code">
13
- <pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line 35</span>
14
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">raw_vector_with</span>( <span class="ruby-identifier">word_list</span> )
15
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">$GSL</span>
16
- <span class="ruby-identifier">vec</span> = <span class="ruby-constant">Vector</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">word_list</span>.<span class="ruby-identifier">size</span>)
17
- <span class="ruby-keyword kw">else</span>
18
- <span class="ruby-identifier">vec</span> = <span class="ruby-constant">Array</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">word_list</span>.<span class="ruby-identifier">size</span>, <span class="ruby-value">0</span>)
19
- <span class="ruby-keyword kw">end</span>
20
-
21
- <span class="ruby-ivar">@word_hash</span>.<span class="ruby-identifier">each_key</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">word</span><span class="ruby-operator">|</span>
22
- <span class="ruby-identifier">vec</span>[<span class="ruby-identifier">word_list</span>[<span class="ruby-identifier">word</span>]] = <span class="ruby-ivar">@word_hash</span>[<span class="ruby-identifier">word</span>] <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">word_list</span>[<span class="ruby-identifier">word</span>]
23
- <span class="ruby-keyword kw">end</span>
24
-
25
- <span class="ruby-comment cmt"># Perform the scaling transform</span>
26
- <span class="ruby-identifier">total_words</span> = <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">sum</span>
27
-
28
- <span class="ruby-comment cmt"># Perform first-order association transform if this vector has more</span>
29
- <span class="ruby-comment cmt"># than one word in it. </span>
30
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">total_words</span> <span class="ruby-operator">&gt;</span> <span class="ruby-value">1.0</span>
31
- <span class="ruby-identifier">weighted_total</span> = <span class="ruby-value">0</span><span class="ruby-value">.0</span>
32
- <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">term</span><span class="ruby-operator">|</span>
33
- <span class="ruby-keyword kw">if</span> ( <span class="ruby-identifier">term</span> <span class="ruby-operator">&gt;</span> <span class="ruby-value">0</span> )
34
- <span class="ruby-identifier">weighted_total</span> <span class="ruby-operator">+=</span> (( <span class="ruby-identifier">term</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">total_words</span> ) <span class="ruby-operator">*</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>( <span class="ruby-identifier">term</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">total_words</span> ))
35
- <span class="ruby-keyword kw">end</span>
36
- <span class="ruby-keyword kw">end</span>
37
- <span class="ruby-identifier">vec</span> = <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">collect</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">val</span><span class="ruby-operator">|</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>( <span class="ruby-identifier">val</span> <span class="ruby-operator">+</span> <span class="ruby-value">1</span> ) <span class="ruby-operator">/</span> <span class="ruby-operator">-</span><span class="ruby-identifier">weighted_total</span> }
38
- <span class="ruby-keyword kw">end</span>
39
-
40
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">$GSL</span>
41
- <span class="ruby-ivar">@raw_norm</span> = <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">normalize</span>
42
- <span class="ruby-ivar">@raw_vector</span> = <span class="ruby-identifier">vec</span>
43
- <span class="ruby-keyword kw">else</span>
44
- <span class="ruby-ivar">@raw_norm</span> = <span class="ruby-constant">Vector</span>[<span class="ruby-operator">*</span><span class="ruby-identifier">vec</span>].<span class="ruby-identifier">normalize</span>
45
- <span class="ruby-ivar">@raw_vector</span> = <span class="ruby-constant">Vector</span>[<span class="ruby-operator">*</span><span class="ruby-identifier">vec</span>]
46
- <span class="ruby-keyword kw">end</span>
47
- <span class="ruby-keyword kw">end</span></pre>
48
- </body>
49
- </html>
@@ -1,539 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
- <head>
8
- <title>Class: Classifier::LSI</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
- <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
12
- <script type="text/javascript">
13
- // <![CDATA[
14
-
15
- function popupCode( url ) {
16
- window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
- }
18
-
19
- function toggleCode( id ) {
20
- if ( document.getElementById )
21
- elem = document.getElementById( id );
22
- else if ( document.all )
23
- elem = eval( "document.all." + id );
24
- else
25
- return false;
26
-
27
- elemStyle = elem.style;
28
-
29
- if ( elemStyle.display != "block" ) {
30
- elemStyle.display = "block"
31
- } else {
32
- elemStyle.display = "none"
33
- }
34
-
35
- return true;
36
- }
37
-
38
- // Make codeblocks hidden by default
39
- document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
-
41
- // ]]>
42
- </script>
43
-
44
- </head>
45
- <body>
46
-
47
-
48
-
49
- <div id="classHeader">
50
- <table class="header-table">
51
- <tr class="top-aligned-row">
52
- <td><strong>Class</strong></td>
53
- <td class="class-name-in-header">Classifier::LSI</td>
54
- </tr>
55
- <tr class="top-aligned-row">
56
- <td><strong>In:</strong></td>
57
- <td>
58
- <a href="../../files/lib/classifier/lsi_rb.html">
59
- lib/classifier/lsi.rb
60
- </a>
61
- <br />
62
- </td>
63
- </tr>
64
-
65
- <tr class="top-aligned-row">
66
- <td><strong>Parent:</strong></td>
67
- <td>
68
- <a href="../Object.html">
69
- Object
70
- </a>
71
- </td>
72
- </tr>
73
- </table>
74
- </div>
75
- <!-- banner header -->
76
-
77
- <div id="bodyContent">
78
-
79
-
80
-
81
- <div id="contextContent">
82
-
83
- <div id="description">
84
- <p>
85
- This class implements a Latent Semantic Indexer, which can search, classify
86
- and cluster data based on underlying semantic relations. For more
87
- information on the algorithms used, please consult <a
88
- href="http://en.wikipedia.org/wiki/Latent_Semantic_Indexing">Wikipedia</a>.
89
- </p>
90
-
91
- </div>
92
-
93
-
94
- </div>
95
-
96
- <div id="method-list">
97
- <h3 class="section-bar">Methods</h3>
98
-
99
- <div class="name-list">
100
- <a href="#M000025">&lt;&lt;</a>&nbsp;&nbsp;
101
- <a href="#M000024">add_item</a>&nbsp;&nbsp;
102
- <a href="#M000030">build_index</a>&nbsp;&nbsp;
103
- <a href="#M000029">categories_for</a>&nbsp;&nbsp;
104
- <a href="#M000026">categories_for</a>&nbsp;&nbsp;
105
- <a href="#M000036">classify</a>&nbsp;&nbsp;
106
- <a href="#M000035">find_related</a>&nbsp;&nbsp;
107
- <a href="#M000037">highest_ranked_stems</a>&nbsp;&nbsp;
108
- <a href="#M000031">highest_relative_content</a>&nbsp;&nbsp;
109
- <a href="#M000028">items</a>&nbsp;&nbsp;
110
- <a href="#M000023">needs_rebuild?</a>&nbsp;&nbsp;
111
- <a href="#M000022">new</a>&nbsp;&nbsp;
112
- <a href="#M000032">proximity_array_for_content</a>&nbsp;&nbsp;
113
- <a href="#M000033">proximity_norms_for_content</a>&nbsp;&nbsp;
114
- <a href="#M000027">remove_item</a>&nbsp;&nbsp;
115
- <a href="#M000034">search</a>&nbsp;&nbsp;
116
- </div>
117
- </div>
118
-
119
- </div>
120
-
121
-
122
- <!-- if includes -->
123
-
124
- <div id="section">
125
-
126
-
127
-
128
-
129
-
130
- <div id="attribute-list">
131
- <h3 class="section-bar">Attributes</h3>
132
-
133
- <div class="name-list">
134
- <table>
135
- <tr class="top-aligned-row context-row">
136
- <td class="context-item-name">auto_rebuild</td>
137
- <td class="context-item-value">&nbsp;[RW]&nbsp;</td>
138
- <td class="context-item-desc"></td>
139
- </tr>
140
- <tr class="top-aligned-row context-row">
141
- <td class="context-item-name">word_list</td>
142
- <td class="context-item-value">&nbsp;[R]&nbsp;</td>
143
- <td class="context-item-desc"></td>
144
- </tr>
145
- </table>
146
- </div>
147
- </div>
148
-
149
-
150
-
151
- <!-- if method_list -->
152
- <div id="methods">
153
- <h3 class="section-bar">Public Class methods</h3>
154
-
155
- <div id="method-M000022" class="method-detail">
156
- <a name="M000022"></a>
157
-
158
- <div class="method-heading">
159
- <a href="LSI.src/M000022.html" target="Code" class="method-signature"
160
- onclick="popupCode('LSI.src/M000022.html');return false;">
161
- <span class="method-name">new</span><span class="method-args">(options = {})</span>
162
- </a>
163
- </div>
164
-
165
- <div class="method-description">
166
- <p>
167
- Create a fresh index. If you want to call <a
168
- href="LSI.html#M000030">build_index</a> manually, use
169
- </p>
170
- <pre>
171
- Classifier::LSI.new :auto_rebuild =&gt; false
172
- </pre>
173
- </div>
174
- </div>
175
-
176
- <h3 class="section-bar">Public Instance methods</h3>
177
-
178
- <div id="method-M000025" class="method-detail">
179
- <a name="M000025"></a>
180
-
181
- <div class="method-heading">
182
- <a href="LSI.src/M000025.html" target="Code" class="method-signature"
183
- onclick="popupCode('LSI.src/M000025.html');return false;">
184
- <span class="method-name">&lt;&lt;</span><span class="method-args">( item )</span>
185
- </a>
186
- </div>
187
-
188
- <div class="method-description">
189
- <p>
190
- A less flexible shorthand for <a href="LSI.html#M000024">add_item</a> that
191
- assumes you are passing in a string with no categorries. item will be duck
192
- typed via to_s .
193
- </p>
194
- </div>
195
- </div>
196
-
197
- <div id="method-M000024" class="method-detail">
198
- <a name="M000024"></a>
199
-
200
- <div class="method-heading">
201
- <a href="LSI.src/M000024.html" target="Code" class="method-signature"
202
- onclick="popupCode('LSI.src/M000024.html');return false;">
203
- <span class="method-name">add_item</span><span class="method-args">( item, *categories, &amp;block )</span>
204
- </a>
205
- </div>
206
-
207
- <div class="method-description">
208
- <p>
209
- Adds an item to the index. item is assumed to be a string, but any item may
210
- be indexed so long as it responds to to_s or if you provide an optional
211
- block explaining how the indexer can fetch fresh string data. This optional
212
- block is passed the item, so the item may only be a reference to a URL or
213
- file name.
214
- </p>
215
- <p>
216
- For example:
217
- </p>
218
- <pre>
219
- lsi = Classifier::LSI.new
220
- lsi.add_item &quot;This is just plain text&quot;
221
- lsi.add_item &quot;/home/me/filename.txt&quot; { |x| File.read x }
222
- ar = ActiveRecordObject.find( :all )
223
- lsi.add_item ar, *ar.categories { |x| ar.content }
224
- </pre>
225
- </div>
226
- </div>
227
-
228
- <div id="method-M000030" class="method-detail">
229
- <a name="M000030"></a>
230
-
231
- <div class="method-heading">
232
- <a href="LSI.src/M000030.html" target="Code" class="method-signature"
233
- onclick="popupCode('LSI.src/M000030.html');return false;">
234
- <span class="method-name">build_index</span><span class="method-args">( cutoff=0.75 )</span>
235
- </a>
236
- </div>
237
-
238
- <div class="method-description">
239
- <p>
240
- This function rebuilds the index if <a
241
- href="LSI.html#M000023">needs_rebuild?</a> returns true. For very large
242
- document spaces, this indexing operation may take some time to complete, so
243
- it may be wise to place the operation in another thread.
244
- </p>
245
- <p>
246
- As a rule, indexing will be fairly swift on modern machines until you have
247
- well over 500 documents indexed, or have an incredibly diverse vocabulary
248
- for your documents.
249
- </p>
250
- <p>
251
- The optional parameter &quot;cutoff&quot; is a tuning parameter. When the
252
- index is built, a certain number of s-values are discarded from the system.
253
- The cutoff parameter tells the indexer how many of these values to keep. A
254
- value of 1 for cutoff means that no semantic analysis will take place,
255
- turning the <a href="LSI.html">LSI</a> class into a simple vector search
256
- engine.
257
- </p>
258
- </div>
259
- </div>
260
-
261
- <div id="method-M000029" class="method-detail">
262
- <a name="M000029"></a>
263
-
264
- <div class="method-heading">
265
- <a href="LSI.src/M000029.html" target="Code" class="method-signature"
266
- onclick="popupCode('LSI.src/M000029.html');return false;">
267
- <span class="method-name">categories_for</span><span class="method-args">(item)</span>
268
- </a>
269
- </div>
270
-
271
- <div class="method-description">
272
- <p>
273
- Returns the categories for a given indexed items. You are free to add and
274
- remove items from this as you see fit. It does not invalide an index to
275
- change its categories.
276
- </p>
277
- </div>
278
- </div>
279
-
280
- <div id="method-M000026" class="method-detail">
281
- <a name="M000026"></a>
282
-
283
- <div class="method-heading">
284
- <a href="LSI.src/M000026.html" target="Code" class="method-signature"
285
- onclick="popupCode('LSI.src/M000026.html');return false;">
286
- <span class="method-name">categories_for</span><span class="method-args">(item)</span>
287
- </a>
288
- </div>
289
-
290
- <div class="method-description">
291
- <p>
292
- Returns the categories for a given indexed items. You are free to add and
293
- remove items from this as you see fit. It does not invalide an index to
294
- change its categories.
295
- </p>
296
- </div>
297
- </div>
298
-
299
- <div id="method-M000036" class="method-detail">
300
- <a name="M000036"></a>
301
-
302
- <div class="method-heading">
303
- <a href="LSI.src/M000036.html" target="Code" class="method-signature"
304
- onclick="popupCode('LSI.src/M000036.html');return false;">
305
- <span class="method-name">classify</span><span class="method-args">( doc, cutoff=0.30, &amp;block )</span>
306
- </a>
307
- </div>
308
-
309
- <div class="method-description">
310
- <p>
311
- This function uses a voting system to categorize documents, based on the
312
- categories of other documents. It uses the same logic as the <a
313
- href="LSI.html#M000035">find_related</a> function to find related
314
- documents, then returns the most obvious category from this list.
315
- </p>
316
- <p>
317
- cutoff signifies the number of documents to consider when clasifying text.
318
- A cutoff of 1 means that every document in the index votes on what category
319
- the document is in. This may not always make sense.
320
- </p>
321
- </div>
322
- </div>
323
-
324
- <div id="method-M000035" class="method-detail">
325
- <a name="M000035"></a>
326
-
327
- <div class="method-heading">
328
- <a href="LSI.src/M000035.html" target="Code" class="method-signature"
329
- onclick="popupCode('LSI.src/M000035.html');return false;">
330
- <span class="method-name">find_related</span><span class="method-args">( doc, max_nearest=3, &amp;block )</span>
331
- </a>
332
- </div>
333
-
334
- <div class="method-description">
335
- <p>
336
- This function takes content and finds other documents that are semantically
337
- &quot;close&quot;, returning an array of documents sorted from most to
338
- least relavant. max_nearest specifies the number of documents to return. A
339
- value of 0 means that it returns all the indexed documents, sorted by
340
- relavence.
341
- </p>
342
- <p>
343
- This is particularly useful for identifing clusters in your document space.
344
- For example you may want to identify several &quot;What&#8217;s
345
- Related&quot; items for weblog articles, or find paragraphs that relate to
346
- each other in an essay.
347
- </p>
348
- </div>
349
- </div>
350
-
351
- <div id="method-M000037" class="method-detail">
352
- <a name="M000037"></a>
353
-
354
- <div class="method-heading">
355
- <a href="LSI.src/M000037.html" target="Code" class="method-signature"
356
- onclick="popupCode('LSI.src/M000037.html');return false;">
357
- <span class="method-name">highest_ranked_stems</span><span class="method-args">( doc, count=3 )</span>
358
- </a>
359
- </div>
360
-
361
- <div class="method-description">
362
- <p>
363
- Prototype, only works on indexed documents. I have no clue if this is going
364
- to work, but in theory it&#8217;s supposed to.
365
- </p>
366
- </div>
367
- </div>
368
-
369
- <div id="method-M000031" class="method-detail">
370
- <a name="M000031"></a>
371
-
372
- <div class="method-heading">
373
- <a href="LSI.src/M000031.html" target="Code" class="method-signature"
374
- onclick="popupCode('LSI.src/M000031.html');return false;">
375
- <span class="method-name">highest_relative_content</span><span class="method-args">( max_chunks=10 )</span>
376
- </a>
377
- </div>
378
-
379
- <div class="method-description">
380
- <p>
381
- This method returns max_chunks entries, ordered by their average semantic
382
- rating. Essentially, the average distance of each entry from all other
383
- entries is calculated, the highest are returned.
384
- </p>
385
- <p>
386
- This can be used to build a summary service, or to provide more information
387
- about your dataset&#8217;s general content. For example, if you were to use
388
- categorize on the results of this data, you could gather information on
389
- what your dataset is generally about.
390
- </p>
391
- </div>
392
- </div>
393
-
394
- <div id="method-M000028" class="method-detail">
395
- <a name="M000028"></a>
396
-
397
- <div class="method-heading">
398
- <a href="LSI.src/M000028.html" target="Code" class="method-signature"
399
- onclick="popupCode('LSI.src/M000028.html');return false;">
400
- <span class="method-name">items</span><span class="method-args">()</span>
401
- </a>
402
- </div>
403
-
404
- <div class="method-description">
405
- <p>
406
- Returns an array of items that are indexed.
407
- </p>
408
- </div>
409
- </div>
410
-
411
- <div id="method-M000023" class="method-detail">
412
- <a name="M000023"></a>
413
-
414
- <div class="method-heading">
415
- <a href="LSI.src/M000023.html" target="Code" class="method-signature"
416
- onclick="popupCode('LSI.src/M000023.html');return false;">
417
- <span class="method-name">needs_rebuild?</span><span class="method-args">()</span>
418
- </a>
419
- </div>
420
-
421
- <div class="method-description">
422
- <p>
423
- Returns true if the index needs to be rebuilt. The index needs to be built
424
- after all informaton is added, but before you start using it for search,
425
- classification and cluster detection.
426
- </p>
427
- </div>
428
- </div>
429
-
430
- <div id="method-M000032" class="method-detail">
431
- <a name="M000032"></a>
432
-
433
- <div class="method-heading">
434
- <a href="LSI.src/M000032.html" target="Code" class="method-signature"
435
- onclick="popupCode('LSI.src/M000032.html');return false;">
436
- <span class="method-name">proximity_array_for_content</span><span class="method-args">( doc, &amp;block )</span>
437
- </a>
438
- </div>
439
-
440
- <div class="method-description">
441
- <p>
442
- This function is the primitive that <a
443
- href="LSI.html#M000035">find_related</a> and classify build upon. It
444
- returns an array of 2-element arrays. The first element of this array is a
445
- document, and the second is its &quot;score&quot;, defining how
446
- &quot;close&quot; it is to other indexed items.
447
- </p>
448
- <p>
449
- These values are somewhat arbitrary, having to do with the vector space
450
- created by your content, so the magnitude is interpretable but not always
451
- meaningful between indexes.
452
- </p>
453
- <p>
454
- The parameter doc is the content to compare. If that content is not
455
- indexed, you can pass an optional block to define how to create the text
456
- data. See <a href="LSI.html#M000024">add_item</a> for examples of how this
457
- works.
458
- </p>
459
- </div>
460
- </div>
461
-
462
- <div id="method-M000033" class="method-detail">
463
- <a name="M000033"></a>
464
-
465
- <div class="method-heading">
466
- <a href="LSI.src/M000033.html" target="Code" class="method-signature"
467
- onclick="popupCode('LSI.src/M000033.html');return false;">
468
- <span class="method-name">proximity_norms_for_content</span><span class="method-args">( doc, &amp;block )</span>
469
- </a>
470
- </div>
471
-
472
- <div class="method-description">
473
- <p>
474
- Similar to <a href="LSI.html#M000032">proximity_array_for_content</a>, this
475
- function takes similar arguments and returns a similar array. However, it
476
- uses the normalized calculated vectors instead of their full versions. This
477
- is useful when you&#8217;re trying to perform operations on content that is
478
- much smaller than the text you&#8217;re working with. search uses this
479
- primitive.
480
- </p>
481
- </div>
482
- </div>
483
-
484
- <div id="method-M000027" class="method-detail">
485
- <a name="M000027"></a>
486
-
487
- <div class="method-heading">
488
- <a href="LSI.src/M000027.html" target="Code" class="method-signature"
489
- onclick="popupCode('LSI.src/M000027.html');return false;">
490
- <span class="method-name">remove_item</span><span class="method-args">( item )</span>
491
- </a>
492
- </div>
493
-
494
- <div class="method-description">
495
- <p>
496
- Removes an item from the database, if it is indexed.
497
- </p>
498
- </div>
499
- </div>
500
-
501
- <div id="method-M000034" class="method-detail">
502
- <a name="M000034"></a>
503
-
504
- <div class="method-heading">
505
- <a href="LSI.src/M000034.html" target="Code" class="method-signature"
506
- onclick="popupCode('LSI.src/M000034.html');return false;">
507
- <span class="method-name">search</span><span class="method-args">( string, max_nearest=3 )</span>
508
- </a>
509
- </div>
510
-
511
- <div class="method-description">
512
- <p>
513
- This function allows for text-based search of your index. Unlike other
514
- functions like <a href="LSI.html#M000035">find_related</a> and classify,
515
- search only takes short strings. It will also ignore factors like repeated
516
- words. It is best for short, google-like search terms. A search will first
517
- priortize lexical relationships, then semantic ones.
518
- </p>
519
- <p>
520
- While this may seem backwards compared to the other functions that <a
521
- href="LSI.html">LSI</a> supports, it is actually the same algorithm, just
522
- applied on a smaller document.
523
- </p>
524
- </div>
525
- </div>
526
-
527
-
528
- </div>
529
-
530
-
531
- </div>
532
-
533
-
534
- <div id="validator-badges">
535
- <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
536
- </div>
537
-
538
- </body>
539
- </html>