classifier 1.1.1 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (77) hide show
  1. data/LICENSE +341 -0
  2. data/README +59 -6
  3. data/Rakefile +16 -4
  4. data/bin/bayes.rb +8 -2
  5. data/doc/classes/Classifier.html +15 -10
  6. data/doc/classes/Classifier/Bayes.html +68 -38
  7. data/doc/classes/Classifier/Bayes.src/{M000005.html → M000023.html} +1 -1
  8. data/doc/classes/Classifier/Bayes.src/{M000006.html → M000024.html} +1 -1
  9. data/doc/classes/Classifier/Bayes.src/M000025.html +30 -0
  10. data/doc/classes/Classifier/Bayes.src/{M000007.html → M000026.html} +1 -1
  11. data/doc/classes/Classifier/Bayes.src/{M000008.html → M000027.html} +1 -1
  12. data/doc/classes/Classifier/Bayes.src/{M000009.html → M000028.html} +4 -4
  13. data/doc/classes/Classifier/Bayes.src/{M000010.html → M000029.html} +2 -2
  14. data/doc/classes/Classifier/ContentNode.html +252 -0
  15. data/doc/classes/Classifier/ContentNode.src/M000031.html +21 -0
  16. data/doc/classes/Classifier/ContentNode.src/M000032.html +18 -0
  17. data/doc/classes/Classifier/ContentNode.src/M000033.html +18 -0
  18. data/doc/classes/Classifier/ContentNode.src/M000034.html +41 -0
  19. data/doc/classes/Classifier/LSI.html +449 -0
  20. data/doc/classes/Classifier/LSI.src/M000011.html +20 -0
  21. data/doc/classes/Classifier/LSI.src/M000012.html +18 -0
  22. data/doc/classes/Classifier/LSI.src/M000013.html +20 -0
  23. data/doc/classes/Classifier/LSI.src/M000014.html +18 -0
  24. data/doc/classes/Classifier/LSI.src/M000015.html +21 -0
  25. data/doc/classes/Classifier/LSI.src/M000016.html +18 -0
  26. data/doc/classes/Classifier/LSI.src/M000017.html +32 -0
  27. data/doc/classes/Classifier/LSI.src/M000018.html +26 -0
  28. data/doc/classes/Classifier/LSI.src/M000019.html +26 -0
  29. data/doc/classes/Classifier/LSI.src/M000020.html +23 -0
  30. data/doc/classes/Classifier/LSI.src/M000021.html +21 -0
  31. data/doc/classes/Classifier/LSI.src/M000022.html +31 -0
  32. data/doc/classes/Classifier/WordList.html +202 -0
  33. data/doc/classes/Classifier/WordList.src/M000007.html +18 -0
  34. data/doc/classes/Classifier/WordList.src/M000008.html +19 -0
  35. data/doc/classes/Classifier/WordList.src/M000009.html +19 -0
  36. data/doc/classes/Classifier/WordList.src/M000010.html +18 -0
  37. data/doc/classes/GSL.html +111 -0
  38. data/doc/classes/GSL/Vector.html +156 -0
  39. data/doc/classes/GSL/Vector.src/M000005.html +18 -0
  40. data/doc/classes/GSL/Vector.src/M000006.html +19 -0
  41. data/doc/classes/Object.html +139 -0
  42. data/doc/classes/Object.src/M000001.html +16 -0
  43. data/doc/classes/String.html +95 -9
  44. data/doc/classes/{Classifier/WordHash.src/M000001.html → String.src/M000002.html} +3 -3
  45. data/doc/classes/String.src/M000003.html +18 -0
  46. data/doc/classes/String.src/M000004.html +18 -0
  47. data/doc/created.rid +1 -1
  48. data/doc/files/README.html +102 -12
  49. data/doc/files/lib/classifier/bayes_rb.html +1 -1
  50. data/doc/files/lib/classifier/{string_extensions/porter_stemmer_rb.html → extensions/vector_serialize_rb.html} +4 -15
  51. data/doc/files/lib/classifier/{string_extensions → extensions}/word_hash_rb.html +2 -2
  52. data/doc/files/lib/classifier/extensions/word_list_rb.html +115 -0
  53. data/doc/files/lib/classifier/lsi/content_node_rb.html +115 -0
  54. data/doc/files/lib/classifier/lsi_rb.html +125 -0
  55. data/doc/files/lib/classifier/string_extensions_rb.html +2 -3
  56. data/doc/files/lib/classifier_rb.html +3 -1
  57. data/doc/fr_class_index.html +6 -2
  58. data/doc/fr_file_index.html +5 -2
  59. data/doc/fr_method_index.html +34 -11
  60. data/lib/classifier.rb +3 -1
  61. data/lib/classifier/bayes.rb +34 -9
  62. data/lib/classifier/extensions/vector_serialize.rb +14 -0
  63. data/lib/classifier/extensions/word_hash.rb +125 -0
  64. data/lib/classifier/extensions/word_list.rb +31 -0
  65. data/lib/classifier/lsi.rb +248 -0
  66. data/lib/classifier/lsi/content_node.rb +67 -0
  67. data/lib/classifier/string_extensions.rb +10 -5
  68. data/test/bayes/bayesian_test.rb +2 -2
  69. data/test/lsi/lsi_test.rb +88 -0
  70. data/test/string_extensions/word_hash_test.rb +7 -5
  71. metadata +79 -24
  72. data/doc/classes/Classifier/Stemmable.html +0 -243
  73. data/doc/classes/Classifier/Stemmable.src/M000003.html +0 -102
  74. data/doc/classes/Classifier/WordHash.html +0 -178
  75. data/doc/classes/Classifier/WordHash.src/M000002.html +0 -28
  76. data/lib/classifier/string_extensions/porter_stemmer.rb +0 -199
  77. data/lib/classifier/string_extensions/word_hash.rb +0 -119
@@ -0,0 +1,21 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html>
7
+ <head>
8
+ <title>new (Classifier::ContentNode)</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
11
+ </head>
12
+ <body class="standalone-code">
13
+ <pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line 19</span>
14
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>( <span class="ruby-identifier">source</span>, <span class="ruby-identifier">categories</span>=<span class="ruby-keyword kw">nil</span>, <span class="ruby-identifier">text_proc</span>=<span class="ruby-keyword kw">nil</span> )
15
+ <span class="ruby-identifier">text_proc</span> = <span class="ruby-identifier">text_proc</span> <span class="ruby-operator">||</span> (<span class="ruby-identifier">proc</span> {<span class="ruby-operator">|</span><span class="ruby-identifier">x</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span>.<span class="ruby-identifier">to_s</span>})
16
+ <span class="ruby-ivar">@categories</span> = <span class="ruby-identifier">categories</span> <span class="ruby-operator">||</span> []
17
+ <span class="ruby-ivar">@source</span> = <span class="ruby-identifier">source</span>
18
+ <span class="ruby-ivar">@word_hash</span> = <span class="ruby-identifier">text_proc</span>.<span class="ruby-identifier">call</span>( <span class="ruby-ivar">@source</span> ).<span class="ruby-identifier">clean_word_hash</span>
19
+ <span class="ruby-keyword kw">end</span></pre>
20
+ </body>
21
+ </html>
@@ -0,0 +1,18 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html>
7
+ <head>
8
+ <title>search_vector (Classifier::ContentNode)</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
11
+ </head>
12
+ <body class="standalone-code">
13
+ <pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line 27</span>
14
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">search_vector</span>
15
+ <span class="ruby-ivar">@lsi_vector</span> <span class="ruby-operator">||</span> <span class="ruby-ivar">@raw_vector</span>
16
+ <span class="ruby-keyword kw">end</span></pre>
17
+ </body>
18
+ </html>
@@ -0,0 +1,18 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html>
7
+ <head>
8
+ <title>search_norm (Classifier::ContentNode)</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
11
+ </head>
12
+ <body class="standalone-code">
13
+ <pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line 32</span>
14
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">search_norm</span>
15
+ <span class="ruby-ivar">@lsi_norm</span> <span class="ruby-operator">||</span> <span class="ruby-ivar">@raw_norm</span>
16
+ <span class="ruby-keyword kw">end</span></pre>
17
+ </body>
18
+ </html>
@@ -0,0 +1,41 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html>
7
+ <head>
8
+ <title>raw_vector_with (Classifier::ContentNode)</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
11
+ </head>
12
+ <body class="standalone-code">
13
+ <pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line 38</span>
14
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">raw_vector_with</span>( <span class="ruby-identifier">word_list</span> )
15
+ <span class="ruby-identifier">vec</span> = <span class="ruby-constant">Array</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">word_list</span>.<span class="ruby-identifier">size</span>, <span class="ruby-value">0</span>)
16
+
17
+ <span class="ruby-ivar">@word_hash</span>.<span class="ruby-identifier">each_key</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">word</span><span class="ruby-operator">|</span>
18
+ <span class="ruby-identifier">vec</span>[<span class="ruby-identifier">word_list</span>[<span class="ruby-identifier">word</span>]] = <span class="ruby-ivar">@word_hash</span>[<span class="ruby-identifier">word</span>] <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">word_list</span>[<span class="ruby-identifier">word</span>]
19
+ <span class="ruby-keyword kw">end</span>
20
+
21
+ <span class="ruby-comment cmt"># Perform the scaling transform</span>
22
+ <span class="ruby-identifier">total_words</span> = <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">inject</span>(<span class="ruby-value">0</span>) { <span class="ruby-operator">|</span><span class="ruby-identifier">sum</span>,<span class="ruby-identifier">term</span><span class="ruby-operator">|</span> <span class="ruby-identifier">sum</span> <span class="ruby-operator">+=</span> <span class="ruby-identifier">term</span> }.<span class="ruby-identifier">to_f</span>
23
+
24
+ <span class="ruby-comment cmt"># Perform first-order association transform if this vector has more</span>
25
+ <span class="ruby-comment cmt"># than one word in it. </span>
26
+ <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">total_words</span> <span class="ruby-operator">&gt;</span> <span class="ruby-value">1.0</span>
27
+ <span class="ruby-identifier">weighted_total</span> = <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">inject</span>(<span class="ruby-value">0</span><span class="ruby-value">.0</span>) <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">sum</span>,<span class="ruby-identifier">term</span><span class="ruby-operator">|</span>
28
+ <span class="ruby-keyword kw">if</span>( <span class="ruby-identifier">term</span> <span class="ruby-operator">&gt;</span> <span class="ruby-value">0</span> )
29
+ <span class="ruby-identifier">sum</span> <span class="ruby-operator">+=</span> (( <span class="ruby-identifier">term</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">total_words</span> ) <span class="ruby-operator">*</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>( <span class="ruby-identifier">term</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">total_words</span> ))
30
+ <span class="ruby-keyword kw">else</span>
31
+ <span class="ruby-identifier">sum</span>
32
+ <span class="ruby-keyword kw">end</span>
33
+ <span class="ruby-keyword kw">end</span>
34
+ <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">map!</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">val</span><span class="ruby-operator">|</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>( <span class="ruby-identifier">val</span> <span class="ruby-operator">+</span> <span class="ruby-value">1</span> ) <span class="ruby-operator">/</span> <span class="ruby-operator">-</span><span class="ruby-identifier">weighted_total</span> }
35
+ <span class="ruby-keyword kw">end</span>
36
+
37
+ <span class="ruby-ivar">@raw_norm</span> = <span class="ruby-constant">GSL</span><span class="ruby-operator">::</span><span class="ruby-constant">Vector</span>.<span class="ruby-identifier">new</span>( <span class="ruby-identifier">vec</span> ).<span class="ruby-identifier">normalize</span>
38
+ <span class="ruby-ivar">@raw_vector</span> = <span class="ruby-constant">GSL</span><span class="ruby-operator">::</span><span class="ruby-constant">Vector</span>.<span class="ruby-identifier">new</span>( <span class="ruby-identifier">vec</span> )
39
+ <span class="ruby-keyword kw">end</span></pre>
40
+ </body>
41
+ </html>
@@ -0,0 +1,449 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>Class: Classifier::LSI</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="classHeader">
50
+ <table class="header-table">
51
+ <tr class="top-aligned-row">
52
+ <td><strong>Class</strong></td>
53
+ <td class="class-name-in-header">Classifier::LSI</td>
54
+ </tr>
55
+ <tr class="top-aligned-row">
56
+ <td><strong>In:</strong></td>
57
+ <td>
58
+ <a href="../../files/lib/classifier/lsi_rb.html">
59
+ lib/classifier/lsi.rb
60
+ </a>
61
+ <br />
62
+ </td>
63
+ </tr>
64
+
65
+ <tr class="top-aligned-row">
66
+ <td><strong>Parent:</strong></td>
67
+ <td>
68
+ <a href="../Object.html">
69
+ Object
70
+ </a>
71
+ </td>
72
+ </tr>
73
+ </table>
74
+ </div>
75
+ <!-- banner header -->
76
+
77
+ <div id="bodyContent">
78
+
79
+
80
+
81
+ <div id="contextContent">
82
+
83
+ <div id="description">
84
+ <p>
85
+ This class implements a Latent Semantic Indexer, which can search, classify
86
+ and cluster data based on underlying semantic relations. For more
87
+ information on the algorithms used, please consult <a
88
+ href="http://en.wikipedia.org/wiki/Latent_Semantic_Indexing">Wikipedia</a>.
89
+ </p>
90
+
91
+ </div>
92
+
93
+
94
+ </div>
95
+
96
+ <div id="method-list">
97
+ <h3 class="section-bar">Methods</h3>
98
+
99
+ <div class="name-list">
100
+ <a href="#M000014">&lt;&lt;</a>&nbsp;&nbsp;
101
+ <a href="#M000013">add_item</a>&nbsp;&nbsp;
102
+ <a href="#M000017">build_index</a>&nbsp;&nbsp;
103
+ <a href="#M000022">classify</a>&nbsp;&nbsp;
104
+ <a href="#M000021">find_related</a>&nbsp;&nbsp;
105
+ <a href="#M000016">items</a>&nbsp;&nbsp;
106
+ <a href="#M000012">needs_rebuild?</a>&nbsp;&nbsp;
107
+ <a href="#M000011">new</a>&nbsp;&nbsp;
108
+ <a href="#M000018">proximity_array_for_content</a>&nbsp;&nbsp;
109
+ <a href="#M000019">proximity_norms_for_content</a>&nbsp;&nbsp;
110
+ <a href="#M000015">remove_item</a>&nbsp;&nbsp;
111
+ <a href="#M000020">search</a>&nbsp;&nbsp;
112
+ </div>
113
+ </div>
114
+
115
+ </div>
116
+
117
+
118
+ <!-- if includes -->
119
+
120
+ <div id="section">
121
+
122
+
123
+
124
+
125
+
126
+ <div id="attribute-list">
127
+ <h3 class="section-bar">Attributes</h3>
128
+
129
+ <div class="name-list">
130
+ <table>
131
+ <tr class="top-aligned-row context-row">
132
+ <td class="context-item-name">word_list</td>
133
+ <td class="context-item-value">&nbsp;[R]&nbsp;</td>
134
+ <td class="context-item-desc"></td>
135
+ </tr>
136
+ </table>
137
+ </div>
138
+ </div>
139
+
140
+
141
+
142
+ <!-- if method_list -->
143
+ <div id="methods">
144
+ <h3 class="section-bar">Public Class methods</h3>
145
+
146
+ <div id="method-M000011" class="method-detail">
147
+ <a name="M000011"></a>
148
+
149
+ <div class="method-heading">
150
+ <a href="LSI.src/M000011.html" target="Code" class="method-signature"
151
+ onclick="popupCode('LSI.src/M000011.html');return false;">
152
+ <span class="method-name">new</span><span class="method-args">(options = {})</span>
153
+ </a>
154
+ </div>
155
+
156
+ <div class="method-description">
157
+ <p>
158
+ Create a fresh index. If you want to call <a
159
+ href="LSI.html#M000017">build_index</a> manually, use
160
+ </p>
161
+ <pre>
162
+ Classifier::LSI.new :auto_rebuild =&gt; false
163
+ </pre>
164
+ </div>
165
+ </div>
166
+
167
+ <h3 class="section-bar">Public Instance methods</h3>
168
+
169
+ <div id="method-M000014" class="method-detail">
170
+ <a name="M000014"></a>
171
+
172
+ <div class="method-heading">
173
+ <a href="LSI.src/M000014.html" target="Code" class="method-signature"
174
+ onclick="popupCode('LSI.src/M000014.html');return false;">
175
+ <span class="method-name">&lt;&lt;</span><span class="method-args">( item )</span>
176
+ </a>
177
+ </div>
178
+
179
+ <div class="method-description">
180
+ <p>
181
+ A less flexible shorthand for <a href="LSI.html#M000013">add_item</a> that
182
+ assumes you are passing in a string with no categorries. item will be duck
183
+ typed via to_s .
184
+ </p>
185
+ </div>
186
+ </div>
187
+
188
+ <div id="method-M000013" class="method-detail">
189
+ <a name="M000013"></a>
190
+
191
+ <div class="method-heading">
192
+ <a href="LSI.src/M000013.html" target="Code" class="method-signature"
193
+ onclick="popupCode('LSI.src/M000013.html');return false;">
194
+ <span class="method-name">add_item</span><span class="method-args">( item, *categories, &amp;block )</span>
195
+ </a>
196
+ </div>
197
+
198
+ <div class="method-description">
199
+ <p>
200
+ Adds an item to the index. item is assumed to be a string, but any item may
201
+ be indexed so long as it responds to to_s or if you provide an optional
202
+ block explaining how the indexer can fetch fresh string data. This optional
203
+ block is passed the item, so the item may only be a reference to a URL or
204
+ file name.
205
+ </p>
206
+ <p>
207
+ For example:
208
+ </p>
209
+ <pre>
210
+ lsi = Classifier::LSI.new
211
+ lsi.add_item &quot;This is just plain text&quot;
212
+ lsi.add_item &quot;/home/me/filename.txt&quot; { |x| File.read x }
213
+ ar = ActiveRecordObject.find( :all )
214
+ lsi.add_item ar, *ar.categories { |x| ar.content }
215
+ </pre>
216
+ </div>
217
+ </div>
218
+
219
+ <div id="method-M000017" class="method-detail">
220
+ <a name="M000017"></a>
221
+
222
+ <div class="method-heading">
223
+ <a href="LSI.src/M000017.html" target="Code" class="method-signature"
224
+ onclick="popupCode('LSI.src/M000017.html');return false;">
225
+ <span class="method-name">build_index</span><span class="method-args">( cutoff=0.75 )</span>
226
+ </a>
227
+ </div>
228
+
229
+ <div class="method-description">
230
+ <p>
231
+ This function rebuilds the index if <a
232
+ href="LSI.html#M000012">needs_rebuild?</a> returns true. For very large
233
+ document spaces, this indexing operation may take some time to complete, so
234
+ it may be wise to place the operation in another thread.
235
+ </p>
236
+ <p>
237
+ As a rule, indexing will be fairly swift on modern machines until you have
238
+ well over 500 documents indexed, or have an incredibly diverse vocabulary
239
+ for your documents.
240
+ </p>
241
+ <p>
242
+ The optional parameter &quot;cutoff&quot; is a tuning parameter. When the
243
+ index is built, a certain number of s-values are discarded from the system.
244
+ The cutoff parameter tells the indexer how many of these values to keep. A
245
+ value of 1 for cutoff means that no semantic analysis will take place,
246
+ turning the <a href="LSI.html">LSI</a> class into a simple vector search
247
+ engine.
248
+ </p>
249
+ </div>
250
+ </div>
251
+
252
+ <div id="method-M000022" class="method-detail">
253
+ <a name="M000022"></a>
254
+
255
+ <div class="method-heading">
256
+ <a href="LSI.src/M000022.html" target="Code" class="method-signature"
257
+ onclick="popupCode('LSI.src/M000022.html');return false;">
258
+ <span class="method-name">classify</span><span class="method-args">( doc, cutoff=0.30, &amp;block )</span>
259
+ </a>
260
+ </div>
261
+
262
+ <div class="method-description">
263
+ <p>
264
+ This function uses a voting system to categorize documents, based on the
265
+ categories of other documents. It uses the same logic as the <a
266
+ href="LSI.html#M000021">find_related</a> function to find related
267
+ documents, then returns the most obvious category from this list.
268
+ </p>
269
+ <p>
270
+ cutoff signifies the number of documents to consider when clasifying text.
271
+ A cutoff of 1 means that every document in the index votes on what category
272
+ the document is in. This may not always make sense.
273
+ </p>
274
+ </div>
275
+ </div>
276
+
277
+ <div id="method-M000021" class="method-detail">
278
+ <a name="M000021"></a>
279
+
280
+ <div class="method-heading">
281
+ <a href="LSI.src/M000021.html" target="Code" class="method-signature"
282
+ onclick="popupCode('LSI.src/M000021.html');return false;">
283
+ <span class="method-name">find_related</span><span class="method-args">( doc, max_nearest=3, &amp;block )</span>
284
+ </a>
285
+ </div>
286
+
287
+ <div class="method-description">
288
+ <p>
289
+ This function takes content and finds other documents that are semantically
290
+ &quot;close&quot;, returning an array of documents sorted from most to
291
+ least relavant. max_nearest specifies the number of documents to return. A
292
+ value of 0 means that it returns all the indexed documents, sorted by
293
+ relavence.
294
+ </p>
295
+ <p>
296
+ This is particularly useful for identifing clusters in your document space.
297
+ For example you may want to identify several &quot;What&#8217;s
298
+ Related&quot; items for weblog articles, or find paragraphs that relate to
299
+ each other in an essay.
300
+ </p>
301
+ </div>
302
+ </div>
303
+
304
+ <div id="method-M000016" class="method-detail">
305
+ <a name="M000016"></a>
306
+
307
+ <div class="method-heading">
308
+ <a href="LSI.src/M000016.html" target="Code" class="method-signature"
309
+ onclick="popupCode('LSI.src/M000016.html');return false;">
310
+ <span class="method-name">items</span><span class="method-args">()</span>
311
+ </a>
312
+ </div>
313
+
314
+ <div class="method-description">
315
+ <p>
316
+ Returns an array of items that are indexed.
317
+ </p>
318
+ </div>
319
+ </div>
320
+
321
+ <div id="method-M000012" class="method-detail">
322
+ <a name="M000012"></a>
323
+
324
+ <div class="method-heading">
325
+ <a href="LSI.src/M000012.html" target="Code" class="method-signature"
326
+ onclick="popupCode('LSI.src/M000012.html');return false;">
327
+ <span class="method-name">needs_rebuild?</span><span class="method-args">()</span>
328
+ </a>
329
+ </div>
330
+
331
+ <div class="method-description">
332
+ <p>
333
+ Returns true if the index needs to be rebuilt. The index needs to be built
334
+ after all informaton is added, but before you start using it for search,
335
+ classification and cluster detection.
336
+ </p>
337
+ </div>
338
+ </div>
339
+
340
+ <div id="method-M000018" class="method-detail">
341
+ <a name="M000018"></a>
342
+
343
+ <div class="method-heading">
344
+ <a href="LSI.src/M000018.html" target="Code" class="method-signature"
345
+ onclick="popupCode('LSI.src/M000018.html');return false;">
346
+ <span class="method-name">proximity_array_for_content</span><span class="method-args">( doc, &amp;block )</span>
347
+ </a>
348
+ </div>
349
+
350
+ <div class="method-description">
351
+ <p>
352
+ This function is the primitive that <a
353
+ href="LSI.html#M000021">find_related</a> and classify build upon. It
354
+ returns an array of 2-element arrays. The first element of this array is a
355
+ document, and the second is its &quot;score&quot;, defining how
356
+ &quot;close&quot; it is to other indexed items.
357
+ </p>
358
+ <p>
359
+ These values are somewhat arbitrary, having to do with the vector space
360
+ created by your content, so the magnitude is interpretable but not always
361
+ meaningful between indexes.
362
+ </p>
363
+ <p>
364
+ The parameter doc is the content to compare. If that content is not
365
+ indexed, you can pass an optional block to define how to create the text
366
+ data. See <a href="LSI.html#M000013">add_item</a> for examples of how this
367
+ works.
368
+ </p>
369
+ </div>
370
+ </div>
371
+
372
+ <div id="method-M000019" class="method-detail">
373
+ <a name="M000019"></a>
374
+
375
+ <div class="method-heading">
376
+ <a href="LSI.src/M000019.html" target="Code" class="method-signature"
377
+ onclick="popupCode('LSI.src/M000019.html');return false;">
378
+ <span class="method-name">proximity_norms_for_content</span><span class="method-args">( doc, &amp;block )</span>
379
+ </a>
380
+ </div>
381
+
382
+ <div class="method-description">
383
+ <p>
384
+ Similar to <a href="LSI.html#M000018">proximity_array_for_content</a>, this
385
+ function takes similar arguments and returns a similar array. However, it
386
+ uses the normalized calculated vectors instead of their full versions. This
387
+ is useful when you&#8217;re trying to perform operations on content that is
388
+ much smaller than the text you&#8217;re working with. search uses this
389
+ primitive.
390
+ </p>
391
+ </div>
392
+ </div>
393
+
394
+ <div id="method-M000015" class="method-detail">
395
+ <a name="M000015"></a>
396
+
397
+ <div class="method-heading">
398
+ <a href="LSI.src/M000015.html" target="Code" class="method-signature"
399
+ onclick="popupCode('LSI.src/M000015.html');return false;">
400
+ <span class="method-name">remove_item</span><span class="method-args">( item )</span>
401
+ </a>
402
+ </div>
403
+
404
+ <div class="method-description">
405
+ <p>
406
+ Removes an item from the database, if it is indexed.
407
+ </p>
408
+ </div>
409
+ </div>
410
+
411
+ <div id="method-M000020" class="method-detail">
412
+ <a name="M000020"></a>
413
+
414
+ <div class="method-heading">
415
+ <a href="LSI.src/M000020.html" target="Code" class="method-signature"
416
+ onclick="popupCode('LSI.src/M000020.html');return false;">
417
+ <span class="method-name">search</span><span class="method-args">( string, max_nearest=3 )</span>
418
+ </a>
419
+ </div>
420
+
421
+ <div class="method-description">
422
+ <p>
423
+ This function allows for text-based search of your index. Unlike other
424
+ functions like <a href="LSI.html#M000021">find_related</a> and classify,
425
+ search only takes short strings. It will also ignore factors like repeated
426
+ words. It is best for short, google-like search terms. A search will first
427
+ priortize lexical relationships, then semantic ones.
428
+ </p>
429
+ <p>
430
+ While this may seem backwards compared to the other functions that <a
431
+ href="LSI.html">LSI</a> supports, it is actually the same algorithm, just
432
+ applied on a smaller document.
433
+ </p>
434
+ </div>
435
+ </div>
436
+
437
+
438
+ </div>
439
+
440
+
441
+ </div>
442
+
443
+
444
+ <div id="validator-badges">
445
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
446
+ </div>
447
+
448
+ </body>
449
+ </html>