RubyGems - classifier - Versions diffs - 1.2.0 → 1.3.0 - Mend

classifier 1.2.0 → 1.3.0

Files changed (96) hide show

data/LICENSE +361 -273
data/README +6 -5
data/Rakefile +12 -2
data/bin/summarize.rb +11 -0
data/doc/classes/Array.html +139 -0
data/doc/classes/Array.src/M000003.html +18 -0
data/doc/classes/Classifier.html +5 -5
data/doc/classes/Classifier/Bayes.html +43 -43
data/doc/classes/Classifier/Bayes.src/{M000023.html → M000038.html} +0 -0
data/doc/classes/Classifier/Bayes.src/{M000024.html → M000039.html} +0 -0
data/doc/classes/Classifier/Bayes.src/{M000025.html → M000040.html} +0 -0
data/doc/classes/Classifier/Bayes.src/{M000026.html → M000041.html} +0 -0
data/doc/classes/Classifier/Bayes.src/{M000027.html → M000042.html} +0 -0
data/doc/classes/Classifier/Bayes.src/{M000028.html → M000043.html} +0 -0
data/doc/classes/Classifier/Bayes.src/{M000029.html → M000044.html} +0 -0
data/doc/classes/Classifier/ContentNode.html +23 -28
data/doc/classes/Classifier/ContentNode.src/M000046.html +19 -0
data/doc/classes/Classifier/ContentNode.src/{M000032.html → M000047.html} +1 -1
data/doc/classes/Classifier/ContentNode.src/{M000033.html → M000048.html} +1 -1
data/doc/classes/Classifier/ContentNode.src/M000049.html +49 -0
data/doc/classes/Classifier/LSI.html +158 -68
data/doc/classes/Classifier/LSI.src/M000022.html +6 -17
data/doc/classes/Classifier/LSI.src/{M000012.html → M000023.html} +2 -2
data/doc/classes/Classifier/LSI.src/{M000013.html → M000024.html} +3 -2
data/doc/classes/Classifier/LSI.src/{M000014.html → M000025.html} +1 -1
data/doc/classes/Classifier/LSI.src/M000026.html +19 -0
data/doc/classes/Classifier/LSI.src/{M000015.html → M000027.html} +1 -1
data/doc/classes/Classifier/LSI.src/{M000016.html → M000028.html} +1 -1
data/doc/classes/Classifier/LSI.src/M000029.html +19 -0
data/doc/classes/Classifier/LSI.src/M000030.html +43 -0
data/doc/classes/Classifier/LSI.src/M000031.html +23 -0
data/doc/classes/Classifier/LSI.src/{M000018.html → M000032.html} +7 -3
data/doc/classes/Classifier/LSI.src/{M000019.html → M000033.html} +6 -2
data/doc/classes/Classifier/LSI.src/{M000020.html → M000034.html} +2 -4
data/doc/classes/Classifier/LSI.src/{M000021.html → M000035.html} +1 -1
data/doc/classes/Classifier/LSI.src/M000036.html +31 -0
data/doc/classes/Classifier/LSI.src/M000037.html +21 -0
data/doc/classes/Classifier/WordList.html +37 -22
data/doc/classes/Classifier/WordList.src/{M000007.html → M000017.html} +2 -2
data/doc/classes/Classifier/WordList.src/{M000008.html → M000018.html} +1 -1
data/doc/classes/Classifier/WordList.src/{M000009.html → M000019.html} +1 -1
data/doc/classes/Classifier/WordList.src/M000020.html +18 -0
data/doc/classes/Classifier/WordList.src/{M000010.html → M000021.html} +1 -1
data/doc/classes/GSL.html +2 -1
data/doc/classes/GSL/Matrix.html +126 -0
data/doc/classes/GSL/Vector.html +10 -10
data/doc/classes/GSL/Vector.src/{M000005.html → M000015.html} +0 -0
data/doc/classes/GSL/Vector.src/{M000006.html → M000016.html} +0 -0
data/doc/classes/Matrix.html +184 -0
data/doc/classes/Matrix.src/M000004.html +18 -0
data/doc/classes/Matrix.src/M000005.html +76 -0
data/doc/classes/Matrix.src/M000006.html +18 -0
data/doc/classes/Object.html +7 -7
data/doc/classes/Object.src/{M000001.html → M000007.html} +1 -1
data/doc/classes/String.html +90 -20
data/doc/classes/String.src/{M000002.html → M000008.html} +0 -0
data/doc/classes/String.src/{M000003.html → M000009.html} +0 -0
data/doc/classes/String.src/{M000004.html → M000010.html} +0 -0
data/doc/classes/String.src/M000011.html +18 -0
data/doc/classes/String.src/M000012.html +18 -0
data/doc/classes/String.src/M000013.html +18 -0
data/doc/classes/String.src/M000014.html +18 -0
data/doc/classes/Vector.html +154 -0
data/doc/classes/Vector.src/M000001.html +22 -0
data/doc/classes/Vector.src/M000002.html +25 -0
data/doc/created.rid +1 -1
data/doc/files/README.html +14 -8
data/doc/files/lib/classifier/bayes_rb.html +1 -1
data/doc/files/lib/classifier/{string_extensions_rb.html → extensions/string_rb.html} +5 -5
data/doc/files/lib/classifier/extensions/vector_rb.html +120 -0
data/doc/files/lib/classifier/extensions/vector_serialize_rb.html +1 -1
data/doc/files/lib/classifier/extensions/word_hash_rb.html +1 -1
data/doc/files/lib/classifier/lsi/content_node_rb.html +2 -2
data/doc/files/lib/classifier/lsi/summary_rb.html +115 -0
data/doc/files/lib/classifier/{extensions → lsi}/word_list_rb.html +3 -3
data/doc/files/lib/classifier/lsi_rb.html +5 -3
data/doc/files/lib/classifier_rb.html +2 -2
data/doc/fr_class_index.html +4 -0
data/doc/fr_file_index.html +4 -2
data/doc/fr_method_index.html +49 -34
data/doc/index.html +2 -2
data/lib/classifier.rb +1 -1
data/lib/classifier/{string_extensions.rb → extensions/string.rb} +0 -0
data/lib/classifier/extensions/vector.rb +106 -0
data/lib/classifier/extensions/vector_serialize.rb +6 -0
data/lib/classifier/lsi.rb +101 -31
data/lib/classifier/lsi/content_node.rb +28 -23
data/lib/classifier/lsi/summary.rb +31 -0
data/lib/classifier/{extensions → lsi}/word_list.rb +7 -2
data/test/{string_extensions → extensions}/word_hash_test.rb +0 -0
data/test/lsi/lsi_test.rb +36 -1
metadata +68 -41
data/doc/classes/Classifier/ContentNode.src/M000031.html +0 -21
data/doc/classes/Classifier/ContentNode.src/M000034.html +0 -41
data/doc/classes/Classifier/LSI.src/M000011.html +0 -20
data/doc/classes/Classifier/LSI.src/M000017.html +0 -32

data/doc/files/lib/classifier/lsi_rb.html CHANGED Viewed

@@ -56,7 +56,7 @@
     </tr>
     <tr class="top-aligned-row">
       <td><strong>Last Update:</strong></td>
-      <td>Sun Apr 24 21:34:06 PDT 2005</td>
+      <td>Thu May 05 01:50:06 PDT 2005</td>
     </tr>
     </table>
   </div>
@@ -76,7 +76,7 @@
 <tr><td valign="top">Copyright:</td><td>Copyright &#169; 2005 David Fayram II
 </td></tr>
-<tr><td valign="top">License:</td><td>GPL
+<tr><td valign="top">License:</td><td>LGPL
 </td></tr>
 </table>
@@ -88,9 +88,11 @@
       <div class="name-list">
       gsl&nbsp;&nbsp;
-      classifier/extensions/word_list&nbsp;&nbsp;
       classifier/extensions/vector_serialize&nbsp;&nbsp;
+      classifier/extensions/vector&nbsp;&nbsp;
+      classifier/lsi/word_list&nbsp;&nbsp;
       classifier/lsi/content_node&nbsp;&nbsp;
+      classifier/lsi/summary&nbsp;&nbsp;
       </div>
     </div>

data/doc/files/lib/classifier_rb.html CHANGED Viewed

@@ -56,7 +56,7 @@
     </tr>
     <tr class="top-aligned-row">
       <td><strong>Last Update:</strong></td>
-      <td>Sun Apr 24 02:08:49 PDT 2005</td>
+      <td>Thu May 05 01:21:16 PDT 2005</td>
     </tr>
     </table>
   </div>
@@ -88,7 +88,7 @@
       <div class="name-list">
       rubygems&nbsp;&nbsp;
-      classifier/string_extensions&nbsp;&nbsp;
+      classifier/extensions/string&nbsp;&nbsp;
       classifier/bayes&nbsp;&nbsp;
       classifier/lsi&nbsp;&nbsp;
       </div>

data/doc/fr_class_index.html CHANGED Viewed

@@ -20,15 +20,19 @@
 <div id="index">
   <h1 class="section-bar">Classes</h1>
   <div id="index-entries">
+    <a href="classes/Array.html">Array</a><br />
     <a href="classes/Classifier.html">Classifier</a><br />
     <a href="classes/Classifier/Bayes.html">Classifier::Bayes</a><br />
     <a href="classes/Classifier/ContentNode.html">Classifier::ContentNode</a><br />
     <a href="classes/Classifier/LSI.html">Classifier::LSI</a><br />
     <a href="classes/Classifier/WordList.html">Classifier::WordList</a><br />
     <a href="classes/GSL.html">GSL</a><br />
+    <a href="classes/GSL/Matrix.html">GSL::Matrix</a><br />
     <a href="classes/GSL/Vector.html">GSL::Vector</a><br />
+    <a href="classes/Matrix.html">Matrix</a><br />
     <a href="classes/Object.html">Object</a><br />
     <a href="classes/String.html">String</a><br />
+    <a href="classes/Vector.html">Vector</a><br />
   </div>
 </div>
 </body>

data/doc/fr_file_index.html CHANGED Viewed

@@ -23,12 +23,14 @@
     <a href="files/README.html">README</a><br />
     <a href="files/lib/classifier_rb.html">lib/classifier.rb</a><br />
     <a href="files/lib/classifier/bayes_rb.html">lib/classifier/bayes.rb</a><br />
+    <a href="files/lib/classifier/extensions/string_rb.html">lib/classifier/extensions/string.rb</a><br />
+    <a href="files/lib/classifier/extensions/vector_rb.html">lib/classifier/extensions/vector.rb</a><br />
     <a href="files/lib/classifier/extensions/vector_serialize_rb.html">lib/classifier/extensions/vector_serialize.rb</a><br />
     <a href="files/lib/classifier/extensions/word_hash_rb.html">lib/classifier/extensions/word_hash.rb</a><br />
-    <a href="files/lib/classifier/extensions/word_list_rb.html">lib/classifier/extensions/word_list.rb</a><br />
     <a href="files/lib/classifier/lsi_rb.html">lib/classifier/lsi.rb</a><br />
     <a href="files/lib/classifier/lsi/content_node_rb.html">lib/classifier/lsi/content_node.rb</a><br />
-    <a href="files/lib/classifier/string_extensions_rb.html">lib/classifier/string_extensions.rb</a><br />
+    <a href="files/lib/classifier/lsi/summary_rb.html">lib/classifier/lsi/summary.rb</a><br />
+    <a href="files/lib/classifier/lsi/word_list_rb.html">lib/classifier/lsi/word_list.rb</a><br />
   </div>
 </div>
 </body>

data/doc/fr_method_index.html CHANGED Viewed

@@ -20,40 +20,55 @@
 <div id="index">
   <h1 class="section-bar">Methods</h1>
   <div id="index-entries">
-    <a href="classes/Classifier/LSI.html#M000014"><< (Classifier::LSI)</a><br />
-    <a href="classes/Classifier/WordList.html#M000009">[] (Classifier::WordList)</a><br />
-    <a href="classes/GSL/Vector.html#M000005">_dump (GSL::Vector)</a><br />
-    <a href="classes/GSL/Vector.html#M000006">_load (GSL::Vector)</a><br />
-    <a href="classes/Classifier/Bayes.html#M000029">add_category (Classifier::Bayes)</a><br />
-    <a href="classes/Classifier/LSI.html#M000013">add_item (Classifier::LSI)</a><br />
-    <a href="classes/Classifier/WordList.html#M000008">add_word (Classifier::WordList)</a><br />
-    <a href="classes/Classifier/Bayes.html#M000030">append_category (Classifier::Bayes)</a><br />
-    <a href="classes/Classifier/LSI.html#M000017">build_index (Classifier::LSI)</a><br />
-    <a href="classes/Classifier/Bayes.html#M000026">classifications (Classifier::Bayes)</a><br />
-    <a href="classes/Classifier/LSI.html#M000022">classify (Classifier::LSI)</a><br />
-    <a href="classes/Classifier/Bayes.html#M000027">classify (Classifier::Bayes)</a><br />
-    <a href="classes/String.html#M000004">clean_word_hash (String)</a><br />
-    <a href="classes/Classifier/LSI.html#M000021">find_related (Classifier::LSI)</a><br />
-    <a href="classes/Classifier/LSI.html#M000016">items (Classifier::LSI)</a><br />
-    <a href="classes/Classifier/Bayes.html#M000028">method_missing (Classifier::Bayes)</a><br />
-    <a href="classes/Classifier/LSI.html#M000012">needs_rebuild? (Classifier::LSI)</a><br />
-    <a href="classes/Classifier/Bayes.html#M000023">new (Classifier::Bayes)</a><br />
-    <a href="classes/Classifier/LSI.html#M000011">new (Classifier::LSI)</a><br />
-    <a href="classes/Classifier/ContentNode.html#M000031">new (Classifier::ContentNode)</a><br />
-    <a href="classes/Classifier/WordList.html#M000007">new (Classifier::WordList)</a><br />
-    <a href="classes/Object.html#M000001">prepare_category_name (Object)</a><br />
-    <a href="classes/Classifier/LSI.html#M000018">proximity_array_for_content (Classifier::LSI)</a><br />
-    <a href="classes/Classifier/LSI.html#M000019">proximity_norms_for_content (Classifier::LSI)</a><br />
-    <a href="classes/Classifier/ContentNode.html#M000034">raw_vector_with (Classifier::ContentNode)</a><br />
-    <a href="classes/Classifier/LSI.html#M000015">remove_item (Classifier::LSI)</a><br />
-    <a href="classes/Classifier/LSI.html#M000020">search (Classifier::LSI)</a><br />
-    <a href="classes/Classifier/ContentNode.html#M000033">search_norm (Classifier::ContentNode)</a><br />
-    <a href="classes/Classifier/ContentNode.html#M000032">search_vector (Classifier::ContentNode)</a><br />
-    <a href="classes/Classifier/WordList.html#M000010">size (Classifier::WordList)</a><br />
-    <a href="classes/Classifier/Bayes.html#M000024">train (Classifier::Bayes)</a><br />
-    <a href="classes/Classifier/Bayes.html#M000025">untrain (Classifier::Bayes)</a><br />
-    <a href="classes/String.html#M000002">without_punctuation (String)</a><br />
-    <a href="classes/String.html#M000003">word_hash (String)</a><br />
+    <a href="classes/Classifier/LSI.html#M000025"><< (Classifier::LSI)</a><br />
+    <a href="classes/Matrix.html#M000005">SV_decomp (Matrix)</a><br />
+    <a href="classes/Classifier/WordList.html#M000019">[] (Classifier::WordList)</a><br />
+    <a href="classes/Matrix.html#M000006">[]= (Matrix)</a><br />
+    <a href="classes/GSL/Vector.html#M000015">_dump (GSL::Vector)</a><br />
+    <a href="classes/GSL/Vector.html#M000016">_load (GSL::Vector)</a><br />
+    <a href="classes/Classifier/Bayes.html#M000044">add_category (Classifier::Bayes)</a><br />
+    <a href="classes/Classifier/LSI.html#M000024">add_item (Classifier::LSI)</a><br />
+    <a href="classes/Classifier/WordList.html#M000018">add_word (Classifier::WordList)</a><br />
+    <a href="classes/Classifier/Bayes.html#M000045">append_category (Classifier::Bayes)</a><br />
+    <a href="classes/Classifier/LSI.html#M000030">build_index (Classifier::LSI)</a><br />
+    <a href="classes/Classifier/LSI.html#M000029">categories_for (Classifier::LSI)</a><br />
+    <a href="classes/Classifier/LSI.html#M000026">categories_for (Classifier::LSI)</a><br />
+    <a href="classes/Classifier/Bayes.html#M000041">classifications (Classifier::Bayes)</a><br />
+    <a href="classes/Classifier/LSI.html#M000036">classify (Classifier::LSI)</a><br />
+    <a href="classes/Classifier/Bayes.html#M000042">classify (Classifier::Bayes)</a><br />
+    <a href="classes/String.html#M000010">clean_word_hash (String)</a><br />
+    <a href="classes/Matrix.html#M000004">diag (Matrix)</a><br />
+    <a href="classes/Classifier/LSI.html#M000035">find_related (Classifier::LSI)</a><br />
+    <a href="classes/Classifier/LSI.html#M000037">highest_ranked_stems (Classifier::LSI)</a><br />
+    <a href="classes/Classifier/LSI.html#M000031">highest_relative_content (Classifier::LSI)</a><br />
+    <a href="classes/Classifier/LSI.html#M000028">items (Classifier::LSI)</a><br />
+    <a href="classes/Vector.html#M000001">magnitude (Vector)</a><br />
+    <a href="classes/Classifier/Bayes.html#M000043">method_missing (Classifier::Bayes)</a><br />
+    <a href="classes/Classifier/LSI.html#M000023">needs_rebuild? (Classifier::LSI)</a><br />
+    <a href="classes/Classifier/ContentNode.html#M000046">new (Classifier::ContentNode)</a><br />
+    <a href="classes/Classifier/Bayes.html#M000038">new (Classifier::Bayes)</a><br />
+    <a href="classes/Classifier/LSI.html#M000022">new (Classifier::LSI)</a><br />
+    <a href="classes/Classifier/WordList.html#M000017">new (Classifier::WordList)</a><br />
+    <a href="classes/Vector.html#M000002">normalize (Vector)</a><br />
+    <a href="classes/String.html#M000012">paragraph_summary (String)</a><br />
+    <a href="classes/Object.html#M000007">prepare_category_name (Object)</a><br />
+    <a href="classes/Classifier/LSI.html#M000032">proximity_array_for_content (Classifier::LSI)</a><br />
+    <a href="classes/Classifier/LSI.html#M000033">proximity_norms_for_content (Classifier::LSI)</a><br />
+    <a href="classes/Classifier/ContentNode.html#M000049">raw_vector_with (Classifier::ContentNode)</a><br />
+    <a href="classes/Classifier/LSI.html#M000027">remove_item (Classifier::LSI)</a><br />
+    <a href="classes/Classifier/LSI.html#M000034">search (Classifier::LSI)</a><br />
+    <a href="classes/Classifier/ContentNode.html#M000048">search_norm (Classifier::ContentNode)</a><br />
+    <a href="classes/Classifier/ContentNode.html#M000047">search_vector (Classifier::ContentNode)</a><br />
+    <a href="classes/Classifier/WordList.html#M000021">size (Classifier::WordList)</a><br />
+    <a href="classes/String.html#M000014">split_paragraphs (String)</a><br />
+    <a href="classes/String.html#M000013">split_sentences (String)</a><br />
+    <a href="classes/Array.html#M000003">sum (Array)</a><br />
+    <a href="classes/String.html#M000011">summary (String)</a><br />
+    <a href="classes/Classifier/Bayes.html#M000039">train (Classifier::Bayes)</a><br />
+    <a href="classes/Classifier/Bayes.html#M000040">untrain (Classifier::Bayes)</a><br />
+    <a href="classes/String.html#M000008">without_punctuation (String)</a><br />
+    <a href="classes/Classifier/WordList.html#M000020">word_for_index (Classifier::WordList)</a><br />
+    <a href="classes/String.html#M000009">word_hash (String)</a><br />
   </div>
 </div>
 </body>

data/doc/index.html CHANGED Viewed

@@ -5,12 +5,12 @@
 <!--
-    RDoc Documentation
+    Ruby Classifier - Bayesian and LSI classification library
   -->
 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 <head>
-  <title>RDoc Documentation</title>
+  <title>Ruby Classifier - Bayesian and LSI classification library</title>
   <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
 </head>
 <frameset rows="20%, 80%">

data/lib/classifier.rb CHANGED Viewed

@@ -25,6 +25,6 @@
 # License::   LGPL
 require 'rubygems'
-require 'classifier/string_extensions'
+require 'classifier/extensions/string'
 require 'classifier/bayes'
 require 'classifier/lsi'

data/lib/classifier/{string_extensions.rb → extensions/string.rb} RENAMED Viewed

File without changes

data/lib/classifier/extensions/vector.rb ADDED Viewed

@@ -0,0 +1,106 @@
+# Author::    Ernest Ellingson
+# Copyright:: Copyright (c) 2005
+# These are extensions to the std-lib 'matrix' to allow an all ruby SVD
+require 'matrix'
+require 'mathn'
+class Array
+   def sum
+      inject(0) { |sum,term| sum += term  }.to_f
+   end
+end
+class Vector
+  def magnitude
+    sumsqs = 0.0
+    self.size.times do |i|
+      sumsqs += self[i] ** 2.0
+    end
+    Math.sqrt(sumsqs)
+  end
+  def normalize
+    nv = []
+    mag = self.magnitude
+    self.size.times do |i|
+      nv << (self[i] / mag)
+    end
+    Vector[*nv]
+  end
+end
+class Matrix
+  def Matrix.diag(s)
+     Matrix.diagonal(*s)
+  end
+  alias :trans :transpose
+  def SV_decomp(maxSweeps = 20)
+    if self.row_size >= self.column_size
+      q = self.trans * self
+    else
+      q = self * self.trans
+    end
+    qrot    = q.dup
+    v       = Matrix.identity(q.row_size)
+    azrot   = nil
+    mzrot   = nil
+    cnt     = 0
+    s_old   = nil
+    mu      = nil
+    while true do
+      cnt += 1
+      for row in (0...qrot.row_size-1) do
+        for col in (1..qrot.row_size-1) do
+          next if row == col
+          h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
+          hcos = Math.cos(h)
+          hsin = Math.sin(h)
+          mzrot = Matrix.identity(qrot.row_size)
+          mzrot[row,row] = hcos
+          mzrot[row,col] = -hsin
+          mzrot[col,row] = hsin
+          mzrot[col,col] = hcos
+          qrot = mzrot.trans * qrot * mzrot
+          v = v * mzrot
+        end
+      end
+      s_old = qrot.dup if cnt == 1
+      sum_qrot = 0.0
+      if cnt > 1
+        qrot.row_size.times do |r|
+          sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
+        end
+        s_old = qrot.dup
+      end
+      break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
+    end # of do while true
+    s = []
+    qrot.row_size.times do |r|
+      s << Math.sqrt(qrot[r,r])
+    end
+    #puts "cnt = #{cnt}"
+    if self.row_size >= self.column_size
+      mu = self *  v * Matrix.diagonal(*s).inverse
+      return [mu, v, s]
+    else
+      puts v.row_size
+      puts v.column_size
+      puts self.row_size
+      puts self.column_size
+      puts s.size
+      mu = (self.trans * v *  Matrix.diagonal(*s).inverse)
+      return [mu, v, s]
+    end
+  end
+  def []=(i,j,val)
+    @rows[i][j] = val
+  end
+end

data/lib/classifier/extensions/vector_serialize.rb CHANGED Viewed

@@ -11,4 +11,10 @@ module GSL
     end
   end
+  class Matrix
+     class <<self
+        alias :diag :diagonal
+     end
+  end
 end

data/lib/classifier/lsi.rb CHANGED Viewed

@@ -1,14 +1,22 @@
 # Author::    David Fayram  (mailto:dfayram@lensmen.net)
 # Copyright:: Copyright (c) 2005 David Fayram II
-# License::   GPL
+# License::   LGPL
 begin
-require 'gsl' # requires http://rb-gsl.rubyforge.org/
-require 'classifier/extensions/word_list'
-require 'classifier/extensions/vector_serialize'
+   raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
+   require 'gsl' # requires http://rb-gsl.rubyforge.org/
+   require 'classifier/extensions/vector_serialize'
+   $GSL = true
+rescue LoadError
+	warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
+	require 'classifier/extensions/vector'
+end
+require 'classifier/lsi/word_list'
 require 'classifier/lsi/content_node'
+require 'classifier/lsi/summary'
 module Classifier
@@ -18,6 +26,7 @@ module Classifier
   class LSI
     attr_reader :word_list
+    attr_accessor :auto_rebuild
     # Create a fresh index.
     # If you want to call #build_index manually, use
@@ -33,7 +42,7 @@ module Classifier
     # to be built after all informaton is added, but before you start
     # using it for search, classification and cluster detection.
     def needs_rebuild?
-      @version != @built_at_version
+      (@items.keys.size > 1) && (@version != @built_at_version)
     end
     # Adds an item to the index. item is assumed to be a string, but
@@ -50,7 +59,8 @@ module Classifier
     #   lsi.add_item ar, *ar.categories { |x| ar.content }
     #
     def add_item( item, *categories, &block )
-      @items[item] = ContentNode.new(item, categories, block)
+      clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
+      @items[item] = ContentNode.new(clean_word_hash, *categories)
       @version += 1
       build_index if @auto_rebuild
     end
@@ -63,6 +73,13 @@ module Classifier
       add_item item
     end
+    # Returns the categories for a given indexed items. You are free to add and remove
+    # items from this as you see fit. It does not invalide an index to change its categories.
+    def categories_for(item)
+      return [] unless @items[item]
+      return @items[item].categories
+    end
     # Removes an item from the database, if it is indexed.
     #
     def remove_item( item )
@@ -77,6 +94,13 @@ module Classifier
       @items.keys
     end
+    # Returns the categories for a given indexed items. You are free to add and remove
+    # items from this as you see fit. It does not invalide an index to change its categories.
+    def categories_for(item)
+      return [] unless @items[item]
+      return @items[item].categories
+    end
     # This function rebuilds the index if needs_rebuild? returns true.
     # For very large document spaces, this indexing operation may take some
     # time to complete, so it may be wise to place the operation in another
@@ -97,18 +121,46 @@ module Classifier
       doc_list = @items.values
       tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
-      tdm = GSL::Matrix.new( *tda ).trans
-      ntdm = build_reduced_matrix(tdm, cutoff)
-      ntdm.size[1].times do |col|
-        vec = GSL::Vector.new( ntdm.column(col) ).row
-        doc_list[col].lsi_vector = vec
-        doc_list[col].lsi_norm = vec.normalize
-      end
+      if $GSL
+         tdm = GSL::Matrix.new(*tda).trans
+         ntdm = build_reduced_matrix(tdm, cutoff)
+         ntdm.size[1].times do |col|
+           vec = GSL::Vector.new( ntdm.column(col) ).row
+           doc_list[col].lsi_vector = vec
+           doc_list[col].lsi_norm = vec.normalize
+         end
+      else
+         tdm = Matrix.rows(tda).trans
+         ntdm = build_reduced_matrix(tdm, cutoff)
+         ntdm.row_size.times do |col|
+           doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
+           doc_list[col].lsi_norm = ntdm.column(col).normalize  if doc_list[col]
+         end
+      end
       @built_at_version = @version
     end
+    # This method returns max_chunks entries, ordered by their average semantic rating.
+    # Essentially, the average distance of each entry from all other entries is calculated,
+    # the highest are returned.
+    #
+    # This can be used to build a summary service, or to provide more information about
+    # your dataset's general content. For example, if you were to use categorize on the
+    # results of this data, you could gather information on what your dataset is generally
+    # about.
+    def highest_relative_content( max_chunks=10 )
+       return [] if needs_rebuild?
+       avg_density = Hash.new
+       @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
+       avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
+    end
     # This function is the primitive that find_related and classify
     # build upon. It returns an array of 2-element arrays. The first element
     # of this array is a document, and the second is its "score", defining
@@ -123,11 +175,15 @@ module Classifier
     # text data. See add_item for examples of how this works.
     def proximity_array_for_content( doc, &block )
       return [] if needs_rebuild?
       content_node = node_for_content( doc, &block )
       result =
         @items.keys.collect do |item|
-          val = content_node.search_vector * @items[item].search_vector.col
+          if $GSL
+             val = content_node.search_vector * @items[item].search_vector.col
+          else
+             val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
+          end
           [item, val]
         end
       result.sort_by { |x| x[1] }.reverse
@@ -144,7 +200,11 @@ module Classifier
       content_node = node_for_content( doc, &block )
       result =
         @items.keys.collect do |item|
-          val = content_node.search_norm * @items[item].search_norm.col
+          if $GSL
+            val = content_node.search_norm * @items[item].search_norm.col
+          else
+            val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
+          end
           [item, val]
         end
       result.sort_by { |x| x[1] }.reverse
@@ -159,9 +219,7 @@ module Classifier
     # it is actually the same algorithm, just applied on a smaller document.
     def search( string, max_nearest=3 )
       return [] if needs_rebuild?
-      carry =
-        proximity_norms_for_content( string )
+      carry = proximity_norms_for_content( string )
       result = carry.collect { |x| x[0] }
       return result[0..max_nearest-1]
     end
@@ -208,29 +266,44 @@ module Classifier
       return ranking[-1]
     end
+    # Prototype, only works on indexed documents.
+    # I have no clue if this is going to work, but in theory
+    # it's supposed to.
+    def highest_ranked_stems( doc, count=3 )
+      raise "Requested stem ranking on non-indexed content!" unless @items[doc]
+      arr = node_for_content(doc).lsi_vector.to_a
+      top_n = arr.sort.reverse[0..count-1]
+      return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
+    end
     private
     def build_reduced_matrix( matrix, cutoff=0.75 )
       # TODO: Check that M>=N on these dimensions! Transpose helps assure this
       u, v, s = matrix.SV_decomp
       # TODO: Better than 75% term, please. :\
       s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
       s.size.times do |ord|
         s[ord] = 0.0 if s[ord] < s_cutoff
       end
       # Reconstruct the term document matrix, only with reduced rank
-      u * Matrix.diagonal( s ) * v.trans
+      u * Matrix.diag( s ) * v.trans
     end
     def node_for_content(item, &block)
       if @items[item]
         return @items[item]
       else
-        cn = ContentNode.new(item, &block) # make the node and extract the data
-        cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
-      end
+        clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
-      cn
+        cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
+        unless needs_rebuild?
+          cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
+        end
+      end
+      return cn
     end
     def make_word_list
@@ -243,6 +316,3 @@ module Classifier
   end
 end
-rescue LoadError
-	$stderr.puts "For LSI support, you need to install http://rb-gsl.rubyforge.org/"
-end