classifier 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. data/LICENSE +361 -273
  2. data/README +6 -5
  3. data/Rakefile +12 -2
  4. data/bin/summarize.rb +11 -0
  5. data/doc/classes/Array.html +139 -0
  6. data/doc/classes/Array.src/M000003.html +18 -0
  7. data/doc/classes/Classifier.html +5 -5
  8. data/doc/classes/Classifier/Bayes.html +43 -43
  9. data/doc/classes/Classifier/Bayes.src/{M000023.html → M000038.html} +0 -0
  10. data/doc/classes/Classifier/Bayes.src/{M000024.html → M000039.html} +0 -0
  11. data/doc/classes/Classifier/Bayes.src/{M000025.html → M000040.html} +0 -0
  12. data/doc/classes/Classifier/Bayes.src/{M000026.html → M000041.html} +0 -0
  13. data/doc/classes/Classifier/Bayes.src/{M000027.html → M000042.html} +0 -0
  14. data/doc/classes/Classifier/Bayes.src/{M000028.html → M000043.html} +0 -0
  15. data/doc/classes/Classifier/Bayes.src/{M000029.html → M000044.html} +0 -0
  16. data/doc/classes/Classifier/ContentNode.html +23 -28
  17. data/doc/classes/Classifier/ContentNode.src/M000046.html +19 -0
  18. data/doc/classes/Classifier/ContentNode.src/{M000032.html → M000047.html} +1 -1
  19. data/doc/classes/Classifier/ContentNode.src/{M000033.html → M000048.html} +1 -1
  20. data/doc/classes/Classifier/ContentNode.src/M000049.html +49 -0
  21. data/doc/classes/Classifier/LSI.html +158 -68
  22. data/doc/classes/Classifier/LSI.src/M000022.html +6 -17
  23. data/doc/classes/Classifier/LSI.src/{M000012.html → M000023.html} +2 -2
  24. data/doc/classes/Classifier/LSI.src/{M000013.html → M000024.html} +3 -2
  25. data/doc/classes/Classifier/LSI.src/{M000014.html → M000025.html} +1 -1
  26. data/doc/classes/Classifier/LSI.src/M000026.html +19 -0
  27. data/doc/classes/Classifier/LSI.src/{M000015.html → M000027.html} +1 -1
  28. data/doc/classes/Classifier/LSI.src/{M000016.html → M000028.html} +1 -1
  29. data/doc/classes/Classifier/LSI.src/M000029.html +19 -0
  30. data/doc/classes/Classifier/LSI.src/M000030.html +43 -0
  31. data/doc/classes/Classifier/LSI.src/M000031.html +23 -0
  32. data/doc/classes/Classifier/LSI.src/{M000018.html → M000032.html} +7 -3
  33. data/doc/classes/Classifier/LSI.src/{M000019.html → M000033.html} +6 -2
  34. data/doc/classes/Classifier/LSI.src/{M000020.html → M000034.html} +2 -4
  35. data/doc/classes/Classifier/LSI.src/{M000021.html → M000035.html} +1 -1
  36. data/doc/classes/Classifier/LSI.src/M000036.html +31 -0
  37. data/doc/classes/Classifier/LSI.src/M000037.html +21 -0
  38. data/doc/classes/Classifier/WordList.html +37 -22
  39. data/doc/classes/Classifier/WordList.src/{M000007.html → M000017.html} +2 -2
  40. data/doc/classes/Classifier/WordList.src/{M000008.html → M000018.html} +1 -1
  41. data/doc/classes/Classifier/WordList.src/{M000009.html → M000019.html} +1 -1
  42. data/doc/classes/Classifier/WordList.src/M000020.html +18 -0
  43. data/doc/classes/Classifier/WordList.src/{M000010.html → M000021.html} +1 -1
  44. data/doc/classes/GSL.html +2 -1
  45. data/doc/classes/GSL/Matrix.html +126 -0
  46. data/doc/classes/GSL/Vector.html +10 -10
  47. data/doc/classes/GSL/Vector.src/{M000005.html → M000015.html} +0 -0
  48. data/doc/classes/GSL/Vector.src/{M000006.html → M000016.html} +0 -0
  49. data/doc/classes/Matrix.html +184 -0
  50. data/doc/classes/Matrix.src/M000004.html +18 -0
  51. data/doc/classes/Matrix.src/M000005.html +76 -0
  52. data/doc/classes/Matrix.src/M000006.html +18 -0
  53. data/doc/classes/Object.html +7 -7
  54. data/doc/classes/Object.src/{M000001.html → M000007.html} +1 -1
  55. data/doc/classes/String.html +90 -20
  56. data/doc/classes/String.src/{M000002.html → M000008.html} +0 -0
  57. data/doc/classes/String.src/{M000003.html → M000009.html} +0 -0
  58. data/doc/classes/String.src/{M000004.html → M000010.html} +0 -0
  59. data/doc/classes/String.src/M000011.html +18 -0
  60. data/doc/classes/String.src/M000012.html +18 -0
  61. data/doc/classes/String.src/M000013.html +18 -0
  62. data/doc/classes/String.src/M000014.html +18 -0
  63. data/doc/classes/Vector.html +154 -0
  64. data/doc/classes/Vector.src/M000001.html +22 -0
  65. data/doc/classes/Vector.src/M000002.html +25 -0
  66. data/doc/created.rid +1 -1
  67. data/doc/files/README.html +14 -8
  68. data/doc/files/lib/classifier/bayes_rb.html +1 -1
  69. data/doc/files/lib/classifier/{string_extensions_rb.html → extensions/string_rb.html} +5 -5
  70. data/doc/files/lib/classifier/extensions/vector_rb.html +120 -0
  71. data/doc/files/lib/classifier/extensions/vector_serialize_rb.html +1 -1
  72. data/doc/files/lib/classifier/extensions/word_hash_rb.html +1 -1
  73. data/doc/files/lib/classifier/lsi/content_node_rb.html +2 -2
  74. data/doc/files/lib/classifier/lsi/summary_rb.html +115 -0
  75. data/doc/files/lib/classifier/{extensions → lsi}/word_list_rb.html +3 -3
  76. data/doc/files/lib/classifier/lsi_rb.html +5 -3
  77. data/doc/files/lib/classifier_rb.html +2 -2
  78. data/doc/fr_class_index.html +4 -0
  79. data/doc/fr_file_index.html +4 -2
  80. data/doc/fr_method_index.html +49 -34
  81. data/doc/index.html +2 -2
  82. data/lib/classifier.rb +1 -1
  83. data/lib/classifier/{string_extensions.rb → extensions/string.rb} +0 -0
  84. data/lib/classifier/extensions/vector.rb +106 -0
  85. data/lib/classifier/extensions/vector_serialize.rb +6 -0
  86. data/lib/classifier/lsi.rb +101 -31
  87. data/lib/classifier/lsi/content_node.rb +28 -23
  88. data/lib/classifier/lsi/summary.rb +31 -0
  89. data/lib/classifier/{extensions → lsi}/word_list.rb +7 -2
  90. data/test/{string_extensions → extensions}/word_hash_test.rb +0 -0
  91. data/test/lsi/lsi_test.rb +36 -1
  92. metadata +68 -41
  93. data/doc/classes/Classifier/ContentNode.src/M000031.html +0 -21
  94. data/doc/classes/Classifier/ContentNode.src/M000034.html +0 -41
  95. data/doc/classes/Classifier/LSI.src/M000011.html +0 -20
  96. data/doc/classes/Classifier/LSI.src/M000017.html +0 -32
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Sun Apr 24 21:34:06 PDT 2005</td>
59
+ <td>Thu May 05 01:50:06 PDT 2005</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -76,7 +76,7 @@
76
76
  <tr><td valign="top">Copyright:</td><td>Copyright &#169; 2005 David Fayram II
77
77
 
78
78
  </td></tr>
79
- <tr><td valign="top">License:</td><td>GPL
79
+ <tr><td valign="top">License:</td><td>LGPL
80
80
 
81
81
  </td></tr>
82
82
  </table>
@@ -88,9 +88,11 @@
88
88
 
89
89
  <div class="name-list">
90
90
  gsl&nbsp;&nbsp;
91
- classifier/extensions/word_list&nbsp;&nbsp;
92
91
  classifier/extensions/vector_serialize&nbsp;&nbsp;
92
+ classifier/extensions/vector&nbsp;&nbsp;
93
+ classifier/lsi/word_list&nbsp;&nbsp;
93
94
  classifier/lsi/content_node&nbsp;&nbsp;
95
+ classifier/lsi/summary&nbsp;&nbsp;
94
96
  </div>
95
97
  </div>
96
98
 
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Sun Apr 24 02:08:49 PDT 2005</td>
59
+ <td>Thu May 05 01:21:16 PDT 2005</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -88,7 +88,7 @@
88
88
 
89
89
  <div class="name-list">
90
90
  rubygems&nbsp;&nbsp;
91
- classifier/string_extensions&nbsp;&nbsp;
91
+ classifier/extensions/string&nbsp;&nbsp;
92
92
  classifier/bayes&nbsp;&nbsp;
93
93
  classifier/lsi&nbsp;&nbsp;
94
94
  </div>
@@ -20,15 +20,19 @@
20
20
  <div id="index">
21
21
  <h1 class="section-bar">Classes</h1>
22
22
  <div id="index-entries">
23
+ <a href="classes/Array.html">Array</a><br />
23
24
  <a href="classes/Classifier.html">Classifier</a><br />
24
25
  <a href="classes/Classifier/Bayes.html">Classifier::Bayes</a><br />
25
26
  <a href="classes/Classifier/ContentNode.html">Classifier::ContentNode</a><br />
26
27
  <a href="classes/Classifier/LSI.html">Classifier::LSI</a><br />
27
28
  <a href="classes/Classifier/WordList.html">Classifier::WordList</a><br />
28
29
  <a href="classes/GSL.html">GSL</a><br />
30
+ <a href="classes/GSL/Matrix.html">GSL::Matrix</a><br />
29
31
  <a href="classes/GSL/Vector.html">GSL::Vector</a><br />
32
+ <a href="classes/Matrix.html">Matrix</a><br />
30
33
  <a href="classes/Object.html">Object</a><br />
31
34
  <a href="classes/String.html">String</a><br />
35
+ <a href="classes/Vector.html">Vector</a><br />
32
36
  </div>
33
37
  </div>
34
38
  </body>
@@ -23,12 +23,14 @@
23
23
  <a href="files/README.html">README</a><br />
24
24
  <a href="files/lib/classifier_rb.html">lib/classifier.rb</a><br />
25
25
  <a href="files/lib/classifier/bayes_rb.html">lib/classifier/bayes.rb</a><br />
26
+ <a href="files/lib/classifier/extensions/string_rb.html">lib/classifier/extensions/string.rb</a><br />
27
+ <a href="files/lib/classifier/extensions/vector_rb.html">lib/classifier/extensions/vector.rb</a><br />
26
28
  <a href="files/lib/classifier/extensions/vector_serialize_rb.html">lib/classifier/extensions/vector_serialize.rb</a><br />
27
29
  <a href="files/lib/classifier/extensions/word_hash_rb.html">lib/classifier/extensions/word_hash.rb</a><br />
28
- <a href="files/lib/classifier/extensions/word_list_rb.html">lib/classifier/extensions/word_list.rb</a><br />
29
30
  <a href="files/lib/classifier/lsi_rb.html">lib/classifier/lsi.rb</a><br />
30
31
  <a href="files/lib/classifier/lsi/content_node_rb.html">lib/classifier/lsi/content_node.rb</a><br />
31
- <a href="files/lib/classifier/string_extensions_rb.html">lib/classifier/string_extensions.rb</a><br />
32
+ <a href="files/lib/classifier/lsi/summary_rb.html">lib/classifier/lsi/summary.rb</a><br />
33
+ <a href="files/lib/classifier/lsi/word_list_rb.html">lib/classifier/lsi/word_list.rb</a><br />
32
34
  </div>
33
35
  </div>
34
36
  </body>
@@ -20,40 +20,55 @@
20
20
  <div id="index">
21
21
  <h1 class="section-bar">Methods</h1>
22
22
  <div id="index-entries">
23
- <a href="classes/Classifier/LSI.html#M000014"><< (Classifier::LSI)</a><br />
24
- <a href="classes/Classifier/WordList.html#M000009">[] (Classifier::WordList)</a><br />
25
- <a href="classes/GSL/Vector.html#M000005">_dump (GSL::Vector)</a><br />
26
- <a href="classes/GSL/Vector.html#M000006">_load (GSL::Vector)</a><br />
27
- <a href="classes/Classifier/Bayes.html#M000029">add_category (Classifier::Bayes)</a><br />
28
- <a href="classes/Classifier/LSI.html#M000013">add_item (Classifier::LSI)</a><br />
29
- <a href="classes/Classifier/WordList.html#M000008">add_word (Classifier::WordList)</a><br />
30
- <a href="classes/Classifier/Bayes.html#M000030">append_category (Classifier::Bayes)</a><br />
31
- <a href="classes/Classifier/LSI.html#M000017">build_index (Classifier::LSI)</a><br />
32
- <a href="classes/Classifier/Bayes.html#M000026">classifications (Classifier::Bayes)</a><br />
33
- <a href="classes/Classifier/LSI.html#M000022">classify (Classifier::LSI)</a><br />
34
- <a href="classes/Classifier/Bayes.html#M000027">classify (Classifier::Bayes)</a><br />
35
- <a href="classes/String.html#M000004">clean_word_hash (String)</a><br />
36
- <a href="classes/Classifier/LSI.html#M000021">find_related (Classifier::LSI)</a><br />
37
- <a href="classes/Classifier/LSI.html#M000016">items (Classifier::LSI)</a><br />
38
- <a href="classes/Classifier/Bayes.html#M000028">method_missing (Classifier::Bayes)</a><br />
39
- <a href="classes/Classifier/LSI.html#M000012">needs_rebuild? (Classifier::LSI)</a><br />
40
- <a href="classes/Classifier/Bayes.html#M000023">new (Classifier::Bayes)</a><br />
41
- <a href="classes/Classifier/LSI.html#M000011">new (Classifier::LSI)</a><br />
42
- <a href="classes/Classifier/ContentNode.html#M000031">new (Classifier::ContentNode)</a><br />
43
- <a href="classes/Classifier/WordList.html#M000007">new (Classifier::WordList)</a><br />
44
- <a href="classes/Object.html#M000001">prepare_category_name (Object)</a><br />
45
- <a href="classes/Classifier/LSI.html#M000018">proximity_array_for_content (Classifier::LSI)</a><br />
46
- <a href="classes/Classifier/LSI.html#M000019">proximity_norms_for_content (Classifier::LSI)</a><br />
47
- <a href="classes/Classifier/ContentNode.html#M000034">raw_vector_with (Classifier::ContentNode)</a><br />
48
- <a href="classes/Classifier/LSI.html#M000015">remove_item (Classifier::LSI)</a><br />
49
- <a href="classes/Classifier/LSI.html#M000020">search (Classifier::LSI)</a><br />
50
- <a href="classes/Classifier/ContentNode.html#M000033">search_norm (Classifier::ContentNode)</a><br />
51
- <a href="classes/Classifier/ContentNode.html#M000032">search_vector (Classifier::ContentNode)</a><br />
52
- <a href="classes/Classifier/WordList.html#M000010">size (Classifier::WordList)</a><br />
53
- <a href="classes/Classifier/Bayes.html#M000024">train (Classifier::Bayes)</a><br />
54
- <a href="classes/Classifier/Bayes.html#M000025">untrain (Classifier::Bayes)</a><br />
55
- <a href="classes/String.html#M000002">without_punctuation (String)</a><br />
56
- <a href="classes/String.html#M000003">word_hash (String)</a><br />
23
+ <a href="classes/Classifier/LSI.html#M000025"><< (Classifier::LSI)</a><br />
24
+ <a href="classes/Matrix.html#M000005">SV_decomp (Matrix)</a><br />
25
+ <a href="classes/Classifier/WordList.html#M000019">[] (Classifier::WordList)</a><br />
26
+ <a href="classes/Matrix.html#M000006">[]= (Matrix)</a><br />
27
+ <a href="classes/GSL/Vector.html#M000015">_dump (GSL::Vector)</a><br />
28
+ <a href="classes/GSL/Vector.html#M000016">_load (GSL::Vector)</a><br />
29
+ <a href="classes/Classifier/Bayes.html#M000044">add_category (Classifier::Bayes)</a><br />
30
+ <a href="classes/Classifier/LSI.html#M000024">add_item (Classifier::LSI)</a><br />
31
+ <a href="classes/Classifier/WordList.html#M000018">add_word (Classifier::WordList)</a><br />
32
+ <a href="classes/Classifier/Bayes.html#M000045">append_category (Classifier::Bayes)</a><br />
33
+ <a href="classes/Classifier/LSI.html#M000030">build_index (Classifier::LSI)</a><br />
34
+ <a href="classes/Classifier/LSI.html#M000029">categories_for (Classifier::LSI)</a><br />
35
+ <a href="classes/Classifier/LSI.html#M000026">categories_for (Classifier::LSI)</a><br />
36
+ <a href="classes/Classifier/Bayes.html#M000041">classifications (Classifier::Bayes)</a><br />
37
+ <a href="classes/Classifier/LSI.html#M000036">classify (Classifier::LSI)</a><br />
38
+ <a href="classes/Classifier/Bayes.html#M000042">classify (Classifier::Bayes)</a><br />
39
+ <a href="classes/String.html#M000010">clean_word_hash (String)</a><br />
40
+ <a href="classes/Matrix.html#M000004">diag (Matrix)</a><br />
41
+ <a href="classes/Classifier/LSI.html#M000035">find_related (Classifier::LSI)</a><br />
42
+ <a href="classes/Classifier/LSI.html#M000037">highest_ranked_stems (Classifier::LSI)</a><br />
43
+ <a href="classes/Classifier/LSI.html#M000031">highest_relative_content (Classifier::LSI)</a><br />
44
+ <a href="classes/Classifier/LSI.html#M000028">items (Classifier::LSI)</a><br />
45
+ <a href="classes/Vector.html#M000001">magnitude (Vector)</a><br />
46
+ <a href="classes/Classifier/Bayes.html#M000043">method_missing (Classifier::Bayes)</a><br />
47
+ <a href="classes/Classifier/LSI.html#M000023">needs_rebuild? (Classifier::LSI)</a><br />
48
+ <a href="classes/Classifier/ContentNode.html#M000046">new (Classifier::ContentNode)</a><br />
49
+ <a href="classes/Classifier/Bayes.html#M000038">new (Classifier::Bayes)</a><br />
50
+ <a href="classes/Classifier/LSI.html#M000022">new (Classifier::LSI)</a><br />
51
+ <a href="classes/Classifier/WordList.html#M000017">new (Classifier::WordList)</a><br />
52
+ <a href="classes/Vector.html#M000002">normalize (Vector)</a><br />
53
+ <a href="classes/String.html#M000012">paragraph_summary (String)</a><br />
54
+ <a href="classes/Object.html#M000007">prepare_category_name (Object)</a><br />
55
+ <a href="classes/Classifier/LSI.html#M000032">proximity_array_for_content (Classifier::LSI)</a><br />
56
+ <a href="classes/Classifier/LSI.html#M000033">proximity_norms_for_content (Classifier::LSI)</a><br />
57
+ <a href="classes/Classifier/ContentNode.html#M000049">raw_vector_with (Classifier::ContentNode)</a><br />
58
+ <a href="classes/Classifier/LSI.html#M000027">remove_item (Classifier::LSI)</a><br />
59
+ <a href="classes/Classifier/LSI.html#M000034">search (Classifier::LSI)</a><br />
60
+ <a href="classes/Classifier/ContentNode.html#M000048">search_norm (Classifier::ContentNode)</a><br />
61
+ <a href="classes/Classifier/ContentNode.html#M000047">search_vector (Classifier::ContentNode)</a><br />
62
+ <a href="classes/Classifier/WordList.html#M000021">size (Classifier::WordList)</a><br />
63
+ <a href="classes/String.html#M000014">split_paragraphs (String)</a><br />
64
+ <a href="classes/String.html#M000013">split_sentences (String)</a><br />
65
+ <a href="classes/Array.html#M000003">sum (Array)</a><br />
66
+ <a href="classes/String.html#M000011">summary (String)</a><br />
67
+ <a href="classes/Classifier/Bayes.html#M000039">train (Classifier::Bayes)</a><br />
68
+ <a href="classes/Classifier/Bayes.html#M000040">untrain (Classifier::Bayes)</a><br />
69
+ <a href="classes/String.html#M000008">without_punctuation (String)</a><br />
70
+ <a href="classes/Classifier/WordList.html#M000020">word_for_index (Classifier::WordList)</a><br />
71
+ <a href="classes/String.html#M000009">word_hash (String)</a><br />
57
72
  </div>
58
73
  </div>
59
74
  </body>
data/doc/index.html CHANGED
@@ -5,12 +5,12 @@
5
5
 
6
6
  <!--
7
7
 
8
- RDoc Documentation
8
+ Ruby Classifier - Bayesian and LSI classification library
9
9
 
10
10
  -->
11
11
  <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
12
12
  <head>
13
- <title>RDoc Documentation</title>
13
+ <title>Ruby Classifier - Bayesian and LSI classification library</title>
14
14
  <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
15
15
  </head>
16
16
  <frameset rows="20%, 80%">
data/lib/classifier.rb CHANGED
@@ -25,6 +25,6 @@
25
25
  # License:: LGPL
26
26
 
27
27
  require 'rubygems'
28
- require 'classifier/string_extensions'
28
+ require 'classifier/extensions/string'
29
29
  require 'classifier/bayes'
30
30
  require 'classifier/lsi'
@@ -0,0 +1,106 @@
1
+ # Author:: Ernest Ellingson
2
+ # Copyright:: Copyright (c) 2005
3
+
4
+ # These are extensions to the std-lib 'matrix' to allow an all ruby SVD
5
+
6
+ require 'matrix'
7
+ require 'mathn'
8
+
9
+ class Array
10
+ def sum
11
+ inject(0) { |sum,term| sum += term }.to_f
12
+ end
13
+ end
14
+
15
+ class Vector
16
+ def magnitude
17
+ sumsqs = 0.0
18
+ self.size.times do |i|
19
+ sumsqs += self[i] ** 2.0
20
+ end
21
+ Math.sqrt(sumsqs)
22
+ end
23
+ def normalize
24
+ nv = []
25
+ mag = self.magnitude
26
+ self.size.times do |i|
27
+
28
+ nv << (self[i] / mag)
29
+
30
+ end
31
+ Vector[*nv]
32
+ end
33
+ end
34
+
35
+ class Matrix
36
+ def Matrix.diag(s)
37
+ Matrix.diagonal(*s)
38
+ end
39
+
40
+ alias :trans :transpose
41
+
42
+ def SV_decomp(maxSweeps = 20)
43
+ if self.row_size >= self.column_size
44
+ q = self.trans * self
45
+ else
46
+ q = self * self.trans
47
+ end
48
+
49
+ qrot = q.dup
50
+ v = Matrix.identity(q.row_size)
51
+ azrot = nil
52
+ mzrot = nil
53
+ cnt = 0
54
+ s_old = nil
55
+ mu = nil
56
+
57
+ while true do
58
+ cnt += 1
59
+ for row in (0...qrot.row_size-1) do
60
+ for col in (1..qrot.row_size-1) do
61
+ next if row == col
62
+ h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
63
+ hcos = Math.cos(h)
64
+ hsin = Math.sin(h)
65
+ mzrot = Matrix.identity(qrot.row_size)
66
+ mzrot[row,row] = hcos
67
+ mzrot[row,col] = -hsin
68
+ mzrot[col,row] = hsin
69
+ mzrot[col,col] = hcos
70
+ qrot = mzrot.trans * qrot * mzrot
71
+ v = v * mzrot
72
+ end
73
+ end
74
+ s_old = qrot.dup if cnt == 1
75
+ sum_qrot = 0.0
76
+ if cnt > 1
77
+ qrot.row_size.times do |r|
78
+ sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
79
+ end
80
+ s_old = qrot.dup
81
+ end
82
+ break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
83
+ end # of do while true
84
+ s = []
85
+ qrot.row_size.times do |r|
86
+ s << Math.sqrt(qrot[r,r])
87
+ end
88
+ #puts "cnt = #{cnt}"
89
+ if self.row_size >= self.column_size
90
+ mu = self * v * Matrix.diagonal(*s).inverse
91
+ return [mu, v, s]
92
+ else
93
+ puts v.row_size
94
+ puts v.column_size
95
+ puts self.row_size
96
+ puts self.column_size
97
+ puts s.size
98
+
99
+ mu = (self.trans * v * Matrix.diagonal(*s).inverse)
100
+ return [mu, v, s]
101
+ end
102
+ end
103
+ def []=(i,j,val)
104
+ @rows[i][j] = val
105
+ end
106
+ end
@@ -11,4 +11,10 @@ module GSL
11
11
  end
12
12
 
13
13
  end
14
+
15
+ class Matrix
16
+ class <<self
17
+ alias :diag :diagonal
18
+ end
19
+ end
14
20
  end
@@ -1,14 +1,22 @@
1
1
  # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
2
  # Copyright:: Copyright (c) 2005 David Fayram II
3
- # License:: GPL
3
+ # License:: LGPL
4
4
 
5
5
  begin
6
-
7
- require 'gsl' # requires http://rb-gsl.rubyforge.org/
8
-
9
- require 'classifier/extensions/word_list'
10
- require 'classifier/extensions/vector_serialize'
6
+ raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
7
+
8
+ require 'gsl' # requires http://rb-gsl.rubyforge.org/
9
+ require 'classifier/extensions/vector_serialize'
10
+ $GSL = true
11
+
12
+ rescue LoadError
13
+ warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
14
+ require 'classifier/extensions/vector'
15
+ end
16
+
17
+ require 'classifier/lsi/word_list'
11
18
  require 'classifier/lsi/content_node'
19
+ require 'classifier/lsi/summary'
12
20
 
13
21
  module Classifier
14
22
 
@@ -18,6 +26,7 @@ module Classifier
18
26
  class LSI
19
27
 
20
28
  attr_reader :word_list
29
+ attr_accessor :auto_rebuild
21
30
 
22
31
  # Create a fresh index.
23
32
  # If you want to call #build_index manually, use
@@ -33,7 +42,7 @@ module Classifier
33
42
  # to be built after all informaton is added, but before you start
34
43
  # using it for search, classification and cluster detection.
35
44
  def needs_rebuild?
36
- @version != @built_at_version
45
+ (@items.keys.size > 1) && (@version != @built_at_version)
37
46
  end
38
47
 
39
48
  # Adds an item to the index. item is assumed to be a string, but
@@ -50,7 +59,8 @@ module Classifier
50
59
  # lsi.add_item ar, *ar.categories { |x| ar.content }
51
60
  #
52
61
  def add_item( item, *categories, &block )
53
- @items[item] = ContentNode.new(item, categories, block)
62
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
63
+ @items[item] = ContentNode.new(clean_word_hash, *categories)
54
64
  @version += 1
55
65
  build_index if @auto_rebuild
56
66
  end
@@ -63,6 +73,13 @@ module Classifier
63
73
  add_item item
64
74
  end
65
75
 
76
+ # Returns the categories for a given indexed items. You are free to add and remove
77
+ # items from this as you see fit. It does not invalide an index to change its categories.
78
+ def categories_for(item)
79
+ return [] unless @items[item]
80
+ return @items[item].categories
81
+ end
82
+
66
83
  # Removes an item from the database, if it is indexed.
67
84
  #
68
85
  def remove_item( item )
@@ -77,6 +94,13 @@ module Classifier
77
94
  @items.keys
78
95
  end
79
96
 
97
+ # Returns the categories for a given indexed items. You are free to add and remove
98
+ # items from this as you see fit. It does not invalide an index to change its categories.
99
+ def categories_for(item)
100
+ return [] unless @items[item]
101
+ return @items[item].categories
102
+ end
103
+
80
104
  # This function rebuilds the index if needs_rebuild? returns true.
81
105
  # For very large document spaces, this indexing operation may take some
82
106
  # time to complete, so it may be wise to place the operation in another
@@ -97,18 +121,46 @@ module Classifier
97
121
 
98
122
  doc_list = @items.values
99
123
  tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
100
- tdm = GSL::Matrix.new( *tda ).trans
101
- ntdm = build_reduced_matrix(tdm, cutoff)
102
124
 
103
- ntdm.size[1].times do |col|
104
- vec = GSL::Vector.new( ntdm.column(col) ).row
105
- doc_list[col].lsi_vector = vec
106
- doc_list[col].lsi_norm = vec.normalize
107
- end
125
+ if $GSL
126
+ tdm = GSL::Matrix.new(*tda).trans
127
+ ntdm = build_reduced_matrix(tdm, cutoff)
128
+
129
+ ntdm.size[1].times do |col|
130
+ vec = GSL::Vector.new( ntdm.column(col) ).row
131
+ doc_list[col].lsi_vector = vec
132
+ doc_list[col].lsi_norm = vec.normalize
133
+ end
134
+ else
135
+ tdm = Matrix.rows(tda).trans
136
+ ntdm = build_reduced_matrix(tdm, cutoff)
108
137
 
138
+ ntdm.row_size.times do |col|
139
+ doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
140
+ doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
141
+ end
142
+ end
143
+
109
144
  @built_at_version = @version
110
145
  end
111
-
146
+
147
+ # This method returns max_chunks entries, ordered by their average semantic rating.
148
+ # Essentially, the average distance of each entry from all other entries is calculated,
149
+ # the highest are returned.
150
+ #
151
+ # This can be used to build a summary service, or to provide more information about
152
+ # your dataset's general content. For example, if you were to use categorize on the
153
+ # results of this data, you could gather information on what your dataset is generally
154
+ # about.
155
+ def highest_relative_content( max_chunks=10 )
156
+ return [] if needs_rebuild?
157
+
158
+ avg_density = Hash.new
159
+ @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
160
+
161
+ avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
162
+ end
163
+
112
164
  # This function is the primitive that find_related and classify
113
165
  # build upon. It returns an array of 2-element arrays. The first element
114
166
  # of this array is a document, and the second is its "score", defining
@@ -123,11 +175,15 @@ module Classifier
123
175
  # text data. See add_item for examples of how this works.
124
176
  def proximity_array_for_content( doc, &block )
125
177
  return [] if needs_rebuild?
126
-
178
+
127
179
  content_node = node_for_content( doc, &block )
128
180
  result =
129
181
  @items.keys.collect do |item|
130
- val = content_node.search_vector * @items[item].search_vector.col
182
+ if $GSL
183
+ val = content_node.search_vector * @items[item].search_vector.col
184
+ else
185
+ val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
186
+ end
131
187
  [item, val]
132
188
  end
133
189
  result.sort_by { |x| x[1] }.reverse
@@ -144,7 +200,11 @@ module Classifier
144
200
  content_node = node_for_content( doc, &block )
145
201
  result =
146
202
  @items.keys.collect do |item|
147
- val = content_node.search_norm * @items[item].search_norm.col
203
+ if $GSL
204
+ val = content_node.search_norm * @items[item].search_norm.col
205
+ else
206
+ val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
207
+ end
148
208
  [item, val]
149
209
  end
150
210
  result.sort_by { |x| x[1] }.reverse
@@ -159,9 +219,7 @@ module Classifier
159
219
  # it is actually the same algorithm, just applied on a smaller document.
160
220
  def search( string, max_nearest=3 )
161
221
  return [] if needs_rebuild?
162
-
163
- carry =
164
- proximity_norms_for_content( string )
222
+ carry = proximity_norms_for_content( string )
165
223
  result = carry.collect { |x| x[0] }
166
224
  return result[0..max_nearest-1]
167
225
  end
@@ -208,29 +266,44 @@ module Classifier
208
266
  return ranking[-1]
209
267
  end
210
268
 
269
+ # Prototype, only works on indexed documents.
270
+ # I have no clue if this is going to work, but in theory
271
+ # it's supposed to.
272
+ def highest_ranked_stems( doc, count=3 )
273
+ raise "Requested stem ranking on non-indexed content!" unless @items[doc]
274
+ arr = node_for_content(doc).lsi_vector.to_a
275
+ top_n = arr.sort.reverse[0..count-1]
276
+ return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
277
+ end
278
+
211
279
  private
212
280
  def build_reduced_matrix( matrix, cutoff=0.75 )
213
281
  # TODO: Check that M>=N on these dimensions! Transpose helps assure this
214
282
  u, v, s = matrix.SV_decomp
283
+
215
284
  # TODO: Better than 75% term, please. :\
216
285
  s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
217
286
  s.size.times do |ord|
218
287
  s[ord] = 0.0 if s[ord] < s_cutoff
219
288
  end
220
-
221
289
  # Reconstruct the term document matrix, only with reduced rank
222
- u * Matrix.diagonal( s ) * v.trans
290
+ u * Matrix.diag( s ) * v.trans
223
291
  end
224
292
 
225
293
  def node_for_content(item, &block)
226
294
  if @items[item]
227
295
  return @items[item]
228
296
  else
229
- cn = ContentNode.new(item, &block) # make the node and extract the data
230
- cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
231
- end
297
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
232
298
 
233
- cn
299
+ cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
300
+
301
+ unless needs_rebuild?
302
+ cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
303
+ end
304
+ end
305
+
306
+ return cn
234
307
  end
235
308
 
236
309
  def make_word_list
@@ -243,6 +316,3 @@ module Classifier
243
316
  end
244
317
  end
245
318
 
246
- rescue LoadError
247
- $stderr.puts "For LSI support, you need to install http://rb-gsl.rubyforge.org/"
248
- end