classifier 1.2.0 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (96) hide show
  1. data/LICENSE +361 -273
  2. data/README +6 -5
  3. data/Rakefile +12 -2
  4. data/bin/summarize.rb +11 -0
  5. data/doc/classes/Array.html +139 -0
  6. data/doc/classes/Array.src/M000003.html +18 -0
  7. data/doc/classes/Classifier.html +5 -5
  8. data/doc/classes/Classifier/Bayes.html +43 -43
  9. data/doc/classes/Classifier/Bayes.src/{M000023.html → M000038.html} +0 -0
  10. data/doc/classes/Classifier/Bayes.src/{M000024.html → M000039.html} +0 -0
  11. data/doc/classes/Classifier/Bayes.src/{M000025.html → M000040.html} +0 -0
  12. data/doc/classes/Classifier/Bayes.src/{M000026.html → M000041.html} +0 -0
  13. data/doc/classes/Classifier/Bayes.src/{M000027.html → M000042.html} +0 -0
  14. data/doc/classes/Classifier/Bayes.src/{M000028.html → M000043.html} +0 -0
  15. data/doc/classes/Classifier/Bayes.src/{M000029.html → M000044.html} +0 -0
  16. data/doc/classes/Classifier/ContentNode.html +23 -28
  17. data/doc/classes/Classifier/ContentNode.src/M000046.html +19 -0
  18. data/doc/classes/Classifier/ContentNode.src/{M000032.html → M000047.html} +1 -1
  19. data/doc/classes/Classifier/ContentNode.src/{M000033.html → M000048.html} +1 -1
  20. data/doc/classes/Classifier/ContentNode.src/M000049.html +49 -0
  21. data/doc/classes/Classifier/LSI.html +158 -68
  22. data/doc/classes/Classifier/LSI.src/M000022.html +6 -17
  23. data/doc/classes/Classifier/LSI.src/{M000012.html → M000023.html} +2 -2
  24. data/doc/classes/Classifier/LSI.src/{M000013.html → M000024.html} +3 -2
  25. data/doc/classes/Classifier/LSI.src/{M000014.html → M000025.html} +1 -1
  26. data/doc/classes/Classifier/LSI.src/M000026.html +19 -0
  27. data/doc/classes/Classifier/LSI.src/{M000015.html → M000027.html} +1 -1
  28. data/doc/classes/Classifier/LSI.src/{M000016.html → M000028.html} +1 -1
  29. data/doc/classes/Classifier/LSI.src/M000029.html +19 -0
  30. data/doc/classes/Classifier/LSI.src/M000030.html +43 -0
  31. data/doc/classes/Classifier/LSI.src/M000031.html +23 -0
  32. data/doc/classes/Classifier/LSI.src/{M000018.html → M000032.html} +7 -3
  33. data/doc/classes/Classifier/LSI.src/{M000019.html → M000033.html} +6 -2
  34. data/doc/classes/Classifier/LSI.src/{M000020.html → M000034.html} +2 -4
  35. data/doc/classes/Classifier/LSI.src/{M000021.html → M000035.html} +1 -1
  36. data/doc/classes/Classifier/LSI.src/M000036.html +31 -0
  37. data/doc/classes/Classifier/LSI.src/M000037.html +21 -0
  38. data/doc/classes/Classifier/WordList.html +37 -22
  39. data/doc/classes/Classifier/WordList.src/{M000007.html → M000017.html} +2 -2
  40. data/doc/classes/Classifier/WordList.src/{M000008.html → M000018.html} +1 -1
  41. data/doc/classes/Classifier/WordList.src/{M000009.html → M000019.html} +1 -1
  42. data/doc/classes/Classifier/WordList.src/M000020.html +18 -0
  43. data/doc/classes/Classifier/WordList.src/{M000010.html → M000021.html} +1 -1
  44. data/doc/classes/GSL.html +2 -1
  45. data/doc/classes/GSL/Matrix.html +126 -0
  46. data/doc/classes/GSL/Vector.html +10 -10
  47. data/doc/classes/GSL/Vector.src/{M000005.html → M000015.html} +0 -0
  48. data/doc/classes/GSL/Vector.src/{M000006.html → M000016.html} +0 -0
  49. data/doc/classes/Matrix.html +184 -0
  50. data/doc/classes/Matrix.src/M000004.html +18 -0
  51. data/doc/classes/Matrix.src/M000005.html +76 -0
  52. data/doc/classes/Matrix.src/M000006.html +18 -0
  53. data/doc/classes/Object.html +7 -7
  54. data/doc/classes/Object.src/{M000001.html → M000007.html} +1 -1
  55. data/doc/classes/String.html +90 -20
  56. data/doc/classes/String.src/{M000002.html → M000008.html} +0 -0
  57. data/doc/classes/String.src/{M000003.html → M000009.html} +0 -0
  58. data/doc/classes/String.src/{M000004.html → M000010.html} +0 -0
  59. data/doc/classes/String.src/M000011.html +18 -0
  60. data/doc/classes/String.src/M000012.html +18 -0
  61. data/doc/classes/String.src/M000013.html +18 -0
  62. data/doc/classes/String.src/M000014.html +18 -0
  63. data/doc/classes/Vector.html +154 -0
  64. data/doc/classes/Vector.src/M000001.html +22 -0
  65. data/doc/classes/Vector.src/M000002.html +25 -0
  66. data/doc/created.rid +1 -1
  67. data/doc/files/README.html +14 -8
  68. data/doc/files/lib/classifier/bayes_rb.html +1 -1
  69. data/doc/files/lib/classifier/{string_extensions_rb.html → extensions/string_rb.html} +5 -5
  70. data/doc/files/lib/classifier/extensions/vector_rb.html +120 -0
  71. data/doc/files/lib/classifier/extensions/vector_serialize_rb.html +1 -1
  72. data/doc/files/lib/classifier/extensions/word_hash_rb.html +1 -1
  73. data/doc/files/lib/classifier/lsi/content_node_rb.html +2 -2
  74. data/doc/files/lib/classifier/lsi/summary_rb.html +115 -0
  75. data/doc/files/lib/classifier/{extensions → lsi}/word_list_rb.html +3 -3
  76. data/doc/files/lib/classifier/lsi_rb.html +5 -3
  77. data/doc/files/lib/classifier_rb.html +2 -2
  78. data/doc/fr_class_index.html +4 -0
  79. data/doc/fr_file_index.html +4 -2
  80. data/doc/fr_method_index.html +49 -34
  81. data/doc/index.html +2 -2
  82. data/lib/classifier.rb +1 -1
  83. data/lib/classifier/{string_extensions.rb → extensions/string.rb} +0 -0
  84. data/lib/classifier/extensions/vector.rb +106 -0
  85. data/lib/classifier/extensions/vector_serialize.rb +6 -0
  86. data/lib/classifier/lsi.rb +101 -31
  87. data/lib/classifier/lsi/content_node.rb +28 -23
  88. data/lib/classifier/lsi/summary.rb +31 -0
  89. data/lib/classifier/{extensions → lsi}/word_list.rb +7 -2
  90. data/test/{string_extensions → extensions}/word_hash_test.rb +0 -0
  91. data/test/lsi/lsi_test.rb +36 -1
  92. metadata +68 -41
  93. data/doc/classes/Classifier/ContentNode.src/M000031.html +0 -21
  94. data/doc/classes/Classifier/ContentNode.src/M000034.html +0 -41
  95. data/doc/classes/Classifier/LSI.src/M000011.html +0 -20
  96. data/doc/classes/Classifier/LSI.src/M000017.html +0 -32
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Sun Apr 24 21:34:06 PDT 2005</td>
59
+ <td>Thu May 05 01:50:06 PDT 2005</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -76,7 +76,7 @@
76
76
  <tr><td valign="top">Copyright:</td><td>Copyright &#169; 2005 David Fayram II
77
77
 
78
78
  </td></tr>
79
- <tr><td valign="top">License:</td><td>GPL
79
+ <tr><td valign="top">License:</td><td>LGPL
80
80
 
81
81
  </td></tr>
82
82
  </table>
@@ -88,9 +88,11 @@
88
88
 
89
89
  <div class="name-list">
90
90
  gsl&nbsp;&nbsp;
91
- classifier/extensions/word_list&nbsp;&nbsp;
92
91
  classifier/extensions/vector_serialize&nbsp;&nbsp;
92
+ classifier/extensions/vector&nbsp;&nbsp;
93
+ classifier/lsi/word_list&nbsp;&nbsp;
93
94
  classifier/lsi/content_node&nbsp;&nbsp;
95
+ classifier/lsi/summary&nbsp;&nbsp;
94
96
  </div>
95
97
  </div>
96
98
 
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Sun Apr 24 02:08:49 PDT 2005</td>
59
+ <td>Thu May 05 01:21:16 PDT 2005</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -88,7 +88,7 @@
88
88
 
89
89
  <div class="name-list">
90
90
  rubygems&nbsp;&nbsp;
91
- classifier/string_extensions&nbsp;&nbsp;
91
+ classifier/extensions/string&nbsp;&nbsp;
92
92
  classifier/bayes&nbsp;&nbsp;
93
93
  classifier/lsi&nbsp;&nbsp;
94
94
  </div>
@@ -20,15 +20,19 @@
20
20
  <div id="index">
21
21
  <h1 class="section-bar">Classes</h1>
22
22
  <div id="index-entries">
23
+ <a href="classes/Array.html">Array</a><br />
23
24
  <a href="classes/Classifier.html">Classifier</a><br />
24
25
  <a href="classes/Classifier/Bayes.html">Classifier::Bayes</a><br />
25
26
  <a href="classes/Classifier/ContentNode.html">Classifier::ContentNode</a><br />
26
27
  <a href="classes/Classifier/LSI.html">Classifier::LSI</a><br />
27
28
  <a href="classes/Classifier/WordList.html">Classifier::WordList</a><br />
28
29
  <a href="classes/GSL.html">GSL</a><br />
30
+ <a href="classes/GSL/Matrix.html">GSL::Matrix</a><br />
29
31
  <a href="classes/GSL/Vector.html">GSL::Vector</a><br />
32
+ <a href="classes/Matrix.html">Matrix</a><br />
30
33
  <a href="classes/Object.html">Object</a><br />
31
34
  <a href="classes/String.html">String</a><br />
35
+ <a href="classes/Vector.html">Vector</a><br />
32
36
  </div>
33
37
  </div>
34
38
  </body>
@@ -23,12 +23,14 @@
23
23
  <a href="files/README.html">README</a><br />
24
24
  <a href="files/lib/classifier_rb.html">lib/classifier.rb</a><br />
25
25
  <a href="files/lib/classifier/bayes_rb.html">lib/classifier/bayes.rb</a><br />
26
+ <a href="files/lib/classifier/extensions/string_rb.html">lib/classifier/extensions/string.rb</a><br />
27
+ <a href="files/lib/classifier/extensions/vector_rb.html">lib/classifier/extensions/vector.rb</a><br />
26
28
  <a href="files/lib/classifier/extensions/vector_serialize_rb.html">lib/classifier/extensions/vector_serialize.rb</a><br />
27
29
  <a href="files/lib/classifier/extensions/word_hash_rb.html">lib/classifier/extensions/word_hash.rb</a><br />
28
- <a href="files/lib/classifier/extensions/word_list_rb.html">lib/classifier/extensions/word_list.rb</a><br />
29
30
  <a href="files/lib/classifier/lsi_rb.html">lib/classifier/lsi.rb</a><br />
30
31
  <a href="files/lib/classifier/lsi/content_node_rb.html">lib/classifier/lsi/content_node.rb</a><br />
31
- <a href="files/lib/classifier/string_extensions_rb.html">lib/classifier/string_extensions.rb</a><br />
32
+ <a href="files/lib/classifier/lsi/summary_rb.html">lib/classifier/lsi/summary.rb</a><br />
33
+ <a href="files/lib/classifier/lsi/word_list_rb.html">lib/classifier/lsi/word_list.rb</a><br />
32
34
  </div>
33
35
  </div>
34
36
  </body>
@@ -20,40 +20,55 @@
20
20
  <div id="index">
21
21
  <h1 class="section-bar">Methods</h1>
22
22
  <div id="index-entries">
23
- <a href="classes/Classifier/LSI.html#M000014"><< (Classifier::LSI)</a><br />
24
- <a href="classes/Classifier/WordList.html#M000009">[] (Classifier::WordList)</a><br />
25
- <a href="classes/GSL/Vector.html#M000005">_dump (GSL::Vector)</a><br />
26
- <a href="classes/GSL/Vector.html#M000006">_load (GSL::Vector)</a><br />
27
- <a href="classes/Classifier/Bayes.html#M000029">add_category (Classifier::Bayes)</a><br />
28
- <a href="classes/Classifier/LSI.html#M000013">add_item (Classifier::LSI)</a><br />
29
- <a href="classes/Classifier/WordList.html#M000008">add_word (Classifier::WordList)</a><br />
30
- <a href="classes/Classifier/Bayes.html#M000030">append_category (Classifier::Bayes)</a><br />
31
- <a href="classes/Classifier/LSI.html#M000017">build_index (Classifier::LSI)</a><br />
32
- <a href="classes/Classifier/Bayes.html#M000026">classifications (Classifier::Bayes)</a><br />
33
- <a href="classes/Classifier/LSI.html#M000022">classify (Classifier::LSI)</a><br />
34
- <a href="classes/Classifier/Bayes.html#M000027">classify (Classifier::Bayes)</a><br />
35
- <a href="classes/String.html#M000004">clean_word_hash (String)</a><br />
36
- <a href="classes/Classifier/LSI.html#M000021">find_related (Classifier::LSI)</a><br />
37
- <a href="classes/Classifier/LSI.html#M000016">items (Classifier::LSI)</a><br />
38
- <a href="classes/Classifier/Bayes.html#M000028">method_missing (Classifier::Bayes)</a><br />
39
- <a href="classes/Classifier/LSI.html#M000012">needs_rebuild? (Classifier::LSI)</a><br />
40
- <a href="classes/Classifier/Bayes.html#M000023">new (Classifier::Bayes)</a><br />
41
- <a href="classes/Classifier/LSI.html#M000011">new (Classifier::LSI)</a><br />
42
- <a href="classes/Classifier/ContentNode.html#M000031">new (Classifier::ContentNode)</a><br />
43
- <a href="classes/Classifier/WordList.html#M000007">new (Classifier::WordList)</a><br />
44
- <a href="classes/Object.html#M000001">prepare_category_name (Object)</a><br />
45
- <a href="classes/Classifier/LSI.html#M000018">proximity_array_for_content (Classifier::LSI)</a><br />
46
- <a href="classes/Classifier/LSI.html#M000019">proximity_norms_for_content (Classifier::LSI)</a><br />
47
- <a href="classes/Classifier/ContentNode.html#M000034">raw_vector_with (Classifier::ContentNode)</a><br />
48
- <a href="classes/Classifier/LSI.html#M000015">remove_item (Classifier::LSI)</a><br />
49
- <a href="classes/Classifier/LSI.html#M000020">search (Classifier::LSI)</a><br />
50
- <a href="classes/Classifier/ContentNode.html#M000033">search_norm (Classifier::ContentNode)</a><br />
51
- <a href="classes/Classifier/ContentNode.html#M000032">search_vector (Classifier::ContentNode)</a><br />
52
- <a href="classes/Classifier/WordList.html#M000010">size (Classifier::WordList)</a><br />
53
- <a href="classes/Classifier/Bayes.html#M000024">train (Classifier::Bayes)</a><br />
54
- <a href="classes/Classifier/Bayes.html#M000025">untrain (Classifier::Bayes)</a><br />
55
- <a href="classes/String.html#M000002">without_punctuation (String)</a><br />
56
- <a href="classes/String.html#M000003">word_hash (String)</a><br />
23
+ <a href="classes/Classifier/LSI.html#M000025"><< (Classifier::LSI)</a><br />
24
+ <a href="classes/Matrix.html#M000005">SV_decomp (Matrix)</a><br />
25
+ <a href="classes/Classifier/WordList.html#M000019">[] (Classifier::WordList)</a><br />
26
+ <a href="classes/Matrix.html#M000006">[]= (Matrix)</a><br />
27
+ <a href="classes/GSL/Vector.html#M000015">_dump (GSL::Vector)</a><br />
28
+ <a href="classes/GSL/Vector.html#M000016">_load (GSL::Vector)</a><br />
29
+ <a href="classes/Classifier/Bayes.html#M000044">add_category (Classifier::Bayes)</a><br />
30
+ <a href="classes/Classifier/LSI.html#M000024">add_item (Classifier::LSI)</a><br />
31
+ <a href="classes/Classifier/WordList.html#M000018">add_word (Classifier::WordList)</a><br />
32
+ <a href="classes/Classifier/Bayes.html#M000045">append_category (Classifier::Bayes)</a><br />
33
+ <a href="classes/Classifier/LSI.html#M000030">build_index (Classifier::LSI)</a><br />
34
+ <a href="classes/Classifier/LSI.html#M000029">categories_for (Classifier::LSI)</a><br />
35
+ <a href="classes/Classifier/LSI.html#M000026">categories_for (Classifier::LSI)</a><br />
36
+ <a href="classes/Classifier/Bayes.html#M000041">classifications (Classifier::Bayes)</a><br />
37
+ <a href="classes/Classifier/LSI.html#M000036">classify (Classifier::LSI)</a><br />
38
+ <a href="classes/Classifier/Bayes.html#M000042">classify (Classifier::Bayes)</a><br />
39
+ <a href="classes/String.html#M000010">clean_word_hash (String)</a><br />
40
+ <a href="classes/Matrix.html#M000004">diag (Matrix)</a><br />
41
+ <a href="classes/Classifier/LSI.html#M000035">find_related (Classifier::LSI)</a><br />
42
+ <a href="classes/Classifier/LSI.html#M000037">highest_ranked_stems (Classifier::LSI)</a><br />
43
+ <a href="classes/Classifier/LSI.html#M000031">highest_relative_content (Classifier::LSI)</a><br />
44
+ <a href="classes/Classifier/LSI.html#M000028">items (Classifier::LSI)</a><br />
45
+ <a href="classes/Vector.html#M000001">magnitude (Vector)</a><br />
46
+ <a href="classes/Classifier/Bayes.html#M000043">method_missing (Classifier::Bayes)</a><br />
47
+ <a href="classes/Classifier/LSI.html#M000023">needs_rebuild? (Classifier::LSI)</a><br />
48
+ <a href="classes/Classifier/ContentNode.html#M000046">new (Classifier::ContentNode)</a><br />
49
+ <a href="classes/Classifier/Bayes.html#M000038">new (Classifier::Bayes)</a><br />
50
+ <a href="classes/Classifier/LSI.html#M000022">new (Classifier::LSI)</a><br />
51
+ <a href="classes/Classifier/WordList.html#M000017">new (Classifier::WordList)</a><br />
52
+ <a href="classes/Vector.html#M000002">normalize (Vector)</a><br />
53
+ <a href="classes/String.html#M000012">paragraph_summary (String)</a><br />
54
+ <a href="classes/Object.html#M000007">prepare_category_name (Object)</a><br />
55
+ <a href="classes/Classifier/LSI.html#M000032">proximity_array_for_content (Classifier::LSI)</a><br />
56
+ <a href="classes/Classifier/LSI.html#M000033">proximity_norms_for_content (Classifier::LSI)</a><br />
57
+ <a href="classes/Classifier/ContentNode.html#M000049">raw_vector_with (Classifier::ContentNode)</a><br />
58
+ <a href="classes/Classifier/LSI.html#M000027">remove_item (Classifier::LSI)</a><br />
59
+ <a href="classes/Classifier/LSI.html#M000034">search (Classifier::LSI)</a><br />
60
+ <a href="classes/Classifier/ContentNode.html#M000048">search_norm (Classifier::ContentNode)</a><br />
61
+ <a href="classes/Classifier/ContentNode.html#M000047">search_vector (Classifier::ContentNode)</a><br />
62
+ <a href="classes/Classifier/WordList.html#M000021">size (Classifier::WordList)</a><br />
63
+ <a href="classes/String.html#M000014">split_paragraphs (String)</a><br />
64
+ <a href="classes/String.html#M000013">split_sentences (String)</a><br />
65
+ <a href="classes/Array.html#M000003">sum (Array)</a><br />
66
+ <a href="classes/String.html#M000011">summary (String)</a><br />
67
+ <a href="classes/Classifier/Bayes.html#M000039">train (Classifier::Bayes)</a><br />
68
+ <a href="classes/Classifier/Bayes.html#M000040">untrain (Classifier::Bayes)</a><br />
69
+ <a href="classes/String.html#M000008">without_punctuation (String)</a><br />
70
+ <a href="classes/Classifier/WordList.html#M000020">word_for_index (Classifier::WordList)</a><br />
71
+ <a href="classes/String.html#M000009">word_hash (String)</a><br />
57
72
  </div>
58
73
  </div>
59
74
  </body>
data/doc/index.html CHANGED
@@ -5,12 +5,12 @@
5
5
 
6
6
  <!--
7
7
 
8
- RDoc Documentation
8
+ Ruby Classifier - Bayesian and LSI classification library
9
9
 
10
10
  -->
11
11
  <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
12
12
  <head>
13
- <title>RDoc Documentation</title>
13
+ <title>Ruby Classifier - Bayesian and LSI classification library</title>
14
14
  <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
15
15
  </head>
16
16
  <frameset rows="20%, 80%">
data/lib/classifier.rb CHANGED
@@ -25,6 +25,6 @@
25
25
  # License:: LGPL
26
26
 
27
27
  require 'rubygems'
28
- require 'classifier/string_extensions'
28
+ require 'classifier/extensions/string'
29
29
  require 'classifier/bayes'
30
30
  require 'classifier/lsi'
@@ -0,0 +1,106 @@
1
+ # Author:: Ernest Ellingson
2
+ # Copyright:: Copyright (c) 2005
3
+
4
+ # These are extensions to the std-lib 'matrix' to allow an all ruby SVD
5
+
6
+ require 'matrix'
7
+ require 'mathn'
8
+
9
+ class Array
10
+ def sum
11
+ inject(0) { |sum,term| sum += term }.to_f
12
+ end
13
+ end
14
+
15
+ class Vector
16
+ def magnitude
17
+ sumsqs = 0.0
18
+ self.size.times do |i|
19
+ sumsqs += self[i] ** 2.0
20
+ end
21
+ Math.sqrt(sumsqs)
22
+ end
23
+ def normalize
24
+ nv = []
25
+ mag = self.magnitude
26
+ self.size.times do |i|
27
+
28
+ nv << (self[i] / mag)
29
+
30
+ end
31
+ Vector[*nv]
32
+ end
33
+ end
34
+
35
+ class Matrix
36
+ def Matrix.diag(s)
37
+ Matrix.diagonal(*s)
38
+ end
39
+
40
+ alias :trans :transpose
41
+
42
+ def SV_decomp(maxSweeps = 20)
43
+ if self.row_size >= self.column_size
44
+ q = self.trans * self
45
+ else
46
+ q = self * self.trans
47
+ end
48
+
49
+ qrot = q.dup
50
+ v = Matrix.identity(q.row_size)
51
+ azrot = nil
52
+ mzrot = nil
53
+ cnt = 0
54
+ s_old = nil
55
+ mu = nil
56
+
57
+ while true do
58
+ cnt += 1
59
+ for row in (0...qrot.row_size-1) do
60
+ for col in (1..qrot.row_size-1) do
61
+ next if row == col
62
+ h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
63
+ hcos = Math.cos(h)
64
+ hsin = Math.sin(h)
65
+ mzrot = Matrix.identity(qrot.row_size)
66
+ mzrot[row,row] = hcos
67
+ mzrot[row,col] = -hsin
68
+ mzrot[col,row] = hsin
69
+ mzrot[col,col] = hcos
70
+ qrot = mzrot.trans * qrot * mzrot
71
+ v = v * mzrot
72
+ end
73
+ end
74
+ s_old = qrot.dup if cnt == 1
75
+ sum_qrot = 0.0
76
+ if cnt > 1
77
+ qrot.row_size.times do |r|
78
+ sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
79
+ end
80
+ s_old = qrot.dup
81
+ end
82
+ break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
83
+ end # of do while true
84
+ s = []
85
+ qrot.row_size.times do |r|
86
+ s << Math.sqrt(qrot[r,r])
87
+ end
88
+ #puts "cnt = #{cnt}"
89
+ if self.row_size >= self.column_size
90
+ mu = self * v * Matrix.diagonal(*s).inverse
91
+ return [mu, v, s]
92
+ else
93
+ puts v.row_size
94
+ puts v.column_size
95
+ puts self.row_size
96
+ puts self.column_size
97
+ puts s.size
98
+
99
+ mu = (self.trans * v * Matrix.diagonal(*s).inverse)
100
+ return [mu, v, s]
101
+ end
102
+ end
103
+ def []=(i,j,val)
104
+ @rows[i][j] = val
105
+ end
106
+ end
@@ -11,4 +11,10 @@ module GSL
11
11
  end
12
12
 
13
13
  end
14
+
15
+ class Matrix
16
+ class <<self
17
+ alias :diag :diagonal
18
+ end
19
+ end
14
20
  end
@@ -1,14 +1,22 @@
1
1
  # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
2
  # Copyright:: Copyright (c) 2005 David Fayram II
3
- # License:: GPL
3
+ # License:: LGPL
4
4
 
5
5
  begin
6
-
7
- require 'gsl' # requires http://rb-gsl.rubyforge.org/
8
-
9
- require 'classifier/extensions/word_list'
10
- require 'classifier/extensions/vector_serialize'
6
+ raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
7
+
8
+ require 'gsl' # requires http://rb-gsl.rubyforge.org/
9
+ require 'classifier/extensions/vector_serialize'
10
+ $GSL = true
11
+
12
+ rescue LoadError
13
+ warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
14
+ require 'classifier/extensions/vector'
15
+ end
16
+
17
+ require 'classifier/lsi/word_list'
11
18
  require 'classifier/lsi/content_node'
19
+ require 'classifier/lsi/summary'
12
20
 
13
21
  module Classifier
14
22
 
@@ -18,6 +26,7 @@ module Classifier
18
26
  class LSI
19
27
 
20
28
  attr_reader :word_list
29
+ attr_accessor :auto_rebuild
21
30
 
22
31
  # Create a fresh index.
23
32
  # If you want to call #build_index manually, use
@@ -33,7 +42,7 @@ module Classifier
33
42
  # to be built after all informaton is added, but before you start
34
43
  # using it for search, classification and cluster detection.
35
44
  def needs_rebuild?
36
- @version != @built_at_version
45
+ (@items.keys.size > 1) && (@version != @built_at_version)
37
46
  end
38
47
 
39
48
  # Adds an item to the index. item is assumed to be a string, but
@@ -50,7 +59,8 @@ module Classifier
50
59
  # lsi.add_item ar, *ar.categories { |x| ar.content }
51
60
  #
52
61
  def add_item( item, *categories, &block )
53
- @items[item] = ContentNode.new(item, categories, block)
62
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
63
+ @items[item] = ContentNode.new(clean_word_hash, *categories)
54
64
  @version += 1
55
65
  build_index if @auto_rebuild
56
66
  end
@@ -63,6 +73,13 @@ module Classifier
63
73
  add_item item
64
74
  end
65
75
 
76
+ # Returns the categories for a given indexed items. You are free to add and remove
77
+ # items from this as you see fit. It does not invalide an index to change its categories.
78
+ def categories_for(item)
79
+ return [] unless @items[item]
80
+ return @items[item].categories
81
+ end
82
+
66
83
  # Removes an item from the database, if it is indexed.
67
84
  #
68
85
  def remove_item( item )
@@ -77,6 +94,13 @@ module Classifier
77
94
  @items.keys
78
95
  end
79
96
 
97
+ # Returns the categories for a given indexed items. You are free to add and remove
98
+ # items from this as you see fit. It does not invalide an index to change its categories.
99
+ def categories_for(item)
100
+ return [] unless @items[item]
101
+ return @items[item].categories
102
+ end
103
+
80
104
  # This function rebuilds the index if needs_rebuild? returns true.
81
105
  # For very large document spaces, this indexing operation may take some
82
106
  # time to complete, so it may be wise to place the operation in another
@@ -97,18 +121,46 @@ module Classifier
97
121
 
98
122
  doc_list = @items.values
99
123
  tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
100
- tdm = GSL::Matrix.new( *tda ).trans
101
- ntdm = build_reduced_matrix(tdm, cutoff)
102
124
 
103
- ntdm.size[1].times do |col|
104
- vec = GSL::Vector.new( ntdm.column(col) ).row
105
- doc_list[col].lsi_vector = vec
106
- doc_list[col].lsi_norm = vec.normalize
107
- end
125
+ if $GSL
126
+ tdm = GSL::Matrix.new(*tda).trans
127
+ ntdm = build_reduced_matrix(tdm, cutoff)
128
+
129
+ ntdm.size[1].times do |col|
130
+ vec = GSL::Vector.new( ntdm.column(col) ).row
131
+ doc_list[col].lsi_vector = vec
132
+ doc_list[col].lsi_norm = vec.normalize
133
+ end
134
+ else
135
+ tdm = Matrix.rows(tda).trans
136
+ ntdm = build_reduced_matrix(tdm, cutoff)
108
137
 
138
+ ntdm.row_size.times do |col|
139
+ doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
140
+ doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
141
+ end
142
+ end
143
+
109
144
  @built_at_version = @version
110
145
  end
111
-
146
+
147
+ # This method returns max_chunks entries, ordered by their average semantic rating.
148
+ # Essentially, the average distance of each entry from all other entries is calculated,
149
+ # the highest are returned.
150
+ #
151
+ # This can be used to build a summary service, or to provide more information about
152
+ # your dataset's general content. For example, if you were to use categorize on the
153
+ # results of this data, you could gather information on what your dataset is generally
154
+ # about.
155
+ def highest_relative_content( max_chunks=10 )
156
+ return [] if needs_rebuild?
157
+
158
+ avg_density = Hash.new
159
+ @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
160
+
161
+ avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
162
+ end
163
+
112
164
  # This function is the primitive that find_related and classify
113
165
  # build upon. It returns an array of 2-element arrays. The first element
114
166
  # of this array is a document, and the second is its "score", defining
@@ -123,11 +175,15 @@ module Classifier
123
175
  # text data. See add_item for examples of how this works.
124
176
  def proximity_array_for_content( doc, &block )
125
177
  return [] if needs_rebuild?
126
-
178
+
127
179
  content_node = node_for_content( doc, &block )
128
180
  result =
129
181
  @items.keys.collect do |item|
130
- val = content_node.search_vector * @items[item].search_vector.col
182
+ if $GSL
183
+ val = content_node.search_vector * @items[item].search_vector.col
184
+ else
185
+ val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
186
+ end
131
187
  [item, val]
132
188
  end
133
189
  result.sort_by { |x| x[1] }.reverse
@@ -144,7 +200,11 @@ module Classifier
144
200
  content_node = node_for_content( doc, &block )
145
201
  result =
146
202
  @items.keys.collect do |item|
147
- val = content_node.search_norm * @items[item].search_norm.col
203
+ if $GSL
204
+ val = content_node.search_norm * @items[item].search_norm.col
205
+ else
206
+ val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
207
+ end
148
208
  [item, val]
149
209
  end
150
210
  result.sort_by { |x| x[1] }.reverse
@@ -159,9 +219,7 @@ module Classifier
159
219
  # it is actually the same algorithm, just applied on a smaller document.
160
220
  def search( string, max_nearest=3 )
161
221
  return [] if needs_rebuild?
162
-
163
- carry =
164
- proximity_norms_for_content( string )
222
+ carry = proximity_norms_for_content( string )
165
223
  result = carry.collect { |x| x[0] }
166
224
  return result[0..max_nearest-1]
167
225
  end
@@ -208,29 +266,44 @@ module Classifier
208
266
  return ranking[-1]
209
267
  end
210
268
 
269
+ # Prototype, only works on indexed documents.
270
+ # I have no clue if this is going to work, but in theory
271
+ # it's supposed to.
272
+ def highest_ranked_stems( doc, count=3 )
273
+ raise "Requested stem ranking on non-indexed content!" unless @items[doc]
274
+ arr = node_for_content(doc).lsi_vector.to_a
275
+ top_n = arr.sort.reverse[0..count-1]
276
+ return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
277
+ end
278
+
211
279
  private
212
280
  def build_reduced_matrix( matrix, cutoff=0.75 )
213
281
  # TODO: Check that M>=N on these dimensions! Transpose helps assure this
214
282
  u, v, s = matrix.SV_decomp
283
+
215
284
  # TODO: Better than 75% term, please. :\
216
285
  s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
217
286
  s.size.times do |ord|
218
287
  s[ord] = 0.0 if s[ord] < s_cutoff
219
288
  end
220
-
221
289
  # Reconstruct the term document matrix, only with reduced rank
222
- u * Matrix.diagonal( s ) * v.trans
290
+ u * Matrix.diag( s ) * v.trans
223
291
  end
224
292
 
225
293
  def node_for_content(item, &block)
226
294
  if @items[item]
227
295
  return @items[item]
228
296
  else
229
- cn = ContentNode.new(item, &block) # make the node and extract the data
230
- cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
231
- end
297
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
232
298
 
233
- cn
299
+ cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
300
+
301
+ unless needs_rebuild?
302
+ cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
303
+ end
304
+ end
305
+
306
+ return cn
234
307
  end
235
308
 
236
309
  def make_word_list
@@ -243,6 +316,3 @@ module Classifier
243
316
  end
244
317
  end
245
318
 
246
- rescue LoadError
247
- $stderr.puts "For LSI support, you need to install http://rb-gsl.rubyforge.org/"
248
- end