classifier 1.2.0 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +361 -273
- data/README +6 -5
- data/Rakefile +12 -2
- data/bin/summarize.rb +11 -0
- data/doc/classes/Array.html +139 -0
- data/doc/classes/Array.src/M000003.html +18 -0
- data/doc/classes/Classifier.html +5 -5
- data/doc/classes/Classifier/Bayes.html +43 -43
- data/doc/classes/Classifier/Bayes.src/{M000023.html → M000038.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000024.html → M000039.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000025.html → M000040.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000026.html → M000041.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000027.html → M000042.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000028.html → M000043.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000029.html → M000044.html} +0 -0
- data/doc/classes/Classifier/ContentNode.html +23 -28
- data/doc/classes/Classifier/ContentNode.src/M000046.html +19 -0
- data/doc/classes/Classifier/ContentNode.src/{M000032.html → M000047.html} +1 -1
- data/doc/classes/Classifier/ContentNode.src/{M000033.html → M000048.html} +1 -1
- data/doc/classes/Classifier/ContentNode.src/M000049.html +49 -0
- data/doc/classes/Classifier/LSI.html +158 -68
- data/doc/classes/Classifier/LSI.src/M000022.html +6 -17
- data/doc/classes/Classifier/LSI.src/{M000012.html → M000023.html} +2 -2
- data/doc/classes/Classifier/LSI.src/{M000013.html → M000024.html} +3 -2
- data/doc/classes/Classifier/LSI.src/{M000014.html → M000025.html} +1 -1
- data/doc/classes/Classifier/LSI.src/M000026.html +19 -0
- data/doc/classes/Classifier/LSI.src/{M000015.html → M000027.html} +1 -1
- data/doc/classes/Classifier/LSI.src/{M000016.html → M000028.html} +1 -1
- data/doc/classes/Classifier/LSI.src/M000029.html +19 -0
- data/doc/classes/Classifier/LSI.src/M000030.html +43 -0
- data/doc/classes/Classifier/LSI.src/M000031.html +23 -0
- data/doc/classes/Classifier/LSI.src/{M000018.html → M000032.html} +7 -3
- data/doc/classes/Classifier/LSI.src/{M000019.html → M000033.html} +6 -2
- data/doc/classes/Classifier/LSI.src/{M000020.html → M000034.html} +2 -4
- data/doc/classes/Classifier/LSI.src/{M000021.html → M000035.html} +1 -1
- data/doc/classes/Classifier/LSI.src/M000036.html +31 -0
- data/doc/classes/Classifier/LSI.src/M000037.html +21 -0
- data/doc/classes/Classifier/WordList.html +37 -22
- data/doc/classes/Classifier/WordList.src/{M000007.html → M000017.html} +2 -2
- data/doc/classes/Classifier/WordList.src/{M000008.html → M000018.html} +1 -1
- data/doc/classes/Classifier/WordList.src/{M000009.html → M000019.html} +1 -1
- data/doc/classes/Classifier/WordList.src/M000020.html +18 -0
- data/doc/classes/Classifier/WordList.src/{M000010.html → M000021.html} +1 -1
- data/doc/classes/GSL.html +2 -1
- data/doc/classes/GSL/Matrix.html +126 -0
- data/doc/classes/GSL/Vector.html +10 -10
- data/doc/classes/GSL/Vector.src/{M000005.html → M000015.html} +0 -0
- data/doc/classes/GSL/Vector.src/{M000006.html → M000016.html} +0 -0
- data/doc/classes/Matrix.html +184 -0
- data/doc/classes/Matrix.src/M000004.html +18 -0
- data/doc/classes/Matrix.src/M000005.html +76 -0
- data/doc/classes/Matrix.src/M000006.html +18 -0
- data/doc/classes/Object.html +7 -7
- data/doc/classes/Object.src/{M000001.html → M000007.html} +1 -1
- data/doc/classes/String.html +90 -20
- data/doc/classes/String.src/{M000002.html → M000008.html} +0 -0
- data/doc/classes/String.src/{M000003.html → M000009.html} +0 -0
- data/doc/classes/String.src/{M000004.html → M000010.html} +0 -0
- data/doc/classes/String.src/M000011.html +18 -0
- data/doc/classes/String.src/M000012.html +18 -0
- data/doc/classes/String.src/M000013.html +18 -0
- data/doc/classes/String.src/M000014.html +18 -0
- data/doc/classes/Vector.html +154 -0
- data/doc/classes/Vector.src/M000001.html +22 -0
- data/doc/classes/Vector.src/M000002.html +25 -0
- data/doc/created.rid +1 -1
- data/doc/files/README.html +14 -8
- data/doc/files/lib/classifier/bayes_rb.html +1 -1
- data/doc/files/lib/classifier/{string_extensions_rb.html → extensions/string_rb.html} +5 -5
- data/doc/files/lib/classifier/extensions/vector_rb.html +120 -0
- data/doc/files/lib/classifier/extensions/vector_serialize_rb.html +1 -1
- data/doc/files/lib/classifier/extensions/word_hash_rb.html +1 -1
- data/doc/files/lib/classifier/lsi/content_node_rb.html +2 -2
- data/doc/files/lib/classifier/lsi/summary_rb.html +115 -0
- data/doc/files/lib/classifier/{extensions → lsi}/word_list_rb.html +3 -3
- data/doc/files/lib/classifier/lsi_rb.html +5 -3
- data/doc/files/lib/classifier_rb.html +2 -2
- data/doc/fr_class_index.html +4 -0
- data/doc/fr_file_index.html +4 -2
- data/doc/fr_method_index.html +49 -34
- data/doc/index.html +2 -2
- data/lib/classifier.rb +1 -1
- data/lib/classifier/{string_extensions.rb → extensions/string.rb} +0 -0
- data/lib/classifier/extensions/vector.rb +106 -0
- data/lib/classifier/extensions/vector_serialize.rb +6 -0
- data/lib/classifier/lsi.rb +101 -31
- data/lib/classifier/lsi/content_node.rb +28 -23
- data/lib/classifier/lsi/summary.rb +31 -0
- data/lib/classifier/{extensions → lsi}/word_list.rb +7 -2
- data/test/{string_extensions → extensions}/word_hash_test.rb +0 -0
- data/test/lsi/lsi_test.rb +36 -1
- metadata +68 -41
- data/doc/classes/Classifier/ContentNode.src/M000031.html +0 -21
- data/doc/classes/Classifier/ContentNode.src/M000034.html +0 -41
- data/doc/classes/Classifier/LSI.src/M000011.html +0 -20
- data/doc/classes/Classifier/LSI.src/M000017.html +0 -32
@@ -56,7 +56,7 @@
|
|
56
56
|
</tr>
|
57
57
|
<tr class="top-aligned-row">
|
58
58
|
<td><strong>Last Update:</strong></td>
|
59
|
-
<td>
|
59
|
+
<td>Thu May 05 01:50:06 PDT 2005</td>
|
60
60
|
</tr>
|
61
61
|
</table>
|
62
62
|
</div>
|
@@ -76,7 +76,7 @@
|
|
76
76
|
<tr><td valign="top">Copyright:</td><td>Copyright © 2005 David Fayram II
|
77
77
|
|
78
78
|
</td></tr>
|
79
|
-
<tr><td valign="top">License:</td><td>
|
79
|
+
<tr><td valign="top">License:</td><td>LGPL
|
80
80
|
|
81
81
|
</td></tr>
|
82
82
|
</table>
|
@@ -88,9 +88,11 @@
|
|
88
88
|
|
89
89
|
<div class="name-list">
|
90
90
|
gsl
|
91
|
-
classifier/extensions/word_list
|
92
91
|
classifier/extensions/vector_serialize
|
92
|
+
classifier/extensions/vector
|
93
|
+
classifier/lsi/word_list
|
93
94
|
classifier/lsi/content_node
|
95
|
+
classifier/lsi/summary
|
94
96
|
</div>
|
95
97
|
</div>
|
96
98
|
|
@@ -56,7 +56,7 @@
|
|
56
56
|
</tr>
|
57
57
|
<tr class="top-aligned-row">
|
58
58
|
<td><strong>Last Update:</strong></td>
|
59
|
-
<td>
|
59
|
+
<td>Thu May 05 01:21:16 PDT 2005</td>
|
60
60
|
</tr>
|
61
61
|
</table>
|
62
62
|
</div>
|
@@ -88,7 +88,7 @@
|
|
88
88
|
|
89
89
|
<div class="name-list">
|
90
90
|
rubygems
|
91
|
-
classifier/
|
91
|
+
classifier/extensions/string
|
92
92
|
classifier/bayes
|
93
93
|
classifier/lsi
|
94
94
|
</div>
|
data/doc/fr_class_index.html
CHANGED
@@ -20,15 +20,19 @@
|
|
20
20
|
<div id="index">
|
21
21
|
<h1 class="section-bar">Classes</h1>
|
22
22
|
<div id="index-entries">
|
23
|
+
<a href="classes/Array.html">Array</a><br />
|
23
24
|
<a href="classes/Classifier.html">Classifier</a><br />
|
24
25
|
<a href="classes/Classifier/Bayes.html">Classifier::Bayes</a><br />
|
25
26
|
<a href="classes/Classifier/ContentNode.html">Classifier::ContentNode</a><br />
|
26
27
|
<a href="classes/Classifier/LSI.html">Classifier::LSI</a><br />
|
27
28
|
<a href="classes/Classifier/WordList.html">Classifier::WordList</a><br />
|
28
29
|
<a href="classes/GSL.html">GSL</a><br />
|
30
|
+
<a href="classes/GSL/Matrix.html">GSL::Matrix</a><br />
|
29
31
|
<a href="classes/GSL/Vector.html">GSL::Vector</a><br />
|
32
|
+
<a href="classes/Matrix.html">Matrix</a><br />
|
30
33
|
<a href="classes/Object.html">Object</a><br />
|
31
34
|
<a href="classes/String.html">String</a><br />
|
35
|
+
<a href="classes/Vector.html">Vector</a><br />
|
32
36
|
</div>
|
33
37
|
</div>
|
34
38
|
</body>
|
data/doc/fr_file_index.html
CHANGED
@@ -23,12 +23,14 @@
|
|
23
23
|
<a href="files/README.html">README</a><br />
|
24
24
|
<a href="files/lib/classifier_rb.html">lib/classifier.rb</a><br />
|
25
25
|
<a href="files/lib/classifier/bayes_rb.html">lib/classifier/bayes.rb</a><br />
|
26
|
+
<a href="files/lib/classifier/extensions/string_rb.html">lib/classifier/extensions/string.rb</a><br />
|
27
|
+
<a href="files/lib/classifier/extensions/vector_rb.html">lib/classifier/extensions/vector.rb</a><br />
|
26
28
|
<a href="files/lib/classifier/extensions/vector_serialize_rb.html">lib/classifier/extensions/vector_serialize.rb</a><br />
|
27
29
|
<a href="files/lib/classifier/extensions/word_hash_rb.html">lib/classifier/extensions/word_hash.rb</a><br />
|
28
|
-
<a href="files/lib/classifier/extensions/word_list_rb.html">lib/classifier/extensions/word_list.rb</a><br />
|
29
30
|
<a href="files/lib/classifier/lsi_rb.html">lib/classifier/lsi.rb</a><br />
|
30
31
|
<a href="files/lib/classifier/lsi/content_node_rb.html">lib/classifier/lsi/content_node.rb</a><br />
|
31
|
-
<a href="files/lib/classifier/
|
32
|
+
<a href="files/lib/classifier/lsi/summary_rb.html">lib/classifier/lsi/summary.rb</a><br />
|
33
|
+
<a href="files/lib/classifier/lsi/word_list_rb.html">lib/classifier/lsi/word_list.rb</a><br />
|
32
34
|
</div>
|
33
35
|
</div>
|
34
36
|
</body>
|
data/doc/fr_method_index.html
CHANGED
@@ -20,40 +20,55 @@
|
|
20
20
|
<div id="index">
|
21
21
|
<h1 class="section-bar">Methods</h1>
|
22
22
|
<div id="index-entries">
|
23
|
-
<a href="classes/Classifier/LSI.html#
|
24
|
-
<a href="classes/
|
25
|
-
<a href="classes/
|
26
|
-
<a href="classes/
|
27
|
-
<a href="classes/
|
28
|
-
<a href="classes/
|
29
|
-
<a href="classes/Classifier/
|
30
|
-
<a href="classes/Classifier/
|
31
|
-
<a href="classes/Classifier/
|
32
|
-
<a href="classes/Classifier/Bayes.html#
|
33
|
-
<a href="classes/Classifier/LSI.html#
|
34
|
-
<a href="classes/Classifier/
|
35
|
-
<a href="classes/
|
36
|
-
<a href="classes/Classifier/
|
37
|
-
<a href="classes/Classifier/LSI.html#
|
38
|
-
<a href="classes/Classifier/Bayes.html#
|
39
|
-
<a href="classes/
|
40
|
-
<a href="classes/
|
41
|
-
<a href="classes/Classifier/LSI.html#
|
42
|
-
<a href="classes/Classifier/
|
43
|
-
<a href="classes/Classifier/
|
44
|
-
<a href="classes/
|
45
|
-
<a href="classes/
|
46
|
-
<a href="classes/Classifier/
|
47
|
-
<a href="classes/Classifier/
|
48
|
-
<a href="classes/Classifier/
|
49
|
-
<a href="classes/Classifier/
|
50
|
-
<a href="classes/Classifier/
|
51
|
-
<a href="classes/Classifier/
|
52
|
-
<a href="classes/
|
53
|
-
<a href="classes/
|
54
|
-
<a href="classes/
|
55
|
-
<a href="classes/
|
56
|
-
<a href="classes/
|
23
|
+
<a href="classes/Classifier/LSI.html#M000025"><< (Classifier::LSI)</a><br />
|
24
|
+
<a href="classes/Matrix.html#M000005">SV_decomp (Matrix)</a><br />
|
25
|
+
<a href="classes/Classifier/WordList.html#M000019">[] (Classifier::WordList)</a><br />
|
26
|
+
<a href="classes/Matrix.html#M000006">[]= (Matrix)</a><br />
|
27
|
+
<a href="classes/GSL/Vector.html#M000015">_dump (GSL::Vector)</a><br />
|
28
|
+
<a href="classes/GSL/Vector.html#M000016">_load (GSL::Vector)</a><br />
|
29
|
+
<a href="classes/Classifier/Bayes.html#M000044">add_category (Classifier::Bayes)</a><br />
|
30
|
+
<a href="classes/Classifier/LSI.html#M000024">add_item (Classifier::LSI)</a><br />
|
31
|
+
<a href="classes/Classifier/WordList.html#M000018">add_word (Classifier::WordList)</a><br />
|
32
|
+
<a href="classes/Classifier/Bayes.html#M000045">append_category (Classifier::Bayes)</a><br />
|
33
|
+
<a href="classes/Classifier/LSI.html#M000030">build_index (Classifier::LSI)</a><br />
|
34
|
+
<a href="classes/Classifier/LSI.html#M000029">categories_for (Classifier::LSI)</a><br />
|
35
|
+
<a href="classes/Classifier/LSI.html#M000026">categories_for (Classifier::LSI)</a><br />
|
36
|
+
<a href="classes/Classifier/Bayes.html#M000041">classifications (Classifier::Bayes)</a><br />
|
37
|
+
<a href="classes/Classifier/LSI.html#M000036">classify (Classifier::LSI)</a><br />
|
38
|
+
<a href="classes/Classifier/Bayes.html#M000042">classify (Classifier::Bayes)</a><br />
|
39
|
+
<a href="classes/String.html#M000010">clean_word_hash (String)</a><br />
|
40
|
+
<a href="classes/Matrix.html#M000004">diag (Matrix)</a><br />
|
41
|
+
<a href="classes/Classifier/LSI.html#M000035">find_related (Classifier::LSI)</a><br />
|
42
|
+
<a href="classes/Classifier/LSI.html#M000037">highest_ranked_stems (Classifier::LSI)</a><br />
|
43
|
+
<a href="classes/Classifier/LSI.html#M000031">highest_relative_content (Classifier::LSI)</a><br />
|
44
|
+
<a href="classes/Classifier/LSI.html#M000028">items (Classifier::LSI)</a><br />
|
45
|
+
<a href="classes/Vector.html#M000001">magnitude (Vector)</a><br />
|
46
|
+
<a href="classes/Classifier/Bayes.html#M000043">method_missing (Classifier::Bayes)</a><br />
|
47
|
+
<a href="classes/Classifier/LSI.html#M000023">needs_rebuild? (Classifier::LSI)</a><br />
|
48
|
+
<a href="classes/Classifier/ContentNode.html#M000046">new (Classifier::ContentNode)</a><br />
|
49
|
+
<a href="classes/Classifier/Bayes.html#M000038">new (Classifier::Bayes)</a><br />
|
50
|
+
<a href="classes/Classifier/LSI.html#M000022">new (Classifier::LSI)</a><br />
|
51
|
+
<a href="classes/Classifier/WordList.html#M000017">new (Classifier::WordList)</a><br />
|
52
|
+
<a href="classes/Vector.html#M000002">normalize (Vector)</a><br />
|
53
|
+
<a href="classes/String.html#M000012">paragraph_summary (String)</a><br />
|
54
|
+
<a href="classes/Object.html#M000007">prepare_category_name (Object)</a><br />
|
55
|
+
<a href="classes/Classifier/LSI.html#M000032">proximity_array_for_content (Classifier::LSI)</a><br />
|
56
|
+
<a href="classes/Classifier/LSI.html#M000033">proximity_norms_for_content (Classifier::LSI)</a><br />
|
57
|
+
<a href="classes/Classifier/ContentNode.html#M000049">raw_vector_with (Classifier::ContentNode)</a><br />
|
58
|
+
<a href="classes/Classifier/LSI.html#M000027">remove_item (Classifier::LSI)</a><br />
|
59
|
+
<a href="classes/Classifier/LSI.html#M000034">search (Classifier::LSI)</a><br />
|
60
|
+
<a href="classes/Classifier/ContentNode.html#M000048">search_norm (Classifier::ContentNode)</a><br />
|
61
|
+
<a href="classes/Classifier/ContentNode.html#M000047">search_vector (Classifier::ContentNode)</a><br />
|
62
|
+
<a href="classes/Classifier/WordList.html#M000021">size (Classifier::WordList)</a><br />
|
63
|
+
<a href="classes/String.html#M000014">split_paragraphs (String)</a><br />
|
64
|
+
<a href="classes/String.html#M000013">split_sentences (String)</a><br />
|
65
|
+
<a href="classes/Array.html#M000003">sum (Array)</a><br />
|
66
|
+
<a href="classes/String.html#M000011">summary (String)</a><br />
|
67
|
+
<a href="classes/Classifier/Bayes.html#M000039">train (Classifier::Bayes)</a><br />
|
68
|
+
<a href="classes/Classifier/Bayes.html#M000040">untrain (Classifier::Bayes)</a><br />
|
69
|
+
<a href="classes/String.html#M000008">without_punctuation (String)</a><br />
|
70
|
+
<a href="classes/Classifier/WordList.html#M000020">word_for_index (Classifier::WordList)</a><br />
|
71
|
+
<a href="classes/String.html#M000009">word_hash (String)</a><br />
|
57
72
|
</div>
|
58
73
|
</div>
|
59
74
|
</body>
|
data/doc/index.html
CHANGED
@@ -5,12 +5,12 @@
|
|
5
5
|
|
6
6
|
<!--
|
7
7
|
|
8
|
-
|
8
|
+
Ruby Classifier - Bayesian and LSI classification library
|
9
9
|
|
10
10
|
-->
|
11
11
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
12
12
|
<head>
|
13
|
-
<title>
|
13
|
+
<title>Ruby Classifier - Bayesian and LSI classification library</title>
|
14
14
|
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
15
15
|
</head>
|
16
16
|
<frameset rows="20%, 80%">
|
data/lib/classifier.rb
CHANGED
File without changes
|
@@ -0,0 +1,106 @@
|
|
1
|
+
# Author:: Ernest Ellingson
|
2
|
+
# Copyright:: Copyright (c) 2005
|
3
|
+
|
4
|
+
# These are extensions to the std-lib 'matrix' to allow an all ruby SVD
|
5
|
+
|
6
|
+
require 'matrix'
|
7
|
+
require 'mathn'
|
8
|
+
|
9
|
+
class Array
|
10
|
+
def sum
|
11
|
+
inject(0) { |sum,term| sum += term }.to_f
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class Vector
|
16
|
+
def magnitude
|
17
|
+
sumsqs = 0.0
|
18
|
+
self.size.times do |i|
|
19
|
+
sumsqs += self[i] ** 2.0
|
20
|
+
end
|
21
|
+
Math.sqrt(sumsqs)
|
22
|
+
end
|
23
|
+
def normalize
|
24
|
+
nv = []
|
25
|
+
mag = self.magnitude
|
26
|
+
self.size.times do |i|
|
27
|
+
|
28
|
+
nv << (self[i] / mag)
|
29
|
+
|
30
|
+
end
|
31
|
+
Vector[*nv]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
class Matrix
|
36
|
+
def Matrix.diag(s)
|
37
|
+
Matrix.diagonal(*s)
|
38
|
+
end
|
39
|
+
|
40
|
+
alias :trans :transpose
|
41
|
+
|
42
|
+
def SV_decomp(maxSweeps = 20)
|
43
|
+
if self.row_size >= self.column_size
|
44
|
+
q = self.trans * self
|
45
|
+
else
|
46
|
+
q = self * self.trans
|
47
|
+
end
|
48
|
+
|
49
|
+
qrot = q.dup
|
50
|
+
v = Matrix.identity(q.row_size)
|
51
|
+
azrot = nil
|
52
|
+
mzrot = nil
|
53
|
+
cnt = 0
|
54
|
+
s_old = nil
|
55
|
+
mu = nil
|
56
|
+
|
57
|
+
while true do
|
58
|
+
cnt += 1
|
59
|
+
for row in (0...qrot.row_size-1) do
|
60
|
+
for col in (1..qrot.row_size-1) do
|
61
|
+
next if row == col
|
62
|
+
h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
|
63
|
+
hcos = Math.cos(h)
|
64
|
+
hsin = Math.sin(h)
|
65
|
+
mzrot = Matrix.identity(qrot.row_size)
|
66
|
+
mzrot[row,row] = hcos
|
67
|
+
mzrot[row,col] = -hsin
|
68
|
+
mzrot[col,row] = hsin
|
69
|
+
mzrot[col,col] = hcos
|
70
|
+
qrot = mzrot.trans * qrot * mzrot
|
71
|
+
v = v * mzrot
|
72
|
+
end
|
73
|
+
end
|
74
|
+
s_old = qrot.dup if cnt == 1
|
75
|
+
sum_qrot = 0.0
|
76
|
+
if cnt > 1
|
77
|
+
qrot.row_size.times do |r|
|
78
|
+
sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
|
79
|
+
end
|
80
|
+
s_old = qrot.dup
|
81
|
+
end
|
82
|
+
break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
|
83
|
+
end # of do while true
|
84
|
+
s = []
|
85
|
+
qrot.row_size.times do |r|
|
86
|
+
s << Math.sqrt(qrot[r,r])
|
87
|
+
end
|
88
|
+
#puts "cnt = #{cnt}"
|
89
|
+
if self.row_size >= self.column_size
|
90
|
+
mu = self * v * Matrix.diagonal(*s).inverse
|
91
|
+
return [mu, v, s]
|
92
|
+
else
|
93
|
+
puts v.row_size
|
94
|
+
puts v.column_size
|
95
|
+
puts self.row_size
|
96
|
+
puts self.column_size
|
97
|
+
puts s.size
|
98
|
+
|
99
|
+
mu = (self.trans * v * Matrix.diagonal(*s).inverse)
|
100
|
+
return [mu, v, s]
|
101
|
+
end
|
102
|
+
end
|
103
|
+
def []=(i,j,val)
|
104
|
+
@rows[i][j] = val
|
105
|
+
end
|
106
|
+
end
|
data/lib/classifier/lsi.rb
CHANGED
@@ -1,14 +1,22 @@
|
|
1
1
|
# Author:: David Fayram (mailto:dfayram@lensmen.net)
|
2
2
|
# Copyright:: Copyright (c) 2005 David Fayram II
|
3
|
-
# License::
|
3
|
+
# License:: LGPL
|
4
4
|
|
5
5
|
begin
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
require 'classifier/extensions/
|
10
|
-
|
6
|
+
raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
|
7
|
+
|
8
|
+
require 'gsl' # requires http://rb-gsl.rubyforge.org/
|
9
|
+
require 'classifier/extensions/vector_serialize'
|
10
|
+
$GSL = true
|
11
|
+
|
12
|
+
rescue LoadError
|
13
|
+
warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
|
14
|
+
require 'classifier/extensions/vector'
|
15
|
+
end
|
16
|
+
|
17
|
+
require 'classifier/lsi/word_list'
|
11
18
|
require 'classifier/lsi/content_node'
|
19
|
+
require 'classifier/lsi/summary'
|
12
20
|
|
13
21
|
module Classifier
|
14
22
|
|
@@ -18,6 +26,7 @@ module Classifier
|
|
18
26
|
class LSI
|
19
27
|
|
20
28
|
attr_reader :word_list
|
29
|
+
attr_accessor :auto_rebuild
|
21
30
|
|
22
31
|
# Create a fresh index.
|
23
32
|
# If you want to call #build_index manually, use
|
@@ -33,7 +42,7 @@ module Classifier
|
|
33
42
|
# to be built after all informaton is added, but before you start
|
34
43
|
# using it for search, classification and cluster detection.
|
35
44
|
def needs_rebuild?
|
36
|
-
@version != @built_at_version
|
45
|
+
(@items.keys.size > 1) && (@version != @built_at_version)
|
37
46
|
end
|
38
47
|
|
39
48
|
# Adds an item to the index. item is assumed to be a string, but
|
@@ -50,7 +59,8 @@ module Classifier
|
|
50
59
|
# lsi.add_item ar, *ar.categories { |x| ar.content }
|
51
60
|
#
|
52
61
|
def add_item( item, *categories, &block )
|
53
|
-
|
62
|
+
clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
|
63
|
+
@items[item] = ContentNode.new(clean_word_hash, *categories)
|
54
64
|
@version += 1
|
55
65
|
build_index if @auto_rebuild
|
56
66
|
end
|
@@ -63,6 +73,13 @@ module Classifier
|
|
63
73
|
add_item item
|
64
74
|
end
|
65
75
|
|
76
|
+
# Returns the categories for a given indexed items. You are free to add and remove
|
77
|
+
# items from this as you see fit. It does not invalide an index to change its categories.
|
78
|
+
def categories_for(item)
|
79
|
+
return [] unless @items[item]
|
80
|
+
return @items[item].categories
|
81
|
+
end
|
82
|
+
|
66
83
|
# Removes an item from the database, if it is indexed.
|
67
84
|
#
|
68
85
|
def remove_item( item )
|
@@ -77,6 +94,13 @@ module Classifier
|
|
77
94
|
@items.keys
|
78
95
|
end
|
79
96
|
|
97
|
+
# Returns the categories for a given indexed items. You are free to add and remove
|
98
|
+
# items from this as you see fit. It does not invalide an index to change its categories.
|
99
|
+
def categories_for(item)
|
100
|
+
return [] unless @items[item]
|
101
|
+
return @items[item].categories
|
102
|
+
end
|
103
|
+
|
80
104
|
# This function rebuilds the index if needs_rebuild? returns true.
|
81
105
|
# For very large document spaces, this indexing operation may take some
|
82
106
|
# time to complete, so it may be wise to place the operation in another
|
@@ -97,18 +121,46 @@ module Classifier
|
|
97
121
|
|
98
122
|
doc_list = @items.values
|
99
123
|
tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
|
100
|
-
tdm = GSL::Matrix.new( *tda ).trans
|
101
|
-
ntdm = build_reduced_matrix(tdm, cutoff)
|
102
124
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
125
|
+
if $GSL
|
126
|
+
tdm = GSL::Matrix.new(*tda).trans
|
127
|
+
ntdm = build_reduced_matrix(tdm, cutoff)
|
128
|
+
|
129
|
+
ntdm.size[1].times do |col|
|
130
|
+
vec = GSL::Vector.new( ntdm.column(col) ).row
|
131
|
+
doc_list[col].lsi_vector = vec
|
132
|
+
doc_list[col].lsi_norm = vec.normalize
|
133
|
+
end
|
134
|
+
else
|
135
|
+
tdm = Matrix.rows(tda).trans
|
136
|
+
ntdm = build_reduced_matrix(tdm, cutoff)
|
108
137
|
|
138
|
+
ntdm.row_size.times do |col|
|
139
|
+
doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
|
140
|
+
doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
109
144
|
@built_at_version = @version
|
110
145
|
end
|
111
|
-
|
146
|
+
|
147
|
+
# This method returns max_chunks entries, ordered by their average semantic rating.
|
148
|
+
# Essentially, the average distance of each entry from all other entries is calculated,
|
149
|
+
# the highest are returned.
|
150
|
+
#
|
151
|
+
# This can be used to build a summary service, or to provide more information about
|
152
|
+
# your dataset's general content. For example, if you were to use categorize on the
|
153
|
+
# results of this data, you could gather information on what your dataset is generally
|
154
|
+
# about.
|
155
|
+
def highest_relative_content( max_chunks=10 )
|
156
|
+
return [] if needs_rebuild?
|
157
|
+
|
158
|
+
avg_density = Hash.new
|
159
|
+
@items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
|
160
|
+
|
161
|
+
avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
|
162
|
+
end
|
163
|
+
|
112
164
|
# This function is the primitive that find_related and classify
|
113
165
|
# build upon. It returns an array of 2-element arrays. The first element
|
114
166
|
# of this array is a document, and the second is its "score", defining
|
@@ -123,11 +175,15 @@ module Classifier
|
|
123
175
|
# text data. See add_item for examples of how this works.
|
124
176
|
def proximity_array_for_content( doc, &block )
|
125
177
|
return [] if needs_rebuild?
|
126
|
-
|
178
|
+
|
127
179
|
content_node = node_for_content( doc, &block )
|
128
180
|
result =
|
129
181
|
@items.keys.collect do |item|
|
130
|
-
|
182
|
+
if $GSL
|
183
|
+
val = content_node.search_vector * @items[item].search_vector.col
|
184
|
+
else
|
185
|
+
val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
|
186
|
+
end
|
131
187
|
[item, val]
|
132
188
|
end
|
133
189
|
result.sort_by { |x| x[1] }.reverse
|
@@ -144,7 +200,11 @@ module Classifier
|
|
144
200
|
content_node = node_for_content( doc, &block )
|
145
201
|
result =
|
146
202
|
@items.keys.collect do |item|
|
147
|
-
|
203
|
+
if $GSL
|
204
|
+
val = content_node.search_norm * @items[item].search_norm.col
|
205
|
+
else
|
206
|
+
val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
|
207
|
+
end
|
148
208
|
[item, val]
|
149
209
|
end
|
150
210
|
result.sort_by { |x| x[1] }.reverse
|
@@ -159,9 +219,7 @@ module Classifier
|
|
159
219
|
# it is actually the same algorithm, just applied on a smaller document.
|
160
220
|
def search( string, max_nearest=3 )
|
161
221
|
return [] if needs_rebuild?
|
162
|
-
|
163
|
-
carry =
|
164
|
-
proximity_norms_for_content( string )
|
222
|
+
carry = proximity_norms_for_content( string )
|
165
223
|
result = carry.collect { |x| x[0] }
|
166
224
|
return result[0..max_nearest-1]
|
167
225
|
end
|
@@ -208,29 +266,44 @@ module Classifier
|
|
208
266
|
return ranking[-1]
|
209
267
|
end
|
210
268
|
|
269
|
+
# Prototype, only works on indexed documents.
|
270
|
+
# I have no clue if this is going to work, but in theory
|
271
|
+
# it's supposed to.
|
272
|
+
def highest_ranked_stems( doc, count=3 )
|
273
|
+
raise "Requested stem ranking on non-indexed content!" unless @items[doc]
|
274
|
+
arr = node_for_content(doc).lsi_vector.to_a
|
275
|
+
top_n = arr.sort.reverse[0..count-1]
|
276
|
+
return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
|
277
|
+
end
|
278
|
+
|
211
279
|
private
|
212
280
|
def build_reduced_matrix( matrix, cutoff=0.75 )
|
213
281
|
# TODO: Check that M>=N on these dimensions! Transpose helps assure this
|
214
282
|
u, v, s = matrix.SV_decomp
|
283
|
+
|
215
284
|
# TODO: Better than 75% term, please. :\
|
216
285
|
s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
|
217
286
|
s.size.times do |ord|
|
218
287
|
s[ord] = 0.0 if s[ord] < s_cutoff
|
219
288
|
end
|
220
|
-
|
221
289
|
# Reconstruct the term document matrix, only with reduced rank
|
222
|
-
u * Matrix.
|
290
|
+
u * Matrix.diag( s ) * v.trans
|
223
291
|
end
|
224
292
|
|
225
293
|
def node_for_content(item, &block)
|
226
294
|
if @items[item]
|
227
295
|
return @items[item]
|
228
296
|
else
|
229
|
-
|
230
|
-
cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
|
231
|
-
end
|
297
|
+
clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
|
232
298
|
|
233
|
-
|
299
|
+
cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
|
300
|
+
|
301
|
+
unless needs_rebuild?
|
302
|
+
cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
|
303
|
+
end
|
304
|
+
end
|
305
|
+
|
306
|
+
return cn
|
234
307
|
end
|
235
308
|
|
236
309
|
def make_word_list
|
@@ -243,6 +316,3 @@ module Classifier
|
|
243
316
|
end
|
244
317
|
end
|
245
318
|
|
246
|
-
rescue LoadError
|
247
|
-
$stderr.puts "For LSI support, you need to install http://rb-gsl.rubyforge.org/"
|
248
|
-
end
|