classifier 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +361 -273
- data/README +6 -5
- data/Rakefile +12 -2
- data/bin/summarize.rb +11 -0
- data/doc/classes/Array.html +139 -0
- data/doc/classes/Array.src/M000003.html +18 -0
- data/doc/classes/Classifier.html +5 -5
- data/doc/classes/Classifier/Bayes.html +43 -43
- data/doc/classes/Classifier/Bayes.src/{M000023.html → M000038.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000024.html → M000039.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000025.html → M000040.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000026.html → M000041.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000027.html → M000042.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000028.html → M000043.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000029.html → M000044.html} +0 -0
- data/doc/classes/Classifier/ContentNode.html +23 -28
- data/doc/classes/Classifier/ContentNode.src/M000046.html +19 -0
- data/doc/classes/Classifier/ContentNode.src/{M000032.html → M000047.html} +1 -1
- data/doc/classes/Classifier/ContentNode.src/{M000033.html → M000048.html} +1 -1
- data/doc/classes/Classifier/ContentNode.src/M000049.html +49 -0
- data/doc/classes/Classifier/LSI.html +158 -68
- data/doc/classes/Classifier/LSI.src/M000022.html +6 -17
- data/doc/classes/Classifier/LSI.src/{M000012.html → M000023.html} +2 -2
- data/doc/classes/Classifier/LSI.src/{M000013.html → M000024.html} +3 -2
- data/doc/classes/Classifier/LSI.src/{M000014.html → M000025.html} +1 -1
- data/doc/classes/Classifier/LSI.src/M000026.html +19 -0
- data/doc/classes/Classifier/LSI.src/{M000015.html → M000027.html} +1 -1
- data/doc/classes/Classifier/LSI.src/{M000016.html → M000028.html} +1 -1
- data/doc/classes/Classifier/LSI.src/M000029.html +19 -0
- data/doc/classes/Classifier/LSI.src/M000030.html +43 -0
- data/doc/classes/Classifier/LSI.src/M000031.html +23 -0
- data/doc/classes/Classifier/LSI.src/{M000018.html → M000032.html} +7 -3
- data/doc/classes/Classifier/LSI.src/{M000019.html → M000033.html} +6 -2
- data/doc/classes/Classifier/LSI.src/{M000020.html → M000034.html} +2 -4
- data/doc/classes/Classifier/LSI.src/{M000021.html → M000035.html} +1 -1
- data/doc/classes/Classifier/LSI.src/M000036.html +31 -0
- data/doc/classes/Classifier/LSI.src/M000037.html +21 -0
- data/doc/classes/Classifier/WordList.html +37 -22
- data/doc/classes/Classifier/WordList.src/{M000007.html → M000017.html} +2 -2
- data/doc/classes/Classifier/WordList.src/{M000008.html → M000018.html} +1 -1
- data/doc/classes/Classifier/WordList.src/{M000009.html → M000019.html} +1 -1
- data/doc/classes/Classifier/WordList.src/M000020.html +18 -0
- data/doc/classes/Classifier/WordList.src/{M000010.html → M000021.html} +1 -1
- data/doc/classes/GSL.html +2 -1
- data/doc/classes/GSL/Matrix.html +126 -0
- data/doc/classes/GSL/Vector.html +10 -10
- data/doc/classes/GSL/Vector.src/{M000005.html → M000015.html} +0 -0
- data/doc/classes/GSL/Vector.src/{M000006.html → M000016.html} +0 -0
- data/doc/classes/Matrix.html +184 -0
- data/doc/classes/Matrix.src/M000004.html +18 -0
- data/doc/classes/Matrix.src/M000005.html +76 -0
- data/doc/classes/Matrix.src/M000006.html +18 -0
- data/doc/classes/Object.html +7 -7
- data/doc/classes/Object.src/{M000001.html → M000007.html} +1 -1
- data/doc/classes/String.html +90 -20
- data/doc/classes/String.src/{M000002.html → M000008.html} +0 -0
- data/doc/classes/String.src/{M000003.html → M000009.html} +0 -0
- data/doc/classes/String.src/{M000004.html → M000010.html} +0 -0
- data/doc/classes/String.src/M000011.html +18 -0
- data/doc/classes/String.src/M000012.html +18 -0
- data/doc/classes/String.src/M000013.html +18 -0
- data/doc/classes/String.src/M000014.html +18 -0
- data/doc/classes/Vector.html +154 -0
- data/doc/classes/Vector.src/M000001.html +22 -0
- data/doc/classes/Vector.src/M000002.html +25 -0
- data/doc/created.rid +1 -1
- data/doc/files/README.html +14 -8
- data/doc/files/lib/classifier/bayes_rb.html +1 -1
- data/doc/files/lib/classifier/{string_extensions_rb.html → extensions/string_rb.html} +5 -5
- data/doc/files/lib/classifier/extensions/vector_rb.html +120 -0
- data/doc/files/lib/classifier/extensions/vector_serialize_rb.html +1 -1
- data/doc/files/lib/classifier/extensions/word_hash_rb.html +1 -1
- data/doc/files/lib/classifier/lsi/content_node_rb.html +2 -2
- data/doc/files/lib/classifier/lsi/summary_rb.html +115 -0
- data/doc/files/lib/classifier/{extensions → lsi}/word_list_rb.html +3 -3
- data/doc/files/lib/classifier/lsi_rb.html +5 -3
- data/doc/files/lib/classifier_rb.html +2 -2
- data/doc/fr_class_index.html +4 -0
- data/doc/fr_file_index.html +4 -2
- data/doc/fr_method_index.html +49 -34
- data/doc/index.html +2 -2
- data/lib/classifier.rb +1 -1
- data/lib/classifier/{string_extensions.rb → extensions/string.rb} +0 -0
- data/lib/classifier/extensions/vector.rb +106 -0
- data/lib/classifier/extensions/vector_serialize.rb +6 -0
- data/lib/classifier/lsi.rb +101 -31
- data/lib/classifier/lsi/content_node.rb +28 -23
- data/lib/classifier/lsi/summary.rb +31 -0
- data/lib/classifier/{extensions → lsi}/word_list.rb +7 -2
- data/test/{string_extensions → extensions}/word_hash_test.rb +0 -0
- data/test/lsi/lsi_test.rb +36 -1
- metadata +68 -41
- data/doc/classes/Classifier/ContentNode.src/M000031.html +0 -21
- data/doc/classes/Classifier/ContentNode.src/M000034.html +0 -41
- data/doc/classes/Classifier/LSI.src/M000011.html +0 -20
- data/doc/classes/Classifier/LSI.src/M000017.html +0 -32
|
@@ -56,7 +56,7 @@
|
|
|
56
56
|
</tr>
|
|
57
57
|
<tr class="top-aligned-row">
|
|
58
58
|
<td><strong>Last Update:</strong></td>
|
|
59
|
-
<td>
|
|
59
|
+
<td>Thu May 05 01:50:06 PDT 2005</td>
|
|
60
60
|
</tr>
|
|
61
61
|
</table>
|
|
62
62
|
</div>
|
|
@@ -76,7 +76,7 @@
|
|
|
76
76
|
<tr><td valign="top">Copyright:</td><td>Copyright © 2005 David Fayram II
|
|
77
77
|
|
|
78
78
|
</td></tr>
|
|
79
|
-
<tr><td valign="top">License:</td><td>
|
|
79
|
+
<tr><td valign="top">License:</td><td>LGPL
|
|
80
80
|
|
|
81
81
|
</td></tr>
|
|
82
82
|
</table>
|
|
@@ -88,9 +88,11 @@
|
|
|
88
88
|
|
|
89
89
|
<div class="name-list">
|
|
90
90
|
gsl
|
|
91
|
-
classifier/extensions/word_list
|
|
92
91
|
classifier/extensions/vector_serialize
|
|
92
|
+
classifier/extensions/vector
|
|
93
|
+
classifier/lsi/word_list
|
|
93
94
|
classifier/lsi/content_node
|
|
95
|
+
classifier/lsi/summary
|
|
94
96
|
</div>
|
|
95
97
|
</div>
|
|
96
98
|
|
|
@@ -56,7 +56,7 @@
|
|
|
56
56
|
</tr>
|
|
57
57
|
<tr class="top-aligned-row">
|
|
58
58
|
<td><strong>Last Update:</strong></td>
|
|
59
|
-
<td>
|
|
59
|
+
<td>Thu May 05 01:21:16 PDT 2005</td>
|
|
60
60
|
</tr>
|
|
61
61
|
</table>
|
|
62
62
|
</div>
|
|
@@ -88,7 +88,7 @@
|
|
|
88
88
|
|
|
89
89
|
<div class="name-list">
|
|
90
90
|
rubygems
|
|
91
|
-
classifier/
|
|
91
|
+
classifier/extensions/string
|
|
92
92
|
classifier/bayes
|
|
93
93
|
classifier/lsi
|
|
94
94
|
</div>
|
data/doc/fr_class_index.html
CHANGED
|
@@ -20,15 +20,19 @@
|
|
|
20
20
|
<div id="index">
|
|
21
21
|
<h1 class="section-bar">Classes</h1>
|
|
22
22
|
<div id="index-entries">
|
|
23
|
+
<a href="classes/Array.html">Array</a><br />
|
|
23
24
|
<a href="classes/Classifier.html">Classifier</a><br />
|
|
24
25
|
<a href="classes/Classifier/Bayes.html">Classifier::Bayes</a><br />
|
|
25
26
|
<a href="classes/Classifier/ContentNode.html">Classifier::ContentNode</a><br />
|
|
26
27
|
<a href="classes/Classifier/LSI.html">Classifier::LSI</a><br />
|
|
27
28
|
<a href="classes/Classifier/WordList.html">Classifier::WordList</a><br />
|
|
28
29
|
<a href="classes/GSL.html">GSL</a><br />
|
|
30
|
+
<a href="classes/GSL/Matrix.html">GSL::Matrix</a><br />
|
|
29
31
|
<a href="classes/GSL/Vector.html">GSL::Vector</a><br />
|
|
32
|
+
<a href="classes/Matrix.html">Matrix</a><br />
|
|
30
33
|
<a href="classes/Object.html">Object</a><br />
|
|
31
34
|
<a href="classes/String.html">String</a><br />
|
|
35
|
+
<a href="classes/Vector.html">Vector</a><br />
|
|
32
36
|
</div>
|
|
33
37
|
</div>
|
|
34
38
|
</body>
|
data/doc/fr_file_index.html
CHANGED
|
@@ -23,12 +23,14 @@
|
|
|
23
23
|
<a href="files/README.html">README</a><br />
|
|
24
24
|
<a href="files/lib/classifier_rb.html">lib/classifier.rb</a><br />
|
|
25
25
|
<a href="files/lib/classifier/bayes_rb.html">lib/classifier/bayes.rb</a><br />
|
|
26
|
+
<a href="files/lib/classifier/extensions/string_rb.html">lib/classifier/extensions/string.rb</a><br />
|
|
27
|
+
<a href="files/lib/classifier/extensions/vector_rb.html">lib/classifier/extensions/vector.rb</a><br />
|
|
26
28
|
<a href="files/lib/classifier/extensions/vector_serialize_rb.html">lib/classifier/extensions/vector_serialize.rb</a><br />
|
|
27
29
|
<a href="files/lib/classifier/extensions/word_hash_rb.html">lib/classifier/extensions/word_hash.rb</a><br />
|
|
28
|
-
<a href="files/lib/classifier/extensions/word_list_rb.html">lib/classifier/extensions/word_list.rb</a><br />
|
|
29
30
|
<a href="files/lib/classifier/lsi_rb.html">lib/classifier/lsi.rb</a><br />
|
|
30
31
|
<a href="files/lib/classifier/lsi/content_node_rb.html">lib/classifier/lsi/content_node.rb</a><br />
|
|
31
|
-
<a href="files/lib/classifier/
|
|
32
|
+
<a href="files/lib/classifier/lsi/summary_rb.html">lib/classifier/lsi/summary.rb</a><br />
|
|
33
|
+
<a href="files/lib/classifier/lsi/word_list_rb.html">lib/classifier/lsi/word_list.rb</a><br />
|
|
32
34
|
</div>
|
|
33
35
|
</div>
|
|
34
36
|
</body>
|
data/doc/fr_method_index.html
CHANGED
|
@@ -20,40 +20,55 @@
|
|
|
20
20
|
<div id="index">
|
|
21
21
|
<h1 class="section-bar">Methods</h1>
|
|
22
22
|
<div id="index-entries">
|
|
23
|
-
<a href="classes/Classifier/LSI.html#
|
|
24
|
-
<a href="classes/
|
|
25
|
-
<a href="classes/
|
|
26
|
-
<a href="classes/
|
|
27
|
-
<a href="classes/
|
|
28
|
-
<a href="classes/
|
|
29
|
-
<a href="classes/Classifier/
|
|
30
|
-
<a href="classes/Classifier/
|
|
31
|
-
<a href="classes/Classifier/
|
|
32
|
-
<a href="classes/Classifier/Bayes.html#
|
|
33
|
-
<a href="classes/Classifier/LSI.html#
|
|
34
|
-
<a href="classes/Classifier/
|
|
35
|
-
<a href="classes/
|
|
36
|
-
<a href="classes/Classifier/
|
|
37
|
-
<a href="classes/Classifier/LSI.html#
|
|
38
|
-
<a href="classes/Classifier/Bayes.html#
|
|
39
|
-
<a href="classes/
|
|
40
|
-
<a href="classes/
|
|
41
|
-
<a href="classes/Classifier/LSI.html#
|
|
42
|
-
<a href="classes/Classifier/
|
|
43
|
-
<a href="classes/Classifier/
|
|
44
|
-
<a href="classes/
|
|
45
|
-
<a href="classes/
|
|
46
|
-
<a href="classes/Classifier/
|
|
47
|
-
<a href="classes/Classifier/
|
|
48
|
-
<a href="classes/Classifier/
|
|
49
|
-
<a href="classes/Classifier/
|
|
50
|
-
<a href="classes/Classifier/
|
|
51
|
-
<a href="classes/Classifier/
|
|
52
|
-
<a href="classes/
|
|
53
|
-
<a href="classes/
|
|
54
|
-
<a href="classes/
|
|
55
|
-
<a href="classes/
|
|
56
|
-
<a href="classes/
|
|
23
|
+
<a href="classes/Classifier/LSI.html#M000025"><< (Classifier::LSI)</a><br />
|
|
24
|
+
<a href="classes/Matrix.html#M000005">SV_decomp (Matrix)</a><br />
|
|
25
|
+
<a href="classes/Classifier/WordList.html#M000019">[] (Classifier::WordList)</a><br />
|
|
26
|
+
<a href="classes/Matrix.html#M000006">[]= (Matrix)</a><br />
|
|
27
|
+
<a href="classes/GSL/Vector.html#M000015">_dump (GSL::Vector)</a><br />
|
|
28
|
+
<a href="classes/GSL/Vector.html#M000016">_load (GSL::Vector)</a><br />
|
|
29
|
+
<a href="classes/Classifier/Bayes.html#M000044">add_category (Classifier::Bayes)</a><br />
|
|
30
|
+
<a href="classes/Classifier/LSI.html#M000024">add_item (Classifier::LSI)</a><br />
|
|
31
|
+
<a href="classes/Classifier/WordList.html#M000018">add_word (Classifier::WordList)</a><br />
|
|
32
|
+
<a href="classes/Classifier/Bayes.html#M000045">append_category (Classifier::Bayes)</a><br />
|
|
33
|
+
<a href="classes/Classifier/LSI.html#M000030">build_index (Classifier::LSI)</a><br />
|
|
34
|
+
<a href="classes/Classifier/LSI.html#M000029">categories_for (Classifier::LSI)</a><br />
|
|
35
|
+
<a href="classes/Classifier/LSI.html#M000026">categories_for (Classifier::LSI)</a><br />
|
|
36
|
+
<a href="classes/Classifier/Bayes.html#M000041">classifications (Classifier::Bayes)</a><br />
|
|
37
|
+
<a href="classes/Classifier/LSI.html#M000036">classify (Classifier::LSI)</a><br />
|
|
38
|
+
<a href="classes/Classifier/Bayes.html#M000042">classify (Classifier::Bayes)</a><br />
|
|
39
|
+
<a href="classes/String.html#M000010">clean_word_hash (String)</a><br />
|
|
40
|
+
<a href="classes/Matrix.html#M000004">diag (Matrix)</a><br />
|
|
41
|
+
<a href="classes/Classifier/LSI.html#M000035">find_related (Classifier::LSI)</a><br />
|
|
42
|
+
<a href="classes/Classifier/LSI.html#M000037">highest_ranked_stems (Classifier::LSI)</a><br />
|
|
43
|
+
<a href="classes/Classifier/LSI.html#M000031">highest_relative_content (Classifier::LSI)</a><br />
|
|
44
|
+
<a href="classes/Classifier/LSI.html#M000028">items (Classifier::LSI)</a><br />
|
|
45
|
+
<a href="classes/Vector.html#M000001">magnitude (Vector)</a><br />
|
|
46
|
+
<a href="classes/Classifier/Bayes.html#M000043">method_missing (Classifier::Bayes)</a><br />
|
|
47
|
+
<a href="classes/Classifier/LSI.html#M000023">needs_rebuild? (Classifier::LSI)</a><br />
|
|
48
|
+
<a href="classes/Classifier/ContentNode.html#M000046">new (Classifier::ContentNode)</a><br />
|
|
49
|
+
<a href="classes/Classifier/Bayes.html#M000038">new (Classifier::Bayes)</a><br />
|
|
50
|
+
<a href="classes/Classifier/LSI.html#M000022">new (Classifier::LSI)</a><br />
|
|
51
|
+
<a href="classes/Classifier/WordList.html#M000017">new (Classifier::WordList)</a><br />
|
|
52
|
+
<a href="classes/Vector.html#M000002">normalize (Vector)</a><br />
|
|
53
|
+
<a href="classes/String.html#M000012">paragraph_summary (String)</a><br />
|
|
54
|
+
<a href="classes/Object.html#M000007">prepare_category_name (Object)</a><br />
|
|
55
|
+
<a href="classes/Classifier/LSI.html#M000032">proximity_array_for_content (Classifier::LSI)</a><br />
|
|
56
|
+
<a href="classes/Classifier/LSI.html#M000033">proximity_norms_for_content (Classifier::LSI)</a><br />
|
|
57
|
+
<a href="classes/Classifier/ContentNode.html#M000049">raw_vector_with (Classifier::ContentNode)</a><br />
|
|
58
|
+
<a href="classes/Classifier/LSI.html#M000027">remove_item (Classifier::LSI)</a><br />
|
|
59
|
+
<a href="classes/Classifier/LSI.html#M000034">search (Classifier::LSI)</a><br />
|
|
60
|
+
<a href="classes/Classifier/ContentNode.html#M000048">search_norm (Classifier::ContentNode)</a><br />
|
|
61
|
+
<a href="classes/Classifier/ContentNode.html#M000047">search_vector (Classifier::ContentNode)</a><br />
|
|
62
|
+
<a href="classes/Classifier/WordList.html#M000021">size (Classifier::WordList)</a><br />
|
|
63
|
+
<a href="classes/String.html#M000014">split_paragraphs (String)</a><br />
|
|
64
|
+
<a href="classes/String.html#M000013">split_sentences (String)</a><br />
|
|
65
|
+
<a href="classes/Array.html#M000003">sum (Array)</a><br />
|
|
66
|
+
<a href="classes/String.html#M000011">summary (String)</a><br />
|
|
67
|
+
<a href="classes/Classifier/Bayes.html#M000039">train (Classifier::Bayes)</a><br />
|
|
68
|
+
<a href="classes/Classifier/Bayes.html#M000040">untrain (Classifier::Bayes)</a><br />
|
|
69
|
+
<a href="classes/String.html#M000008">without_punctuation (String)</a><br />
|
|
70
|
+
<a href="classes/Classifier/WordList.html#M000020">word_for_index (Classifier::WordList)</a><br />
|
|
71
|
+
<a href="classes/String.html#M000009">word_hash (String)</a><br />
|
|
57
72
|
</div>
|
|
58
73
|
</div>
|
|
59
74
|
</body>
|
data/doc/index.html
CHANGED
|
@@ -5,12 +5,12 @@
|
|
|
5
5
|
|
|
6
6
|
<!--
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
Ruby Classifier - Bayesian and LSI classification library
|
|
9
9
|
|
|
10
10
|
-->
|
|
11
11
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
|
12
12
|
<head>
|
|
13
|
-
<title>
|
|
13
|
+
<title>Ruby Classifier - Bayesian and LSI classification library</title>
|
|
14
14
|
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
|
15
15
|
</head>
|
|
16
16
|
<frameset rows="20%, 80%">
|
data/lib/classifier.rb
CHANGED
|
File without changes
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# Author:: Ernest Ellingson
|
|
2
|
+
# Copyright:: Copyright (c) 2005
|
|
3
|
+
|
|
4
|
+
# These are extensions to the std-lib 'matrix' to allow an all ruby SVD
|
|
5
|
+
|
|
6
|
+
require 'matrix'
|
|
7
|
+
require 'mathn'
|
|
8
|
+
|
|
9
|
+
class Array
|
|
10
|
+
def sum
|
|
11
|
+
inject(0) { |sum,term| sum += term }.to_f
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
class Vector
|
|
16
|
+
def magnitude
|
|
17
|
+
sumsqs = 0.0
|
|
18
|
+
self.size.times do |i|
|
|
19
|
+
sumsqs += self[i] ** 2.0
|
|
20
|
+
end
|
|
21
|
+
Math.sqrt(sumsqs)
|
|
22
|
+
end
|
|
23
|
+
def normalize
|
|
24
|
+
nv = []
|
|
25
|
+
mag = self.magnitude
|
|
26
|
+
self.size.times do |i|
|
|
27
|
+
|
|
28
|
+
nv << (self[i] / mag)
|
|
29
|
+
|
|
30
|
+
end
|
|
31
|
+
Vector[*nv]
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
class Matrix
|
|
36
|
+
def Matrix.diag(s)
|
|
37
|
+
Matrix.diagonal(*s)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
alias :trans :transpose
|
|
41
|
+
|
|
42
|
+
def SV_decomp(maxSweeps = 20)
|
|
43
|
+
if self.row_size >= self.column_size
|
|
44
|
+
q = self.trans * self
|
|
45
|
+
else
|
|
46
|
+
q = self * self.trans
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
qrot = q.dup
|
|
50
|
+
v = Matrix.identity(q.row_size)
|
|
51
|
+
azrot = nil
|
|
52
|
+
mzrot = nil
|
|
53
|
+
cnt = 0
|
|
54
|
+
s_old = nil
|
|
55
|
+
mu = nil
|
|
56
|
+
|
|
57
|
+
while true do
|
|
58
|
+
cnt += 1
|
|
59
|
+
for row in (0...qrot.row_size-1) do
|
|
60
|
+
for col in (1..qrot.row_size-1) do
|
|
61
|
+
next if row == col
|
|
62
|
+
h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
|
|
63
|
+
hcos = Math.cos(h)
|
|
64
|
+
hsin = Math.sin(h)
|
|
65
|
+
mzrot = Matrix.identity(qrot.row_size)
|
|
66
|
+
mzrot[row,row] = hcos
|
|
67
|
+
mzrot[row,col] = -hsin
|
|
68
|
+
mzrot[col,row] = hsin
|
|
69
|
+
mzrot[col,col] = hcos
|
|
70
|
+
qrot = mzrot.trans * qrot * mzrot
|
|
71
|
+
v = v * mzrot
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
s_old = qrot.dup if cnt == 1
|
|
75
|
+
sum_qrot = 0.0
|
|
76
|
+
if cnt > 1
|
|
77
|
+
qrot.row_size.times do |r|
|
|
78
|
+
sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
|
|
79
|
+
end
|
|
80
|
+
s_old = qrot.dup
|
|
81
|
+
end
|
|
82
|
+
break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
|
|
83
|
+
end # of do while true
|
|
84
|
+
s = []
|
|
85
|
+
qrot.row_size.times do |r|
|
|
86
|
+
s << Math.sqrt(qrot[r,r])
|
|
87
|
+
end
|
|
88
|
+
#puts "cnt = #{cnt}"
|
|
89
|
+
if self.row_size >= self.column_size
|
|
90
|
+
mu = self * v * Matrix.diagonal(*s).inverse
|
|
91
|
+
return [mu, v, s]
|
|
92
|
+
else
|
|
93
|
+
puts v.row_size
|
|
94
|
+
puts v.column_size
|
|
95
|
+
puts self.row_size
|
|
96
|
+
puts self.column_size
|
|
97
|
+
puts s.size
|
|
98
|
+
|
|
99
|
+
mu = (self.trans * v * Matrix.diagonal(*s).inverse)
|
|
100
|
+
return [mu, v, s]
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
def []=(i,j,val)
|
|
104
|
+
@rows[i][j] = val
|
|
105
|
+
end
|
|
106
|
+
end
|
data/lib/classifier/lsi.rb
CHANGED
|
@@ -1,14 +1,22 @@
|
|
|
1
1
|
# Author:: David Fayram (mailto:dfayram@lensmen.net)
|
|
2
2
|
# Copyright:: Copyright (c) 2005 David Fayram II
|
|
3
|
-
# License::
|
|
3
|
+
# License:: LGPL
|
|
4
4
|
|
|
5
5
|
begin
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
require 'classifier/extensions/
|
|
10
|
-
|
|
6
|
+
raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
|
|
7
|
+
|
|
8
|
+
require 'gsl' # requires http://rb-gsl.rubyforge.org/
|
|
9
|
+
require 'classifier/extensions/vector_serialize'
|
|
10
|
+
$GSL = true
|
|
11
|
+
|
|
12
|
+
rescue LoadError
|
|
13
|
+
warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
|
|
14
|
+
require 'classifier/extensions/vector'
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
require 'classifier/lsi/word_list'
|
|
11
18
|
require 'classifier/lsi/content_node'
|
|
19
|
+
require 'classifier/lsi/summary'
|
|
12
20
|
|
|
13
21
|
module Classifier
|
|
14
22
|
|
|
@@ -18,6 +26,7 @@ module Classifier
|
|
|
18
26
|
class LSI
|
|
19
27
|
|
|
20
28
|
attr_reader :word_list
|
|
29
|
+
attr_accessor :auto_rebuild
|
|
21
30
|
|
|
22
31
|
# Create a fresh index.
|
|
23
32
|
# If you want to call #build_index manually, use
|
|
@@ -33,7 +42,7 @@ module Classifier
|
|
|
33
42
|
# to be built after all informaton is added, but before you start
|
|
34
43
|
# using it for search, classification and cluster detection.
|
|
35
44
|
def needs_rebuild?
|
|
36
|
-
@version != @built_at_version
|
|
45
|
+
(@items.keys.size > 1) && (@version != @built_at_version)
|
|
37
46
|
end
|
|
38
47
|
|
|
39
48
|
# Adds an item to the index. item is assumed to be a string, but
|
|
@@ -50,7 +59,8 @@ module Classifier
|
|
|
50
59
|
# lsi.add_item ar, *ar.categories { |x| ar.content }
|
|
51
60
|
#
|
|
52
61
|
def add_item( item, *categories, &block )
|
|
53
|
-
|
|
62
|
+
clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
|
|
63
|
+
@items[item] = ContentNode.new(clean_word_hash, *categories)
|
|
54
64
|
@version += 1
|
|
55
65
|
build_index if @auto_rebuild
|
|
56
66
|
end
|
|
@@ -63,6 +73,13 @@ module Classifier
|
|
|
63
73
|
add_item item
|
|
64
74
|
end
|
|
65
75
|
|
|
76
|
+
# Returns the categories for a given indexed items. You are free to add and remove
|
|
77
|
+
# items from this as you see fit. It does not invalide an index to change its categories.
|
|
78
|
+
def categories_for(item)
|
|
79
|
+
return [] unless @items[item]
|
|
80
|
+
return @items[item].categories
|
|
81
|
+
end
|
|
82
|
+
|
|
66
83
|
# Removes an item from the database, if it is indexed.
|
|
67
84
|
#
|
|
68
85
|
def remove_item( item )
|
|
@@ -77,6 +94,13 @@ module Classifier
|
|
|
77
94
|
@items.keys
|
|
78
95
|
end
|
|
79
96
|
|
|
97
|
+
# Returns the categories for a given indexed items. You are free to add and remove
|
|
98
|
+
# items from this as you see fit. It does not invalide an index to change its categories.
|
|
99
|
+
def categories_for(item)
|
|
100
|
+
return [] unless @items[item]
|
|
101
|
+
return @items[item].categories
|
|
102
|
+
end
|
|
103
|
+
|
|
80
104
|
# This function rebuilds the index if needs_rebuild? returns true.
|
|
81
105
|
# For very large document spaces, this indexing operation may take some
|
|
82
106
|
# time to complete, so it may be wise to place the operation in another
|
|
@@ -97,18 +121,46 @@ module Classifier
|
|
|
97
121
|
|
|
98
122
|
doc_list = @items.values
|
|
99
123
|
tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
|
|
100
|
-
tdm = GSL::Matrix.new( *tda ).trans
|
|
101
|
-
ntdm = build_reduced_matrix(tdm, cutoff)
|
|
102
124
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
125
|
+
if $GSL
|
|
126
|
+
tdm = GSL::Matrix.new(*tda).trans
|
|
127
|
+
ntdm = build_reduced_matrix(tdm, cutoff)
|
|
128
|
+
|
|
129
|
+
ntdm.size[1].times do |col|
|
|
130
|
+
vec = GSL::Vector.new( ntdm.column(col) ).row
|
|
131
|
+
doc_list[col].lsi_vector = vec
|
|
132
|
+
doc_list[col].lsi_norm = vec.normalize
|
|
133
|
+
end
|
|
134
|
+
else
|
|
135
|
+
tdm = Matrix.rows(tda).trans
|
|
136
|
+
ntdm = build_reduced_matrix(tdm, cutoff)
|
|
108
137
|
|
|
138
|
+
ntdm.row_size.times do |col|
|
|
139
|
+
doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
|
|
140
|
+
doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
109
144
|
@built_at_version = @version
|
|
110
145
|
end
|
|
111
|
-
|
|
146
|
+
|
|
147
|
+
# This method returns max_chunks entries, ordered by their average semantic rating.
|
|
148
|
+
# Essentially, the average distance of each entry from all other entries is calculated,
|
|
149
|
+
# the highest are returned.
|
|
150
|
+
#
|
|
151
|
+
# This can be used to build a summary service, or to provide more information about
|
|
152
|
+
# your dataset's general content. For example, if you were to use categorize on the
|
|
153
|
+
# results of this data, you could gather information on what your dataset is generally
|
|
154
|
+
# about.
|
|
155
|
+
def highest_relative_content( max_chunks=10 )
|
|
156
|
+
return [] if needs_rebuild?
|
|
157
|
+
|
|
158
|
+
avg_density = Hash.new
|
|
159
|
+
@items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
|
|
160
|
+
|
|
161
|
+
avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
|
|
162
|
+
end
|
|
163
|
+
|
|
112
164
|
# This function is the primitive that find_related and classify
|
|
113
165
|
# build upon. It returns an array of 2-element arrays. The first element
|
|
114
166
|
# of this array is a document, and the second is its "score", defining
|
|
@@ -123,11 +175,15 @@ module Classifier
|
|
|
123
175
|
# text data. See add_item for examples of how this works.
|
|
124
176
|
def proximity_array_for_content( doc, &block )
|
|
125
177
|
return [] if needs_rebuild?
|
|
126
|
-
|
|
178
|
+
|
|
127
179
|
content_node = node_for_content( doc, &block )
|
|
128
180
|
result =
|
|
129
181
|
@items.keys.collect do |item|
|
|
130
|
-
|
|
182
|
+
if $GSL
|
|
183
|
+
val = content_node.search_vector * @items[item].search_vector.col
|
|
184
|
+
else
|
|
185
|
+
val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
|
|
186
|
+
end
|
|
131
187
|
[item, val]
|
|
132
188
|
end
|
|
133
189
|
result.sort_by { |x| x[1] }.reverse
|
|
@@ -144,7 +200,11 @@ module Classifier
|
|
|
144
200
|
content_node = node_for_content( doc, &block )
|
|
145
201
|
result =
|
|
146
202
|
@items.keys.collect do |item|
|
|
147
|
-
|
|
203
|
+
if $GSL
|
|
204
|
+
val = content_node.search_norm * @items[item].search_norm.col
|
|
205
|
+
else
|
|
206
|
+
val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
|
|
207
|
+
end
|
|
148
208
|
[item, val]
|
|
149
209
|
end
|
|
150
210
|
result.sort_by { |x| x[1] }.reverse
|
|
@@ -159,9 +219,7 @@ module Classifier
|
|
|
159
219
|
# it is actually the same algorithm, just applied on a smaller document.
|
|
160
220
|
def search( string, max_nearest=3 )
|
|
161
221
|
return [] if needs_rebuild?
|
|
162
|
-
|
|
163
|
-
carry =
|
|
164
|
-
proximity_norms_for_content( string )
|
|
222
|
+
carry = proximity_norms_for_content( string )
|
|
165
223
|
result = carry.collect { |x| x[0] }
|
|
166
224
|
return result[0..max_nearest-1]
|
|
167
225
|
end
|
|
@@ -208,29 +266,44 @@ module Classifier
|
|
|
208
266
|
return ranking[-1]
|
|
209
267
|
end
|
|
210
268
|
|
|
269
|
+
# Prototype, only works on indexed documents.
|
|
270
|
+
# I have no clue if this is going to work, but in theory
|
|
271
|
+
# it's supposed to.
|
|
272
|
+
def highest_ranked_stems( doc, count=3 )
|
|
273
|
+
raise "Requested stem ranking on non-indexed content!" unless @items[doc]
|
|
274
|
+
arr = node_for_content(doc).lsi_vector.to_a
|
|
275
|
+
top_n = arr.sort.reverse[0..count-1]
|
|
276
|
+
return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
|
|
277
|
+
end
|
|
278
|
+
|
|
211
279
|
private
|
|
212
280
|
def build_reduced_matrix( matrix, cutoff=0.75 )
|
|
213
281
|
# TODO: Check that M>=N on these dimensions! Transpose helps assure this
|
|
214
282
|
u, v, s = matrix.SV_decomp
|
|
283
|
+
|
|
215
284
|
# TODO: Better than 75% term, please. :\
|
|
216
285
|
s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
|
|
217
286
|
s.size.times do |ord|
|
|
218
287
|
s[ord] = 0.0 if s[ord] < s_cutoff
|
|
219
288
|
end
|
|
220
|
-
|
|
221
289
|
# Reconstruct the term document matrix, only with reduced rank
|
|
222
|
-
u * Matrix.
|
|
290
|
+
u * Matrix.diag( s ) * v.trans
|
|
223
291
|
end
|
|
224
292
|
|
|
225
293
|
def node_for_content(item, &block)
|
|
226
294
|
if @items[item]
|
|
227
295
|
return @items[item]
|
|
228
296
|
else
|
|
229
|
-
|
|
230
|
-
cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
|
|
231
|
-
end
|
|
297
|
+
clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
|
|
232
298
|
|
|
233
|
-
|
|
299
|
+
cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
|
|
300
|
+
|
|
301
|
+
unless needs_rebuild?
|
|
302
|
+
cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
|
|
303
|
+
end
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
return cn
|
|
234
307
|
end
|
|
235
308
|
|
|
236
309
|
def make_word_list
|
|
@@ -243,6 +316,3 @@ module Classifier
|
|
|
243
316
|
end
|
|
244
317
|
end
|
|
245
318
|
|
|
246
|
-
rescue LoadError
|
|
247
|
-
$stderr.puts "For LSI support, you need to install http://rb-gsl.rubyforge.org/"
|
|
248
|
-
end
|