classifier 1.2.0 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +361 -273
- data/README +6 -5
- data/Rakefile +12 -2
- data/bin/summarize.rb +11 -0
- data/doc/classes/Array.html +139 -0
- data/doc/classes/Array.src/M000003.html +18 -0
- data/doc/classes/Classifier.html +5 -5
- data/doc/classes/Classifier/Bayes.html +43 -43
- data/doc/classes/Classifier/Bayes.src/{M000023.html → M000038.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000024.html → M000039.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000025.html → M000040.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000026.html → M000041.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000027.html → M000042.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000028.html → M000043.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000029.html → M000044.html} +0 -0
- data/doc/classes/Classifier/ContentNode.html +23 -28
- data/doc/classes/Classifier/ContentNode.src/M000046.html +19 -0
- data/doc/classes/Classifier/ContentNode.src/{M000032.html → M000047.html} +1 -1
- data/doc/classes/Classifier/ContentNode.src/{M000033.html → M000048.html} +1 -1
- data/doc/classes/Classifier/ContentNode.src/M000049.html +49 -0
- data/doc/classes/Classifier/LSI.html +158 -68
- data/doc/classes/Classifier/LSI.src/M000022.html +6 -17
- data/doc/classes/Classifier/LSI.src/{M000012.html → M000023.html} +2 -2
- data/doc/classes/Classifier/LSI.src/{M000013.html → M000024.html} +3 -2
- data/doc/classes/Classifier/LSI.src/{M000014.html → M000025.html} +1 -1
- data/doc/classes/Classifier/LSI.src/M000026.html +19 -0
- data/doc/classes/Classifier/LSI.src/{M000015.html → M000027.html} +1 -1
- data/doc/classes/Classifier/LSI.src/{M000016.html → M000028.html} +1 -1
- data/doc/classes/Classifier/LSI.src/M000029.html +19 -0
- data/doc/classes/Classifier/LSI.src/M000030.html +43 -0
- data/doc/classes/Classifier/LSI.src/M000031.html +23 -0
- data/doc/classes/Classifier/LSI.src/{M000018.html → M000032.html} +7 -3
- data/doc/classes/Classifier/LSI.src/{M000019.html → M000033.html} +6 -2
- data/doc/classes/Classifier/LSI.src/{M000020.html → M000034.html} +2 -4
- data/doc/classes/Classifier/LSI.src/{M000021.html → M000035.html} +1 -1
- data/doc/classes/Classifier/LSI.src/M000036.html +31 -0
- data/doc/classes/Classifier/LSI.src/M000037.html +21 -0
- data/doc/classes/Classifier/WordList.html +37 -22
- data/doc/classes/Classifier/WordList.src/{M000007.html → M000017.html} +2 -2
- data/doc/classes/Classifier/WordList.src/{M000008.html → M000018.html} +1 -1
- data/doc/classes/Classifier/WordList.src/{M000009.html → M000019.html} +1 -1
- data/doc/classes/Classifier/WordList.src/M000020.html +18 -0
- data/doc/classes/Classifier/WordList.src/{M000010.html → M000021.html} +1 -1
- data/doc/classes/GSL.html +2 -1
- data/doc/classes/GSL/Matrix.html +126 -0
- data/doc/classes/GSL/Vector.html +10 -10
- data/doc/classes/GSL/Vector.src/{M000005.html → M000015.html} +0 -0
- data/doc/classes/GSL/Vector.src/{M000006.html → M000016.html} +0 -0
- data/doc/classes/Matrix.html +184 -0
- data/doc/classes/Matrix.src/M000004.html +18 -0
- data/doc/classes/Matrix.src/M000005.html +76 -0
- data/doc/classes/Matrix.src/M000006.html +18 -0
- data/doc/classes/Object.html +7 -7
- data/doc/classes/Object.src/{M000001.html → M000007.html} +1 -1
- data/doc/classes/String.html +90 -20
- data/doc/classes/String.src/{M000002.html → M000008.html} +0 -0
- data/doc/classes/String.src/{M000003.html → M000009.html} +0 -0
- data/doc/classes/String.src/{M000004.html → M000010.html} +0 -0
- data/doc/classes/String.src/M000011.html +18 -0
- data/doc/classes/String.src/M000012.html +18 -0
- data/doc/classes/String.src/M000013.html +18 -0
- data/doc/classes/String.src/M000014.html +18 -0
- data/doc/classes/Vector.html +154 -0
- data/doc/classes/Vector.src/M000001.html +22 -0
- data/doc/classes/Vector.src/M000002.html +25 -0
- data/doc/created.rid +1 -1
- data/doc/files/README.html +14 -8
- data/doc/files/lib/classifier/bayes_rb.html +1 -1
- data/doc/files/lib/classifier/{string_extensions_rb.html → extensions/string_rb.html} +5 -5
- data/doc/files/lib/classifier/extensions/vector_rb.html +120 -0
- data/doc/files/lib/classifier/extensions/vector_serialize_rb.html +1 -1
- data/doc/files/lib/classifier/extensions/word_hash_rb.html +1 -1
- data/doc/files/lib/classifier/lsi/content_node_rb.html +2 -2
- data/doc/files/lib/classifier/lsi/summary_rb.html +115 -0
- data/doc/files/lib/classifier/{extensions → lsi}/word_list_rb.html +3 -3
- data/doc/files/lib/classifier/lsi_rb.html +5 -3
- data/doc/files/lib/classifier_rb.html +2 -2
- data/doc/fr_class_index.html +4 -0
- data/doc/fr_file_index.html +4 -2
- data/doc/fr_method_index.html +49 -34
- data/doc/index.html +2 -2
- data/lib/classifier.rb +1 -1
- data/lib/classifier/{string_extensions.rb → extensions/string.rb} +0 -0
- data/lib/classifier/extensions/vector.rb +106 -0
- data/lib/classifier/extensions/vector_serialize.rb +6 -0
- data/lib/classifier/lsi.rb +101 -31
- data/lib/classifier/lsi/content_node.rb +28 -23
- data/lib/classifier/lsi/summary.rb +31 -0
- data/lib/classifier/{extensions → lsi}/word_list.rb +7 -2
- data/test/{string_extensions → extensions}/word_hash_test.rb +0 -0
- data/test/lsi/lsi_test.rb +36 -1
- metadata +68 -41
- data/doc/classes/Classifier/ContentNode.src/M000031.html +0 -21
- data/doc/classes/Classifier/ContentNode.src/M000034.html +0 -41
- data/doc/classes/Classifier/LSI.src/M000011.html +0 -20
- data/doc/classes/Classifier/LSI.src/M000017.html +0 -32
@@ -1,26 +1,23 @@
|
|
1
1
|
# Author:: David Fayram (mailto:dfayram@lensmen.net)
|
2
2
|
# Copyright:: Copyright (c) 2005 David Fayram II
|
3
|
-
# License::
|
3
|
+
# License:: LGPL
|
4
4
|
|
5
5
|
module Classifier
|
6
6
|
|
7
|
-
|
8
7
|
# This is an internal data structure class for the LSI node. Save for
|
9
8
|
# raw_vector_with, it should be fairly straightforward to understand.
|
10
9
|
# You should never have to use it directly.
|
11
10
|
class ContentNode
|
12
|
-
attr_accessor :
|
13
|
-
|
14
|
-
:categories
|
15
|
-
|
16
|
-
|
11
|
+
attr_accessor :raw_vector, :raw_norm,
|
12
|
+
:lsi_vector, :lsi_norm,
|
13
|
+
:categories
|
14
|
+
|
15
|
+
attr_reader :word_hash
|
17
16
|
# If text_proc is not specified, the source will be duck-typed
|
18
17
|
# via source.to_s
|
19
|
-
def initialize(
|
20
|
-
text_proc = text_proc || (proc {|x| x.to_s})
|
18
|
+
def initialize( word_hash, *categories )
|
21
19
|
@categories = categories || []
|
22
|
-
@
|
23
|
-
@word_hash = text_proc.call( @source ).clean_word_hash
|
20
|
+
@word_hash = word_hash
|
24
21
|
end
|
25
22
|
|
26
23
|
# Use this to fetch the appropriate search vector.
|
@@ -36,32 +33,40 @@ module Classifier
|
|
36
33
|
# Creates the raw vector out of word_hash using word_list as the
|
37
34
|
# key for mapping the vector space.
|
38
35
|
def raw_vector_with( word_list )
|
39
|
-
|
36
|
+
if $GSL
|
37
|
+
vec = Vector.new(word_list.size)
|
38
|
+
else
|
39
|
+
vec = Array.new(word_list.size, 0)
|
40
|
+
end
|
40
41
|
|
41
42
|
@word_hash.each_key do |word|
|
42
43
|
vec[word_list[word]] = @word_hash[word] if word_list[word]
|
43
44
|
end
|
44
45
|
|
45
46
|
# Perform the scaling transform
|
46
|
-
total_words = vec.
|
47
|
+
total_words = vec.sum
|
47
48
|
|
48
49
|
# Perform first-order association transform if this vector has more
|
49
50
|
# than one word in it.
|
50
51
|
if total_words > 1.0
|
51
|
-
weighted_total =
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
sum
|
52
|
+
weighted_total = 0.0
|
53
|
+
vec.each do |term|
|
54
|
+
if ( term > 0 )
|
55
|
+
weighted_total += (( term / total_words ) * Math.log( term / total_words ))
|
56
56
|
end
|
57
57
|
end
|
58
|
-
vec.
|
58
|
+
vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
|
59
|
+
end
|
60
|
+
|
61
|
+
if $GSL
|
62
|
+
@raw_norm = vec.normalize
|
63
|
+
@raw_vector = vec
|
64
|
+
else
|
65
|
+
@raw_norm = Vector[*vec].normalize
|
66
|
+
@raw_vector = Vector[*vec]
|
59
67
|
end
|
60
|
-
|
61
|
-
@raw_norm = GSL::Vector.new( vec ).normalize
|
62
|
-
@raw_vector = GSL::Vector.new( vec )
|
63
68
|
end
|
64
69
|
|
65
70
|
end
|
66
71
|
|
67
|
-
end
|
72
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
|
+
# License:: LGPL
|
4
|
+
|
5
|
+
class String
|
6
|
+
def summary( count=10, separator=" [...] " )
|
7
|
+
perform_lsi split_sentences, count, separator
|
8
|
+
end
|
9
|
+
|
10
|
+
def paragraph_summary( count=1, separator=" [...] " )
|
11
|
+
perform_lsi split_paragraphs, count, separator
|
12
|
+
end
|
13
|
+
|
14
|
+
def split_sentences
|
15
|
+
split /(\.|\!|\?)/ # TODO: make this less primitive
|
16
|
+
end
|
17
|
+
|
18
|
+
def split_paragraphs
|
19
|
+
split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def perform_lsi(chunks, count, separator)
|
25
|
+
lsi = Classifier::LSI.new :auto_rebuild => false
|
26
|
+
chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
|
27
|
+
lsi.build_index
|
28
|
+
summaries = lsi.highest_relative_content count
|
29
|
+
return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
|
30
|
+
end
|
31
|
+
end
|
@@ -1,13 +1,14 @@
|
|
1
1
|
# Author:: David Fayram (mailto:dfayram@lensmen.net)
|
2
2
|
# Copyright:: Copyright (c) 2005 David Fayram II
|
3
|
-
# License::
|
3
|
+
# License:: LGPL
|
4
4
|
|
5
5
|
module Classifier
|
6
6
|
# This class keeps a word => index mapping. It is used to map stemmed words
|
7
7
|
# to dimensions of a vector.
|
8
|
+
|
8
9
|
class WordList
|
9
10
|
def initialize
|
10
|
-
@location_table =
|
11
|
+
@location_table = Hash.new
|
11
12
|
end
|
12
13
|
|
13
14
|
# Adds a word (if it is new) and assigns it a unique dimension.
|
@@ -22,6 +23,10 @@ module Classifier
|
|
22
23
|
@location_table[term]
|
23
24
|
end
|
24
25
|
|
26
|
+
def word_for_index(ind)
|
27
|
+
@location_table.invert[ind]
|
28
|
+
end
|
29
|
+
|
25
30
|
# Returns the number of words mapped.
|
26
31
|
def size
|
27
32
|
@location_table.size
|
File without changes
|
data/test/lsi/lsi_test.rb
CHANGED
@@ -3,7 +3,7 @@ class LSITest < Test::Unit::TestCase
|
|
3
3
|
def setup
|
4
4
|
# we repeat principle words to help weight them.
|
5
5
|
# This test is rather delicate, since this system is mostly noise.
|
6
|
-
|
6
|
+
@str1 = "This text deals with dogs. Dogs."
|
7
7
|
@str2 = "This text involves dogs too. Dogs! "
|
8
8
|
@str3 = "This text revolves around cats. Cats."
|
9
9
|
@str4 = "This text also involves cats. Cats!"
|
@@ -23,6 +23,7 @@ class LSITest < Test::Unit::TestCase
|
|
23
23
|
def test_not_auto_rebuild
|
24
24
|
lsi = Classifier::LSI.new :auto_rebuild => false
|
25
25
|
lsi.add_item @str1, "Dog"
|
26
|
+
lsi.add_item @str2, "Dog"
|
26
27
|
assert lsi.needs_rebuild?
|
27
28
|
lsi.build_index
|
28
29
|
assert ! lsi.needs_rebuild?
|
@@ -57,6 +58,25 @@ class LSITest < Test::Unit::TestCase
|
|
57
58
|
assert_not_equal "Dog", bayes.classify( tricky_case )
|
58
59
|
end
|
59
60
|
|
61
|
+
def test_recategorize_interface
|
62
|
+
lsi = Classifier::LSI.new
|
63
|
+
lsi.add_item @str1, "Dog"
|
64
|
+
lsi.add_item @str2, "Dog"
|
65
|
+
lsi.add_item @str3, "Cat"
|
66
|
+
lsi.add_item @str4, "Cat"
|
67
|
+
lsi.add_item @str5, "Bird"
|
68
|
+
|
69
|
+
tricky_case = "This text revolves around dogs."
|
70
|
+
assert_equal "Dog", lsi.classify( tricky_case )
|
71
|
+
|
72
|
+
# Recategorize as needed.
|
73
|
+
lsi.categories_for(@str1).clear.push "Cow"
|
74
|
+
lsi.categories_for(@str2).clear.push "Cow"
|
75
|
+
|
76
|
+
assert !lsi.needs_rebuild?
|
77
|
+
assert_equal "Cow", lsi.classify( tricky_case )
|
78
|
+
end
|
79
|
+
|
60
80
|
def test_search
|
61
81
|
lsi = Classifier::LSI.new
|
62
82
|
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
@@ -85,4 +105,19 @@ class LSITest < Test::Unit::TestCase
|
|
85
105
|
assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
|
86
106
|
end
|
87
107
|
|
108
|
+
def test_keyword_search
|
109
|
+
lsi = Classifier::LSI.new
|
110
|
+
lsi.add_item @str1, "Dog"
|
111
|
+
lsi.add_item @str2, "Dog"
|
112
|
+
lsi.add_item @str3, "Cat"
|
113
|
+
lsi.add_item @str4, "Cat"
|
114
|
+
lsi.add_item @str5, "Bird"
|
115
|
+
|
116
|
+
assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
|
117
|
+
end
|
118
|
+
|
119
|
+
def test_summary
|
120
|
+
assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
|
121
|
+
end
|
122
|
+
|
88
123
|
end
|
metadata
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.8.
|
2
|
+
rubygems_version: 0.8.10
|
3
3
|
specification_version: 1
|
4
4
|
name: classifier
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 1.
|
7
|
-
date: 2005-
|
6
|
+
version: 1.3.0
|
7
|
+
date: 2005-05-05
|
8
8
|
summary: A general classifier module to allow Bayesian and other types of classifications.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -33,19 +33,22 @@ files:
|
|
33
33
|
- lib/classifier/extensions
|
34
34
|
- lib/classifier/lsi
|
35
35
|
- lib/classifier/lsi.rb
|
36
|
-
- lib/classifier/
|
36
|
+
- lib/classifier/extensions/string.rb
|
37
|
+
- lib/classifier/extensions/vector.rb
|
37
38
|
- lib/classifier/extensions/vector_serialize.rb
|
38
39
|
- lib/classifier/extensions/word_hash.rb
|
39
|
-
- lib/classifier/extensions/word_list.rb
|
40
40
|
- lib/classifier/lsi/content_node.rb
|
41
|
+
- lib/classifier/lsi/summary.rb
|
42
|
+
- lib/classifier/lsi/word_list.rb
|
41
43
|
- bin/bayes.rb
|
44
|
+
- bin/summarize.rb
|
42
45
|
- test/bayes
|
46
|
+
- test/extensions
|
43
47
|
- test/lsi
|
44
|
-
- test/string_extensions
|
45
48
|
- test/test_helper.rb
|
46
49
|
- test/bayes/bayesian_test.rb
|
50
|
+
- test/extensions/word_hash_test.rb
|
47
51
|
- test/lsi/lsi_test.rb
|
48
|
-
- test/string_extensions/word_hash_test.rb
|
49
52
|
- LICENSE
|
50
53
|
- Rakefile
|
51
54
|
- README
|
@@ -57,14 +60,21 @@ files:
|
|
57
60
|
- doc/fr_method_index.html
|
58
61
|
- doc/index.html
|
59
62
|
- doc/rdoc-style.css
|
63
|
+
- doc/classes/Array.html
|
64
|
+
- doc/classes/Array.src
|
60
65
|
- doc/classes/Classifier
|
61
66
|
- doc/classes/Classifier.html
|
62
67
|
- doc/classes/GSL
|
63
68
|
- doc/classes/GSL.html
|
69
|
+
- doc/classes/Matrix.html
|
70
|
+
- doc/classes/Matrix.src
|
64
71
|
- doc/classes/Object.html
|
65
72
|
- doc/classes/Object.src
|
66
73
|
- doc/classes/String.html
|
67
74
|
- doc/classes/String.src
|
75
|
+
- doc/classes/Vector.html
|
76
|
+
- doc/classes/Vector.src
|
77
|
+
- doc/classes/Array.src/M000003.html
|
68
78
|
- doc/classes/Classifier/Bayes.html
|
69
79
|
- doc/classes/Classifier/Bayes.src
|
70
80
|
- doc/classes/Classifier/ContentNode.html
|
@@ -73,41 +83,56 @@ files:
|
|
73
83
|
- doc/classes/Classifier/LSI.src
|
74
84
|
- doc/classes/Classifier/WordList.html
|
75
85
|
- doc/classes/Classifier/WordList.src
|
76
|
-
- doc/classes/Classifier/Bayes.src/
|
77
|
-
- doc/classes/Classifier/Bayes.src/
|
78
|
-
- doc/classes/Classifier/Bayes.src/
|
79
|
-
- doc/classes/Classifier/Bayes.src/
|
80
|
-
- doc/classes/Classifier/Bayes.src/
|
81
|
-
- doc/classes/Classifier/Bayes.src/
|
82
|
-
- doc/classes/Classifier/Bayes.src/
|
83
|
-
- doc/classes/Classifier/ContentNode.src/
|
84
|
-
- doc/classes/Classifier/ContentNode.src/
|
85
|
-
- doc/classes/Classifier/ContentNode.src/
|
86
|
-
- doc/classes/Classifier/ContentNode.src/
|
87
|
-
- doc/classes/Classifier/LSI.src/M000011.html
|
88
|
-
- doc/classes/Classifier/LSI.src/M000012.html
|
89
|
-
- doc/classes/Classifier/LSI.src/M000013.html
|
90
|
-
- doc/classes/Classifier/LSI.src/M000014.html
|
91
|
-
- doc/classes/Classifier/LSI.src/M000015.html
|
92
|
-
- doc/classes/Classifier/LSI.src/M000016.html
|
93
|
-
- doc/classes/Classifier/LSI.src/M000017.html
|
94
|
-
- doc/classes/Classifier/LSI.src/M000018.html
|
95
|
-
- doc/classes/Classifier/LSI.src/M000019.html
|
96
|
-
- doc/classes/Classifier/LSI.src/M000020.html
|
97
|
-
- doc/classes/Classifier/LSI.src/M000021.html
|
86
|
+
- doc/classes/Classifier/Bayes.src/M000038.html
|
87
|
+
- doc/classes/Classifier/Bayes.src/M000039.html
|
88
|
+
- doc/classes/Classifier/Bayes.src/M000040.html
|
89
|
+
- doc/classes/Classifier/Bayes.src/M000041.html
|
90
|
+
- doc/classes/Classifier/Bayes.src/M000042.html
|
91
|
+
- doc/classes/Classifier/Bayes.src/M000043.html
|
92
|
+
- doc/classes/Classifier/Bayes.src/M000044.html
|
93
|
+
- doc/classes/Classifier/ContentNode.src/M000046.html
|
94
|
+
- doc/classes/Classifier/ContentNode.src/M000047.html
|
95
|
+
- doc/classes/Classifier/ContentNode.src/M000048.html
|
96
|
+
- doc/classes/Classifier/ContentNode.src/M000049.html
|
98
97
|
- doc/classes/Classifier/LSI.src/M000022.html
|
99
|
-
- doc/classes/Classifier/
|
100
|
-
- doc/classes/Classifier/
|
101
|
-
- doc/classes/Classifier/
|
102
|
-
- doc/classes/Classifier/
|
98
|
+
- doc/classes/Classifier/LSI.src/M000023.html
|
99
|
+
- doc/classes/Classifier/LSI.src/M000024.html
|
100
|
+
- doc/classes/Classifier/LSI.src/M000025.html
|
101
|
+
- doc/classes/Classifier/LSI.src/M000026.html
|
102
|
+
- doc/classes/Classifier/LSI.src/M000027.html
|
103
|
+
- doc/classes/Classifier/LSI.src/M000028.html
|
104
|
+
- doc/classes/Classifier/LSI.src/M000029.html
|
105
|
+
- doc/classes/Classifier/LSI.src/M000030.html
|
106
|
+
- doc/classes/Classifier/LSI.src/M000031.html
|
107
|
+
- doc/classes/Classifier/LSI.src/M000032.html
|
108
|
+
- doc/classes/Classifier/LSI.src/M000033.html
|
109
|
+
- doc/classes/Classifier/LSI.src/M000034.html
|
110
|
+
- doc/classes/Classifier/LSI.src/M000035.html
|
111
|
+
- doc/classes/Classifier/LSI.src/M000036.html
|
112
|
+
- doc/classes/Classifier/LSI.src/M000037.html
|
113
|
+
- doc/classes/Classifier/WordList.src/M000017.html
|
114
|
+
- doc/classes/Classifier/WordList.src/M000018.html
|
115
|
+
- doc/classes/Classifier/WordList.src/M000019.html
|
116
|
+
- doc/classes/Classifier/WordList.src/M000020.html
|
117
|
+
- doc/classes/Classifier/WordList.src/M000021.html
|
118
|
+
- doc/classes/GSL/Matrix.html
|
103
119
|
- doc/classes/GSL/Vector.html
|
104
120
|
- doc/classes/GSL/Vector.src
|
105
|
-
- doc/classes/GSL/Vector.src/
|
106
|
-
- doc/classes/GSL/Vector.src/
|
107
|
-
- doc/classes/
|
108
|
-
- doc/classes/
|
109
|
-
- doc/classes/
|
110
|
-
- doc/classes/
|
121
|
+
- doc/classes/GSL/Vector.src/M000015.html
|
122
|
+
- doc/classes/GSL/Vector.src/M000016.html
|
123
|
+
- doc/classes/Matrix.src/M000004.html
|
124
|
+
- doc/classes/Matrix.src/M000005.html
|
125
|
+
- doc/classes/Matrix.src/M000006.html
|
126
|
+
- doc/classes/Object.src/M000007.html
|
127
|
+
- doc/classes/String.src/M000008.html
|
128
|
+
- doc/classes/String.src/M000009.html
|
129
|
+
- doc/classes/String.src/M000010.html
|
130
|
+
- doc/classes/String.src/M000011.html
|
131
|
+
- doc/classes/String.src/M000012.html
|
132
|
+
- doc/classes/String.src/M000013.html
|
133
|
+
- doc/classes/String.src/M000014.html
|
134
|
+
- doc/classes/Vector.src/M000001.html
|
135
|
+
- doc/classes/Vector.src/M000002.html
|
111
136
|
- doc/files/lib
|
112
137
|
- doc/files/README.html
|
113
138
|
- doc/files/lib/classifier
|
@@ -116,11 +141,13 @@ files:
|
|
116
141
|
- doc/files/lib/classifier/extensions
|
117
142
|
- doc/files/lib/classifier/lsi
|
118
143
|
- doc/files/lib/classifier/lsi_rb.html
|
119
|
-
- doc/files/lib/classifier/
|
144
|
+
- doc/files/lib/classifier/extensions/string_rb.html
|
145
|
+
- doc/files/lib/classifier/extensions/vector_rb.html
|
120
146
|
- doc/files/lib/classifier/extensions/vector_serialize_rb.html
|
121
147
|
- doc/files/lib/classifier/extensions/word_hash_rb.html
|
122
|
-
- doc/files/lib/classifier/extensions/word_list_rb.html
|
123
148
|
- doc/files/lib/classifier/lsi/content_node_rb.html
|
149
|
+
- doc/files/lib/classifier/lsi/summary_rb.html
|
150
|
+
- doc/files/lib/classifier/lsi/word_list_rb.html
|
124
151
|
test_files: []
|
125
152
|
rdoc_options: []
|
126
153
|
extra_rdoc_files: []
|
@@ -1,21 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
-
<!DOCTYPE html
|
3
|
-
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
-
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
-
|
6
|
-
<html>
|
7
|
-
<head>
|
8
|
-
<title>new (Classifier::ContentNode)</title>
|
9
|
-
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
-
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
11
|
-
</head>
|
12
|
-
<body class="standalone-code">
|
13
|
-
<pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line 19</span>
|
14
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>( <span class="ruby-identifier">source</span>, <span class="ruby-identifier">categories</span>=<span class="ruby-keyword kw">nil</span>, <span class="ruby-identifier">text_proc</span>=<span class="ruby-keyword kw">nil</span> )
|
15
|
-
<span class="ruby-identifier">text_proc</span> = <span class="ruby-identifier">text_proc</span> <span class="ruby-operator">||</span> (<span class="ruby-identifier">proc</span> {<span class="ruby-operator">|</span><span class="ruby-identifier">x</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span>.<span class="ruby-identifier">to_s</span>})
|
16
|
-
<span class="ruby-ivar">@categories</span> = <span class="ruby-identifier">categories</span> <span class="ruby-operator">||</span> []
|
17
|
-
<span class="ruby-ivar">@source</span> = <span class="ruby-identifier">source</span>
|
18
|
-
<span class="ruby-ivar">@word_hash</span> = <span class="ruby-identifier">text_proc</span>.<span class="ruby-identifier">call</span>( <span class="ruby-ivar">@source</span> ).<span class="ruby-identifier">clean_word_hash</span>
|
19
|
-
<span class="ruby-keyword kw">end</span></pre>
|
20
|
-
</body>
|
21
|
-
</html>
|
@@ -1,41 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
-
<!DOCTYPE html
|
3
|
-
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
-
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
-
|
6
|
-
<html>
|
7
|
-
<head>
|
8
|
-
<title>raw_vector_with (Classifier::ContentNode)</title>
|
9
|
-
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
-
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
11
|
-
</head>
|
12
|
-
<body class="standalone-code">
|
13
|
-
<pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line 38</span>
|
14
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">raw_vector_with</span>( <span class="ruby-identifier">word_list</span> )
|
15
|
-
<span class="ruby-identifier">vec</span> = <span class="ruby-constant">Array</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">word_list</span>.<span class="ruby-identifier">size</span>, <span class="ruby-value">0</span>)
|
16
|
-
|
17
|
-
<span class="ruby-ivar">@word_hash</span>.<span class="ruby-identifier">each_key</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">word</span><span class="ruby-operator">|</span>
|
18
|
-
<span class="ruby-identifier">vec</span>[<span class="ruby-identifier">word_list</span>[<span class="ruby-identifier">word</span>]] = <span class="ruby-ivar">@word_hash</span>[<span class="ruby-identifier">word</span>] <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">word_list</span>[<span class="ruby-identifier">word</span>]
|
19
|
-
<span class="ruby-keyword kw">end</span>
|
20
|
-
|
21
|
-
<span class="ruby-comment cmt"># Perform the scaling transform</span>
|
22
|
-
<span class="ruby-identifier">total_words</span> = <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">inject</span>(<span class="ruby-value">0</span>) { <span class="ruby-operator">|</span><span class="ruby-identifier">sum</span>,<span class="ruby-identifier">term</span><span class="ruby-operator">|</span> <span class="ruby-identifier">sum</span> <span class="ruby-operator">+=</span> <span class="ruby-identifier">term</span> }.<span class="ruby-identifier">to_f</span>
|
23
|
-
|
24
|
-
<span class="ruby-comment cmt"># Perform first-order association transform if this vector has more</span>
|
25
|
-
<span class="ruby-comment cmt"># than one word in it. </span>
|
26
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">total_words</span> <span class="ruby-operator">></span> <span class="ruby-value">1.0</span>
|
27
|
-
<span class="ruby-identifier">weighted_total</span> = <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">inject</span>(<span class="ruby-value">0</span><span class="ruby-value">.0</span>) <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">sum</span>,<span class="ruby-identifier">term</span><span class="ruby-operator">|</span>
|
28
|
-
<span class="ruby-keyword kw">if</span>( <span class="ruby-identifier">term</span> <span class="ruby-operator">></span> <span class="ruby-value">0</span> )
|
29
|
-
<span class="ruby-identifier">sum</span> <span class="ruby-operator">+=</span> (( <span class="ruby-identifier">term</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">total_words</span> ) <span class="ruby-operator">*</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>( <span class="ruby-identifier">term</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">total_words</span> ))
|
30
|
-
<span class="ruby-keyword kw">else</span>
|
31
|
-
<span class="ruby-identifier">sum</span>
|
32
|
-
<span class="ruby-keyword kw">end</span>
|
33
|
-
<span class="ruby-keyword kw">end</span>
|
34
|
-
<span class="ruby-identifier">vec</span>.<span class="ruby-identifier">map!</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">val</span><span class="ruby-operator">|</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>( <span class="ruby-identifier">val</span> <span class="ruby-operator">+</span> <span class="ruby-value">1</span> ) <span class="ruby-operator">/</span> <span class="ruby-operator">-</span><span class="ruby-identifier">weighted_total</span> }
|
35
|
-
<span class="ruby-keyword kw">end</span>
|
36
|
-
|
37
|
-
<span class="ruby-ivar">@raw_norm</span> = <span class="ruby-constant">GSL</span><span class="ruby-operator">::</span><span class="ruby-constant">Vector</span>.<span class="ruby-identifier">new</span>( <span class="ruby-identifier">vec</span> ).<span class="ruby-identifier">normalize</span>
|
38
|
-
<span class="ruby-ivar">@raw_vector</span> = <span class="ruby-constant">GSL</span><span class="ruby-operator">::</span><span class="ruby-constant">Vector</span>.<span class="ruby-identifier">new</span>( <span class="ruby-identifier">vec</span> )
|
39
|
-
<span class="ruby-keyword kw">end</span></pre>
|
40
|
-
</body>
|
41
|
-
</html>
|
@@ -1,20 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
-
<!DOCTYPE html
|
3
|
-
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
-
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
-
|
6
|
-
<html>
|
7
|
-
<head>
|
8
|
-
<title>new (Classifier::LSI)</title>
|
9
|
-
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
-
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
11
|
-
</head>
|
12
|
-
<body class="standalone-code">
|
13
|
-
<pre><span class="ruby-comment cmt"># File lib/classifier/lsi.rb, line 26</span>
|
14
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">options</span> = {})
|
15
|
-
<span class="ruby-ivar">@auto_rebuild</span> = <span class="ruby-keyword kw">true</span> <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">options</span>[<span class="ruby-identifier">:auto_rebuild</span>] <span class="ruby-operator">==</span> <span class="ruby-keyword kw">false</span>
|
16
|
-
<span class="ruby-ivar">@word_list</span>, <span class="ruby-ivar">@items</span> = <span class="ruby-constant">WordList</span>.<span class="ruby-identifier">new</span>, {}
|
17
|
-
<span class="ruby-ivar">@version</span>, <span class="ruby-ivar">@built_at_version</span> = <span class="ruby-value">0</span>, <span class="ruby-value">-1</span>
|
18
|
-
<span class="ruby-keyword kw">end</span></pre>
|
19
|
-
</body>
|
20
|
-
</html>
|