classifier 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +361 -273
- data/README +6 -5
- data/Rakefile +12 -2
- data/bin/summarize.rb +11 -0
- data/doc/classes/Array.html +139 -0
- data/doc/classes/Array.src/M000003.html +18 -0
- data/doc/classes/Classifier.html +5 -5
- data/doc/classes/Classifier/Bayes.html +43 -43
- data/doc/classes/Classifier/Bayes.src/{M000023.html → M000038.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000024.html → M000039.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000025.html → M000040.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000026.html → M000041.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000027.html → M000042.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000028.html → M000043.html} +0 -0
- data/doc/classes/Classifier/Bayes.src/{M000029.html → M000044.html} +0 -0
- data/doc/classes/Classifier/ContentNode.html +23 -28
- data/doc/classes/Classifier/ContentNode.src/M000046.html +19 -0
- data/doc/classes/Classifier/ContentNode.src/{M000032.html → M000047.html} +1 -1
- data/doc/classes/Classifier/ContentNode.src/{M000033.html → M000048.html} +1 -1
- data/doc/classes/Classifier/ContentNode.src/M000049.html +49 -0
- data/doc/classes/Classifier/LSI.html +158 -68
- data/doc/classes/Classifier/LSI.src/M000022.html +6 -17
- data/doc/classes/Classifier/LSI.src/{M000012.html → M000023.html} +2 -2
- data/doc/classes/Classifier/LSI.src/{M000013.html → M000024.html} +3 -2
- data/doc/classes/Classifier/LSI.src/{M000014.html → M000025.html} +1 -1
- data/doc/classes/Classifier/LSI.src/M000026.html +19 -0
- data/doc/classes/Classifier/LSI.src/{M000015.html → M000027.html} +1 -1
- data/doc/classes/Classifier/LSI.src/{M000016.html → M000028.html} +1 -1
- data/doc/classes/Classifier/LSI.src/M000029.html +19 -0
- data/doc/classes/Classifier/LSI.src/M000030.html +43 -0
- data/doc/classes/Classifier/LSI.src/M000031.html +23 -0
- data/doc/classes/Classifier/LSI.src/{M000018.html → M000032.html} +7 -3
- data/doc/classes/Classifier/LSI.src/{M000019.html → M000033.html} +6 -2
- data/doc/classes/Classifier/LSI.src/{M000020.html → M000034.html} +2 -4
- data/doc/classes/Classifier/LSI.src/{M000021.html → M000035.html} +1 -1
- data/doc/classes/Classifier/LSI.src/M000036.html +31 -0
- data/doc/classes/Classifier/LSI.src/M000037.html +21 -0
- data/doc/classes/Classifier/WordList.html +37 -22
- data/doc/classes/Classifier/WordList.src/{M000007.html → M000017.html} +2 -2
- data/doc/classes/Classifier/WordList.src/{M000008.html → M000018.html} +1 -1
- data/doc/classes/Classifier/WordList.src/{M000009.html → M000019.html} +1 -1
- data/doc/classes/Classifier/WordList.src/M000020.html +18 -0
- data/doc/classes/Classifier/WordList.src/{M000010.html → M000021.html} +1 -1
- data/doc/classes/GSL.html +2 -1
- data/doc/classes/GSL/Matrix.html +126 -0
- data/doc/classes/GSL/Vector.html +10 -10
- data/doc/classes/GSL/Vector.src/{M000005.html → M000015.html} +0 -0
- data/doc/classes/GSL/Vector.src/{M000006.html → M000016.html} +0 -0
- data/doc/classes/Matrix.html +184 -0
- data/doc/classes/Matrix.src/M000004.html +18 -0
- data/doc/classes/Matrix.src/M000005.html +76 -0
- data/doc/classes/Matrix.src/M000006.html +18 -0
- data/doc/classes/Object.html +7 -7
- data/doc/classes/Object.src/{M000001.html → M000007.html} +1 -1
- data/doc/classes/String.html +90 -20
- data/doc/classes/String.src/{M000002.html → M000008.html} +0 -0
- data/doc/classes/String.src/{M000003.html → M000009.html} +0 -0
- data/doc/classes/String.src/{M000004.html → M000010.html} +0 -0
- data/doc/classes/String.src/M000011.html +18 -0
- data/doc/classes/String.src/M000012.html +18 -0
- data/doc/classes/String.src/M000013.html +18 -0
- data/doc/classes/String.src/M000014.html +18 -0
- data/doc/classes/Vector.html +154 -0
- data/doc/classes/Vector.src/M000001.html +22 -0
- data/doc/classes/Vector.src/M000002.html +25 -0
- data/doc/created.rid +1 -1
- data/doc/files/README.html +14 -8
- data/doc/files/lib/classifier/bayes_rb.html +1 -1
- data/doc/files/lib/classifier/{string_extensions_rb.html → extensions/string_rb.html} +5 -5
- data/doc/files/lib/classifier/extensions/vector_rb.html +120 -0
- data/doc/files/lib/classifier/extensions/vector_serialize_rb.html +1 -1
- data/doc/files/lib/classifier/extensions/word_hash_rb.html +1 -1
- data/doc/files/lib/classifier/lsi/content_node_rb.html +2 -2
- data/doc/files/lib/classifier/lsi/summary_rb.html +115 -0
- data/doc/files/lib/classifier/{extensions → lsi}/word_list_rb.html +3 -3
- data/doc/files/lib/classifier/lsi_rb.html +5 -3
- data/doc/files/lib/classifier_rb.html +2 -2
- data/doc/fr_class_index.html +4 -0
- data/doc/fr_file_index.html +4 -2
- data/doc/fr_method_index.html +49 -34
- data/doc/index.html +2 -2
- data/lib/classifier.rb +1 -1
- data/lib/classifier/{string_extensions.rb → extensions/string.rb} +0 -0
- data/lib/classifier/extensions/vector.rb +106 -0
- data/lib/classifier/extensions/vector_serialize.rb +6 -0
- data/lib/classifier/lsi.rb +101 -31
- data/lib/classifier/lsi/content_node.rb +28 -23
- data/lib/classifier/lsi/summary.rb +31 -0
- data/lib/classifier/{extensions → lsi}/word_list.rb +7 -2
- data/test/{string_extensions → extensions}/word_hash_test.rb +0 -0
- data/test/lsi/lsi_test.rb +36 -1
- metadata +68 -41
- data/doc/classes/Classifier/ContentNode.src/M000031.html +0 -21
- data/doc/classes/Classifier/ContentNode.src/M000034.html +0 -41
- data/doc/classes/Classifier/LSI.src/M000011.html +0 -20
- data/doc/classes/Classifier/LSI.src/M000017.html +0 -32
|
@@ -1,26 +1,23 @@
|
|
|
1
1
|
# Author:: David Fayram (mailto:dfayram@lensmen.net)
|
|
2
2
|
# Copyright:: Copyright (c) 2005 David Fayram II
|
|
3
|
-
# License::
|
|
3
|
+
# License:: LGPL
|
|
4
4
|
|
|
5
5
|
module Classifier
|
|
6
6
|
|
|
7
|
-
|
|
8
7
|
# This is an internal data structure class for the LSI node. Save for
|
|
9
8
|
# raw_vector_with, it should be fairly straightforward to understand.
|
|
10
9
|
# You should never have to use it directly.
|
|
11
10
|
class ContentNode
|
|
12
|
-
attr_accessor :
|
|
13
|
-
|
|
14
|
-
:categories
|
|
15
|
-
|
|
16
|
-
|
|
11
|
+
attr_accessor :raw_vector, :raw_norm,
|
|
12
|
+
:lsi_vector, :lsi_norm,
|
|
13
|
+
:categories
|
|
14
|
+
|
|
15
|
+
attr_reader :word_hash
|
|
17
16
|
# If text_proc is not specified, the source will be duck-typed
|
|
18
17
|
# via source.to_s
|
|
19
|
-
def initialize(
|
|
20
|
-
text_proc = text_proc || (proc {|x| x.to_s})
|
|
18
|
+
def initialize( word_hash, *categories )
|
|
21
19
|
@categories = categories || []
|
|
22
|
-
@
|
|
23
|
-
@word_hash = text_proc.call( @source ).clean_word_hash
|
|
20
|
+
@word_hash = word_hash
|
|
24
21
|
end
|
|
25
22
|
|
|
26
23
|
# Use this to fetch the appropriate search vector.
|
|
@@ -36,32 +33,40 @@ module Classifier
|
|
|
36
33
|
# Creates the raw vector out of word_hash using word_list as the
|
|
37
34
|
# key for mapping the vector space.
|
|
38
35
|
def raw_vector_with( word_list )
|
|
39
|
-
|
|
36
|
+
if $GSL
|
|
37
|
+
vec = Vector.new(word_list.size)
|
|
38
|
+
else
|
|
39
|
+
vec = Array.new(word_list.size, 0)
|
|
40
|
+
end
|
|
40
41
|
|
|
41
42
|
@word_hash.each_key do |word|
|
|
42
43
|
vec[word_list[word]] = @word_hash[word] if word_list[word]
|
|
43
44
|
end
|
|
44
45
|
|
|
45
46
|
# Perform the scaling transform
|
|
46
|
-
total_words = vec.
|
|
47
|
+
total_words = vec.sum
|
|
47
48
|
|
|
48
49
|
# Perform first-order association transform if this vector has more
|
|
49
50
|
# than one word in it.
|
|
50
51
|
if total_words > 1.0
|
|
51
|
-
weighted_total =
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
sum
|
|
52
|
+
weighted_total = 0.0
|
|
53
|
+
vec.each do |term|
|
|
54
|
+
if ( term > 0 )
|
|
55
|
+
weighted_total += (( term / total_words ) * Math.log( term / total_words ))
|
|
56
56
|
end
|
|
57
57
|
end
|
|
58
|
-
vec.
|
|
58
|
+
vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
if $GSL
|
|
62
|
+
@raw_norm = vec.normalize
|
|
63
|
+
@raw_vector = vec
|
|
64
|
+
else
|
|
65
|
+
@raw_norm = Vector[*vec].normalize
|
|
66
|
+
@raw_vector = Vector[*vec]
|
|
59
67
|
end
|
|
60
|
-
|
|
61
|
-
@raw_norm = GSL::Vector.new( vec ).normalize
|
|
62
|
-
@raw_vector = GSL::Vector.new( vec )
|
|
63
68
|
end
|
|
64
69
|
|
|
65
70
|
end
|
|
66
71
|
|
|
67
|
-
end
|
|
72
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
|
2
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
|
3
|
+
# License:: LGPL
|
|
4
|
+
|
|
5
|
+
class String
|
|
6
|
+
def summary( count=10, separator=" [...] " )
|
|
7
|
+
perform_lsi split_sentences, count, separator
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def paragraph_summary( count=1, separator=" [...] " )
|
|
11
|
+
perform_lsi split_paragraphs, count, separator
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def split_sentences
|
|
15
|
+
split /(\.|\!|\?)/ # TODO: make this less primitive
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def split_paragraphs
|
|
19
|
+
split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
private
|
|
23
|
+
|
|
24
|
+
def perform_lsi(chunks, count, separator)
|
|
25
|
+
lsi = Classifier::LSI.new :auto_rebuild => false
|
|
26
|
+
chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
|
|
27
|
+
lsi.build_index
|
|
28
|
+
summaries = lsi.highest_relative_content count
|
|
29
|
+
return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
# Author:: David Fayram (mailto:dfayram@lensmen.net)
|
|
2
2
|
# Copyright:: Copyright (c) 2005 David Fayram II
|
|
3
|
-
# License::
|
|
3
|
+
# License:: LGPL
|
|
4
4
|
|
|
5
5
|
module Classifier
|
|
6
6
|
# This class keeps a word => index mapping. It is used to map stemmed words
|
|
7
7
|
# to dimensions of a vector.
|
|
8
|
+
|
|
8
9
|
class WordList
|
|
9
10
|
def initialize
|
|
10
|
-
@location_table =
|
|
11
|
+
@location_table = Hash.new
|
|
11
12
|
end
|
|
12
13
|
|
|
13
14
|
# Adds a word (if it is new) and assigns it a unique dimension.
|
|
@@ -22,6 +23,10 @@ module Classifier
|
|
|
22
23
|
@location_table[term]
|
|
23
24
|
end
|
|
24
25
|
|
|
26
|
+
def word_for_index(ind)
|
|
27
|
+
@location_table.invert[ind]
|
|
28
|
+
end
|
|
29
|
+
|
|
25
30
|
# Returns the number of words mapped.
|
|
26
31
|
def size
|
|
27
32
|
@location_table.size
|
|
File without changes
|
data/test/lsi/lsi_test.rb
CHANGED
|
@@ -3,7 +3,7 @@ class LSITest < Test::Unit::TestCase
|
|
|
3
3
|
def setup
|
|
4
4
|
# we repeat principle words to help weight them.
|
|
5
5
|
# This test is rather delicate, since this system is mostly noise.
|
|
6
|
-
|
|
6
|
+
@str1 = "This text deals with dogs. Dogs."
|
|
7
7
|
@str2 = "This text involves dogs too. Dogs! "
|
|
8
8
|
@str3 = "This text revolves around cats. Cats."
|
|
9
9
|
@str4 = "This text also involves cats. Cats!"
|
|
@@ -23,6 +23,7 @@ class LSITest < Test::Unit::TestCase
|
|
|
23
23
|
def test_not_auto_rebuild
|
|
24
24
|
lsi = Classifier::LSI.new :auto_rebuild => false
|
|
25
25
|
lsi.add_item @str1, "Dog"
|
|
26
|
+
lsi.add_item @str2, "Dog"
|
|
26
27
|
assert lsi.needs_rebuild?
|
|
27
28
|
lsi.build_index
|
|
28
29
|
assert ! lsi.needs_rebuild?
|
|
@@ -57,6 +58,25 @@ class LSITest < Test::Unit::TestCase
|
|
|
57
58
|
assert_not_equal "Dog", bayes.classify( tricky_case )
|
|
58
59
|
end
|
|
59
60
|
|
|
61
|
+
def test_recategorize_interface
|
|
62
|
+
lsi = Classifier::LSI.new
|
|
63
|
+
lsi.add_item @str1, "Dog"
|
|
64
|
+
lsi.add_item @str2, "Dog"
|
|
65
|
+
lsi.add_item @str3, "Cat"
|
|
66
|
+
lsi.add_item @str4, "Cat"
|
|
67
|
+
lsi.add_item @str5, "Bird"
|
|
68
|
+
|
|
69
|
+
tricky_case = "This text revolves around dogs."
|
|
70
|
+
assert_equal "Dog", lsi.classify( tricky_case )
|
|
71
|
+
|
|
72
|
+
# Recategorize as needed.
|
|
73
|
+
lsi.categories_for(@str1).clear.push "Cow"
|
|
74
|
+
lsi.categories_for(@str2).clear.push "Cow"
|
|
75
|
+
|
|
76
|
+
assert !lsi.needs_rebuild?
|
|
77
|
+
assert_equal "Cow", lsi.classify( tricky_case )
|
|
78
|
+
end
|
|
79
|
+
|
|
60
80
|
def test_search
|
|
61
81
|
lsi = Classifier::LSI.new
|
|
62
82
|
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
|
@@ -85,4 +105,19 @@ class LSITest < Test::Unit::TestCase
|
|
|
85
105
|
assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
|
|
86
106
|
end
|
|
87
107
|
|
|
108
|
+
def test_keyword_search
|
|
109
|
+
lsi = Classifier::LSI.new
|
|
110
|
+
lsi.add_item @str1, "Dog"
|
|
111
|
+
lsi.add_item @str2, "Dog"
|
|
112
|
+
lsi.add_item @str3, "Cat"
|
|
113
|
+
lsi.add_item @str4, "Cat"
|
|
114
|
+
lsi.add_item @str5, "Bird"
|
|
115
|
+
|
|
116
|
+
assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def test_summary
|
|
120
|
+
assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
|
|
121
|
+
end
|
|
122
|
+
|
|
88
123
|
end
|
metadata
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
|
-
rubygems_version: 0.8.
|
|
2
|
+
rubygems_version: 0.8.10
|
|
3
3
|
specification_version: 1
|
|
4
4
|
name: classifier
|
|
5
5
|
version: !ruby/object:Gem::Version
|
|
6
|
-
version: 1.
|
|
7
|
-
date: 2005-
|
|
6
|
+
version: 1.3.0
|
|
7
|
+
date: 2005-05-05
|
|
8
8
|
summary: A general classifier module to allow Bayesian and other types of classifications.
|
|
9
9
|
require_paths:
|
|
10
10
|
- lib
|
|
@@ -33,19 +33,22 @@ files:
|
|
|
33
33
|
- lib/classifier/extensions
|
|
34
34
|
- lib/classifier/lsi
|
|
35
35
|
- lib/classifier/lsi.rb
|
|
36
|
-
- lib/classifier/
|
|
36
|
+
- lib/classifier/extensions/string.rb
|
|
37
|
+
- lib/classifier/extensions/vector.rb
|
|
37
38
|
- lib/classifier/extensions/vector_serialize.rb
|
|
38
39
|
- lib/classifier/extensions/word_hash.rb
|
|
39
|
-
- lib/classifier/extensions/word_list.rb
|
|
40
40
|
- lib/classifier/lsi/content_node.rb
|
|
41
|
+
- lib/classifier/lsi/summary.rb
|
|
42
|
+
- lib/classifier/lsi/word_list.rb
|
|
41
43
|
- bin/bayes.rb
|
|
44
|
+
- bin/summarize.rb
|
|
42
45
|
- test/bayes
|
|
46
|
+
- test/extensions
|
|
43
47
|
- test/lsi
|
|
44
|
-
- test/string_extensions
|
|
45
48
|
- test/test_helper.rb
|
|
46
49
|
- test/bayes/bayesian_test.rb
|
|
50
|
+
- test/extensions/word_hash_test.rb
|
|
47
51
|
- test/lsi/lsi_test.rb
|
|
48
|
-
- test/string_extensions/word_hash_test.rb
|
|
49
52
|
- LICENSE
|
|
50
53
|
- Rakefile
|
|
51
54
|
- README
|
|
@@ -57,14 +60,21 @@ files:
|
|
|
57
60
|
- doc/fr_method_index.html
|
|
58
61
|
- doc/index.html
|
|
59
62
|
- doc/rdoc-style.css
|
|
63
|
+
- doc/classes/Array.html
|
|
64
|
+
- doc/classes/Array.src
|
|
60
65
|
- doc/classes/Classifier
|
|
61
66
|
- doc/classes/Classifier.html
|
|
62
67
|
- doc/classes/GSL
|
|
63
68
|
- doc/classes/GSL.html
|
|
69
|
+
- doc/classes/Matrix.html
|
|
70
|
+
- doc/classes/Matrix.src
|
|
64
71
|
- doc/classes/Object.html
|
|
65
72
|
- doc/classes/Object.src
|
|
66
73
|
- doc/classes/String.html
|
|
67
74
|
- doc/classes/String.src
|
|
75
|
+
- doc/classes/Vector.html
|
|
76
|
+
- doc/classes/Vector.src
|
|
77
|
+
- doc/classes/Array.src/M000003.html
|
|
68
78
|
- doc/classes/Classifier/Bayes.html
|
|
69
79
|
- doc/classes/Classifier/Bayes.src
|
|
70
80
|
- doc/classes/Classifier/ContentNode.html
|
|
@@ -73,41 +83,56 @@ files:
|
|
|
73
83
|
- doc/classes/Classifier/LSI.src
|
|
74
84
|
- doc/classes/Classifier/WordList.html
|
|
75
85
|
- doc/classes/Classifier/WordList.src
|
|
76
|
-
- doc/classes/Classifier/Bayes.src/
|
|
77
|
-
- doc/classes/Classifier/Bayes.src/
|
|
78
|
-
- doc/classes/Classifier/Bayes.src/
|
|
79
|
-
- doc/classes/Classifier/Bayes.src/
|
|
80
|
-
- doc/classes/Classifier/Bayes.src/
|
|
81
|
-
- doc/classes/Classifier/Bayes.src/
|
|
82
|
-
- doc/classes/Classifier/Bayes.src/
|
|
83
|
-
- doc/classes/Classifier/ContentNode.src/
|
|
84
|
-
- doc/classes/Classifier/ContentNode.src/
|
|
85
|
-
- doc/classes/Classifier/ContentNode.src/
|
|
86
|
-
- doc/classes/Classifier/ContentNode.src/
|
|
87
|
-
- doc/classes/Classifier/LSI.src/M000011.html
|
|
88
|
-
- doc/classes/Classifier/LSI.src/M000012.html
|
|
89
|
-
- doc/classes/Classifier/LSI.src/M000013.html
|
|
90
|
-
- doc/classes/Classifier/LSI.src/M000014.html
|
|
91
|
-
- doc/classes/Classifier/LSI.src/M000015.html
|
|
92
|
-
- doc/classes/Classifier/LSI.src/M000016.html
|
|
93
|
-
- doc/classes/Classifier/LSI.src/M000017.html
|
|
94
|
-
- doc/classes/Classifier/LSI.src/M000018.html
|
|
95
|
-
- doc/classes/Classifier/LSI.src/M000019.html
|
|
96
|
-
- doc/classes/Classifier/LSI.src/M000020.html
|
|
97
|
-
- doc/classes/Classifier/LSI.src/M000021.html
|
|
86
|
+
- doc/classes/Classifier/Bayes.src/M000038.html
|
|
87
|
+
- doc/classes/Classifier/Bayes.src/M000039.html
|
|
88
|
+
- doc/classes/Classifier/Bayes.src/M000040.html
|
|
89
|
+
- doc/classes/Classifier/Bayes.src/M000041.html
|
|
90
|
+
- doc/classes/Classifier/Bayes.src/M000042.html
|
|
91
|
+
- doc/classes/Classifier/Bayes.src/M000043.html
|
|
92
|
+
- doc/classes/Classifier/Bayes.src/M000044.html
|
|
93
|
+
- doc/classes/Classifier/ContentNode.src/M000046.html
|
|
94
|
+
- doc/classes/Classifier/ContentNode.src/M000047.html
|
|
95
|
+
- doc/classes/Classifier/ContentNode.src/M000048.html
|
|
96
|
+
- doc/classes/Classifier/ContentNode.src/M000049.html
|
|
98
97
|
- doc/classes/Classifier/LSI.src/M000022.html
|
|
99
|
-
- doc/classes/Classifier/
|
|
100
|
-
- doc/classes/Classifier/
|
|
101
|
-
- doc/classes/Classifier/
|
|
102
|
-
- doc/classes/Classifier/
|
|
98
|
+
- doc/classes/Classifier/LSI.src/M000023.html
|
|
99
|
+
- doc/classes/Classifier/LSI.src/M000024.html
|
|
100
|
+
- doc/classes/Classifier/LSI.src/M000025.html
|
|
101
|
+
- doc/classes/Classifier/LSI.src/M000026.html
|
|
102
|
+
- doc/classes/Classifier/LSI.src/M000027.html
|
|
103
|
+
- doc/classes/Classifier/LSI.src/M000028.html
|
|
104
|
+
- doc/classes/Classifier/LSI.src/M000029.html
|
|
105
|
+
- doc/classes/Classifier/LSI.src/M000030.html
|
|
106
|
+
- doc/classes/Classifier/LSI.src/M000031.html
|
|
107
|
+
- doc/classes/Classifier/LSI.src/M000032.html
|
|
108
|
+
- doc/classes/Classifier/LSI.src/M000033.html
|
|
109
|
+
- doc/classes/Classifier/LSI.src/M000034.html
|
|
110
|
+
- doc/classes/Classifier/LSI.src/M000035.html
|
|
111
|
+
- doc/classes/Classifier/LSI.src/M000036.html
|
|
112
|
+
- doc/classes/Classifier/LSI.src/M000037.html
|
|
113
|
+
- doc/classes/Classifier/WordList.src/M000017.html
|
|
114
|
+
- doc/classes/Classifier/WordList.src/M000018.html
|
|
115
|
+
- doc/classes/Classifier/WordList.src/M000019.html
|
|
116
|
+
- doc/classes/Classifier/WordList.src/M000020.html
|
|
117
|
+
- doc/classes/Classifier/WordList.src/M000021.html
|
|
118
|
+
- doc/classes/GSL/Matrix.html
|
|
103
119
|
- doc/classes/GSL/Vector.html
|
|
104
120
|
- doc/classes/GSL/Vector.src
|
|
105
|
-
- doc/classes/GSL/Vector.src/
|
|
106
|
-
- doc/classes/GSL/Vector.src/
|
|
107
|
-
- doc/classes/
|
|
108
|
-
- doc/classes/
|
|
109
|
-
- doc/classes/
|
|
110
|
-
- doc/classes/
|
|
121
|
+
- doc/classes/GSL/Vector.src/M000015.html
|
|
122
|
+
- doc/classes/GSL/Vector.src/M000016.html
|
|
123
|
+
- doc/classes/Matrix.src/M000004.html
|
|
124
|
+
- doc/classes/Matrix.src/M000005.html
|
|
125
|
+
- doc/classes/Matrix.src/M000006.html
|
|
126
|
+
- doc/classes/Object.src/M000007.html
|
|
127
|
+
- doc/classes/String.src/M000008.html
|
|
128
|
+
- doc/classes/String.src/M000009.html
|
|
129
|
+
- doc/classes/String.src/M000010.html
|
|
130
|
+
- doc/classes/String.src/M000011.html
|
|
131
|
+
- doc/classes/String.src/M000012.html
|
|
132
|
+
- doc/classes/String.src/M000013.html
|
|
133
|
+
- doc/classes/String.src/M000014.html
|
|
134
|
+
- doc/classes/Vector.src/M000001.html
|
|
135
|
+
- doc/classes/Vector.src/M000002.html
|
|
111
136
|
- doc/files/lib
|
|
112
137
|
- doc/files/README.html
|
|
113
138
|
- doc/files/lib/classifier
|
|
@@ -116,11 +141,13 @@ files:
|
|
|
116
141
|
- doc/files/lib/classifier/extensions
|
|
117
142
|
- doc/files/lib/classifier/lsi
|
|
118
143
|
- doc/files/lib/classifier/lsi_rb.html
|
|
119
|
-
- doc/files/lib/classifier/
|
|
144
|
+
- doc/files/lib/classifier/extensions/string_rb.html
|
|
145
|
+
- doc/files/lib/classifier/extensions/vector_rb.html
|
|
120
146
|
- doc/files/lib/classifier/extensions/vector_serialize_rb.html
|
|
121
147
|
- doc/files/lib/classifier/extensions/word_hash_rb.html
|
|
122
|
-
- doc/files/lib/classifier/extensions/word_list_rb.html
|
|
123
148
|
- doc/files/lib/classifier/lsi/content_node_rb.html
|
|
149
|
+
- doc/files/lib/classifier/lsi/summary_rb.html
|
|
150
|
+
- doc/files/lib/classifier/lsi/word_list_rb.html
|
|
124
151
|
test_files: []
|
|
125
152
|
rdoc_options: []
|
|
126
153
|
extra_rdoc_files: []
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
<?xml version="1.0" encoding="iso-8859-1"?>
|
|
2
|
-
<!DOCTYPE html
|
|
3
|
-
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
|
4
|
-
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
5
|
-
|
|
6
|
-
<html>
|
|
7
|
-
<head>
|
|
8
|
-
<title>new (Classifier::ContentNode)</title>
|
|
9
|
-
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
|
10
|
-
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
|
11
|
-
</head>
|
|
12
|
-
<body class="standalone-code">
|
|
13
|
-
<pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line 19</span>
|
|
14
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>( <span class="ruby-identifier">source</span>, <span class="ruby-identifier">categories</span>=<span class="ruby-keyword kw">nil</span>, <span class="ruby-identifier">text_proc</span>=<span class="ruby-keyword kw">nil</span> )
|
|
15
|
-
<span class="ruby-identifier">text_proc</span> = <span class="ruby-identifier">text_proc</span> <span class="ruby-operator">||</span> (<span class="ruby-identifier">proc</span> {<span class="ruby-operator">|</span><span class="ruby-identifier">x</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span>.<span class="ruby-identifier">to_s</span>})
|
|
16
|
-
<span class="ruby-ivar">@categories</span> = <span class="ruby-identifier">categories</span> <span class="ruby-operator">||</span> []
|
|
17
|
-
<span class="ruby-ivar">@source</span> = <span class="ruby-identifier">source</span>
|
|
18
|
-
<span class="ruby-ivar">@word_hash</span> = <span class="ruby-identifier">text_proc</span>.<span class="ruby-identifier">call</span>( <span class="ruby-ivar">@source</span> ).<span class="ruby-identifier">clean_word_hash</span>
|
|
19
|
-
<span class="ruby-keyword kw">end</span></pre>
|
|
20
|
-
</body>
|
|
21
|
-
</html>
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
<?xml version="1.0" encoding="iso-8859-1"?>
|
|
2
|
-
<!DOCTYPE html
|
|
3
|
-
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
|
4
|
-
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
5
|
-
|
|
6
|
-
<html>
|
|
7
|
-
<head>
|
|
8
|
-
<title>raw_vector_with (Classifier::ContentNode)</title>
|
|
9
|
-
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
|
10
|
-
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
|
11
|
-
</head>
|
|
12
|
-
<body class="standalone-code">
|
|
13
|
-
<pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line 38</span>
|
|
14
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">raw_vector_with</span>( <span class="ruby-identifier">word_list</span> )
|
|
15
|
-
<span class="ruby-identifier">vec</span> = <span class="ruby-constant">Array</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">word_list</span>.<span class="ruby-identifier">size</span>, <span class="ruby-value">0</span>)
|
|
16
|
-
|
|
17
|
-
<span class="ruby-ivar">@word_hash</span>.<span class="ruby-identifier">each_key</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">word</span><span class="ruby-operator">|</span>
|
|
18
|
-
<span class="ruby-identifier">vec</span>[<span class="ruby-identifier">word_list</span>[<span class="ruby-identifier">word</span>]] = <span class="ruby-ivar">@word_hash</span>[<span class="ruby-identifier">word</span>] <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">word_list</span>[<span class="ruby-identifier">word</span>]
|
|
19
|
-
<span class="ruby-keyword kw">end</span>
|
|
20
|
-
|
|
21
|
-
<span class="ruby-comment cmt"># Perform the scaling transform</span>
|
|
22
|
-
<span class="ruby-identifier">total_words</span> = <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">inject</span>(<span class="ruby-value">0</span>) { <span class="ruby-operator">|</span><span class="ruby-identifier">sum</span>,<span class="ruby-identifier">term</span><span class="ruby-operator">|</span> <span class="ruby-identifier">sum</span> <span class="ruby-operator">+=</span> <span class="ruby-identifier">term</span> }.<span class="ruby-identifier">to_f</span>
|
|
23
|
-
|
|
24
|
-
<span class="ruby-comment cmt"># Perform first-order association transform if this vector has more</span>
|
|
25
|
-
<span class="ruby-comment cmt"># than one word in it. </span>
|
|
26
|
-
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">total_words</span> <span class="ruby-operator">></span> <span class="ruby-value">1.0</span>
|
|
27
|
-
<span class="ruby-identifier">weighted_total</span> = <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">inject</span>(<span class="ruby-value">0</span><span class="ruby-value">.0</span>) <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">sum</span>,<span class="ruby-identifier">term</span><span class="ruby-operator">|</span>
|
|
28
|
-
<span class="ruby-keyword kw">if</span>( <span class="ruby-identifier">term</span> <span class="ruby-operator">></span> <span class="ruby-value">0</span> )
|
|
29
|
-
<span class="ruby-identifier">sum</span> <span class="ruby-operator">+=</span> (( <span class="ruby-identifier">term</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">total_words</span> ) <span class="ruby-operator">*</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>( <span class="ruby-identifier">term</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">total_words</span> ))
|
|
30
|
-
<span class="ruby-keyword kw">else</span>
|
|
31
|
-
<span class="ruby-identifier">sum</span>
|
|
32
|
-
<span class="ruby-keyword kw">end</span>
|
|
33
|
-
<span class="ruby-keyword kw">end</span>
|
|
34
|
-
<span class="ruby-identifier">vec</span>.<span class="ruby-identifier">map!</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">val</span><span class="ruby-operator">|</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>( <span class="ruby-identifier">val</span> <span class="ruby-operator">+</span> <span class="ruby-value">1</span> ) <span class="ruby-operator">/</span> <span class="ruby-operator">-</span><span class="ruby-identifier">weighted_total</span> }
|
|
35
|
-
<span class="ruby-keyword kw">end</span>
|
|
36
|
-
|
|
37
|
-
<span class="ruby-ivar">@raw_norm</span> = <span class="ruby-constant">GSL</span><span class="ruby-operator">::</span><span class="ruby-constant">Vector</span>.<span class="ruby-identifier">new</span>( <span class="ruby-identifier">vec</span> ).<span class="ruby-identifier">normalize</span>
|
|
38
|
-
<span class="ruby-ivar">@raw_vector</span> = <span class="ruby-constant">GSL</span><span class="ruby-operator">::</span><span class="ruby-constant">Vector</span>.<span class="ruby-identifier">new</span>( <span class="ruby-identifier">vec</span> )
|
|
39
|
-
<span class="ruby-keyword kw">end</span></pre>
|
|
40
|
-
</body>
|
|
41
|
-
</html>
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
<?xml version="1.0" encoding="iso-8859-1"?>
|
|
2
|
-
<!DOCTYPE html
|
|
3
|
-
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
|
4
|
-
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
5
|
-
|
|
6
|
-
<html>
|
|
7
|
-
<head>
|
|
8
|
-
<title>new (Classifier::LSI)</title>
|
|
9
|
-
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
|
10
|
-
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
|
11
|
-
</head>
|
|
12
|
-
<body class="standalone-code">
|
|
13
|
-
<pre><span class="ruby-comment cmt"># File lib/classifier/lsi.rb, line 26</span>
|
|
14
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">options</span> = {})
|
|
15
|
-
<span class="ruby-ivar">@auto_rebuild</span> = <span class="ruby-keyword kw">true</span> <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">options</span>[<span class="ruby-identifier">:auto_rebuild</span>] <span class="ruby-operator">==</span> <span class="ruby-keyword kw">false</span>
|
|
16
|
-
<span class="ruby-ivar">@word_list</span>, <span class="ruby-ivar">@items</span> = <span class="ruby-constant">WordList</span>.<span class="ruby-identifier">new</span>, {}
|
|
17
|
-
<span class="ruby-ivar">@version</span>, <span class="ruby-ivar">@built_at_version</span> = <span class="ruby-value">0</span>, <span class="ruby-value">-1</span>
|
|
18
|
-
<span class="ruby-keyword kw">end</span></pre>
|
|
19
|
-
</body>
|
|
20
|
-
</html>
|