sclust 1.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/sclust/kmean/cluster.rb +294 -0
- data/lib/sclust/kmean/doccluster.rb +83 -0
- data/lib/sclust/lda/lda.rb +243 -0
- data/lib/sclust/lda/lda2.rb +328 -0
- data/lib/sclust/util/doc.rb +134 -0
- data/lib/sclust/util/doccol.rb +187 -0
- data/lib/sclust/util/filters.rb +210 -0
- data/lib/sclust/util/rss.rb +96 -0
- data/lib/sclust/util/sparse_vector.rb +96 -0
- data/lib/sclust/util/stopwords.rb +1149 -0
- data/lib/sclust/util/weightedmovingaverage.rb +25 -0
- data/lib/sclust/util/word.rb +53 -0
- data/tests/clustertest.rb +56 -29
- data/tests/filters_test.rb +48 -0
- data/tests/ldatest.rb +75 -0
- data/tests/sparse_vector_test.rb +61 -0
- data/tests/test001.rb +49 -19
- metadata +74 -40
- data/lib/sclust/cluster.rb +0 -197
- data/lib/sclust/doc.rb +0 -92
- data/lib/sclust/doccluster.rb +0 -39
- data/lib/sclust/doccol.rb +0 -75
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
|
4
|
+
module SClust
|
5
|
+
module Util
|
6
|
+
class WeightedMovingAverage
|
7
|
+
|
8
|
+
attr_reader :weight, :value
|
9
|
+
attr_writer :weight, :value
|
10
|
+
|
11
|
+
def initialize(weight, initial_value = 0.0)
|
12
|
+
|
13
|
+
raise Exception.new("Weight was #{weight} but must be between 0.0 and 1.0.") if ( weight > 1 or weight < 0)
|
14
|
+
|
15
|
+
@weight = weight
|
16
|
+
@weight_compliment = 1.0-weight
|
17
|
+
@value = initial_value
|
18
|
+
end
|
19
|
+
|
20
|
+
def adjust(value)
|
21
|
+
@value = ( @weight_compliment*@value ) + ( @weight * value )
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
25
|
+
|
26
|
+
module SClust
|
27
|
+
module Util
|
28
|
+
class Word
|
29
|
+
|
30
|
+
attr_reader :word, :weight, :data
|
31
|
+
attr_writer :word, :weight, :data
|
32
|
+
|
33
|
+
def initialize(word="", weight=0.0, other_data={})
|
34
|
+
@word = word
|
35
|
+
@weight = weight
|
36
|
+
@data = other_data
|
37
|
+
end
|
38
|
+
|
39
|
+
# Return @word.
|
40
|
+
def to_s
|
41
|
+
@word
|
42
|
+
end
|
43
|
+
|
44
|
+
def hash
|
45
|
+
@word.hash
|
46
|
+
end
|
47
|
+
|
48
|
+
def eql?(w)
|
49
|
+
@word.eql?(w)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
data/tests/clustertest.rb
CHANGED
@@ -1,50 +1,77 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
1
25
|
require 'test/unit'
|
2
26
|
|
3
|
-
require 'sclust/
|
27
|
+
require 'sclust/util/doccol'
|
28
|
+
require 'sclust/kmean/doccluster'
|
29
|
+
require 'sclust/util/filters'
|
30
|
+
|
31
|
+
Log4r::StderrOutputter.new('default')
|
32
|
+
Log4r::Outputter['default'].formatter = Log4r::PatternFormatter.new( :pattern => '%d %C: %m' , :date_pattern => '[%Y-%m-%d-%H:%M:%S %Z]')
|
33
|
+
Log4r::Logger.root.level = Log4r::DEBUG
|
34
|
+
Log4r::Logger.root.add( 'default' )
|
35
|
+
|
36
|
+
require 'sclust/util/doc'
|
37
|
+
|
38
|
+
|
39
|
+
#$logger = Log4r::Logger.new($0)
|
40
|
+
#$logger.add('default')
|
41
|
+
#$logger.info("Starting")
|
42
|
+
|
4
43
|
|
5
44
|
class ClusterTest < Test::Unit::TestCase
|
6
45
|
|
7
46
|
def setup()
|
8
|
-
@dc = SClust::DocumentCollection.new()
|
9
|
-
filter = SClust::NullFilter.new()
|
10
|
-
d1 = SClust::Document.new("a b c d d e a q a b", :filter=>filter, :ngrams=>[1])
|
11
|
-
d2 = SClust::Document.new("a b d e a", :filter=>filter, :ngrams=>[1])
|
12
|
-
d3 = SClust::Document.new("bob", :filter=>filter, :ngrams=>[1])
|
13
|
-
d4 = SClust::Document.new("frank a", :filter=>filter, :ngrams=>[1])
|
14
|
-
|
15
|
-
@dc + d1
|
16
|
-
@dc + d2
|
17
|
-
@dc + d3
|
18
|
-
@dc + d4
|
19
47
|
end
|
20
48
|
|
21
49
|
def teardown()
|
22
50
|
end
|
23
51
|
|
24
52
|
def test_makecluster()
|
25
|
-
|
53
|
+
filter = SClust::Util::NullFilter.new()
|
54
|
+
d1 = SClust::Util::Document.new("a b c d d e a q a b", :filter=>filter, :ngrams=>[1])
|
55
|
+
d2 = SClust::Util::Document.new("a b d e a", :filter=>filter, :ngrams=>[1])
|
56
|
+
d3 = SClust::Util::Document.new("bob", :filter=>filter, :ngrams=>[1])
|
57
|
+
d4 = SClust::Util::Document.new("frank a", :filter=>filter, :ngrams=>[1])
|
58
|
+
|
59
|
+
c = SClust::KMean::DocumentClusterer.new()
|
26
60
|
|
27
|
-
c
|
61
|
+
c << d1
|
62
|
+
c << d2
|
63
|
+
c << d3
|
64
|
+
c << d4
|
28
65
|
|
66
|
+
c.topics = 3
|
67
|
+
|
68
|
+
c.cluster
|
69
|
+
|
29
70
|
c.each_cluster do |cl|
|
30
|
-
|
31
|
-
max = 0
|
32
|
-
|
33
|
-
0.upto(cl.center.terms.length - 1) do |i|
|
34
|
-
|
35
|
-
term = cl.center.terms[i]
|
36
|
-
value = cl.center.values[i]
|
37
|
-
|
38
|
-
max = i if ( cl.center.values[i] > cl.center.values[max] )
|
39
|
-
end
|
40
|
-
|
41
|
-
puts("Cluster: #{cl.center.terms[max]} #{cl.center.values[max]}")
|
42
|
-
|
71
|
+
puts('===================================')
|
43
72
|
cl.center.get_max_terms(3).each do |t|
|
44
73
|
puts("Got Term: #{t} with value #{cl.center.get_term_value(t)}")
|
45
74
|
end
|
46
|
-
|
47
|
-
assert(cl.center.values[max] == cl.center.get_term_value(cl.center.get_max_terms(1)[0]), "Max value was not found.")
|
48
75
|
end
|
49
76
|
end
|
50
77
|
|
@@ -0,0 +1,48 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
25
|
+
require 'test/unit'
|
26
|
+
|
27
|
+
require 'sclust/util/filters'
|
28
|
+
|
29
|
+
class DocTests < Test::Unit::TestCase
|
30
|
+
|
31
|
+
def setup() end
|
32
|
+
def teardown() end
|
33
|
+
|
34
|
+
def test_docfilter()
|
35
|
+
f = SClust::Util::DocumentTermFilter.new()
|
36
|
+
|
37
|
+
assert( f.apply("aba").original_word == "aba", "did not filter out a.")
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_tokenizer()
|
41
|
+
|
42
|
+
f = SClust::Util::TokenizerFilter.new()
|
43
|
+
|
44
|
+
assert(f.apply("hi bye") == [ "hi", "bye" ])
|
45
|
+
assert(f.apply("hi \r\n\n\rbye") == [ "hi", "bye" ])
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
data/tests/ldatest.rb
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
25
|
+
require 'test/unit'
|
26
|
+
|
27
|
+
require 'sclust/lda/lda2'
|
28
|
+
require 'sclust/util/doc'
|
29
|
+
require 'log4r'
|
30
|
+
|
31
|
+
Log4r::StderrOutputter.new('default')
|
32
|
+
Log4r::Outputter['default'].formatter = Log4r::PatternFormatter.new( :pattern => '%d %C: %m' , :date_pattern => '[%Y-%m-%d-%H:%M:%S %Z]')
|
33
|
+
Log4r::Logger.root.level = Log4r::DEBUG
|
34
|
+
Log4r::Logger.root.add( 'default' )
|
35
|
+
|
36
|
+
|
37
|
+
class DocTests < Test::Unit::TestCase
|
38
|
+
|
39
|
+
def setup()
|
40
|
+
@null_filter = SClust::Util::NullFilter.new()
|
41
|
+
end
|
42
|
+
|
43
|
+
def teardown()
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_lda_001()
|
47
|
+
|
48
|
+
|
49
|
+
lda = SClust::LDA2::LDA2.new()
|
50
|
+
|
51
|
+
lda.topics=4
|
52
|
+
|
53
|
+
lda << SClust::Util::Document.new("a b 1 z ", :filter => @null_filter)
|
54
|
+
lda << SClust::Util::Document.new("a b 2 5 ", :filter => @null_filter)
|
55
|
+
lda << SClust::Util::Document.new("a b 3 4 ", :filter => @null_filter)
|
56
|
+
lda << SClust::Util::Document.new("a b c d e f g", :filter => @null_filter)
|
57
|
+
lda << SClust::Util::Document.new("d e f z", :filter => @null_filter)
|
58
|
+
lda << SClust::Util::Document.new("g h z", :filter => @null_filter)
|
59
|
+
lda << SClust::Util::Document.new("h i z", :filter => @null_filter)
|
60
|
+
lda << SClust::Util::Document.new("x y 6", :filter => @null_filter)
|
61
|
+
lda << SClust::Util::Document.new("x y 7", :filter => @null_filter)
|
62
|
+
lda << SClust::Util::Document.new("x y 8", :filter => @null_filter)
|
63
|
+
|
64
|
+
lda.lda(:iterations=>100)
|
65
|
+
|
66
|
+
lda.get_max_terms(100).each do |topic|
|
67
|
+
puts("---------- Topic ---------- ")
|
68
|
+
|
69
|
+
topic.each do |words|
|
70
|
+
puts("\t#{words.weight} - #{words.to_s}")
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
@@ -0,0 +1,61 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
25
|
+
require 'test/unit'
|
26
|
+
|
27
|
+
require 'sclust/sparse_vector'
|
28
|
+
|
29
|
+
class ClusterTest < Test::Unit::TestCase
|
30
|
+
|
31
|
+
def setup()
|
32
|
+
end
|
33
|
+
|
34
|
+
def teardown()
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_spvec01()
|
38
|
+
sp = SClust::SparseLabeledVector.new(0)
|
39
|
+
|
40
|
+
sp[5] = 0
|
41
|
+
sp.store(0, 1, "bye")
|
42
|
+
sp.store(2, 0, "hi")
|
43
|
+
|
44
|
+
assert(sp[0] == 1, "Could not define value.")
|
45
|
+
|
46
|
+
assert(sp[1] == 0, "Default value not returned for unknown keys.")
|
47
|
+
|
48
|
+
assert(sp.length == 1, "Data size was #{sp.length} instead of 1. Assigning default value may have accidentally stored the default value.")
|
49
|
+
|
50
|
+
assert(sp.key_map[0] == "bye", "Could not find map from key 0 to label \"bye\"")
|
51
|
+
|
52
|
+
assert(sp.label_map["bye"] == 0, "Could not find map from label \"bye\" to key 0")
|
53
|
+
|
54
|
+
sp.delete(0)
|
55
|
+
sp.delete(1)
|
56
|
+
|
57
|
+
assert(sp[0] == 0, "Default value not returned for deleted key.")
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
data/tests/test001.rb
CHANGED
@@ -1,5 +1,30 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
25
|
+
require 'sclust/util/doc'
|
26
|
+
require 'sclust/util/doccol'
|
27
|
+
require 'sclust/util/filters'
|
3
28
|
require 'test/unit'
|
4
29
|
|
5
30
|
|
@@ -12,11 +37,11 @@ class DocTests < Test::Unit::TestCase
|
|
12
37
|
#end
|
13
38
|
|
14
39
|
def test_builddoc
|
15
|
-
d = SClust::Document.new("hi, this is a nice doc! Yup. Oh? A very nice doc, indeed.")
|
40
|
+
d = SClust::Util::Document.new("hi, this is a nice doc! Yup. Oh? A very nice doc, indeed.")
|
16
41
|
|
17
42
|
d.terms.each do |k,v|
|
18
|
-
assert(k != ".", "Period found")
|
19
|
-
assert(k != "", "Empty term found")
|
43
|
+
assert(k.original_word != ".", "Period found")
|
44
|
+
assert(k.original_word != "", "Empty term found")
|
20
45
|
#puts("#{k}=#{v}")
|
21
46
|
end
|
22
47
|
|
@@ -27,23 +52,28 @@ end
|
|
27
52
|
class DocCollectionTests < Test::Unit::TestCase
|
28
53
|
|
29
54
|
def test_collectionadd()
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
55
|
+
filter = SClust::Util::NullFilter.new()
|
56
|
+
dc = SClust::KMean::DocumentCollection.new()
|
57
|
+
d1 = SClust::Util::Document.new("a b c d d e a q a b", :filter=>filter, :ngrams => [1])
|
58
|
+
d2 = SClust::Util::Document.new("a b d e a", :filter=>filter, :ngrams => [1])
|
59
|
+
d3 = SClust::Util::Document.new("bob", :filter=>filter, :ngrams => [1])
|
60
|
+
d4 = SClust::Util::Document.new("frank a", :filter=>filter, :ngrams => [1])
|
35
61
|
|
36
|
-
dc
|
37
|
-
dc
|
38
|
-
dc
|
39
|
-
dc
|
62
|
+
dc << d1
|
63
|
+
dc << d2
|
64
|
+
dc << d3
|
65
|
+
dc << d4
|
40
66
|
|
41
67
|
dc.terms.each do |k,v|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
68
|
+
if k == "a"
|
69
|
+
assert(v == 6, "A appers in #{v} documents out of 4.")
|
70
|
+
assert(dc.idf("a") > 0.2, "Known value for a")
|
71
|
+
assert(dc.idf("a") < 0.3, "Known value for a")
|
72
|
+
end
|
47
73
|
end
|
74
|
+
|
75
|
+
print("TERMS: ")
|
76
|
+
d1.words.each { |w| print "#{w}, " }
|
77
|
+
assert(d1.tf('a') * d1.words.size == 3)
|
48
78
|
end
|
49
79
|
end
|