sclust 1.0.0 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/sclust/kmean/cluster.rb +294 -0
- data/lib/sclust/kmean/doccluster.rb +83 -0
- data/lib/sclust/lda/lda.rb +243 -0
- data/lib/sclust/lda/lda2.rb +328 -0
- data/lib/sclust/util/doc.rb +134 -0
- data/lib/sclust/util/doccol.rb +187 -0
- data/lib/sclust/util/filters.rb +210 -0
- data/lib/sclust/util/rss.rb +96 -0
- data/lib/sclust/util/sparse_vector.rb +96 -0
- data/lib/sclust/util/stopwords.rb +1149 -0
- data/lib/sclust/util/weightedmovingaverage.rb +25 -0
- data/lib/sclust/util/word.rb +53 -0
- data/tests/clustertest.rb +56 -29
- data/tests/filters_test.rb +48 -0
- data/tests/ldatest.rb +75 -0
- data/tests/sparse_vector_test.rb +61 -0
- data/tests/test001.rb +49 -19
- metadata +74 -40
- data/lib/sclust/cluster.rb +0 -197
- data/lib/sclust/doc.rb +0 -92
- data/lib/sclust/doccluster.rb +0 -39
- data/lib/sclust/doccol.rb +0 -75
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
|
4
|
+
module SClust
|
5
|
+
module Util
|
6
|
+
class WeightedMovingAverage
|
7
|
+
|
8
|
+
attr_reader :weight, :value
|
9
|
+
attr_writer :weight, :value
|
10
|
+
|
11
|
+
def initialize(weight, initial_value = 0.0)
|
12
|
+
|
13
|
+
raise Exception.new("Weight was #{weight} but must be between 0.0 and 1.0.") if ( weight > 1 or weight < 0)
|
14
|
+
|
15
|
+
@weight = weight
|
16
|
+
@weight_compliment = 1.0-weight
|
17
|
+
@value = initial_value
|
18
|
+
end
|
19
|
+
|
20
|
+
def adjust(value)
|
21
|
+
@value = ( @weight_compliment*@value ) + ( @weight * value )
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
25
|
+
|
26
|
+
module SClust
|
27
|
+
module Util
|
28
|
+
class Word
|
29
|
+
|
30
|
+
attr_reader :word, :weight, :data
|
31
|
+
attr_writer :word, :weight, :data
|
32
|
+
|
33
|
+
def initialize(word="", weight=0.0, other_data={})
|
34
|
+
@word = word
|
35
|
+
@weight = weight
|
36
|
+
@data = other_data
|
37
|
+
end
|
38
|
+
|
39
|
+
# Return @word.
|
40
|
+
def to_s
|
41
|
+
@word
|
42
|
+
end
|
43
|
+
|
44
|
+
def hash
|
45
|
+
@word.hash
|
46
|
+
end
|
47
|
+
|
48
|
+
def eql?(w)
|
49
|
+
@word.eql?(w)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
data/tests/clustertest.rb
CHANGED
@@ -1,50 +1,77 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
1
25
|
require 'test/unit'
|
2
26
|
|
3
|
-
require 'sclust/
|
27
|
+
require 'sclust/util/doccol'
|
28
|
+
require 'sclust/kmean/doccluster'
|
29
|
+
require 'sclust/util/filters'
|
30
|
+
|
31
|
+
Log4r::StderrOutputter.new('default')
|
32
|
+
Log4r::Outputter['default'].formatter = Log4r::PatternFormatter.new( :pattern => '%d %C: %m' , :date_pattern => '[%Y-%m-%d-%H:%M:%S %Z]')
|
33
|
+
Log4r::Logger.root.level = Log4r::DEBUG
|
34
|
+
Log4r::Logger.root.add( 'default' )
|
35
|
+
|
36
|
+
require 'sclust/util/doc'
|
37
|
+
|
38
|
+
|
39
|
+
#$logger = Log4r::Logger.new($0)
|
40
|
+
#$logger.add('default')
|
41
|
+
#$logger.info("Starting")
|
42
|
+
|
4
43
|
|
5
44
|
class ClusterTest < Test::Unit::TestCase
|
6
45
|
|
7
46
|
def setup()
|
8
|
-
@dc = SClust::DocumentCollection.new()
|
9
|
-
filter = SClust::NullFilter.new()
|
10
|
-
d1 = SClust::Document.new("a b c d d e a q a b", :filter=>filter, :ngrams=>[1])
|
11
|
-
d2 = SClust::Document.new("a b d e a", :filter=>filter, :ngrams=>[1])
|
12
|
-
d3 = SClust::Document.new("bob", :filter=>filter, :ngrams=>[1])
|
13
|
-
d4 = SClust::Document.new("frank a", :filter=>filter, :ngrams=>[1])
|
14
|
-
|
15
|
-
@dc + d1
|
16
|
-
@dc + d2
|
17
|
-
@dc + d3
|
18
|
-
@dc + d4
|
19
47
|
end
|
20
48
|
|
21
49
|
def teardown()
|
22
50
|
end
|
23
51
|
|
24
52
|
def test_makecluster()
|
25
|
-
|
53
|
+
filter = SClust::Util::NullFilter.new()
|
54
|
+
d1 = SClust::Util::Document.new("a b c d d e a q a b", :filter=>filter, :ngrams=>[1])
|
55
|
+
d2 = SClust::Util::Document.new("a b d e a", :filter=>filter, :ngrams=>[1])
|
56
|
+
d3 = SClust::Util::Document.new("bob", :filter=>filter, :ngrams=>[1])
|
57
|
+
d4 = SClust::Util::Document.new("frank a", :filter=>filter, :ngrams=>[1])
|
58
|
+
|
59
|
+
c = SClust::KMean::DocumentClusterer.new()
|
26
60
|
|
27
|
-
c
|
61
|
+
c << d1
|
62
|
+
c << d2
|
63
|
+
c << d3
|
64
|
+
c << d4
|
28
65
|
|
66
|
+
c.topics = 3
|
67
|
+
|
68
|
+
c.cluster
|
69
|
+
|
29
70
|
c.each_cluster do |cl|
|
30
|
-
|
31
|
-
max = 0
|
32
|
-
|
33
|
-
0.upto(cl.center.terms.length - 1) do |i|
|
34
|
-
|
35
|
-
term = cl.center.terms[i]
|
36
|
-
value = cl.center.values[i]
|
37
|
-
|
38
|
-
max = i if ( cl.center.values[i] > cl.center.values[max] )
|
39
|
-
end
|
40
|
-
|
41
|
-
puts("Cluster: #{cl.center.terms[max]} #{cl.center.values[max]}")
|
42
|
-
|
71
|
+
puts('===================================')
|
43
72
|
cl.center.get_max_terms(3).each do |t|
|
44
73
|
puts("Got Term: #{t} with value #{cl.center.get_term_value(t)}")
|
45
74
|
end
|
46
|
-
|
47
|
-
assert(cl.center.values[max] == cl.center.get_term_value(cl.center.get_max_terms(1)[0]), "Max value was not found.")
|
48
75
|
end
|
49
76
|
end
|
50
77
|
|
@@ -0,0 +1,48 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
25
|
+
require 'test/unit'
|
26
|
+
|
27
|
+
require 'sclust/util/filters'
|
28
|
+
|
29
|
+
class DocTests < Test::Unit::TestCase
|
30
|
+
|
31
|
+
def setup() end
|
32
|
+
def teardown() end
|
33
|
+
|
34
|
+
def test_docfilter()
|
35
|
+
f = SClust::Util::DocumentTermFilter.new()
|
36
|
+
|
37
|
+
assert( f.apply("aba").original_word == "aba", "did not filter out a.")
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_tokenizer()
|
41
|
+
|
42
|
+
f = SClust::Util::TokenizerFilter.new()
|
43
|
+
|
44
|
+
assert(f.apply("hi bye") == [ "hi", "bye" ])
|
45
|
+
assert(f.apply("hi \r\n\n\rbye") == [ "hi", "bye" ])
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
data/tests/ldatest.rb
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
25
|
+
require 'test/unit'
|
26
|
+
|
27
|
+
require 'sclust/lda/lda2'
|
28
|
+
require 'sclust/util/doc'
|
29
|
+
require 'log4r'
|
30
|
+
|
31
|
+
Log4r::StderrOutputter.new('default')
|
32
|
+
Log4r::Outputter['default'].formatter = Log4r::PatternFormatter.new( :pattern => '%d %C: %m' , :date_pattern => '[%Y-%m-%d-%H:%M:%S %Z]')
|
33
|
+
Log4r::Logger.root.level = Log4r::DEBUG
|
34
|
+
Log4r::Logger.root.add( 'default' )
|
35
|
+
|
36
|
+
|
37
|
+
class DocTests < Test::Unit::TestCase
|
38
|
+
|
39
|
+
def setup()
|
40
|
+
@null_filter = SClust::Util::NullFilter.new()
|
41
|
+
end
|
42
|
+
|
43
|
+
def teardown()
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_lda_001()
|
47
|
+
|
48
|
+
|
49
|
+
lda = SClust::LDA2::LDA2.new()
|
50
|
+
|
51
|
+
lda.topics=4
|
52
|
+
|
53
|
+
lda << SClust::Util::Document.new("a b 1 z ", :filter => @null_filter)
|
54
|
+
lda << SClust::Util::Document.new("a b 2 5 ", :filter => @null_filter)
|
55
|
+
lda << SClust::Util::Document.new("a b 3 4 ", :filter => @null_filter)
|
56
|
+
lda << SClust::Util::Document.new("a b c d e f g", :filter => @null_filter)
|
57
|
+
lda << SClust::Util::Document.new("d e f z", :filter => @null_filter)
|
58
|
+
lda << SClust::Util::Document.new("g h z", :filter => @null_filter)
|
59
|
+
lda << SClust::Util::Document.new("h i z", :filter => @null_filter)
|
60
|
+
lda << SClust::Util::Document.new("x y 6", :filter => @null_filter)
|
61
|
+
lda << SClust::Util::Document.new("x y 7", :filter => @null_filter)
|
62
|
+
lda << SClust::Util::Document.new("x y 8", :filter => @null_filter)
|
63
|
+
|
64
|
+
lda.lda(:iterations=>100)
|
65
|
+
|
66
|
+
lda.get_max_terms(100).each do |topic|
|
67
|
+
puts("---------- Topic ---------- ")
|
68
|
+
|
69
|
+
topic.each do |words|
|
70
|
+
puts("\t#{words.weight} - #{words.to_s}")
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
@@ -0,0 +1,61 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
25
|
+
require 'test/unit'
|
26
|
+
|
27
|
+
require 'sclust/sparse_vector'
|
28
|
+
|
29
|
+
class ClusterTest < Test::Unit::TestCase
|
30
|
+
|
31
|
+
def setup()
|
32
|
+
end
|
33
|
+
|
34
|
+
def teardown()
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_spvec01()
|
38
|
+
sp = SClust::SparseLabeledVector.new(0)
|
39
|
+
|
40
|
+
sp[5] = 0
|
41
|
+
sp.store(0, 1, "bye")
|
42
|
+
sp.store(2, 0, "hi")
|
43
|
+
|
44
|
+
assert(sp[0] == 1, "Could not define value.")
|
45
|
+
|
46
|
+
assert(sp[1] == 0, "Default value not returned for unknown keys.")
|
47
|
+
|
48
|
+
assert(sp.length == 1, "Data size was #{sp.length} instead of 1. Assigning default value may have accidentally stored the default value.")
|
49
|
+
|
50
|
+
assert(sp.key_map[0] == "bye", "Could not find map from key 0 to label \"bye\"")
|
51
|
+
|
52
|
+
assert(sp.label_map["bye"] == 0, "Could not find map from label \"bye\" to key 0")
|
53
|
+
|
54
|
+
sp.delete(0)
|
55
|
+
sp.delete(1)
|
56
|
+
|
57
|
+
assert(sp[0] == 0, "Default value not returned for deleted key.")
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
data/tests/test001.rb
CHANGED
@@ -1,5 +1,30 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
25
|
+
require 'sclust/util/doc'
|
26
|
+
require 'sclust/util/doccol'
|
27
|
+
require 'sclust/util/filters'
|
3
28
|
require 'test/unit'
|
4
29
|
|
5
30
|
|
@@ -12,11 +37,11 @@ class DocTests < Test::Unit::TestCase
|
|
12
37
|
#end
|
13
38
|
|
14
39
|
def test_builddoc
|
15
|
-
d = SClust::Document.new("hi, this is a nice doc! Yup. Oh? A very nice doc, indeed.")
|
40
|
+
d = SClust::Util::Document.new("hi, this is a nice doc! Yup. Oh? A very nice doc, indeed.")
|
16
41
|
|
17
42
|
d.terms.each do |k,v|
|
18
|
-
assert(k != ".", "Period found")
|
19
|
-
assert(k != "", "Empty term found")
|
43
|
+
assert(k.original_word != ".", "Period found")
|
44
|
+
assert(k.original_word != "", "Empty term found")
|
20
45
|
#puts("#{k}=#{v}")
|
21
46
|
end
|
22
47
|
|
@@ -27,23 +52,28 @@ end
|
|
27
52
|
class DocCollectionTests < Test::Unit::TestCase
|
28
53
|
|
29
54
|
def test_collectionadd()
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
55
|
+
filter = SClust::Util::NullFilter.new()
|
56
|
+
dc = SClust::KMean::DocumentCollection.new()
|
57
|
+
d1 = SClust::Util::Document.new("a b c d d e a q a b", :filter=>filter, :ngrams => [1])
|
58
|
+
d2 = SClust::Util::Document.new("a b d e a", :filter=>filter, :ngrams => [1])
|
59
|
+
d3 = SClust::Util::Document.new("bob", :filter=>filter, :ngrams => [1])
|
60
|
+
d4 = SClust::Util::Document.new("frank a", :filter=>filter, :ngrams => [1])
|
35
61
|
|
36
|
-
dc
|
37
|
-
dc
|
38
|
-
dc
|
39
|
-
dc
|
62
|
+
dc << d1
|
63
|
+
dc << d2
|
64
|
+
dc << d3
|
65
|
+
dc << d4
|
40
66
|
|
41
67
|
dc.terms.each do |k,v|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
68
|
+
if k == "a"
|
69
|
+
assert(v == 6, "A appers in #{v} documents out of 4.")
|
70
|
+
assert(dc.idf("a") > 0.2, "Known value for a")
|
71
|
+
assert(dc.idf("a") < 0.3, "Known value for a")
|
72
|
+
end
|
47
73
|
end
|
74
|
+
|
75
|
+
print("TERMS: ")
|
76
|
+
d1.words.each { |w| print "#{w}, " }
|
77
|
+
assert(d1.tf('a') * d1.words.size == 3)
|
48
78
|
end
|
49
79
|
end
|