sclust 1.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/ruby
2
+
3
+
4
+ module SClust
5
+ module Util
6
+ class WeightedMovingAverage
7
+
8
+ attr_reader :weight, :value
9
+ attr_writer :weight, :value
10
+
11
+ def initialize(weight, initial_value = 0.0)
12
+
13
+ raise Exception.new("Weight was #{weight} but must be between 0.0 and 1.0.") if ( weight > 1 or weight < 0)
14
+
15
+ @weight = weight
16
+ @weight_compliment = 1.0-weight
17
+ @value = initial_value
18
+ end
19
+
20
+ def adjust(value)
21
+ @value = ( @weight_compliment*@value ) + ( @weight * value )
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,53 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
25
+
26
+ module SClust
27
+ module Util
28
+ class Word
29
+
30
+ attr_reader :word, :weight, :data
31
+ attr_writer :word, :weight, :data
32
+
33
+ def initialize(word="", weight=0.0, other_data={})
34
+ @word = word
35
+ @weight = weight
36
+ @data = other_data
37
+ end
38
+
39
+ # Return @word.
40
+ def to_s
41
+ @word
42
+ end
43
+
44
+ def hash
45
+ @word.hash
46
+ end
47
+
48
+ def eql?(w)
49
+ @word.eql?(w)
50
+ end
51
+ end
52
+ end
53
+ end
@@ -1,50 +1,77 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
1
25
  require 'test/unit'
2
26
 
3
- require 'sclust/doccluster'
27
+ require 'sclust/util/doccol'
28
+ require 'sclust/kmean/doccluster'
29
+ require 'sclust/util/filters'
30
+
31
+ Log4r::StderrOutputter.new('default')
32
+ Log4r::Outputter['default'].formatter = Log4r::PatternFormatter.new( :pattern => '%d %C: %m' , :date_pattern => '[%Y-%m-%d-%H:%M:%S %Z]')
33
+ Log4r::Logger.root.level = Log4r::DEBUG
34
+ Log4r::Logger.root.add( 'default' )
35
+
36
+ require 'sclust/util/doc'
37
+
38
+
39
+ #$logger = Log4r::Logger.new($0)
40
+ #$logger.add('default')
41
+ #$logger.info("Starting")
42
+
4
43
 
5
44
  class ClusterTest < Test::Unit::TestCase
6
45
 
7
46
  def setup()
8
- @dc = SClust::DocumentCollection.new()
9
- filter = SClust::NullFilter.new()
10
- d1 = SClust::Document.new("a b c d d e a q a b", :filter=>filter, :ngrams=>[1])
11
- d2 = SClust::Document.new("a b d e a", :filter=>filter, :ngrams=>[1])
12
- d3 = SClust::Document.new("bob", :filter=>filter, :ngrams=>[1])
13
- d4 = SClust::Document.new("frank a", :filter=>filter, :ngrams=>[1])
14
-
15
- @dc + d1
16
- @dc + d2
17
- @dc + d3
18
- @dc + d4
19
47
  end
20
48
 
21
49
  def teardown()
22
50
  end
23
51
 
24
52
  def test_makecluster()
25
- c = SClust::DocumentClusterer.new(@dc)
53
+ filter = SClust::Util::NullFilter.new()
54
+ d1 = SClust::Util::Document.new("a b c d d e a q a b", :filter=>filter, :ngrams=>[1])
55
+ d2 = SClust::Util::Document.new("a b d e a", :filter=>filter, :ngrams=>[1])
56
+ d3 = SClust::Util::Document.new("bob", :filter=>filter, :ngrams=>[1])
57
+ d4 = SClust::Util::Document.new("frank a", :filter=>filter, :ngrams=>[1])
58
+
59
+ c = SClust::KMean::DocumentClusterer.new()
26
60
 
27
- c.cluster
61
+ c << d1
62
+ c << d2
63
+ c << d3
64
+ c << d4
28
65
 
66
+ c.topics = 3
67
+
68
+ c.cluster
69
+
29
70
  c.each_cluster do |cl|
30
-
31
- max = 0
32
-
33
- 0.upto(cl.center.terms.length - 1) do |i|
34
-
35
- term = cl.center.terms[i]
36
- value = cl.center.values[i]
37
-
38
- max = i if ( cl.center.values[i] > cl.center.values[max] )
39
- end
40
-
41
- puts("Cluster: #{cl.center.terms[max]} #{cl.center.values[max]}")
42
-
71
+ puts('===================================')
43
72
  cl.center.get_max_terms(3).each do |t|
44
73
  puts("Got Term: #{t} with value #{cl.center.get_term_value(t)}")
45
74
  end
46
-
47
- assert(cl.center.values[max] == cl.center.get_term_value(cl.center.get_max_terms(1)[0]), "Max value was not found.")
48
75
  end
49
76
  end
50
77
 
@@ -0,0 +1,48 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
25
+ require 'test/unit'
26
+
27
+ require 'sclust/util/filters'
28
+
29
+ class DocTests < Test::Unit::TestCase
30
+
31
+ def setup() end
32
+ def teardown() end
33
+
34
+ def test_docfilter()
35
+ f = SClust::Util::DocumentTermFilter.new()
36
+
37
+ assert( f.apply("aba").original_word == "aba", "did not filter out a.")
38
+ end
39
+
40
+ def test_tokenizer()
41
+
42
+ f = SClust::Util::TokenizerFilter.new()
43
+
44
+ assert(f.apply("hi bye") == [ "hi", "bye" ])
45
+ assert(f.apply("hi \r\n\n\rbye") == [ "hi", "bye" ])
46
+ end
47
+ end
48
+
@@ -0,0 +1,75 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
25
+ require 'test/unit'
26
+
27
+ require 'sclust/lda/lda2'
28
+ require 'sclust/util/doc'
29
+ require 'log4r'
30
+
31
+ Log4r::StderrOutputter.new('default')
32
+ Log4r::Outputter['default'].formatter = Log4r::PatternFormatter.new( :pattern => '%d %C: %m' , :date_pattern => '[%Y-%m-%d-%H:%M:%S %Z]')
33
+ Log4r::Logger.root.level = Log4r::DEBUG
34
+ Log4r::Logger.root.add( 'default' )
35
+
36
+
37
+ class DocTests < Test::Unit::TestCase
38
+
39
+ def setup()
40
+ @null_filter = SClust::Util::NullFilter.new()
41
+ end
42
+
43
+ def teardown()
44
+ end
45
+
46
+ def test_lda_001()
47
+
48
+
49
+ lda = SClust::LDA2::LDA2.new()
50
+
51
+ lda.topics=4
52
+
53
+ lda << SClust::Util::Document.new("a b 1 z ", :filter => @null_filter)
54
+ lda << SClust::Util::Document.new("a b 2 5 ", :filter => @null_filter)
55
+ lda << SClust::Util::Document.new("a b 3 4 ", :filter => @null_filter)
56
+ lda << SClust::Util::Document.new("a b c d e f g", :filter => @null_filter)
57
+ lda << SClust::Util::Document.new("d e f z", :filter => @null_filter)
58
+ lda << SClust::Util::Document.new("g h z", :filter => @null_filter)
59
+ lda << SClust::Util::Document.new("h i z", :filter => @null_filter)
60
+ lda << SClust::Util::Document.new("x y 6", :filter => @null_filter)
61
+ lda << SClust::Util::Document.new("x y 7", :filter => @null_filter)
62
+ lda << SClust::Util::Document.new("x y 8", :filter => @null_filter)
63
+
64
+ lda.lda(:iterations=>100)
65
+
66
+ lda.get_max_terms(100).each do |topic|
67
+ puts("---------- Topic ---------- ")
68
+
69
+ topic.each do |words|
70
+ puts("\t#{words.weight} - #{words.to_s}")
71
+ end
72
+ end
73
+ end
74
+ end
75
+
@@ -0,0 +1,61 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
25
+ require 'test/unit'
26
+
27
+ require 'sclust/sparse_vector'
28
+
29
+ class ClusterTest < Test::Unit::TestCase
30
+
31
+ def setup()
32
+ end
33
+
34
+ def teardown()
35
+ end
36
+
37
+ def test_spvec01()
38
+ sp = SClust::SparseLabeledVector.new(0)
39
+
40
+ sp[5] = 0
41
+ sp.store(0, 1, "bye")
42
+ sp.store(2, 0, "hi")
43
+
44
+ assert(sp[0] == 1, "Could not define value.")
45
+
46
+ assert(sp[1] == 0, "Default value not returned for unknown keys.")
47
+
48
+ assert(sp.length == 1, "Data size was #{sp.length} instead of 1. Assigning default value may have accidentally stored the default value.")
49
+
50
+ assert(sp.key_map[0] == "bye", "Could not find map from key 0 to label \"bye\"")
51
+
52
+ assert(sp.label_map["bye"] == 0, "Could not find map from label \"bye\" to key 0")
53
+
54
+ sp.delete(0)
55
+ sp.delete(1)
56
+
57
+ assert(sp[0] == 0, "Default value not returned for deleted key.")
58
+
59
+ end
60
+
61
+ end
@@ -1,5 +1,30 @@
1
- require 'sclust/doc'
2
- require 'sclust/doccol'
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
25
+ require 'sclust/util/doc'
26
+ require 'sclust/util/doccol'
27
+ require 'sclust/util/filters'
3
28
  require 'test/unit'
4
29
 
5
30
 
@@ -12,11 +37,11 @@ class DocTests < Test::Unit::TestCase
12
37
  #end
13
38
 
14
39
  def test_builddoc
15
- d = SClust::Document.new("hi, this is a nice doc! Yup. Oh? A very nice doc, indeed.")
40
+ d = SClust::Util::Document.new("hi, this is a nice doc! Yup. Oh? A very nice doc, indeed.")
16
41
 
17
42
  d.terms.each do |k,v|
18
- assert(k != ".", "Period found")
19
- assert(k != "", "Empty term found")
43
+ assert(k.original_word != ".", "Period found")
44
+ assert(k.original_word != "", "Empty term found")
20
45
  #puts("#{k}=#{v}")
21
46
  end
22
47
 
@@ -27,23 +52,28 @@ end
27
52
  class DocCollectionTests < Test::Unit::TestCase
28
53
 
29
54
  def test_collectionadd()
30
- dc = SClust::DocumentCollection.new()
31
- d1 = SClust::Document.new("a b c d d e a q a b")
32
- d2 = SClust::Document.new("a b d e a")
33
- d3 = SClust::Document.new("bob")
34
- d4 = SClust::Document.new("frank a")
55
+ filter = SClust::Util::NullFilter.new()
56
+ dc = SClust::KMean::DocumentCollection.new()
57
+ d1 = SClust::Util::Document.new("a b c d d e a q a b", :filter=>filter, :ngrams => [1])
58
+ d2 = SClust::Util::Document.new("a b d e a", :filter=>filter, :ngrams => [1])
59
+ d3 = SClust::Util::Document.new("bob", :filter=>filter, :ngrams => [1])
60
+ d4 = SClust::Util::Document.new("frank a", :filter=>filter, :ngrams => [1])
35
61
 
36
- dc + d1
37
- dc + d2
38
- dc + d3
39
- dc + d4
62
+ dc << d1
63
+ dc << d2
64
+ dc << d3
65
+ dc << d4
40
66
 
41
67
  dc.terms.each do |k,v|
42
- if k == "a"
43
- assert(v == 3, "A appers in 3 documents out of 4.")
44
- assert(dc.idf("a") > 2.2, "Known value for a")
45
- assert(dc.idf("a") < 2.3, "Known value for a")
46
- end
68
+ if k == "a"
69
+ assert(v == 6, "A appers in #{v} documents out of 4.")
70
+ assert(dc.idf("a") > 0.2, "Known value for a")
71
+ assert(dc.idf("a") < 0.3, "Known value for a")
72
+ end
47
73
  end
74
+
75
+ print("TERMS: ")
76
+ d1.words.each { |w| print "#{w}, " }
77
+ assert(d1.tf('a') * d1.words.size == 3)
48
78
  end
49
79
  end