sclust 1.0.0 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/ruby
2
+
3
+
4
+ module SClust
5
+ module Util
6
+ class WeightedMovingAverage
7
+
8
+ attr_reader :weight, :value
9
+ attr_writer :weight, :value
10
+
11
+ def initialize(weight, initial_value = 0.0)
12
+
13
+ raise Exception.new("Weight was #{weight} but must be between 0.0 and 1.0.") if ( weight > 1 or weight < 0)
14
+
15
+ @weight = weight
16
+ @weight_compliment = 1.0-weight
17
+ @value = initial_value
18
+ end
19
+
20
+ def adjust(value)
21
+ @value = ( @weight_compliment*@value ) + ( @weight * value )
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,53 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
25
+
26
+ module SClust
27
+ module Util
28
+ class Word
29
+
30
+ attr_reader :word, :weight, :data
31
+ attr_writer :word, :weight, :data
32
+
33
+ def initialize(word="", weight=0.0, other_data={})
34
+ @word = word
35
+ @weight = weight
36
+ @data = other_data
37
+ end
38
+
39
+ # Return @word.
40
+ def to_s
41
+ @word
42
+ end
43
+
44
+ def hash
45
+ @word.hash
46
+ end
47
+
48
+ def eql?(w)
49
+ @word.eql?(w)
50
+ end
51
+ end
52
+ end
53
+ end
@@ -1,50 +1,77 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
1
25
  require 'test/unit'
2
26
 
3
- require 'sclust/doccluster'
27
+ require 'sclust/util/doccol'
28
+ require 'sclust/kmean/doccluster'
29
+ require 'sclust/util/filters'
30
+
31
+ Log4r::StderrOutputter.new('default')
32
+ Log4r::Outputter['default'].formatter = Log4r::PatternFormatter.new( :pattern => '%d %C: %m' , :date_pattern => '[%Y-%m-%d-%H:%M:%S %Z]')
33
+ Log4r::Logger.root.level = Log4r::DEBUG
34
+ Log4r::Logger.root.add( 'default' )
35
+
36
+ require 'sclust/util/doc'
37
+
38
+
39
+ #$logger = Log4r::Logger.new($0)
40
+ #$logger.add('default')
41
+ #$logger.info("Starting")
42
+
4
43
 
5
44
  class ClusterTest < Test::Unit::TestCase
6
45
 
7
46
  def setup()
8
- @dc = SClust::DocumentCollection.new()
9
- filter = SClust::NullFilter.new()
10
- d1 = SClust::Document.new("a b c d d e a q a b", :filter=>filter, :ngrams=>[1])
11
- d2 = SClust::Document.new("a b d e a", :filter=>filter, :ngrams=>[1])
12
- d3 = SClust::Document.new("bob", :filter=>filter, :ngrams=>[1])
13
- d4 = SClust::Document.new("frank a", :filter=>filter, :ngrams=>[1])
14
-
15
- @dc + d1
16
- @dc + d2
17
- @dc + d3
18
- @dc + d4
19
47
  end
20
48
 
21
49
  def teardown()
22
50
  end
23
51
 
24
52
  def test_makecluster()
25
- c = SClust::DocumentClusterer.new(@dc)
53
+ filter = SClust::Util::NullFilter.new()
54
+ d1 = SClust::Util::Document.new("a b c d d e a q a b", :filter=>filter, :ngrams=>[1])
55
+ d2 = SClust::Util::Document.new("a b d e a", :filter=>filter, :ngrams=>[1])
56
+ d3 = SClust::Util::Document.new("bob", :filter=>filter, :ngrams=>[1])
57
+ d4 = SClust::Util::Document.new("frank a", :filter=>filter, :ngrams=>[1])
58
+
59
+ c = SClust::KMean::DocumentClusterer.new()
26
60
 
27
- c.cluster
61
+ c << d1
62
+ c << d2
63
+ c << d3
64
+ c << d4
28
65
 
66
+ c.topics = 3
67
+
68
+ c.cluster
69
+
29
70
  c.each_cluster do |cl|
30
-
31
- max = 0
32
-
33
- 0.upto(cl.center.terms.length - 1) do |i|
34
-
35
- term = cl.center.terms[i]
36
- value = cl.center.values[i]
37
-
38
- max = i if ( cl.center.values[i] > cl.center.values[max] )
39
- end
40
-
41
- puts("Cluster: #{cl.center.terms[max]} #{cl.center.values[max]}")
42
-
71
+ puts('===================================')
43
72
  cl.center.get_max_terms(3).each do |t|
44
73
  puts("Got Term: #{t} with value #{cl.center.get_term_value(t)}")
45
74
  end
46
-
47
- assert(cl.center.values[max] == cl.center.get_term_value(cl.center.get_max_terms(1)[0]), "Max value was not found.")
48
75
  end
49
76
  end
50
77
 
@@ -0,0 +1,48 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
25
+ require 'test/unit'
26
+
27
+ require 'sclust/util/filters'
28
+
29
+ class DocTests < Test::Unit::TestCase
30
+
31
+ def setup() end
32
+ def teardown() end
33
+
34
+ def test_docfilter()
35
+ f = SClust::Util::DocumentTermFilter.new()
36
+
37
+ assert( f.apply("aba").original_word == "aba", "did not filter out a.")
38
+ end
39
+
40
+ def test_tokenizer()
41
+
42
+ f = SClust::Util::TokenizerFilter.new()
43
+
44
+ assert(f.apply("hi bye") == [ "hi", "bye" ])
45
+ assert(f.apply("hi \r\n\n\rbye") == [ "hi", "bye" ])
46
+ end
47
+ end
48
+
@@ -0,0 +1,75 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
25
+ require 'test/unit'
26
+
27
+ require 'sclust/lda/lda2'
28
+ require 'sclust/util/doc'
29
+ require 'log4r'
30
+
31
+ Log4r::StderrOutputter.new('default')
32
+ Log4r::Outputter['default'].formatter = Log4r::PatternFormatter.new( :pattern => '%d %C: %m' , :date_pattern => '[%Y-%m-%d-%H:%M:%S %Z]')
33
+ Log4r::Logger.root.level = Log4r::DEBUG
34
+ Log4r::Logger.root.add( 'default' )
35
+
36
+
37
+ class DocTests < Test::Unit::TestCase
38
+
39
+ def setup()
40
+ @null_filter = SClust::Util::NullFilter.new()
41
+ end
42
+
43
+ def teardown()
44
+ end
45
+
46
+ def test_lda_001()
47
+
48
+
49
+ lda = SClust::LDA2::LDA2.new()
50
+
51
+ lda.topics=4
52
+
53
+ lda << SClust::Util::Document.new("a b 1 z ", :filter => @null_filter)
54
+ lda << SClust::Util::Document.new("a b 2 5 ", :filter => @null_filter)
55
+ lda << SClust::Util::Document.new("a b 3 4 ", :filter => @null_filter)
56
+ lda << SClust::Util::Document.new("a b c d e f g", :filter => @null_filter)
57
+ lda << SClust::Util::Document.new("d e f z", :filter => @null_filter)
58
+ lda << SClust::Util::Document.new("g h z", :filter => @null_filter)
59
+ lda << SClust::Util::Document.new("h i z", :filter => @null_filter)
60
+ lda << SClust::Util::Document.new("x y 6", :filter => @null_filter)
61
+ lda << SClust::Util::Document.new("x y 7", :filter => @null_filter)
62
+ lda << SClust::Util::Document.new("x y 8", :filter => @null_filter)
63
+
64
+ lda.lda(:iterations=>100)
65
+
66
+ lda.get_max_terms(100).each do |topic|
67
+ puts("---------- Topic ---------- ")
68
+
69
+ topic.each do |words|
70
+ puts("\t#{words.weight} - #{words.to_s}")
71
+ end
72
+ end
73
+ end
74
+ end
75
+
@@ -0,0 +1,61 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
25
+ require 'test/unit'
26
+
27
+ require 'sclust/sparse_vector'
28
+
29
+ class ClusterTest < Test::Unit::TestCase
30
+
31
+ def setup()
32
+ end
33
+
34
+ def teardown()
35
+ end
36
+
37
+ def test_spvec01()
38
+ sp = SClust::SparseLabeledVector.new(0)
39
+
40
+ sp[5] = 0
41
+ sp.store(0, 1, "bye")
42
+ sp.store(2, 0, "hi")
43
+
44
+ assert(sp[0] == 1, "Could not define value.")
45
+
46
+ assert(sp[1] == 0, "Default value not returned for unknown keys.")
47
+
48
+ assert(sp.length == 1, "Data size was #{sp.length} instead of 1. Assigning default value may have accidentally stored the default value.")
49
+
50
+ assert(sp.key_map[0] == "bye", "Could not find map from key 0 to label \"bye\"")
51
+
52
+ assert(sp.label_map["bye"] == 0, "Could not find map from label \"bye\" to key 0")
53
+
54
+ sp.delete(0)
55
+ sp.delete(1)
56
+
57
+ assert(sp[0] == 0, "Default value not returned for deleted key.")
58
+
59
+ end
60
+
61
+ end
@@ -1,5 +1,30 @@
1
- require 'sclust/doc'
2
- require 'sclust/doccol'
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
25
+ require 'sclust/util/doc'
26
+ require 'sclust/util/doccol'
27
+ require 'sclust/util/filters'
3
28
  require 'test/unit'
4
29
 
5
30
 
@@ -12,11 +37,11 @@ class DocTests < Test::Unit::TestCase
12
37
  #end
13
38
 
14
39
  def test_builddoc
15
- d = SClust::Document.new("hi, this is a nice doc! Yup. Oh? A very nice doc, indeed.")
40
+ d = SClust::Util::Document.new("hi, this is a nice doc! Yup. Oh? A very nice doc, indeed.")
16
41
 
17
42
  d.terms.each do |k,v|
18
- assert(k != ".", "Period found")
19
- assert(k != "", "Empty term found")
43
+ assert(k.original_word != ".", "Period found")
44
+ assert(k.original_word != "", "Empty term found")
20
45
  #puts("#{k}=#{v}")
21
46
  end
22
47
 
@@ -27,23 +52,28 @@ end
27
52
  class DocCollectionTests < Test::Unit::TestCase
28
53
 
29
54
  def test_collectionadd()
30
- dc = SClust::DocumentCollection.new()
31
- d1 = SClust::Document.new("a b c d d e a q a b")
32
- d2 = SClust::Document.new("a b d e a")
33
- d3 = SClust::Document.new("bob")
34
- d4 = SClust::Document.new("frank a")
55
+ filter = SClust::Util::NullFilter.new()
56
+ dc = SClust::KMean::DocumentCollection.new()
57
+ d1 = SClust::Util::Document.new("a b c d d e a q a b", :filter=>filter, :ngrams => [1])
58
+ d2 = SClust::Util::Document.new("a b d e a", :filter=>filter, :ngrams => [1])
59
+ d3 = SClust::Util::Document.new("bob", :filter=>filter, :ngrams => [1])
60
+ d4 = SClust::Util::Document.new("frank a", :filter=>filter, :ngrams => [1])
35
61
 
36
- dc + d1
37
- dc + d2
38
- dc + d3
39
- dc + d4
62
+ dc << d1
63
+ dc << d2
64
+ dc << d3
65
+ dc << d4
40
66
 
41
67
  dc.terms.each do |k,v|
42
- if k == "a"
43
- assert(v == 3, "A appers in 3 documents out of 4.")
44
- assert(dc.idf("a") > 2.2, "Known value for a")
45
- assert(dc.idf("a") < 2.3, "Known value for a")
46
- end
68
+ if k == "a"
69
+ assert(v == 6, "A appers in #{v} documents out of 4.")
70
+ assert(dc.idf("a") > 0.2, "Known value for a")
71
+ assert(dc.idf("a") < 0.3, "Known value for a")
72
+ end
47
73
  end
74
+
75
+ print("TERMS: ")
76
+ d1.words.each { |w| print "#{w}, " }
77
+ assert(d1.tf('a') * d1.words.size == 3)
48
78
  end
49
79
  end