clusterer 0.1.0 → 0.1.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,72 @@
1
+ #--
2
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+
23
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
24
+
25
+ require 'test/unit'
26
+ require 'clusterer'
27
+
28
+ class TestSimilarity < Test::Unit::TestCase
29
+ include Clusterer::Tokenizer
30
+ def test_simple_tokenizer
31
+ x = []
32
+ simple_tokenizer("good! morrow!! the AB called") {|w| x << w}
33
+ assert_equal 3, x.size
34
+ assert_equal "morrow".stem, x[1]
35
+ assert_equal "call", x[2]
36
+ end
37
+
38
+ def test_simple_tokenizer_with_no_stemming
39
+ x = []
40
+ simple_tokenizer("good! morrow!! the AB called", :no_stem => true) {|w| x << w}
41
+ assert_equal 3, x.size
42
+ assert_equal "morrow", x[1]
43
+ assert_equal "called", x[2]
44
+ end
45
+
46
+ def test_simple_ngram_tokenizer_1
47
+ x = []
48
+ simple_ngram_tokenizer("Good! morrow!! the AB",1) {|w| x << w}
49
+ assert_equal 2, x.size
50
+ assert_equal "morrow".stem, x[1]
51
+ end
52
+
53
+ def test_simple_ngram_tokenizer
54
+ x = []
55
+ simple_ngram_tokenizer("The cow is a cool holy animal.",:ngram => 1) {|w| x << w}
56
+ assert_equal 4, x.size
57
+ x = []
58
+ simple_ngram_tokenizer("The cow is a cool holy animal.",:ngram => 2) {|w| x << w}
59
+ assert_equal 6, x.size
60
+ assert x.include?(["holy".stem, "animal".stem].join(" "))
61
+ x = []
62
+ simple_ngram_tokenizer("The cow is a cool holy animal.",:ngram => 3) {|w| x << w}
63
+ assert_equal 7, x.size
64
+ assert x.include?(["holy".stem, "animal".stem].join(" "))
65
+ assert x.include?(["cool".stem, "holy".stem, "animal".stem].join(" "))
66
+ x = []
67
+ simple_ngram_tokenizer("Ruby on Rails is cool.") {|w| x << w}
68
+ assert_equal 5, x.size
69
+ assert x.include?(["ruby".stem, "on".stem, "rails".stem].join(" "))
70
+ assert x.include?(["rails".stem, "is".stem, "cool".stem].join(" "))
71
+ end
72
+ end
metadata CHANGED
@@ -1,11 +1,11 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.8.11
2
+ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: clusterer
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.1.0
7
- date: 2006-08-22 00:00:00 +05:30
8
- summary: A library of clustering algorithms for text data.
6
+ version: 0.1.9
7
+ date: 2007-03-22 00:00:00 +05:30
8
+ summary: A library of clustering and classification algorithms for text data.
9
9
  require_paths:
10
10
  - lib
11
11
  email: ssinghi@kreeti.com
@@ -25,18 +25,44 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement
25
25
  platform: ruby
26
26
  signing_key:
27
27
  cert_chain:
28
+ post_install_message:
28
29
  authors:
29
30
  - Surendra K Singhi
30
31
  files:
31
- - tests/clusterer_test.rb
32
+ - tests/similarity_test.rb
33
+ - tests/document_test.rb
34
+ - tests/cluster_test.rb
35
+ - tests/bayes_test.rb
36
+ - tests/document_centroid_test.rb
37
+ - tests/lsi_test.rb
38
+ - tests/algorithms_test.rb
39
+ - tests/tokenizer_test.rb
40
+ - tests/inverse_document_frequency_test.rb
41
+ - tests/document_array_test.rb
42
+ - lib/clusterer
32
43
  - lib/clusterer.rb
33
- - lib/similarity.rb
34
- - lib/word_hash.rb
44
+ - lib/clusterer/lsi
45
+ - lib/clusterer/clustering.rb
46
+ - lib/clusterer/document.rb
47
+ - lib/clusterer/stop_words.rb
48
+ - lib/clusterer/cluster.rb
49
+ - lib/clusterer/bayes.rb
50
+ - lib/clusterer/document_array.rb
51
+ - lib/clusterer/similarity.rb
52
+ - lib/clusterer/document_base.rb
53
+ - lib/clusterer/algorithms.rb
54
+ - lib/clusterer/documents_centroid.rb
55
+ - lib/clusterer/tokenizer.rb
56
+ - lib/clusterer/inverse_document_frequency.rb
57
+ - lib/clusterer/lsi/document_vector.rb
58
+ - lib/clusterer/lsi/lsi.rb
59
+ - lib/clusterer/lsi/dmatrix.rb
60
+ - lib/clusterer/lsi/documents_centroid_vector.rb
35
61
  - examples/google_search_cluster.rb
36
62
  - examples/yahoo_search_cluster.rb
37
63
  - README
38
- test_files:
39
- - tests/clusterer_test.rb
64
+ test_files: []
65
+
40
66
  rdoc_options: []
41
67
 
42
68
  extra_rdoc_files:
@@ -1,27 +0,0 @@
1
- #The MIT License
2
-
3
- ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
4
-
5
- module Clusterer
6
- module Similarity
7
- #find similarity between two clusters, or two points
8
- def Similarity.vector_similarity(cluster1, cluster2)
9
- similarity = 0
10
- total = 0
11
- cluster1.each do |w,value|
12
- next unless w.class == String
13
- total += (value*value) unless cluster1[:total]
14
- similarity += (value * (cluster2[w] || 0))
15
- end
16
- cluster1[:total] = total unless cluster1[:total]
17
- unless cluster2[:total]
18
- total = 0
19
- cluster2.each_value {|v| total += (v*v) }
20
- total = 1 if total == 0
21
- cluster2[:total] = total
22
- end
23
- cluster1[:total] = 1 if cluster1[:total] == 0
24
- similarity /= Math.sqrt(cluster1[:total] * cluster2[:total]).to_f
25
- end
26
- end
27
- end
@@ -1,20 +0,0 @@
1
- #The MIT License
2
-
3
- ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
4
-
5
- $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
6
-
7
- require 'test/unit'
8
- require 'clusterer'
9
-
10
- class TestClusterer < Test::Unit::TestCase
11
- def test_simple_kmeans
12
- assert_not_equal [], Clusterer::Clustering.kmeans_clustering(["hello world","mea culpa","goodbye world"])
13
- assert_equal 2, Clusterer::Clustering.kmeans_clustering(["hello world","mea culpa","goodbye world"],2).size
14
- end
15
-
16
- def test_simple_hierarchical_clustering
17
- assert_not_equal [], Clusterer::Clustering.hierarchical_clustering(["hello world","mea culpa","goodbye world"])
18
- assert_equal 2, Clusterer::Clustering.hierarchical_clustering(["hello world","mea culpa","goodbye world"],2).size
19
- end
20
- end