sclust 1.0.0 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/sclust/kmean/cluster.rb +294 -0
- data/lib/sclust/kmean/doccluster.rb +83 -0
- data/lib/sclust/lda/lda.rb +243 -0
- data/lib/sclust/lda/lda2.rb +328 -0
- data/lib/sclust/util/doc.rb +134 -0
- data/lib/sclust/util/doccol.rb +187 -0
- data/lib/sclust/util/filters.rb +210 -0
- data/lib/sclust/util/rss.rb +96 -0
- data/lib/sclust/util/sparse_vector.rb +96 -0
- data/lib/sclust/util/stopwords.rb +1149 -0
- data/lib/sclust/util/weightedmovingaverage.rb +25 -0
- data/lib/sclust/util/word.rb +53 -0
- data/tests/clustertest.rb +56 -29
- data/tests/filters_test.rb +48 -0
- data/tests/ldatest.rb +75 -0
- data/tests/sparse_vector_test.rb +61 -0
- data/tests/test001.rb +49 -19
- metadata +74 -40
- data/lib/sclust/cluster.rb +0 -197
- data/lib/sclust/doc.rb +0 -92
- data/lib/sclust/doccluster.rb +0 -39
- data/lib/sclust/doccol.rb +0 -75
data/lib/sclust/doccol.rb
DELETED
@@ -1,75 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'log4r'
|
3
|
-
|
4
|
-
module SClust
|
5
|
-
class DocumentCollection
|
6
|
-
|
7
|
-
# terms - a hash were they keys are the terms in the documents and the values stored are the number of documents contiaining the term.
|
8
|
-
attr_reader :terms
|
9
|
-
|
10
|
-
# A list of documents
|
11
|
-
attr_reader :doclist
|
12
|
-
|
13
|
-
# Log4r::Logger for this document collection.
|
14
|
-
attr_reader :logger
|
15
|
-
|
16
|
-
def initialize()
|
17
|
-
@logger = Log4r::Logger.new("SClust::DocumentCollection")
|
18
|
-
@terms = Hash.new(0)
|
19
|
-
@doclist = []
|
20
|
-
end
|
21
|
-
|
22
|
-
# Add a document to the collection and adjust the @terms attribute to store any new terms in the document.
|
23
|
-
# The document is also added to the @doclist attribute.
|
24
|
-
def +(d)
|
25
|
-
|
26
|
-
d.each_term do |term|
|
27
|
-
@terms[term] += 1.0
|
28
|
-
end
|
29
|
-
|
30
|
-
@doclist<<d
|
31
|
-
|
32
|
-
@logger.info("There are #{@doclist.size} documents and #{@terms.size} terms.")
|
33
|
-
|
34
|
-
self
|
35
|
-
end
|
36
|
-
|
37
|
-
def drop_terms(min_frequency=0.10, max_frequency=0.80)
|
38
|
-
|
39
|
-
min_docs = @doclist.length * min_frequency
|
40
|
-
max_docs = @doclist.length * max_frequency
|
41
|
-
|
42
|
-
@logger.info("Analyzing #{@terms.length} terms for removal.")
|
43
|
-
@logger.info("Upper/lower boundary are #{max_frequency}/#{min_frequency}% document frequency or #{max_docs}/#{min_docs} documents.")
|
44
|
-
|
45
|
-
remove_list = []
|
46
|
-
|
47
|
-
@terms.each do |term, frequency|
|
48
|
-
|
49
|
-
if ( frequency < min_docs or frequency > max_docs )
|
50
|
-
@logger.info("Removing term #{term} occuring in #{frequency} documents out of #{@doclist.length}")
|
51
|
-
@terms.delete(term)
|
52
|
-
remove_list << term
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
@logger.info("Removed #{remove_list.length} of #{@terms.length + remove_list.length} terms. Updating #{doclist.length} documents.")
|
57
|
-
|
58
|
-
@doclist.each do |doc|
|
59
|
-
remove_list.each do |term|
|
60
|
-
doc.terms.delete(term)
|
61
|
-
end
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
def inverse_document_frequency(term)
|
66
|
-
Math.log( @terms.length / @terms[term] )
|
67
|
-
end
|
68
|
-
|
69
|
-
alias idf inverse_document_frequency
|
70
|
-
|
71
|
-
def each_term(&c)
|
72
|
-
@terms.each_key { |k| yield k }
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|