sclust 1.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/sclust/kmean/cluster.rb +294 -0
- data/lib/sclust/kmean/doccluster.rb +83 -0
- data/lib/sclust/lda/lda.rb +243 -0
- data/lib/sclust/lda/lda2.rb +328 -0
- data/lib/sclust/util/doc.rb +134 -0
- data/lib/sclust/util/doccol.rb +187 -0
- data/lib/sclust/util/filters.rb +210 -0
- data/lib/sclust/util/rss.rb +96 -0
- data/lib/sclust/util/sparse_vector.rb +96 -0
- data/lib/sclust/util/stopwords.rb +1149 -0
- data/lib/sclust/util/weightedmovingaverage.rb +25 -0
- data/lib/sclust/util/word.rb +53 -0
- data/tests/clustertest.rb +56 -29
- data/tests/filters_test.rb +48 -0
- data/tests/ldatest.rb +75 -0
- data/tests/sparse_vector_test.rb +61 -0
- data/tests/test001.rb +49 -19
- metadata +74 -40
- data/lib/sclust/cluster.rb +0 -197
- data/lib/sclust/doc.rb +0 -92
- data/lib/sclust/doccluster.rb +0 -39
- data/lib/sclust/doccol.rb +0 -75
data/lib/sclust/doccol.rb
DELETED
@@ -1,75 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'log4r'
|
3
|
-
|
4
|
-
module SClust
|
5
|
-
class DocumentCollection
|
6
|
-
|
7
|
-
# terms - a hash were they keys are the terms in the documents and the values stored are the number of documents contiaining the term.
|
8
|
-
attr_reader :terms
|
9
|
-
|
10
|
-
# A list of documents
|
11
|
-
attr_reader :doclist
|
12
|
-
|
13
|
-
# Log4r::Logger for this document collection.
|
14
|
-
attr_reader :logger
|
15
|
-
|
16
|
-
def initialize()
|
17
|
-
@logger = Log4r::Logger.new("SClust::DocumentCollection")
|
18
|
-
@terms = Hash.new(0)
|
19
|
-
@doclist = []
|
20
|
-
end
|
21
|
-
|
22
|
-
# Add a document to the collection and adjust the @terms attribute to store any new terms in the document.
|
23
|
-
# The document is also added to the @doclist attribute.
|
24
|
-
def +(d)
|
25
|
-
|
26
|
-
d.each_term do |term|
|
27
|
-
@terms[term] += 1.0
|
28
|
-
end
|
29
|
-
|
30
|
-
@doclist<<d
|
31
|
-
|
32
|
-
@logger.info("There are #{@doclist.size} documents and #{@terms.size} terms.")
|
33
|
-
|
34
|
-
self
|
35
|
-
end
|
36
|
-
|
37
|
-
def drop_terms(min_frequency=0.10, max_frequency=0.80)
|
38
|
-
|
39
|
-
min_docs = @doclist.length * min_frequency
|
40
|
-
max_docs = @doclist.length * max_frequency
|
41
|
-
|
42
|
-
@logger.info("Analyzing #{@terms.length} terms for removal.")
|
43
|
-
@logger.info("Upper/lower boundary are #{max_frequency}/#{min_frequency}% document frequency or #{max_docs}/#{min_docs} documents.")
|
44
|
-
|
45
|
-
remove_list = []
|
46
|
-
|
47
|
-
@terms.each do |term, frequency|
|
48
|
-
|
49
|
-
if ( frequency < min_docs or frequency > max_docs )
|
50
|
-
@logger.info("Removing term #{term} occuring in #{frequency} documents out of #{@doclist.length}")
|
51
|
-
@terms.delete(term)
|
52
|
-
remove_list << term
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
@logger.info("Removed #{remove_list.length} of #{@terms.length + remove_list.length} terms. Updating #{doclist.length} documents.")
|
57
|
-
|
58
|
-
@doclist.each do |doc|
|
59
|
-
remove_list.each do |term|
|
60
|
-
doc.terms.delete(term)
|
61
|
-
end
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
def inverse_document_frequency(term)
|
66
|
-
Math.log( @terms.length / @terms[term] )
|
67
|
-
end
|
68
|
-
|
69
|
-
alias idf inverse_document_frequency
|
70
|
-
|
71
|
-
def each_term(&c)
|
72
|
-
@terms.each_key { |k| yield k }
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|