sclust 1.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/sclust/kmean/cluster.rb +294 -0
- data/lib/sclust/kmean/doccluster.rb +83 -0
- data/lib/sclust/lda/lda.rb +243 -0
- data/lib/sclust/lda/lda2.rb +328 -0
- data/lib/sclust/util/doc.rb +134 -0
- data/lib/sclust/util/doccol.rb +187 -0
- data/lib/sclust/util/filters.rb +210 -0
- data/lib/sclust/util/rss.rb +96 -0
- data/lib/sclust/util/sparse_vector.rb +96 -0
- data/lib/sclust/util/stopwords.rb +1149 -0
- data/lib/sclust/util/weightedmovingaverage.rb +25 -0
- data/lib/sclust/util/word.rb +53 -0
- data/tests/clustertest.rb +56 -29
- data/tests/filters_test.rb +48 -0
- data/tests/ldatest.rb +75 -0
- data/tests/sparse_vector_test.rb +61 -0
- data/tests/test001.rb +49 -19
- metadata +74 -40
- data/lib/sclust/cluster.rb +0 -197
- data/lib/sclust/doc.rb +0 -92
- data/lib/sclust/doccluster.rb +0 -39
- data/lib/sclust/doccol.rb +0 -75
@@ -0,0 +1,187 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
25
|
+
require 'rubygems'
|
26
|
+
require 'log4r'
|
27
|
+
|
28
|
+
require 'sclust/util/sparse_vector'
|
29
|
+
|
30
|
+
module SClust
|
31
|
+
module Util
|
32
|
+
|
33
|
+
|
34
|
+
class DocumentCollection
|
35
|
+
|
36
|
+
# terms - a hash were they keys are the terms in the documents and the values stored are the number of documents contiaining the term.
|
37
|
+
attr_reader :terms
|
38
|
+
|
39
|
+
# A list of documents
|
40
|
+
attr_reader :doclist
|
41
|
+
|
42
|
+
# Log4r::Logger for this document collection.
|
43
|
+
attr_reader :logger
|
44
|
+
|
45
|
+
def initialize()
|
46
|
+
@logger = Log4r::Logger.new(self.class.to_s)
|
47
|
+
@logger.add('default')
|
48
|
+
@terms = SClust::Util::SparseVector.new(0)
|
49
|
+
@doclist = []
|
50
|
+
end
|
51
|
+
|
52
|
+
# Add a document to the collection and adjust the @terms attribute to store any new terms in the document.
|
53
|
+
# The document is also added to the @doclist attribute.
|
54
|
+
def <<(d)
|
55
|
+
|
56
|
+
seen_terms = {}
|
57
|
+
|
58
|
+
d.each_term { |term, frequency| seen_terms[term] = 1 }
|
59
|
+
|
60
|
+
if ( seen_terms.size > 0 )
|
61
|
+
|
62
|
+
seen_terms.each_key { |term| @terms[term] += 1 }
|
63
|
+
|
64
|
+
@doclist<<d
|
65
|
+
|
66
|
+
#@logger.info("There are #{@doclist.size} documents and #{@terms.size} terms.")
|
67
|
+
end
|
68
|
+
|
69
|
+
self
|
70
|
+
end
|
71
|
+
|
72
|
+
# The sum of the terms divided by the documents. If the document only has 1-gram terms, then this
|
73
|
+
# number will always be less than the number of words per document. If, however, you enable
|
74
|
+
# 2-grams, 3-grams, etc in a document, this value will not corrolate perfectly with the word count.
|
75
|
+
def average_terms_per_document()
|
76
|
+
@terms.reduce(0.0) { |count, keyval_pair| count + keyval_pair[1] } / @doclist.size
|
77
|
+
end
|
78
|
+
|
79
|
+
# Number of words that make up a document. Words are no unique like terms are.
|
80
|
+
# Two occurences of the word "the" are a single term "the". Get it? :) Great. One caveate is that
|
81
|
+
# a "term" is typically a 1-gram, that is 1 word is 1 term. It is possible for a term to be constructed
|
82
|
+
# of two or more words (an 2-gram, 3-gram, ... n-gram) in which case this relationship will vary
|
83
|
+
# widely.
|
84
|
+
def average_words_per_document()
|
85
|
+
@doclist.reduce(0.0) { |count, doc| count + doc.words.size } / @doclist.size
|
86
|
+
end
|
87
|
+
|
88
|
+
# Return the size of the document list.
|
89
|
+
def document_count()
|
90
|
+
@doclist.size
|
91
|
+
end
|
92
|
+
|
93
|
+
# Sum all words
|
94
|
+
def word_count()
|
95
|
+
@doclist.reduce(0) { |count, doc| count+doc.words.size }
|
96
|
+
end
|
97
|
+
|
98
|
+
# Return the size of the term vector
|
99
|
+
def term_count()
|
100
|
+
@terms.size
|
101
|
+
end
|
102
|
+
|
103
|
+
|
104
|
+
def drop_terms(min_frequency=0.10, max_frequency=0.80)
|
105
|
+
|
106
|
+
min_docs = @doclist.length * min_frequency
|
107
|
+
max_docs = @doclist.length * max_frequency
|
108
|
+
|
109
|
+
@logger.info("Analyzing #{@terms.length} terms for removal.")
|
110
|
+
@logger.info("Upper/lower boundary are #{max_frequency}/#{min_frequency}% document frequency or #{max_docs}/#{min_docs} documents.")
|
111
|
+
|
112
|
+
remove_list = []
|
113
|
+
|
114
|
+
@terms.each do |term, frequency|
|
115
|
+
|
116
|
+
if ( frequency < min_docs or frequency > max_docs )
|
117
|
+
@logger.info("Removing term #{term} occuring in #{frequency} documents out of #{@doclist.length}")
|
118
|
+
@terms.delete(term)
|
119
|
+
remove_list << term
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
@logger.info("Removed #{remove_list.length} of #{@terms.length + remove_list.length} terms. Updating #{doclist.length} documents.")
|
124
|
+
|
125
|
+
@doclist.each do |doc|
|
126
|
+
remove_list.each do |term|
|
127
|
+
doc.terms.delete(term)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
def inverse_document_frequency(term)
|
133
|
+
Math.log( @doclist.length / @terms[term] )
|
134
|
+
end
|
135
|
+
|
136
|
+
alias idf inverse_document_frequency
|
137
|
+
|
138
|
+
def each_term(&c)
|
139
|
+
@terms.each_key { |k| yield k }
|
140
|
+
end
|
141
|
+
|
142
|
+
# Filter out documents that are not in the given range
|
143
|
+
# of document frequency as expressed as a percentage of the total
|
144
|
+
# number of documents in the collection. If floats are passed, then they are treated as
|
145
|
+
# percentages. If integers are passed, they are treated like docuent counts.
|
146
|
+
def filter_df(min=1, max=0.20)
|
147
|
+
|
148
|
+
delete_list = []
|
149
|
+
delete_hash = {}
|
150
|
+
|
151
|
+
mindocs = ( min.is_a?(Integer) ) ? min : ( min * @doclist.size )
|
152
|
+
maxdocs = ( max.is_a?(Integer) ) ? max : ( max * @doclist.size )
|
153
|
+
|
154
|
+
@logger.info("Building term to delete list for range #{mindocs} - #{maxdocs}.")
|
155
|
+
|
156
|
+
@terms.each { |term, freq| delete_list << term if (freq <= mindocs or freq >= maxdocs ) }
|
157
|
+
|
158
|
+
@logger.info("Identified #{delete_list.size} terms for removal.")
|
159
|
+
|
160
|
+
# NOTE: We do a two-phase delete so we can delete from backing documents.
|
161
|
+
|
162
|
+
delete_list.each do |term|
|
163
|
+
@logger.debug { "Removing term #{term}."}
|
164
|
+
@terms.delete(term)
|
165
|
+
delete_hash[term] = 1
|
166
|
+
end
|
167
|
+
|
168
|
+
@logger.info("Updating documents.")
|
169
|
+
|
170
|
+
i=0
|
171
|
+
|
172
|
+
@doclist.each do |doc|
|
173
|
+
@logger.debug { "Processing document #{i += 1} / #{@doclist.size}" }
|
174
|
+
|
175
|
+
doc.delete_term_if { |term| delete_hash.member?(term) }
|
176
|
+
end
|
177
|
+
|
178
|
+
@logger.info("Deleting documents that now have no terms left in them. #{@doclist.size} documents.")
|
179
|
+
|
180
|
+
@doclist.delete_if { |doc| doc.terms.size == 0 }
|
181
|
+
|
182
|
+
@logger.info("Document count now #{@doclist.size} documents.")
|
183
|
+
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
@@ -0,0 +1,210 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
25
|
+
require 'rubygems'
|
26
|
+
require 'stemmer'
|
27
|
+
require 'sclust/util/stopwords'
|
28
|
+
require 'nokogiri'
|
29
|
+
|
30
|
+
module SClust
|
31
|
+
module Util
|
32
|
+
|
33
|
+
class Filter
|
34
|
+
class StemmedWord
|
35
|
+
|
36
|
+
attr_reader :original_word, :stemmed_word
|
37
|
+
attr_writer :original_word, :stemmed_word
|
38
|
+
|
39
|
+
def initialize(stemmed_word, original_word)
|
40
|
+
#super(stemmed_word)
|
41
|
+
@stemmed_word = stemmed_word
|
42
|
+
@original_word = String.new(original_word)
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
def initialize_copy(s)
|
47
|
+
super(s)
|
48
|
+
|
49
|
+
if ( stemmed_word.class == "SClust::Filter::StemmedWord" )
|
50
|
+
@original_word = s.original_word
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def to_s()
|
55
|
+
@stemmed_word
|
56
|
+
end
|
57
|
+
|
58
|
+
def < (sw)
|
59
|
+
@stemmed_word< sw.stemmed_word
|
60
|
+
end
|
61
|
+
|
62
|
+
def < (sw)
|
63
|
+
@stemmed_word> sw.stemmed_word
|
64
|
+
end
|
65
|
+
def ==(sw)
|
66
|
+
@stemmed_word == sw.stemmed_word
|
67
|
+
end
|
68
|
+
|
69
|
+
def <=>(sw)
|
70
|
+
@stemmed_word <=> sw.stemmed_word
|
71
|
+
end
|
72
|
+
|
73
|
+
def +(sw)
|
74
|
+
if ( sw.nil?)
|
75
|
+
self
|
76
|
+
elsif (sw.is_a?(String) )
|
77
|
+
StemmedWord.new(@stemmed_word + sw, @original_word + sw)
|
78
|
+
else
|
79
|
+
StemmedWord.new(@stemmed_word + sw.stemmed_word, @original_word + sw.original_word)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
84
|
+
|
85
|
+
def initialize(prev=nil)
|
86
|
+
@previous_filters = (prev)? [ prev ] : []
|
87
|
+
@succeeding_filters = []
|
88
|
+
end
|
89
|
+
|
90
|
+
def apply(term)
|
91
|
+
|
92
|
+
if ( term )
|
93
|
+
|
94
|
+
catch(:filtered_term) do
|
95
|
+
@previous_filters.each { |f| term = f.filter(term) ; throw :filtered_term if term.nil? }
|
96
|
+
|
97
|
+
term = filter(term) ; throw :filtered_term if term.nil?
|
98
|
+
|
99
|
+
@succeeding_filters.each { |f| term = f.filter(term) ; throw :filtered_term if term.nil? }
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
term
|
104
|
+
end
|
105
|
+
|
106
|
+
def after(filter)
|
107
|
+
@previous_filters << filter
|
108
|
+
self
|
109
|
+
end
|
110
|
+
|
111
|
+
def before(filter)
|
112
|
+
@succeeding_filters << filter
|
113
|
+
self
|
114
|
+
end
|
115
|
+
|
116
|
+
def filter(term)
|
117
|
+
raise Exception.new("Method \"filter\" must be overridden by child classes to implement the specific filter.")
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
# Similar to StemFilter, but this will wrap the word in a Filter::StemmedWord object.
|
122
|
+
class StemmedWordFilter < Filter
|
123
|
+
def filter(term)
|
124
|
+
Filter::StemmedWord.new(term.stem, term)
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
class StemFilter < Filter
|
129
|
+
def filter(term)
|
130
|
+
term.stem
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
class LowercaseFilter < Filter
|
135
|
+
def filter(term)
|
136
|
+
term.downcase
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
class StopwordFilter < Filter
|
141
|
+
|
142
|
+
include SClust::Util::StopwordList
|
143
|
+
|
144
|
+
filter = LowercaseFilter.new()
|
145
|
+
|
146
|
+
@@stopwords = {}
|
147
|
+
|
148
|
+
@@stopword_list.each { |term| @@stopwords[filter.apply(term)] = true }
|
149
|
+
|
150
|
+
def filter(term)
|
151
|
+
( @@stopwords[term] ) ? nil : term
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
class TrimWhitespace < Filter
|
156
|
+
def filter(term)
|
157
|
+
term.chomp.sub(/^\s*/, '').sub(/\s*$/, '')
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
|
162
|
+
class TokenizerFilter < Filter
|
163
|
+
def filter(document)
|
164
|
+
document.split(/[\s,\.\t!\?\(\)\{\}\[\]\t\r\n";':]+/m)
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
class HTMLFilter < Filter
|
169
|
+
def filter(doc)
|
170
|
+
Nokogiri::HTML::DocumentFragment.parse(doc).text
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
# A tokenizer that applies a few overall document filters.
|
175
|
+
class DocumentTokenizer < TokenizerFilter
|
176
|
+
def initialize()
|
177
|
+
super()
|
178
|
+
after(HTMLFilter.new())
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
# Filters a document term
|
183
|
+
class DocumentTermFilter < Filter
|
184
|
+
|
185
|
+
def initialize()
|
186
|
+
super()
|
187
|
+
after(LowercaseFilter.new())
|
188
|
+
after(StopwordFilter.new())
|
189
|
+
after(TrimWhitespace.new())
|
190
|
+
#after(StemFilter.new())
|
191
|
+
end
|
192
|
+
|
193
|
+
# Return nil if the term should be excluded. Otherwise the version of the term
|
194
|
+
# that should be included is returned.
|
195
|
+
def filter(term)
|
196
|
+
if ( term =~ /^[\d\.]+$/ )
|
197
|
+
nil
|
198
|
+
else
|
199
|
+
term
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
class NullFilter < Filter
|
205
|
+
def filter(term)
|
206
|
+
term
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
25
|
+
require 'rubygems'
|
26
|
+
require 'rss'
|
27
|
+
|
28
|
+
module SClust
|
29
|
+
|
30
|
+
# NOTE: RSS collides with the module ::RSS, so we use the :: prefix when accessing the ::RSS module
|
31
|
+
# that ships with Ruby. :)
|
32
|
+
module RSS
|
33
|
+
def self.rss_to_documents(rss, &addNewDoc)
|
34
|
+
|
35
|
+
$logger.debug("Operating on #{rss} of type #{rss.class}")
|
36
|
+
|
37
|
+
# This block builds an RSS::Element (document).
|
38
|
+
unless (rss.instance_of?(::RSS::Element))
|
39
|
+
|
40
|
+
# Check if we have a URI string...
|
41
|
+
if ( rss.instance_of?(String) )
|
42
|
+
begin
|
43
|
+
rss = URI.parse(rss)
|
44
|
+
rescue URI::InvalidURIError => e
|
45
|
+
$logger.warning("Exception parsing URI: #{e.message}")
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
$logger.debug("Rss is now of type #{rss.class}.")
|
50
|
+
|
51
|
+
# Parse it...
|
52
|
+
if (rss.instance_of?(URI::HTTP))
|
53
|
+
begin
|
54
|
+
#rss = RSS::Parser::parse(Net::HTTP::get(rss), false)
|
55
|
+
rss = ::RSS::Parser::parse($wwwagent.get_file(rss), false)
|
56
|
+
rescue Exception => e
|
57
|
+
$logger.error("Failed to retrieve URL #{rss}: #{e.message}")
|
58
|
+
throw e
|
59
|
+
end
|
60
|
+
elsif(rss.instance_of?(String))
|
61
|
+
rss = ::RSS::Parser::parse(rss, false)
|
62
|
+
elsif(rss.is_a?(File))
|
63
|
+
rss = ::RSS::Parser::parse(rss, false);
|
64
|
+
else
|
65
|
+
rss = nil
|
66
|
+
end
|
67
|
+
|
68
|
+
throw Exception.new("RSS was not a URI string, a URI object, an RSS document, or an RSS document string: #{rss}") unless rss
|
69
|
+
end
|
70
|
+
|
71
|
+
unless ( rss.nil? || rss.items.nil? )
|
72
|
+
|
73
|
+
$logger.debug("Adding #{rss.items.size} to document collection.")
|
74
|
+
|
75
|
+
# Add this documents of this item to the document collection.
|
76
|
+
rss.items.each do |item|
|
77
|
+
|
78
|
+
if ( item.instance_of?(::RSS::Rss::Channel::Item))
|
79
|
+
|
80
|
+
addNewDoc.call(item.title, item.description, item) if ( item.description )
|
81
|
+
|
82
|
+
elsif ( item.instance_of?(::RSS::RDF::Item) )
|
83
|
+
|
84
|
+
addNewDoc.call(item.title, item.content_encoded, item)
|
85
|
+
|
86
|
+
else
|
87
|
+
|
88
|
+
addNewDoc.call(item.title.content, item.content.content, item)
|
89
|
+
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|