sclust 1.0.0 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/sclust/kmean/cluster.rb +294 -0
- data/lib/sclust/kmean/doccluster.rb +83 -0
- data/lib/sclust/lda/lda.rb +243 -0
- data/lib/sclust/lda/lda2.rb +328 -0
- data/lib/sclust/util/doc.rb +134 -0
- data/lib/sclust/util/doccol.rb +187 -0
- data/lib/sclust/util/filters.rb +210 -0
- data/lib/sclust/util/rss.rb +96 -0
- data/lib/sclust/util/sparse_vector.rb +96 -0
- data/lib/sclust/util/stopwords.rb +1149 -0
- data/lib/sclust/util/weightedmovingaverage.rb +25 -0
- data/lib/sclust/util/word.rb +53 -0
- data/tests/clustertest.rb +56 -29
- data/tests/filters_test.rb +48 -0
- data/tests/ldatest.rb +75 -0
- data/tests/sparse_vector_test.rb +61 -0
- data/tests/test001.rb +49 -19
- metadata +74 -40
- data/lib/sclust/cluster.rb +0 -197
- data/lib/sclust/doc.rb +0 -92
- data/lib/sclust/doccluster.rb +0 -39
- data/lib/sclust/doccol.rb +0 -75
@@ -0,0 +1,187 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
25
|
+
require 'rubygems'
|
26
|
+
require 'log4r'
|
27
|
+
|
28
|
+
require 'sclust/util/sparse_vector'
|
29
|
+
|
30
|
+
module SClust
|
31
|
+
module Util
|
32
|
+
|
33
|
+
|
34
|
+
class DocumentCollection
|
35
|
+
|
36
|
+
# terms - a hash were they keys are the terms in the documents and the values stored are the number of documents contiaining the term.
|
37
|
+
attr_reader :terms
|
38
|
+
|
39
|
+
# A list of documents
|
40
|
+
attr_reader :doclist
|
41
|
+
|
42
|
+
# Log4r::Logger for this document collection.
|
43
|
+
attr_reader :logger
|
44
|
+
|
45
|
+
def initialize()
|
46
|
+
@logger = Log4r::Logger.new(self.class.to_s)
|
47
|
+
@logger.add('default')
|
48
|
+
@terms = SClust::Util::SparseVector.new(0)
|
49
|
+
@doclist = []
|
50
|
+
end
|
51
|
+
|
52
|
+
# Add a document to the collection and adjust the @terms attribute to store any new terms in the document.
|
53
|
+
# The document is also added to the @doclist attribute.
|
54
|
+
def <<(d)
|
55
|
+
|
56
|
+
seen_terms = {}
|
57
|
+
|
58
|
+
d.each_term { |term, frequency| seen_terms[term] = 1 }
|
59
|
+
|
60
|
+
if ( seen_terms.size > 0 )
|
61
|
+
|
62
|
+
seen_terms.each_key { |term| @terms[term] += 1 }
|
63
|
+
|
64
|
+
@doclist<<d
|
65
|
+
|
66
|
+
#@logger.info("There are #{@doclist.size} documents and #{@terms.size} terms.")
|
67
|
+
end
|
68
|
+
|
69
|
+
self
|
70
|
+
end
|
71
|
+
|
72
|
+
# The sum of the terms divided by the documents. If the document only has 1-gram terms, then this
|
73
|
+
# number will always be less than the number of words per document. If, however, you enable
|
74
|
+
# 2-grams, 3-grams, etc in a document, this value will not corrolate perfectly with the word count.
|
75
|
+
def average_terms_per_document()
|
76
|
+
@terms.reduce(0.0) { |count, keyval_pair| count + keyval_pair[1] } / @doclist.size
|
77
|
+
end
|
78
|
+
|
79
|
+
# Number of words that make up a document. Words are no unique like terms are.
|
80
|
+
# Two occurences of the word "the" are a single term "the". Get it? :) Great. One caveate is that
|
81
|
+
# a "term" is typically a 1-gram, that is 1 word is 1 term. It is possible for a term to be constructed
|
82
|
+
# of two or more words (an 2-gram, 3-gram, ... n-gram) in which case this relationship will vary
|
83
|
+
# widely.
|
84
|
+
def average_words_per_document()
|
85
|
+
@doclist.reduce(0.0) { |count, doc| count + doc.words.size } / @doclist.size
|
86
|
+
end
|
87
|
+
|
88
|
+
# Return the size of the document list.
|
89
|
+
def document_count()
|
90
|
+
@doclist.size
|
91
|
+
end
|
92
|
+
|
93
|
+
# Sum all words
|
94
|
+
def word_count()
|
95
|
+
@doclist.reduce(0) { |count, doc| count+doc.words.size }
|
96
|
+
end
|
97
|
+
|
98
|
+
# Return the size of the term vector
|
99
|
+
def term_count()
|
100
|
+
@terms.size
|
101
|
+
end
|
102
|
+
|
103
|
+
|
104
|
+
def drop_terms(min_frequency=0.10, max_frequency=0.80)
|
105
|
+
|
106
|
+
min_docs = @doclist.length * min_frequency
|
107
|
+
max_docs = @doclist.length * max_frequency
|
108
|
+
|
109
|
+
@logger.info("Analyzing #{@terms.length} terms for removal.")
|
110
|
+
@logger.info("Upper/lower boundary are #{max_frequency}/#{min_frequency}% document frequency or #{max_docs}/#{min_docs} documents.")
|
111
|
+
|
112
|
+
remove_list = []
|
113
|
+
|
114
|
+
@terms.each do |term, frequency|
|
115
|
+
|
116
|
+
if ( frequency < min_docs or frequency > max_docs )
|
117
|
+
@logger.info("Removing term #{term} occuring in #{frequency} documents out of #{@doclist.length}")
|
118
|
+
@terms.delete(term)
|
119
|
+
remove_list << term
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
@logger.info("Removed #{remove_list.length} of #{@terms.length + remove_list.length} terms. Updating #{doclist.length} documents.")
|
124
|
+
|
125
|
+
@doclist.each do |doc|
|
126
|
+
remove_list.each do |term|
|
127
|
+
doc.terms.delete(term)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
def inverse_document_frequency(term)
|
133
|
+
Math.log( @doclist.length / @terms[term] )
|
134
|
+
end
|
135
|
+
|
136
|
+
alias idf inverse_document_frequency
|
137
|
+
|
138
|
+
def each_term(&c)
|
139
|
+
@terms.each_key { |k| yield k }
|
140
|
+
end
|
141
|
+
|
142
|
+
# Filter out documents that are not in the given range
|
143
|
+
# of document frequency as expressed as a percentage of the total
|
144
|
+
# number of documents in the collection. If floats are passed, then they are treated as
|
145
|
+
# percentages. If integers are passed, they are treated like docuent counts.
|
146
|
+
def filter_df(min=1, max=0.20)
|
147
|
+
|
148
|
+
delete_list = []
|
149
|
+
delete_hash = {}
|
150
|
+
|
151
|
+
mindocs = ( min.is_a?(Integer) ) ? min : ( min * @doclist.size )
|
152
|
+
maxdocs = ( max.is_a?(Integer) ) ? max : ( max * @doclist.size )
|
153
|
+
|
154
|
+
@logger.info("Building term to delete list for range #{mindocs} - #{maxdocs}.")
|
155
|
+
|
156
|
+
@terms.each { |term, freq| delete_list << term if (freq <= mindocs or freq >= maxdocs ) }
|
157
|
+
|
158
|
+
@logger.info("Identified #{delete_list.size} terms for removal.")
|
159
|
+
|
160
|
+
# NOTE: We do a two-phase delete so we can delete from backing documents.
|
161
|
+
|
162
|
+
delete_list.each do |term|
|
163
|
+
@logger.debug { "Removing term #{term}."}
|
164
|
+
@terms.delete(term)
|
165
|
+
delete_hash[term] = 1
|
166
|
+
end
|
167
|
+
|
168
|
+
@logger.info("Updating documents.")
|
169
|
+
|
170
|
+
i=0
|
171
|
+
|
172
|
+
@doclist.each do |doc|
|
173
|
+
@logger.debug { "Processing document #{i += 1} / #{@doclist.size}" }
|
174
|
+
|
175
|
+
doc.delete_term_if { |term| delete_hash.member?(term) }
|
176
|
+
end
|
177
|
+
|
178
|
+
@logger.info("Deleting documents that now have no terms left in them. #{@doclist.size} documents.")
|
179
|
+
|
180
|
+
@doclist.delete_if { |doc| doc.terms.size == 0 }
|
181
|
+
|
182
|
+
@logger.info("Document count now #{@doclist.size} documents.")
|
183
|
+
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
@@ -0,0 +1,210 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
25
|
+
require 'rubygems'
|
26
|
+
require 'stemmer'
|
27
|
+
require 'sclust/util/stopwords'
|
28
|
+
require 'nokogiri'
|
29
|
+
|
30
|
+
module SClust
|
31
|
+
module Util
|
32
|
+
|
33
|
+
class Filter
|
34
|
+
class StemmedWord
|
35
|
+
|
36
|
+
attr_reader :original_word, :stemmed_word
|
37
|
+
attr_writer :original_word, :stemmed_word
|
38
|
+
|
39
|
+
def initialize(stemmed_word, original_word)
|
40
|
+
#super(stemmed_word)
|
41
|
+
@stemmed_word = stemmed_word
|
42
|
+
@original_word = String.new(original_word)
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
def initialize_copy(s)
|
47
|
+
super(s)
|
48
|
+
|
49
|
+
if ( stemmed_word.class == "SClust::Filter::StemmedWord" )
|
50
|
+
@original_word = s.original_word
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def to_s()
|
55
|
+
@stemmed_word
|
56
|
+
end
|
57
|
+
|
58
|
+
def < (sw)
|
59
|
+
@stemmed_word< sw.stemmed_word
|
60
|
+
end
|
61
|
+
|
62
|
+
def < (sw)
|
63
|
+
@stemmed_word> sw.stemmed_word
|
64
|
+
end
|
65
|
+
def ==(sw)
|
66
|
+
@stemmed_word == sw.stemmed_word
|
67
|
+
end
|
68
|
+
|
69
|
+
def <=>(sw)
|
70
|
+
@stemmed_word <=> sw.stemmed_word
|
71
|
+
end
|
72
|
+
|
73
|
+
def +(sw)
|
74
|
+
if ( sw.nil?)
|
75
|
+
self
|
76
|
+
elsif (sw.is_a?(String) )
|
77
|
+
StemmedWord.new(@stemmed_word + sw, @original_word + sw)
|
78
|
+
else
|
79
|
+
StemmedWord.new(@stemmed_word + sw.stemmed_word, @original_word + sw.original_word)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
84
|
+
|
85
|
+
def initialize(prev=nil)
|
86
|
+
@previous_filters = (prev)? [ prev ] : []
|
87
|
+
@succeeding_filters = []
|
88
|
+
end
|
89
|
+
|
90
|
+
def apply(term)
|
91
|
+
|
92
|
+
if ( term )
|
93
|
+
|
94
|
+
catch(:filtered_term) do
|
95
|
+
@previous_filters.each { |f| term = f.filter(term) ; throw :filtered_term if term.nil? }
|
96
|
+
|
97
|
+
term = filter(term) ; throw :filtered_term if term.nil?
|
98
|
+
|
99
|
+
@succeeding_filters.each { |f| term = f.filter(term) ; throw :filtered_term if term.nil? }
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
term
|
104
|
+
end
|
105
|
+
|
106
|
+
def after(filter)
|
107
|
+
@previous_filters << filter
|
108
|
+
self
|
109
|
+
end
|
110
|
+
|
111
|
+
def before(filter)
|
112
|
+
@succeeding_filters << filter
|
113
|
+
self
|
114
|
+
end
|
115
|
+
|
116
|
+
def filter(term)
|
117
|
+
raise Exception.new("Method \"filter\" must be overridden by child classes to implement the specific filter.")
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
# Similar to StemFilter, but this will wrap the word in a Filter::StemmedWord object.
|
122
|
+
class StemmedWordFilter < Filter
|
123
|
+
def filter(term)
|
124
|
+
Filter::StemmedWord.new(term.stem, term)
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
class StemFilter < Filter
|
129
|
+
def filter(term)
|
130
|
+
term.stem
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
class LowercaseFilter < Filter
|
135
|
+
def filter(term)
|
136
|
+
term.downcase
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
class StopwordFilter < Filter
|
141
|
+
|
142
|
+
include SClust::Util::StopwordList
|
143
|
+
|
144
|
+
filter = LowercaseFilter.new()
|
145
|
+
|
146
|
+
@@stopwords = {}
|
147
|
+
|
148
|
+
@@stopword_list.each { |term| @@stopwords[filter.apply(term)] = true }
|
149
|
+
|
150
|
+
def filter(term)
|
151
|
+
( @@stopwords[term] ) ? nil : term
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
class TrimWhitespace < Filter
|
156
|
+
def filter(term)
|
157
|
+
term.chomp.sub(/^\s*/, '').sub(/\s*$/, '')
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
|
162
|
+
class TokenizerFilter < Filter
|
163
|
+
def filter(document)
|
164
|
+
document.split(/[\s,\.\t!\?\(\)\{\}\[\]\t\r\n";':]+/m)
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
class HTMLFilter < Filter
|
169
|
+
def filter(doc)
|
170
|
+
Nokogiri::HTML::DocumentFragment.parse(doc).text
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
# A tokenizer that applies a few overall document filters.
|
175
|
+
class DocumentTokenizer < TokenizerFilter
|
176
|
+
def initialize()
|
177
|
+
super()
|
178
|
+
after(HTMLFilter.new())
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
# Filters a document term
|
183
|
+
class DocumentTermFilter < Filter
|
184
|
+
|
185
|
+
def initialize()
|
186
|
+
super()
|
187
|
+
after(LowercaseFilter.new())
|
188
|
+
after(StopwordFilter.new())
|
189
|
+
after(TrimWhitespace.new())
|
190
|
+
#after(StemFilter.new())
|
191
|
+
end
|
192
|
+
|
193
|
+
# Return nil if the term should be excluded. Otherwise the version of the term
|
194
|
+
# that should be included is returned.
|
195
|
+
def filter(term)
|
196
|
+
if ( term =~ /^[\d\.]+$/ )
|
197
|
+
nil
|
198
|
+
else
|
199
|
+
term
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
class NullFilter < Filter
|
205
|
+
def filter(term)
|
206
|
+
term
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
25
|
+
require 'rubygems'
|
26
|
+
require 'rss'
|
27
|
+
|
28
|
+
module SClust
|
29
|
+
|
30
|
+
# NOTE: RSS collides with the module ::RSS, so we use the :: prefix when accessing the ::RSS module
|
31
|
+
# that ships with Ruby. :)
|
32
|
+
module RSS
|
33
|
+
def self.rss_to_documents(rss, &addNewDoc)
|
34
|
+
|
35
|
+
$logger.debug("Operating on #{rss} of type #{rss.class}")
|
36
|
+
|
37
|
+
# This block builds an RSS::Element (document).
|
38
|
+
unless (rss.instance_of?(::RSS::Element))
|
39
|
+
|
40
|
+
# Check if we have a URI string...
|
41
|
+
if ( rss.instance_of?(String) )
|
42
|
+
begin
|
43
|
+
rss = URI.parse(rss)
|
44
|
+
rescue URI::InvalidURIError => e
|
45
|
+
$logger.warning("Exception parsing URI: #{e.message}")
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
$logger.debug("Rss is now of type #{rss.class}.")
|
50
|
+
|
51
|
+
# Parse it...
|
52
|
+
if (rss.instance_of?(URI::HTTP))
|
53
|
+
begin
|
54
|
+
#rss = RSS::Parser::parse(Net::HTTP::get(rss), false)
|
55
|
+
rss = ::RSS::Parser::parse($wwwagent.get_file(rss), false)
|
56
|
+
rescue Exception => e
|
57
|
+
$logger.error("Failed to retrieve URL #{rss}: #{e.message}")
|
58
|
+
throw e
|
59
|
+
end
|
60
|
+
elsif(rss.instance_of?(String))
|
61
|
+
rss = ::RSS::Parser::parse(rss, false)
|
62
|
+
elsif(rss.is_a?(File))
|
63
|
+
rss = ::RSS::Parser::parse(rss, false);
|
64
|
+
else
|
65
|
+
rss = nil
|
66
|
+
end
|
67
|
+
|
68
|
+
throw Exception.new("RSS was not a URI string, a URI object, an RSS document, or an RSS document string: #{rss}") unless rss
|
69
|
+
end
|
70
|
+
|
71
|
+
unless ( rss.nil? || rss.items.nil? )
|
72
|
+
|
73
|
+
$logger.debug("Adding #{rss.items.size} to document collection.")
|
74
|
+
|
75
|
+
# Add this documents of this item to the document collection.
|
76
|
+
rss.items.each do |item|
|
77
|
+
|
78
|
+
if ( item.instance_of?(::RSS::Rss::Channel::Item))
|
79
|
+
|
80
|
+
addNewDoc.call(item.title, item.description, item) if ( item.description )
|
81
|
+
|
82
|
+
elsif ( item.instance_of?(::RSS::RDF::Item) )
|
83
|
+
|
84
|
+
addNewDoc.call(item.title, item.content_encoded, item)
|
85
|
+
|
86
|
+
else
|
87
|
+
|
88
|
+
addNewDoc.call(item.title.content, item.content.content, item)
|
89
|
+
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|