sclust 1.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,187 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
25
+ require 'rubygems'
26
+ require 'log4r'
27
+
28
+ require 'sclust/util/sparse_vector'
29
+
30
+ module SClust
31
+ module Util
32
+
33
+
34
+ class DocumentCollection
35
+
36
+ # terms - a hash were they keys are the terms in the documents and the values stored are the number of documents contiaining the term.
37
+ attr_reader :terms
38
+
39
+ # A list of documents
40
+ attr_reader :doclist
41
+
42
+ # Log4r::Logger for this document collection.
43
+ attr_reader :logger
44
+
45
+ def initialize()
46
+ @logger = Log4r::Logger.new(self.class.to_s)
47
+ @logger.add('default')
48
+ @terms = SClust::Util::SparseVector.new(0)
49
+ @doclist = []
50
+ end
51
+
52
+ # Add a document to the collection and adjust the @terms attribute to store any new terms in the document.
53
+ # The document is also added to the @doclist attribute.
54
+ def <<(d)
55
+
56
+ seen_terms = {}
57
+
58
+ d.each_term { |term, frequency| seen_terms[term] = 1 }
59
+
60
+ if ( seen_terms.size > 0 )
61
+
62
+ seen_terms.each_key { |term| @terms[term] += 1 }
63
+
64
+ @doclist<<d
65
+
66
+ #@logger.info("There are #{@doclist.size} documents and #{@terms.size} terms.")
67
+ end
68
+
69
+ self
70
+ end
71
+
72
+ # The sum of the terms divided by the documents. If the document only has 1-gram terms, then this
73
+ # number will always be less than the number of words per document. If, however, you enable
74
+ # 2-grams, 3-grams, etc in a document, this value will not corrolate perfectly with the word count.
75
+ def average_terms_per_document()
76
+ @terms.reduce(0.0) { |count, keyval_pair| count + keyval_pair[1] } / @doclist.size
77
+ end
78
+
79
+ # Number of words that make up a document. Words are no unique like terms are.
80
+ # Two occurences of the word "the" are a single term "the". Get it? :) Great. One caveate is that
81
+ # a "term" is typically a 1-gram, that is 1 word is 1 term. It is possible for a term to be constructed
82
+ # of two or more words (an 2-gram, 3-gram, ... n-gram) in which case this relationship will vary
83
+ # widely.
84
+ def average_words_per_document()
85
+ @doclist.reduce(0.0) { |count, doc| count + doc.words.size } / @doclist.size
86
+ end
87
+
88
+ # Return the size of the document list.
89
+ def document_count()
90
+ @doclist.size
91
+ end
92
+
93
+ # Sum all words
94
+ def word_count()
95
+ @doclist.reduce(0) { |count, doc| count+doc.words.size }
96
+ end
97
+
98
+ # Return the size of the term vector
99
+ def term_count()
100
+ @terms.size
101
+ end
102
+
103
+
104
+ def drop_terms(min_frequency=0.10, max_frequency=0.80)
105
+
106
+ min_docs = @doclist.length * min_frequency
107
+ max_docs = @doclist.length * max_frequency
108
+
109
+ @logger.info("Analyzing #{@terms.length} terms for removal.")
110
+ @logger.info("Upper/lower boundary are #{max_frequency}/#{min_frequency}% document frequency or #{max_docs}/#{min_docs} documents.")
111
+
112
+ remove_list = []
113
+
114
+ @terms.each do |term, frequency|
115
+
116
+ if ( frequency < min_docs or frequency > max_docs )
117
+ @logger.info("Removing term #{term} occuring in #{frequency} documents out of #{@doclist.length}")
118
+ @terms.delete(term)
119
+ remove_list << term
120
+ end
121
+ end
122
+
123
+ @logger.info("Removed #{remove_list.length} of #{@terms.length + remove_list.length} terms. Updating #{doclist.length} documents.")
124
+
125
+ @doclist.each do |doc|
126
+ remove_list.each do |term|
127
+ doc.terms.delete(term)
128
+ end
129
+ end
130
+ end
131
+
132
+ def inverse_document_frequency(term)
133
+ Math.log( @doclist.length / @terms[term] )
134
+ end
135
+
136
+ alias idf inverse_document_frequency
137
+
138
+ def each_term(&c)
139
+ @terms.each_key { |k| yield k }
140
+ end
141
+
142
+ # Filter out documents that are not in the given range
143
+ # of document frequency as expressed as a percentage of the total
144
+ # number of documents in the collection. If floats are passed, then they are treated as
145
+ # percentages. If integers are passed, they are treated like docuent counts.
146
+ def filter_df(min=1, max=0.20)
147
+
148
+ delete_list = []
149
+ delete_hash = {}
150
+
151
+ mindocs = ( min.is_a?(Integer) ) ? min : ( min * @doclist.size )
152
+ maxdocs = ( max.is_a?(Integer) ) ? max : ( max * @doclist.size )
153
+
154
+ @logger.info("Building term to delete list for range #{mindocs} - #{maxdocs}.")
155
+
156
+ @terms.each { |term, freq| delete_list << term if (freq <= mindocs or freq >= maxdocs ) }
157
+
158
+ @logger.info("Identified #{delete_list.size} terms for removal.")
159
+
160
+ # NOTE: We do a two-phase delete so we can delete from backing documents.
161
+
162
+ delete_list.each do |term|
163
+ @logger.debug { "Removing term #{term}."}
164
+ @terms.delete(term)
165
+ delete_hash[term] = 1
166
+ end
167
+
168
+ @logger.info("Updating documents.")
169
+
170
+ i=0
171
+
172
+ @doclist.each do |doc|
173
+ @logger.debug { "Processing document #{i += 1} / #{@doclist.size}" }
174
+
175
+ doc.delete_term_if { |term| delete_hash.member?(term) }
176
+ end
177
+
178
+ @logger.info("Deleting documents that now have no terms left in them. #{@doclist.size} documents.")
179
+
180
+ @doclist.delete_if { |doc| doc.terms.size == 0 }
181
+
182
+ @logger.info("Document count now #{@doclist.size} documents.")
183
+
184
+ end
185
+ end
186
+ end
187
+ end
@@ -0,0 +1,210 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
25
+ require 'rubygems'
26
+ require 'stemmer'
27
+ require 'sclust/util/stopwords'
28
+ require 'nokogiri'
29
+
30
+ module SClust
31
+ module Util
32
+
33
+ class Filter
34
+ class StemmedWord
35
+
36
+ attr_reader :original_word, :stemmed_word
37
+ attr_writer :original_word, :stemmed_word
38
+
39
+ def initialize(stemmed_word, original_word)
40
+ #super(stemmed_word)
41
+ @stemmed_word = stemmed_word
42
+ @original_word = String.new(original_word)
43
+
44
+ end
45
+
46
+ def initialize_copy(s)
47
+ super(s)
48
+
49
+ if ( stemmed_word.class == "SClust::Filter::StemmedWord" )
50
+ @original_word = s.original_word
51
+ end
52
+ end
53
+
54
+ def to_s()
55
+ @stemmed_word
56
+ end
57
+
58
+ def < (sw)
59
+ @stemmed_word< sw.stemmed_word
60
+ end
61
+
62
+ def < (sw)
63
+ @stemmed_word> sw.stemmed_word
64
+ end
65
+ def ==(sw)
66
+ @stemmed_word == sw.stemmed_word
67
+ end
68
+
69
+ def <=>(sw)
70
+ @stemmed_word <=> sw.stemmed_word
71
+ end
72
+
73
+ def +(sw)
74
+ if ( sw.nil?)
75
+ self
76
+ elsif (sw.is_a?(String) )
77
+ StemmedWord.new(@stemmed_word + sw, @original_word + sw)
78
+ else
79
+ StemmedWord.new(@stemmed_word + sw.stemmed_word, @original_word + sw.original_word)
80
+ end
81
+ end
82
+
83
+ end
84
+
85
+ def initialize(prev=nil)
86
+ @previous_filters = (prev)? [ prev ] : []
87
+ @succeeding_filters = []
88
+ end
89
+
90
+ def apply(term)
91
+
92
+ if ( term )
93
+
94
+ catch(:filtered_term) do
95
+ @previous_filters.each { |f| term = f.filter(term) ; throw :filtered_term if term.nil? }
96
+
97
+ term = filter(term) ; throw :filtered_term if term.nil?
98
+
99
+ @succeeding_filters.each { |f| term = f.filter(term) ; throw :filtered_term if term.nil? }
100
+ end
101
+ end
102
+
103
+ term
104
+ end
105
+
106
+ def after(filter)
107
+ @previous_filters << filter
108
+ self
109
+ end
110
+
111
+ def before(filter)
112
+ @succeeding_filters << filter
113
+ self
114
+ end
115
+
116
+ def filter(term)
117
+ raise Exception.new("Method \"filter\" must be overridden by child classes to implement the specific filter.")
118
+ end
119
+ end
120
+
121
+ # Similar to StemFilter, but this will wrap the word in a Filter::StemmedWord object.
122
+ class StemmedWordFilter < Filter
123
+ def filter(term)
124
+ Filter::StemmedWord.new(term.stem, term)
125
+ end
126
+ end
127
+
128
+ class StemFilter < Filter
129
+ def filter(term)
130
+ term.stem
131
+ end
132
+ end
133
+
134
+ class LowercaseFilter < Filter
135
+ def filter(term)
136
+ term.downcase
137
+ end
138
+ end
139
+
140
+ class StopwordFilter < Filter
141
+
142
+ include SClust::Util::StopwordList
143
+
144
+ filter = LowercaseFilter.new()
145
+
146
+ @@stopwords = {}
147
+
148
+ @@stopword_list.each { |term| @@stopwords[filter.apply(term)] = true }
149
+
150
+ def filter(term)
151
+ ( @@stopwords[term] ) ? nil : term
152
+ end
153
+ end
154
+
155
+ class TrimWhitespace < Filter
156
+ def filter(term)
157
+ term.chomp.sub(/^\s*/, '').sub(/\s*$/, '')
158
+ end
159
+ end
160
+
161
+
162
+ class TokenizerFilter < Filter
163
+ def filter(document)
164
+ document.split(/[\s,\.\t!\?\(\)\{\}\[\]\t\r\n";':]+/m)
165
+ end
166
+ end
167
+
168
+ class HTMLFilter < Filter
169
+ def filter(doc)
170
+ Nokogiri::HTML::DocumentFragment.parse(doc).text
171
+ end
172
+ end
173
+
174
+ # A tokenizer that applies a few overall document filters.
175
+ class DocumentTokenizer < TokenizerFilter
176
+ def initialize()
177
+ super()
178
+ after(HTMLFilter.new())
179
+ end
180
+ end
181
+
182
+ # Filters a document term
183
+ class DocumentTermFilter < Filter
184
+
185
+ def initialize()
186
+ super()
187
+ after(LowercaseFilter.new())
188
+ after(StopwordFilter.new())
189
+ after(TrimWhitespace.new())
190
+ #after(StemFilter.new())
191
+ end
192
+
193
+ # Return nil if the term should be excluded. Otherwise the version of the term
194
+ # that should be included is returned.
195
+ def filter(term)
196
+ if ( term =~ /^[\d\.]+$/ )
197
+ nil
198
+ else
199
+ term
200
+ end
201
+ end
202
+ end
203
+
204
+ class NullFilter < Filter
205
+ def filter(term)
206
+ term
207
+ end
208
+ end
209
+ end
210
+ end
@@ -0,0 +1,96 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
25
+ require 'rubygems'
26
+ require 'rss'
27
+
28
+ module SClust
29
+
30
+ # NOTE: RSS collides with the module ::RSS, so we use the :: prefix when accessing the ::RSS module
31
+ # that ships with Ruby. :)
32
+ module RSS
33
+ def self.rss_to_documents(rss, &addNewDoc)
34
+
35
+ $logger.debug("Operating on #{rss} of type #{rss.class}")
36
+
37
+ # This block builds an RSS::Element (document).
38
+ unless (rss.instance_of?(::RSS::Element))
39
+
40
+ # Check if we have a URI string...
41
+ if ( rss.instance_of?(String) )
42
+ begin
43
+ rss = URI.parse(rss)
44
+ rescue URI::InvalidURIError => e
45
+ $logger.warning("Exception parsing URI: #{e.message}")
46
+ end
47
+ end
48
+
49
+ $logger.debug("Rss is now of type #{rss.class}.")
50
+
51
+ # Parse it...
52
+ if (rss.instance_of?(URI::HTTP))
53
+ begin
54
+ #rss = RSS::Parser::parse(Net::HTTP::get(rss), false)
55
+ rss = ::RSS::Parser::parse($wwwagent.get_file(rss), false)
56
+ rescue Exception => e
57
+ $logger.error("Failed to retrieve URL #{rss}: #{e.message}")
58
+ throw e
59
+ end
60
+ elsif(rss.instance_of?(String))
61
+ rss = ::RSS::Parser::parse(rss, false)
62
+ elsif(rss.is_a?(File))
63
+ rss = ::RSS::Parser::parse(rss, false);
64
+ else
65
+ rss = nil
66
+ end
67
+
68
+ throw Exception.new("RSS was not a URI string, a URI object, an RSS document, or an RSS document string: #{rss}") unless rss
69
+ end
70
+
71
+ unless ( rss.nil? || rss.items.nil? )
72
+
73
+ $logger.debug("Adding #{rss.items.size} to document collection.")
74
+
75
+ # Add this documents of this item to the document collection.
76
+ rss.items.each do |item|
77
+
78
+ if ( item.instance_of?(::RSS::Rss::Channel::Item))
79
+
80
+ addNewDoc.call(item.title, item.description, item) if ( item.description )
81
+
82
+ elsif ( item.instance_of?(::RSS::RDF::Item) )
83
+
84
+ addNewDoc.call(item.title, item.content_encoded, item)
85
+
86
+ else
87
+
88
+ addNewDoc.call(item.title.content, item.content.content, item)
89
+
90
+ end
91
+ end
92
+ end
93
+ end
94
+ end
95
+
96
+ end