sclust 1.0.0 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,187 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
25
+ require 'rubygems'
26
+ require 'log4r'
27
+
28
+ require 'sclust/util/sparse_vector'
29
+
30
+ module SClust
31
+ module Util
32
+
33
+
34
+ class DocumentCollection
35
+
36
+ # terms - a hash were they keys are the terms in the documents and the values stored are the number of documents contiaining the term.
37
+ attr_reader :terms
38
+
39
+ # A list of documents
40
+ attr_reader :doclist
41
+
42
+ # Log4r::Logger for this document collection.
43
+ attr_reader :logger
44
+
45
+ def initialize()
46
+ @logger = Log4r::Logger.new(self.class.to_s)
47
+ @logger.add('default')
48
+ @terms = SClust::Util::SparseVector.new(0)
49
+ @doclist = []
50
+ end
51
+
52
+ # Add a document to the collection and adjust the @terms attribute to store any new terms in the document.
53
+ # The document is also added to the @doclist attribute.
54
+ def <<(d)
55
+
56
+ seen_terms = {}
57
+
58
+ d.each_term { |term, frequency| seen_terms[term] = 1 }
59
+
60
+ if ( seen_terms.size > 0 )
61
+
62
+ seen_terms.each_key { |term| @terms[term] += 1 }
63
+
64
+ @doclist<<d
65
+
66
+ #@logger.info("There are #{@doclist.size} documents and #{@terms.size} terms.")
67
+ end
68
+
69
+ self
70
+ end
71
+
72
+ # The sum of the terms divided by the documents. If the document only has 1-gram terms, then this
73
+ # number will always be less than the number of words per document. If, however, you enable
74
+ # 2-grams, 3-grams, etc in a document, this value will not corrolate perfectly with the word count.
75
+ def average_terms_per_document()
76
+ @terms.reduce(0.0) { |count, keyval_pair| count + keyval_pair[1] } / @doclist.size
77
+ end
78
+
79
+ # Number of words that make up a document. Words are no unique like terms are.
80
+ # Two occurences of the word "the" are a single term "the". Get it? :) Great. One caveate is that
81
+ # a "term" is typically a 1-gram, that is 1 word is 1 term. It is possible for a term to be constructed
82
+ # of two or more words (an 2-gram, 3-gram, ... n-gram) in which case this relationship will vary
83
+ # widely.
84
+ def average_words_per_document()
85
+ @doclist.reduce(0.0) { |count, doc| count + doc.words.size } / @doclist.size
86
+ end
87
+
88
+ # Return the size of the document list.
89
+ def document_count()
90
+ @doclist.size
91
+ end
92
+
93
+ # Sum all words
94
+ def word_count()
95
+ @doclist.reduce(0) { |count, doc| count+doc.words.size }
96
+ end
97
+
98
+ # Return the size of the term vector
99
+ def term_count()
100
+ @terms.size
101
+ end
102
+
103
+
104
+ def drop_terms(min_frequency=0.10, max_frequency=0.80)
105
+
106
+ min_docs = @doclist.length * min_frequency
107
+ max_docs = @doclist.length * max_frequency
108
+
109
+ @logger.info("Analyzing #{@terms.length} terms for removal.")
110
+ @logger.info("Upper/lower boundary are #{max_frequency}/#{min_frequency}% document frequency or #{max_docs}/#{min_docs} documents.")
111
+
112
+ remove_list = []
113
+
114
+ @terms.each do |term, frequency|
115
+
116
+ if ( frequency < min_docs or frequency > max_docs )
117
+ @logger.info("Removing term #{term} occuring in #{frequency} documents out of #{@doclist.length}")
118
+ @terms.delete(term)
119
+ remove_list << term
120
+ end
121
+ end
122
+
123
+ @logger.info("Removed #{remove_list.length} of #{@terms.length + remove_list.length} terms. Updating #{doclist.length} documents.")
124
+
125
+ @doclist.each do |doc|
126
+ remove_list.each do |term|
127
+ doc.terms.delete(term)
128
+ end
129
+ end
130
+ end
131
+
132
+ def inverse_document_frequency(term)
133
+ Math.log( @doclist.length / @terms[term] )
134
+ end
135
+
136
+ alias idf inverse_document_frequency
137
+
138
+ def each_term(&c)
139
+ @terms.each_key { |k| yield k }
140
+ end
141
+
142
+ # Filter out documents that are not in the given range
143
+ # of document frequency as expressed as a percentage of the total
144
+ # number of documents in the collection. If floats are passed, then they are treated as
145
+ # percentages. If integers are passed, they are treated like docuent counts.
146
+ def filter_df(min=1, max=0.20)
147
+
148
+ delete_list = []
149
+ delete_hash = {}
150
+
151
+ mindocs = ( min.is_a?(Integer) ) ? min : ( min * @doclist.size )
152
+ maxdocs = ( max.is_a?(Integer) ) ? max : ( max * @doclist.size )
153
+
154
+ @logger.info("Building term to delete list for range #{mindocs} - #{maxdocs}.")
155
+
156
+ @terms.each { |term, freq| delete_list << term if (freq <= mindocs or freq >= maxdocs ) }
157
+
158
+ @logger.info("Identified #{delete_list.size} terms for removal.")
159
+
160
+ # NOTE: We do a two-phase delete so we can delete from backing documents.
161
+
162
+ delete_list.each do |term|
163
+ @logger.debug { "Removing term #{term}."}
164
+ @terms.delete(term)
165
+ delete_hash[term] = 1
166
+ end
167
+
168
+ @logger.info("Updating documents.")
169
+
170
+ i=0
171
+
172
+ @doclist.each do |doc|
173
+ @logger.debug { "Processing document #{i += 1} / #{@doclist.size}" }
174
+
175
+ doc.delete_term_if { |term| delete_hash.member?(term) }
176
+ end
177
+
178
+ @logger.info("Deleting documents that now have no terms left in them. #{@doclist.size} documents.")
179
+
180
+ @doclist.delete_if { |doc| doc.terms.size == 0 }
181
+
182
+ @logger.info("Document count now #{@doclist.size} documents.")
183
+
184
+ end
185
+ end
186
+ end
187
+ end
@@ -0,0 +1,210 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
25
+ require 'rubygems'
26
+ require 'stemmer'
27
+ require 'sclust/util/stopwords'
28
+ require 'nokogiri'
29
+
30
+ module SClust
31
+ module Util
32
+
33
+ class Filter
34
+ class StemmedWord
35
+
36
+ attr_reader :original_word, :stemmed_word
37
+ attr_writer :original_word, :stemmed_word
38
+
39
+ def initialize(stemmed_word, original_word)
40
+ #super(stemmed_word)
41
+ @stemmed_word = stemmed_word
42
+ @original_word = String.new(original_word)
43
+
44
+ end
45
+
46
+ def initialize_copy(s)
47
+ super(s)
48
+
49
+ if ( stemmed_word.class == "SClust::Filter::StemmedWord" )
50
+ @original_word = s.original_word
51
+ end
52
+ end
53
+
54
+ def to_s()
55
+ @stemmed_word
56
+ end
57
+
58
+ def < (sw)
59
+ @stemmed_word< sw.stemmed_word
60
+ end
61
+
62
+ def < (sw)
63
+ @stemmed_word> sw.stemmed_word
64
+ end
65
+ def ==(sw)
66
+ @stemmed_word == sw.stemmed_word
67
+ end
68
+
69
+ def <=>(sw)
70
+ @stemmed_word <=> sw.stemmed_word
71
+ end
72
+
73
+ def +(sw)
74
+ if ( sw.nil?)
75
+ self
76
+ elsif (sw.is_a?(String) )
77
+ StemmedWord.new(@stemmed_word + sw, @original_word + sw)
78
+ else
79
+ StemmedWord.new(@stemmed_word + sw.stemmed_word, @original_word + sw.original_word)
80
+ end
81
+ end
82
+
83
+ end
84
+
85
+ def initialize(prev=nil)
86
+ @previous_filters = (prev)? [ prev ] : []
87
+ @succeeding_filters = []
88
+ end
89
+
90
+ def apply(term)
91
+
92
+ if ( term )
93
+
94
+ catch(:filtered_term) do
95
+ @previous_filters.each { |f| term = f.filter(term) ; throw :filtered_term if term.nil? }
96
+
97
+ term = filter(term) ; throw :filtered_term if term.nil?
98
+
99
+ @succeeding_filters.each { |f| term = f.filter(term) ; throw :filtered_term if term.nil? }
100
+ end
101
+ end
102
+
103
+ term
104
+ end
105
+
106
+ def after(filter)
107
+ @previous_filters << filter
108
+ self
109
+ end
110
+
111
+ def before(filter)
112
+ @succeeding_filters << filter
113
+ self
114
+ end
115
+
116
+ def filter(term)
117
+ raise Exception.new("Method \"filter\" must be overridden by child classes to implement the specific filter.")
118
+ end
119
+ end
120
+
121
+ # Similar to StemFilter, but this will wrap the word in a Filter::StemmedWord object.
122
+ class StemmedWordFilter < Filter
123
+ def filter(term)
124
+ Filter::StemmedWord.new(term.stem, term)
125
+ end
126
+ end
127
+
128
+ class StemFilter < Filter
129
+ def filter(term)
130
+ term.stem
131
+ end
132
+ end
133
+
134
+ class LowercaseFilter < Filter
135
+ def filter(term)
136
+ term.downcase
137
+ end
138
+ end
139
+
140
+ class StopwordFilter < Filter
141
+
142
+ include SClust::Util::StopwordList
143
+
144
+ filter = LowercaseFilter.new()
145
+
146
+ @@stopwords = {}
147
+
148
+ @@stopword_list.each { |term| @@stopwords[filter.apply(term)] = true }
149
+
150
+ def filter(term)
151
+ ( @@stopwords[term] ) ? nil : term
152
+ end
153
+ end
154
+
155
+ class TrimWhitespace < Filter
156
+ def filter(term)
157
+ term.chomp.sub(/^\s*/, '').sub(/\s*$/, '')
158
+ end
159
+ end
160
+
161
+
162
+ class TokenizerFilter < Filter
163
+ def filter(document)
164
+ document.split(/[\s,\.\t!\?\(\)\{\}\[\]\t\r\n";':]+/m)
165
+ end
166
+ end
167
+
168
+ class HTMLFilter < Filter
169
+ def filter(doc)
170
+ Nokogiri::HTML::DocumentFragment.parse(doc).text
171
+ end
172
+ end
173
+
174
+ # A tokenizer that applies a few overall document filters.
175
+ class DocumentTokenizer < TokenizerFilter
176
+ def initialize()
177
+ super()
178
+ after(HTMLFilter.new())
179
+ end
180
+ end
181
+
182
+ # Filters a document term
183
+ class DocumentTermFilter < Filter
184
+
185
+ def initialize()
186
+ super()
187
+ after(LowercaseFilter.new())
188
+ after(StopwordFilter.new())
189
+ after(TrimWhitespace.new())
190
+ #after(StemFilter.new())
191
+ end
192
+
193
+ # Return nil if the term should be excluded. Otherwise the version of the term
194
+ # that should be included is returned.
195
+ def filter(term)
196
+ if ( term =~ /^[\d\.]+$/ )
197
+ nil
198
+ else
199
+ term
200
+ end
201
+ end
202
+ end
203
+
204
+ class NullFilter < Filter
205
+ def filter(term)
206
+ term
207
+ end
208
+ end
209
+ end
210
+ end
@@ -0,0 +1,96 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
25
+ require 'rubygems'
26
+ require 'rss'
27
+
28
+ module SClust
29
+
30
+ # NOTE: RSS collides with the module ::RSS, so we use the :: prefix when accessing the ::RSS module
31
+ # that ships with Ruby. :)
32
+ module RSS
33
+ def self.rss_to_documents(rss, &addNewDoc)
34
+
35
+ $logger.debug("Operating on #{rss} of type #{rss.class}")
36
+
37
+ # This block builds an RSS::Element (document).
38
+ unless (rss.instance_of?(::RSS::Element))
39
+
40
+ # Check if we have a URI string...
41
+ if ( rss.instance_of?(String) )
42
+ begin
43
+ rss = URI.parse(rss)
44
+ rescue URI::InvalidURIError => e
45
+ $logger.warning("Exception parsing URI: #{e.message}")
46
+ end
47
+ end
48
+
49
+ $logger.debug("Rss is now of type #{rss.class}.")
50
+
51
+ # Parse it...
52
+ if (rss.instance_of?(URI::HTTP))
53
+ begin
54
+ #rss = RSS::Parser::parse(Net::HTTP::get(rss), false)
55
+ rss = ::RSS::Parser::parse($wwwagent.get_file(rss), false)
56
+ rescue Exception => e
57
+ $logger.error("Failed to retrieve URL #{rss}: #{e.message}")
58
+ throw e
59
+ end
60
+ elsif(rss.instance_of?(String))
61
+ rss = ::RSS::Parser::parse(rss, false)
62
+ elsif(rss.is_a?(File))
63
+ rss = ::RSS::Parser::parse(rss, false);
64
+ else
65
+ rss = nil
66
+ end
67
+
68
+ throw Exception.new("RSS was not a URI string, a URI object, an RSS document, or an RSS document string: #{rss}") unless rss
69
+ end
70
+
71
+ unless ( rss.nil? || rss.items.nil? )
72
+
73
+ $logger.debug("Adding #{rss.items.size} to document collection.")
74
+
75
+ # Add this documents of this item to the document collection.
76
+ rss.items.each do |item|
77
+
78
+ if ( item.instance_of?(::RSS::Rss::Channel::Item))
79
+
80
+ addNewDoc.call(item.title, item.description, item) if ( item.description )
81
+
82
+ elsif ( item.instance_of?(::RSS::RDF::Item) )
83
+
84
+ addNewDoc.call(item.title, item.content_encoded, item)
85
+
86
+ else
87
+
88
+ addNewDoc.call(item.title.content, item.content.content, item)
89
+
90
+ end
91
+ end
92
+ end
93
+ end
94
+ end
95
+
96
+ end