squish 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ === Squish 0.0.1
2
+ * initial release
3
+ * kinda neat
4
+ * needs work
5
+ * currently melt-your-brain slow
6
+ * may not be useful until it's rewritten entirely in C
7
+ * but it's functional and it has an easy-to-use API
data/README ADDED
@@ -0,0 +1,4 @@
1
+ Squish is a simple classification library that uses a modified Huffman
2
+ compression algorithm to classify resources into buckets. While it is
3
+ orders of magnitude slower than a naive Bayes classifier, it is potentially
4
+ more effective for certain types of data.
@@ -0,0 +1,444 @@
1
+ #--
2
+ # Squish, Copyright (c) 2006 Robert Aman
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+
24
+ class Array
25
+ def evaporate #:nodoc:
26
+ # I didn't write this method. I haven't looked at this code long
27
+ # enough to figure out what it does.
28
+ 0.upto(size - 2) do |position|
29
+ self[position] = at(position) & at(position + 1)
30
+ end
31
+ self[size - 1] = []
32
+ end
33
+ end
34
+
35
+ module Squish
36
+ # Classifies a document, based on an array of supplied buckets.
37
+ def self.classify(document, buckets)
38
+ best_result = nil
39
+ best_score = nil
40
+ for bucket in buckets
41
+ score = bucket.compress(document)
42
+ if best_score == nil || (score < best_score)
43
+ best_score = score
44
+ best_result = bucket.name
45
+ end
46
+ end
47
+ return best_result
48
+ end
49
+
50
+ # Classifies a document, based on an array of supplied buckets.
51
+ # The document is automatically added to the bucket after classification.
52
+ def self.classify!(document, buckets)
53
+ result = self.classify(document, buckets)
54
+ for bucket in buckets
55
+ bucket << document if bucket.name == result
56
+ end
57
+ return result
58
+ end
59
+
60
+ class Bucket
61
+ # Creates a new bucket with a given name.
62
+ def initialize(name)
63
+ @name = name
64
+ end
65
+
66
+ # Returns the name of the bucket.
67
+ def name
68
+ return @name
69
+ end
70
+
71
+ # Returns the list of documents contained within the bucket. Each
72
+ # document is simply a Hash object.
73
+ def documents
74
+ if !defined?(@documents) || @documents.nil?
75
+ @documents = []
76
+ end
77
+ return @documents
78
+ end
79
+
80
+ # Magnets are Strings or Regexps which can be attached to a
81
+ # bucket. They cause any incoming document that matches them to be very,
82
+ # very strongly attracted to the bucket that they are attached to.
83
+ # In essence, it makes the string that the magnet matches infinitely
84
+ # compressible by that bucket.
85
+ def magnets
86
+ if !defined?(@magnets) || @magnets.nil?
87
+ @magnets = []
88
+ end
89
+ return @magnets
90
+ end
91
+
92
+ # Adds a document to the bucket. The supplied document must be a Hash.
93
+ # Suggested convention is to use a Hash such as this:
94
+ #
95
+ # {
96
+ # :name => "Bob Aman",
97
+ # :email => "bob@sporkmonger.com",
98
+ # :body => <<-TEXT
99
+ # This is some example text from a hypothetical comment I left on
100
+ # someone's blog.
101
+ # TEXT
102
+ # }
103
+ #
104
+ # Supplying a String will convert the string to the form:
105
+ #
106
+ # {
107
+ # :body => string
108
+ # }
109
+ #
110
+ # Automatically invalidates the previously calculated bucket data.
111
+ def <<(document)
112
+ self.invalidate()
113
+ document = {:body => document} if document.kind_of?(String)
114
+ self.documents << document
115
+ return self.documents
116
+ end
117
+
118
+ # Returns the compression ratio for the given document with this bucket.
119
+ # The lower this number is, the better the fit.
120
+ #
121
+ # Supplying a String will convert the string to the form:
122
+ #
123
+ # {
124
+ # :body => string
125
+ # }
126
+ #
127
+ def compress(document)
128
+ document = {:body => document} if document.kind_of?(String)
129
+
130
+ # Magically compress anything matched by the magnets to nothing.
131
+ # This strongly attracts the document to this bucket.
132
+ magnetized_document = document.dup
133
+ for magnet in self.magnets
134
+ regexp = nil
135
+ if magnet.kind_of?(String)
136
+ regexp = Regexp.new(Regexp.escape(magnet))
137
+ elsif magnet.kind_of?(Regexp)
138
+ regexp = magnet
139
+ end
140
+ if regexp != nil
141
+ for key, value in magnetized_document
142
+ value.gsub!(regexp, "")
143
+ end
144
+ end
145
+ end
146
+
147
+ document_bytes = Marshal.dump(
148
+ Squish.filter_document(magnetized_document))
149
+ document_compressed_binary = ""
150
+ document_compressed_bytes = ""
151
+ sorted_symbol_table =
152
+ self.symbol_table.sort { |a, b| b[0].size <=> a[0].size }
153
+ while document_bytes.size > 0
154
+ for symbol, coding in sorted_symbol_table
155
+ symbol_regexp = Regexp.new("^" + Regexp.escape(symbol))
156
+ if document_bytes =~ symbol_regexp
157
+ document_bytes.gsub!(symbol_regexp, "")
158
+ document_compressed_binary << coding
159
+ break
160
+ end
161
+ end
162
+ end
163
+ while document_compressed_binary != nil &&
164
+ document_compressed_binary.size > 0
165
+ document_compressed_bytes <<
166
+ document_compressed_binary[0...8].to_i(2).chr
167
+ document_compressed_binary = document_compressed_binary[8..-1]
168
+ end
169
+ return (document_compressed_bytes.size.to_f /
170
+ Marshal.dump(document).size.to_f)
171
+ end
172
+
173
+ # Invalidates the bucket compression data. This method should be called
174
+ # any time the bucket's list of documents changes. The << method calls
175
+ # this method automatically.
176
+ def invalidate
177
+ @tree = nil
178
+ @symbol_table = nil
179
+ end
180
+
181
+ protected
182
+
183
+ # Returns a list of documents that have been processed by the filter.
184
+ def processed_documents #:nodoc:
185
+ processed_documents = []
186
+ for document in self.documents
187
+ processed_documents << Squish.filter_document(document)
188
+ end
189
+ return processed_documents
190
+ end
191
+
192
+ # Returns the raw document list used for compression.
193
+ def raw #:nodoc:
194
+ return Marshal.dump(self.processed_documents)
195
+ end
196
+
197
+ # Returns a hash table of symbols and their huffman codings.
198
+ def symbol_table #:nodoc:
199
+ if !defined?(@symbol_table) || @symbol_table == nil
200
+ table = {}
201
+ self.build_table(self.tree, table)
202
+ @symbol_table = table
203
+ end
204
+ return @symbol_table
205
+ end
206
+
207
+ # Tree traversal helper method.
208
+ # Originally written by Aggelos Orfanakos
209
+ def build_table(root, table, bitstream=[]) #:nodoc:
210
+ if root.kind_of?(Squish::Internal)
211
+ bitstream.push '0'
212
+ self.build_table(root.left, table, bitstream)
213
+ bitstream[-1] = '1'
214
+ self.build_table(root.right, table, bitstream)
215
+ bitstream.pop
216
+ else
217
+ table[root.data] = bitstream.join
218
+ end
219
+ end
220
+
221
+ # Returns the huffman code tree for the bucket. This is used to test
222
+ # whether a document is a good match for a bucket or not.
223
+ def tree #:nodoc:
224
+ if !defined?(@tree) || @tree == nil
225
+ # Adapted from code written by Aggelos Orfanakos
226
+ forest = []
227
+
228
+ self.symbol_weights.each do |pair|
229
+ forest.push(Leaf.new(*pair.reverse))
230
+ end
231
+
232
+ # Sort once, we'll try to keep the array sorted after this
233
+ forest.sort!
234
+ while forest.length > 1
235
+ a = forest.pop
236
+ b = forest.pop
237
+ new_node = Internal.new(a.weight + b.weight, a, b)
238
+ inserted = false
239
+ for i in 0...forest.size
240
+ index = forest.size - i - 1
241
+ if forest[index].weight > new_node.weight
242
+ forest.insert(index + 1, new_node)
243
+ inserted = true
244
+ break
245
+ end
246
+ end
247
+ forest.unshift(new_node) if !inserted
248
+ end
249
+ @tree = forest.first
250
+ end
251
+ return @tree
252
+ end
253
+
254
+ # Returns a hash of keys and values, where each key is a string that has
255
+ # occurred at least once in the source, and each value is the number of
256
+ # times its corresponding key has appeared.
257
+ def symbol_occurrences(source=(self.raw + Squish.all_bytes)) #:nodoc:
258
+
259
+ # This method is can't help being a performance bottleneck. Anything
260
+ # that can be done to improve it performance-wise will be much
261
+ # appreciated.
262
+
263
+ # I didn't write this method. Someone on IRC improved on my original
264
+ # method that was quite terrible, and I haven't quite figured out how
265
+ # his replacement method works.
266
+
267
+ symbol_occurrences = {}
268
+
269
+ char_positions = Hash.new { |h, k| h[k] = [] }
270
+ source.unpack('C*').each_with_index do |ch, pos|
271
+ char_positions[ch] << pos
272
+ end
273
+ offsets = Array.new(source.length)
274
+ char_positions.each do |char, positions|
275
+ positions.each do |position|
276
+ offsets[position] = (positions.dup.delete_if do |other_position|
277
+ other_position <= position
278
+ end).map { |other_position| other_position - position }
279
+ end
280
+ end
281
+
282
+ (1..10).each do |length|
283
+ offsets.each_with_index do |offset_list, position|
284
+ if length == 1 || (offset_list.size + 1 >= 2)
285
+ if symbol_occurrences[source[position, length]] == nil
286
+ symbol_occurrences[source[position, length]] = []
287
+ end
288
+ symbol_occurrences[source[position, length]] << position
289
+ symbol_occurrences[source[position, length]].concat(
290
+ offset_list.map { |offset| position + offset }
291
+ )
292
+ end
293
+ end
294
+ offsets.evaporate
295
+ end
296
+
297
+ for key in symbol_occurrences.keys
298
+ symbol_occurrences[key] = symbol_occurrences[key].uniq.size
299
+ end
300
+
301
+ return symbol_occurrences
302
+ end
303
+
304
+ # Returns a hash of keys and values, where each key is a string that has
305
+ # occurred at least once in the source, and each value is a weighting
306
+ # of occurrances multiplied by the length of the key.
307
+ def symbol_weights(symbol_occurrences=self.symbol_occurrences) #:nodoc:
308
+ symbol_weights = {}
309
+ for key in symbol_occurrences.keys
310
+ symbol_weights[key] = (key.size * symbol_occurrences[key])
311
+ end
312
+ return symbol_weights
313
+ end
314
+ end
315
+
316
+ # Originally written by Aggelos Orfanakos
317
+ class Node # :nodoc:
318
+ include Comparable
319
+
320
+ attr_reader :weight
321
+
322
+ def initialize(weight)
323
+ @weight = weight
324
+ end
325
+
326
+ def <=>(other)
327
+ other.weight <=> @weight
328
+ end
329
+ end
330
+
331
+ # Originally written by Aggelos Orfanakos
332
+ class Internal < Node # :nodoc:
333
+ attr_reader :left, :right
334
+
335
+ def initialize(weight, left, right)
336
+ super(weight)
337
+ @left = left
338
+ @right = right
339
+ end
340
+ end
341
+
342
+ # Originally written by Aggelos Orfanakos
343
+ class Leaf < Node # :nodoc:
344
+ attr_reader :data
345
+
346
+ def initialize(weight, data)
347
+ super(weight)
348
+ @data = data
349
+ end
350
+ end
351
+
352
+ # Returns a string containing all possible bytes. This is appended to the
353
+ # raw bucket dump to ensure that all bytes can be handled by the tree,
354
+ # since incoming documents may contain bytes not previously encountered
355
+ # within training data.
356
+ def self.all_bytes #:nodoc:
357
+ if !defined?(@all_bytes) || @all_bytes == nil
358
+ all_bytes = ""
359
+ for i in 0...256
360
+ all_bytes << i.chr
361
+ end
362
+ @all_bytes = all_bytes
363
+ end
364
+ return @all_bytes
365
+ end
366
+
367
+ # Filters an entire document (Hash)
368
+ def self.filter_document(document) #:nodoc:
369
+ filtered_document = {}
370
+ for key in document.keys
371
+ filtered_document[key] = filter_value(document[key])
372
+ end
373
+ return filtered_document
374
+ end
375
+
376
+ # Does a visual reduction of the characters contained within the value.
377
+ # This prevents "1337" speak from degrading the effectiveness of the
378
+ # algorithm in any way. This is intentionally a VERY lossy algorithm, and
379
+ # isn't particularly efficient, but it works. The main advantage of this
380
+ # algorithm is that while some information may be lost from legitimate
381
+ # documents, more patterns will be revealed in illegitimate documents,
382
+ # with ultimately more critical information revealed than is lost.
383
+ def self.filter_value(value) #:nodoc:
384
+ filtered_value = value.to_s.dup
385
+
386
+ # Remove whitespace because spammers sometimes insert extraneous
387
+ # whitespace, and the main algorithm shouldn't give false positives due
388
+ # to a lack of whitespace, but it may give false positives due to extra
389
+ # whitespace.
390
+ filtered_value.gsub!(/\s/, "")
391
+
392
+ filtered_value.gsub!(/~/, "-")
393
+ filtered_value.gsub!(/\|/, "I")
394
+ filtered_value.gsub!(/!/, "I")
395
+ filtered_value.gsub!(/1/, "I")
396
+ filtered_value.gsub!(/l/, "I")
397
+ filtered_value.gsub!(/\+/, "t")
398
+ filtered_value.gsub!(/3/, "e")
399
+ filtered_value.gsub!(/7/, "T")
400
+ filtered_value.gsub!(/@/, "a")
401
+ filtered_value.gsub!(/4/, "A")
402
+ filtered_value.gsub!(/8/, "B")
403
+ filtered_value.gsub!(/6/, "G")
404
+ filtered_value.gsub!(/\$/, "S")
405
+ filtered_value.gsub!(/0/, "O")
406
+ filtered_value.gsub!(/\(\)/, "O")
407
+ filtered_value.gsub!(/I\)/, "D")
408
+ filtered_value.gsub!(/\]\)/, "D")
409
+ filtered_value.gsub!(/\[\)/, "D")
410
+ filtered_value.gsub!(/I\*/, "P")
411
+ filtered_value.gsub!(/\]\*/, "P")
412
+ filtered_value.gsub!(/\*/, "a")
413
+ filtered_value.gsub!(/I2/, "R")
414
+ filtered_value.gsub!(/I=/, "F")
415
+ filtered_value.gsub!(/I\\I/, "N")
416
+ filtered_value.gsub!(/\`\//, "Y")
417
+ filtered_value.gsub!(/\/\\\/\\/, "M")
418
+ filtered_value.gsub!(/\\\/\\\//, "W")
419
+ filtered_value.gsub!(/\\\/\\\//, "W")
420
+ filtered_value.gsub!(/I\\\/I/, "M")
421
+ filtered_value.gsub!(/IVI/i, "M")
422
+ filtered_value.gsub!(/VV/, "W")
423
+ filtered_value.gsub!(/\\X\//, "W")
424
+ filtered_value.gsub!(/\/\\\//, "N")
425
+ filtered_value.gsub!(/\\\/\\/, "N")
426
+ filtered_value.gsub!(/\/V\\/i, "M")
427
+ filtered_value.gsub!(/\/V/i, "N")
428
+ filtered_value.gsub!(/\\N/, "W")
429
+ filtered_value.gsub!(/\\\//, "V")
430
+ filtered_value.gsub!(/\>\</, "X")
431
+ filtered_value.gsub!(/I-I/, "H")
432
+ filtered_value.gsub!(/\]-\[/, "H")
433
+ filtered_value.gsub!(/\}\{/, "H")
434
+ filtered_value.gsub!(/I_I/, "U")
435
+ filtered_value.gsub!(/I\</, "K")
436
+ filtered_value.gsub!(/\]\</, "K")
437
+ filtered_value.gsub!(/\(/, "C")
438
+ filtered_value.gsub!(/\//, "I")
439
+ filtered_value.gsub!(/\\/, "I")
440
+ filtered_value.downcase!
441
+
442
+ return filtered_value
443
+ end
444
+ end
@@ -0,0 +1,9 @@
1
+ module Squish
2
+ module SQUISH_VERSION #:nodoc:
3
+ MAJOR = 0
4
+ MINOR = 0
5
+ TINY = 1
6
+
7
+ STRING = [MAJOR, MINOR, TINY].join('.')
8
+ end
9
+ end
@@ -0,0 +1,252 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'rake/testtask'
4
+ require 'rake/rdoctask'
5
+ require 'rake/packagetask'
6
+ require 'rake/gempackagetask'
7
+ require 'rake/contrib/rubyforgepublisher'
8
+ require 'spec/rake/spectask'
9
+
10
+ require File.join(File.dirname(__FILE__), 'lib/squish', 'version')
11
+
12
+ PKG_DISPLAY_NAME = 'Squish'
13
+ PKG_NAME = PKG_DISPLAY_NAME.downcase
14
+ PKG_VERSION = Squish::SQUISH_VERSION::STRING
15
+ PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
16
+
17
+ RELEASE_NAME = "REL #{PKG_VERSION}"
18
+
19
+ RUBY_FORGE_PROJECT = PKG_NAME
20
+ RUBY_FORGE_USER = "sporkmonger"
21
+
22
+ PKG_SUMMARY = "Resource classification library."
23
+ PKG_DESCRIPTION = <<-TEXT
24
+ Squish is a simple classification library that uses a modified Huffman
25
+ compression algorithm to classify resources into buckets. While it is
26
+ orders of magnitude slower than a naive Bayes classifier, it is potentially
27
+ more effective for certain types of data.
28
+ TEXT
29
+
30
+ PKG_FILES = FileList[
31
+ "lib/**/*", "spec/**/*", "doc/**/*", "vendor/**/*", "[A-Z]*", "rakefile"
32
+ ].exclude(/\bCVS\b|~$/).exclude(/database\.yml/).exclude(/[_\.]svn$/)
33
+
34
+ module Rake
35
+ def self.browse(filepath)
36
+ if RUBY_PLATFORM =~ /mswin/
37
+ system(filepath)
38
+ else
39
+ try_browsers = lambda do
40
+ result = true
41
+ if !(`which firefox 2>&1` =~ /no firefox/)
42
+ system("firefox #{filepath}")
43
+ elsif !(`which mozilla 2>&1` =~ /no mozilla/)
44
+ system("mozilla #{filepath}")
45
+ elsif !(`which netscape 2>&1` =~ /no netscape/)
46
+ system("netscape #{filepath}")
47
+ elsif !(`which links 2>&1` =~ /no links/)
48
+ system("links #{filepath}")
49
+ elsif !(`which lynx 2>&1` =~ /no lynx/)
50
+ system("lynx #{filepath}")
51
+ else
52
+ result = false
53
+ end
54
+ result
55
+ end
56
+ opened = false
57
+ if RUBY_PLATFORM =~ /darwin/
58
+ opened = true
59
+ system("open #{filepath}")
60
+ elsif !(`which gnome-open 2>&1` =~ /no gnome-open/)
61
+ success =
62
+ !(`gnome-open #{filepath} 2>&1` =~ /There is no default action/)
63
+ if !success
64
+ opened = try_browsers.call()
65
+ else
66
+ opened = true
67
+ end
68
+ else
69
+ opened = try_browsers.call()
70
+ end
71
+ if !opened
72
+ puts "Don't know how to browse to location."
73
+ end
74
+ end
75
+ end
76
+ end
77
+
78
+ task :default => [ "spec:run" ]
79
+
80
+ gem_spec = Gem::Specification.new do |s|
81
+ s.name = PKG_NAME
82
+ s.version = PKG_VERSION
83
+ s.summary = PKG_SUMMARY
84
+ s.description = PKG_DESCRIPTION
85
+
86
+ s.files = PKG_FILES.to_a
87
+
88
+ s.has_rdoc = true
89
+ s.extra_rdoc_files = %w( README )
90
+ s.rdoc_options.concat ['--main', 'README']
91
+
92
+ s.add_dependency('rake', '>= 0.7.2')
93
+ s.add_dependency('rspec', '>= 0.7.1')
94
+
95
+ s.require_path = 'lib'
96
+
97
+ s.author = "Bob Aman"
98
+ s.email = "bob@sporkmonger.com"
99
+ s.homepage = "http://sporkmonger.com/"
100
+ s.rubyforge_project = "squish"
101
+ end
102
+
103
+ Rake::GemPackageTask.new(gem_spec) do |p|
104
+ p.gem_spec = gem_spec
105
+ p.need_tar = true
106
+ p.need_zip = true
107
+ end
108
+
109
+ Rake::RDocTask.new do |rdoc|
110
+ rdoc.rdoc_dir = 'doc'
111
+ rdoc.title = "Squish -- simple resource classification"
112
+ rdoc.options << '--line-numbers' << '--inline-source' <<
113
+ '--accessor' << 'cattr_accessor=object'
114
+ rdoc.template = "#{ENV['template']}.rb" if ENV['template']
115
+ rdoc.rdoc_files.include('README', 'CHANGELOG', 'TODO', 'LICENSE')
116
+ rdoc.rdoc_files.include('lib/**/*.rb')
117
+ end
118
+
119
+ namespace :rcov do
120
+ desc 'Open the RCov code coverage report in a browser.'
121
+ task :browse do
122
+ if !File.exists?(File.expand_path(
123
+ File.dirname(__FILE__) + '/coverage/index.html'))
124
+ Rake::Task["spec:run"].invoke
125
+ end
126
+ Rake.browse(File.expand_path(
127
+ File.dirname(__FILE__) + '/coverage/index.html'))
128
+ end
129
+ end
130
+
131
+ namespace :spec do
132
+ desc "Run all the specs"
133
+ Spec::Rake::SpecTask.new(:run) do |t|
134
+ t.spec_files = FileList['spec/**/*_spec.rb']
135
+ t.spec_opts = ['--color']
136
+ t.rcov = true
137
+ t.rcov_opts = [
138
+ # Don't include the actual spec files in the coverage report
139
+ '--exclude', '"spec\/.*"'
140
+ ]
141
+ end
142
+
143
+ desc "Run all the specs"
144
+ Spec::Rake::SpecTask.new(:run_without_rcov) do |t|
145
+ t.spec_files = FileList['spec/**/*_spec.rb']
146
+ t.spec_opts = ['--color']
147
+ end
148
+
149
+ # desc "Start up autotest for RSpec"
150
+ # task :autospec do
151
+ # require 'autotest'
152
+ # require 'autotest/growl'
153
+ # require 'autotest/redgreen'
154
+ # require 'vendor/autospec/lib/autospec'
155
+ # Autospec.run
156
+ # end
157
+
158
+ desc "Print Specdoc for all specs"
159
+ Spec::Rake::SpecTask.new(:doc) do |t|
160
+ t.spec_files = FileList[
161
+ 'spec/**/*_spec.rb'
162
+ ]
163
+ t.spec_opts = ["--format", "specdoc"]
164
+ end
165
+
166
+ desc "Generate HTML Specdocs for all specs"
167
+ Spec::Rake::SpecTask.new(:html) do |t|
168
+ if !File.exists?(
169
+ File.expand_path(File.dirname(__FILE__) + '/doc/'))
170
+ puts "Creating doc folder..."
171
+ Dir.mkdir(File.expand_path(File.dirname(__FILE__) + '/doc/'))
172
+ end
173
+ if !File.exists?(
174
+ File.expand_path(File.dirname(__FILE__) + '/doc/specs/'))
175
+ puts "Creating specs folder..."
176
+ Dir.mkdir(File.expand_path(File.dirname(__FILE__) + '/doc/specs/'))
177
+ end
178
+
179
+ t.spec_files = FileList['spec/**/*_spec.rb']
180
+ t.spec_opts = ["--format", "html"]
181
+ t.out = File.expand_path(
182
+ File.dirname(__FILE__) + '/doc/specs/index.html')
183
+ end
184
+
185
+ desc 'Open the RSpec HTML specifications in a browser.'
186
+ task :browse => [ "spec:html" ] do
187
+ Rake.browse(File.expand_path(
188
+ File.dirname(__FILE__) + '/doc/specs/index.html'))
189
+ end
190
+ end
191
+
192
+ namespace :publish do
193
+ desc "Publish the coverage report"
194
+ task :coverage => [ "spec:run" ] do
195
+ Rake::SshDirPublisher.new(
196
+ "sporkmonger@sporkmonger.com",
197
+ "projects/squish/coverage/",
198
+ "coverage/"
199
+ ).upload
200
+ end
201
+
202
+ desc "Publish the specifications"
203
+ task :specs => [ "spec:html" ] do
204
+ Rake::SshDirPublisher.new(
205
+ "sporkmonger@sporkmonger.com",
206
+ "projects/squish/specs/",
207
+ "doc/specs/"
208
+ ).upload
209
+ end
210
+
211
+ desc "Publish the API documentation"
212
+ task :api => [ "rdoc" ] do
213
+ if !File.exists?(
214
+ File.expand_path(File.dirname(__FILE__) + '/doc/specs/'))
215
+ puts "Creating specs folder..."
216
+ Dir.mkdir(File.expand_path(File.dirname(__FILE__) + '/doc/specs/'))
217
+ end
218
+
219
+ Rake::SshDirPublisher.new(
220
+ "sporkmonger@sporkmonger.com",
221
+ "projects/squish/api/",
222
+ "doc/"
223
+ ).upload
224
+ end
225
+
226
+ desc "Runs all of the publishing tasks"
227
+ task :all => ["publish:coverage", "publish:api", "publish:specs"] do
228
+ end
229
+ end
230
+
231
+ task :lines do
232
+ lines, codelines, total_lines, total_codelines = 0, 0, 0, 0
233
+
234
+ for file_name in FileList["lib/**/*.rb"]
235
+ f = File.open(file_name)
236
+
237
+ while line = f.gets
238
+ lines += 1
239
+ next if line =~ /^\s*$/
240
+ next if line =~ /^\s*#/
241
+ codelines += 1
242
+ end
243
+ puts "L: #{sprintf("%4d", lines)}, LOC #{sprintf("%4d", codelines)} | #{file_name}"
244
+
245
+ total_lines += lines
246
+ total_codelines += codelines
247
+
248
+ lines, codelines = 0, 0
249
+ end
250
+
251
+ puts "Total: Lines #{total_lines}, LOC #{total_codelines}"
252
+ end
@@ -0,0 +1,241 @@
1
+ #--
2
+ # Squish, Copyright (c) 2006 Robert Aman
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+
24
+ $:.unshift(File.expand_path(File.dirname(__FILE__) + '/../../lib'))
25
+ $:.uniq!
26
+
27
+ require 'squish'
28
+
29
+ lorem_bucket = Squish::Bucket.new("lorem")
30
+ lorem_bucket << {
31
+ :body => <<-TEXT
32
+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
33
+ tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
34
+ quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
35
+ consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
36
+ cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
37
+ proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
38
+ TEXT
39
+ }
40
+
41
+ spam_bucket = Squish::Bucket.new("spam")
42
+ spam_bucket << {
43
+ :name => "Disabled",
44
+ :url => "http://www.artpromcompany.com/",
45
+ :body => <<-TEXT
46
+ I much have powerfully interested your resource. As you see on that be
47
+ changed reference http://www.artpromcompany.com
48
+ TEXT
49
+ }
50
+ spam_bucket << {
51
+ :name => "gamble in poker casinos",
52
+ :url => "http://www.see-the-dealer.com/",
53
+ :body => <<-TEXT
54
+ The eye has some latin bingo. Some interest has one new issue. Select craps
55
+ is some rolling Revolution. One confident Revolution poked thanks to a
56
+ middle-class side. Lesser keno is one socialist table. It’s social to be
57
+ overhung! It’s violent to be sat! The eldest poker misread some keno
58
+ conductively.
59
+ TEXT
60
+ }
61
+ spam_bucket << {
62
+ :name => "currency rates",
63
+ :url => "http://www.allinforex.com/",
64
+ :body => <<-TEXT
65
+ It’s valid to be lent! The exchange is untactfully legal. One war has a
66
+ capitalist forex investment. Some currency has this wonderful forex.
67
+ Goodness, one integral forex rates sociably input excepting that religious
68
+ forex investment. I mean, one face is less unemployed than a medical girl.
69
+ TEXT
70
+ }
71
+ spam_bucket.magnets << /viagra/i
72
+
73
+ valid_bucket = Squish::Bucket.new("valid")
74
+ valid_bucket << {
75
+ :name => "ninja",
76
+ :url => nil,
77
+ :body => <<-TEXT
78
+ Hey Bob,
79
+
80
+ This article sounds alot like you’re encouraging rather than discouraging
81
+ strict OOP. Not what I heard from you last time we spoke...
82
+
83
+ Dont bother constructing an elaborate counter-argument... I’m too ignorant
84
+ to understand what’s going on on this site anyway, and probabaly wouldnt
85
+ understand what you’re talking about :)
86
+
87
+ – Reda
88
+
89
+ p.s – how’s it going?
90
+ TEXT
91
+ }
92
+ valid_bucket.magnets << "Bob"
93
+
94
+ context "An empty bucket" do
95
+ setup do
96
+ @empty = Squish::Bucket.new("empty")
97
+ end
98
+
99
+ specify "should have the correct name" do
100
+ @empty.name.should == "empty"
101
+ end
102
+
103
+ specify "should not have any documents" do
104
+ @empty.documents.should.be.empty
105
+ end
106
+
107
+ specify "should not have any magnets" do
108
+ @empty.magnets.should.be.empty
109
+ end
110
+ end
111
+
112
+ context "A bucket containing 'lorem ipsum' text" do
113
+ setup do
114
+ @lorem = lorem_bucket
115
+ end
116
+
117
+ specify "should have the correct name" do
118
+ @lorem.name.should == "lorem"
119
+ end
120
+
121
+ specify "should have at least one 'lorem ipsum' document" do
122
+ @lorem.documents.should.not.be.empty
123
+ @lorem.documents.size.should >= 1
124
+ end
125
+
126
+ specify "should not have any magnets" do
127
+ @lorem.magnets.should.be.empty
128
+ end
129
+ end
130
+
131
+ context "A bucket containing several spammy documents and a viagra magnet" do
132
+ setup do
133
+ @spam = spam_bucket
134
+ end
135
+
136
+ specify "should have the correct name" do
137
+ @spam.name.should == "spam"
138
+ end
139
+
140
+ specify "should have multiple documents" do
141
+ @spam.documents.should.not.be.empty
142
+ @spam.documents.size.should >= 3
143
+ end
144
+
145
+ specify "should only have one magnet" do
146
+ @spam.magnets.should.not.be.empty
147
+ @spam.magnets.size.should == 1
148
+ end
149
+ end
150
+
151
+ context "With an array of several buckets, Squish" do
152
+ setup do
153
+ @buckets = [lorem_bucket, spam_bucket, valid_bucket]
154
+ end
155
+
156
+ specify "should correctly classify valid documents" do
157
+ Squish.classify!(
158
+ "Hi Bob, what have you been up to lately? Anything interesting?",
159
+ @buckets
160
+ ).should == "valid"
161
+
162
+ Squish.classify!(
163
+ "Bob, I figured out what was wrong with your ruby program.",
164
+ @buckets
165
+ ).should == "valid"
166
+ end
167
+
168
+ specify "should correctly classify spam documents" do
169
+ # Give the bucket a little help here, since it doesn't have enough
170
+ # training data.
171
+ spam_bucket << "Check currancy rates online!"
172
+ spam_bucket << "Online poker casino!"
173
+ spam_bucket << "Penis enlargement!"
174
+ spam_bucket << "Cheap online pharmacy sells viagra and cialis!"
175
+ spam_bucket << "Amazing mortgage rates! Buy your home for less!"
176
+ spam_bucket << "Viagra! Cialis!"
177
+
178
+ Squish.classify!(
179
+ "Invest money on the foreign exchange!",
180
+ @buckets
181
+ ).should == "spam"
182
+
183
+ Squish.classify!(
184
+ "Play bingo and poker online! Make money!",
185
+ @buckets
186
+ ).should == "spam"
187
+
188
+ Squish.classify!(
189
+ "Enlarge your penis for cheap! She will fall in love with you again!",
190
+ @buckets
191
+ ).should == "spam"
192
+
193
+ Squish.classify!(
194
+ "\\/|agr@!",
195
+ @buckets
196
+ ).should == "spam"
197
+
198
+ Squish.classify!(
199
+ "V / a G r A",
200
+ @buckets
201
+ ).should == "spam"
202
+
203
+ Squish.classify!(
204
+ "Buy viagra and cialis!",
205
+ @buckets
206
+ ).should == "spam"
207
+ end
208
+
209
+ specify "should correctly classify 'lorem ipsum' documents" do
210
+ Squish.classify!(
211
+ %{
212
+ Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Fusce
213
+ tincidunt augue a augue. Ut nunc. Fusce porta, sem a luctus mattis,
214
+ dolor dui gravida diam, a eleifend augue nibh eget nibh. Duis eu
215
+ justo. In viverra enim a turpis. Nullam eros. Nullam vestibulum
216
+ nunc vel nisi. Vestibulum ante ipsum primis in faucibus orci luctus
217
+ et ultrices posuere cubilia Curae; Integer feugiat lorem ut dolor.
218
+ Cras eget nulla. Donec velit pede, posuere vel, iaculis quis, commodo
219
+ sit amet, diam. Praesent pharetra velit ac enim. Donec porta tortor
220
+ congue nunc. Duis eu enim sit amet nulla tincidunt bibendum.
221
+ Donec mollis.
222
+ },
223
+ @buckets
224
+ ).should == "lorem"
225
+
226
+ Squish.classify!(
227
+ "Lorem ipsum dolor sit amet.",
228
+ @buckets
229
+ ).should == "lorem"
230
+
231
+ Squish.classify!(
232
+ "Lorem ipsum, you scallywag! Vestibulum ante ipsum and such!",
233
+ @buckets
234
+ ).should == "lorem"
235
+
236
+ Squish.classify!(
237
+ "Lorem ipsum, you scallywag!",
238
+ @buckets
239
+ ).should == "lorem"
240
+ end
241
+ end
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: 1
4
+ name: squish
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.0.1
7
+ date: 2007-03-30 00:00:00 -04:00
8
+ summary: Resource classification library.
9
+ require_paths:
10
+ - lib
11
+ email: bob@sporkmonger.com
12
+ homepage: http://sporkmonger.com/
13
+ rubyforge_project: squish
14
+ description: Squish is a simple classification library that uses a modified Huffman compression algorithm to classify resources into buckets. While it is orders of magnitude slower than a naive Bayes classifier, it is potentially more effective for certain types of data.
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Bob Aman
31
+ files:
32
+ - lib/squish
33
+ - lib/squish.rb
34
+ - lib/squish/version.rb
35
+ - spec/squish
36
+ - spec/squish/squish_spec.rb
37
+ - doc/specs
38
+ - CHANGELOG
39
+ - README
40
+ - rakefile
41
+ test_files: []
42
+
43
+ rdoc_options:
44
+ - --main
45
+ - README
46
+ extra_rdoc_files:
47
+ - README
48
+ executables: []
49
+
50
+ extensions: []
51
+
52
+ requirements: []
53
+
54
+ dependencies:
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ version_requirement:
58
+ version_requirements: !ruby/object:Gem::Version::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: 0.7.2
63
+ version:
64
+ - !ruby/object:Gem::Dependency
65
+ name: rspec
66
+ version_requirement:
67
+ version_requirements: !ruby/object:Gem::Version::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: 0.7.1
72
+ version: