squish 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ === Squish 0.0.1
2
+ * initial release
3
+ * kinda neat
4
+ * needs work
5
+ * currently melt-your-brain slow
6
+ * may not be useful until it's rewritten entirely in C
7
+ * but it's functional and it has an easy-to-use API
data/README ADDED
@@ -0,0 +1,4 @@
1
+ Squish is a simple classification library that uses a modified Huffman
2
+ compression algorithm to classify resources into buckets. While it is
3
+ orders of magnitude slower than a naive Bayes classifier, it is potentially
4
+ more effective for certain types of data.
@@ -0,0 +1,444 @@
1
+ #--
2
+ # Squish, Copyright (c) 2006 Robert Aman
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+
24
+ class Array
25
+ def evaporate #:nodoc:
26
+ # I didn't write this method. I haven't looked at this code long
27
+ # enough to figure out what it does.
28
+ 0.upto(size - 2) do |position|
29
+ self[position] = at(position) & at(position + 1)
30
+ end
31
+ self[size - 1] = []
32
+ end
33
+ end
34
+
35
+ module Squish
36
+ # Classifies a document, based on an array of supplied buckets.
37
+ def self.classify(document, buckets)
38
+ best_result = nil
39
+ best_score = nil
40
+ for bucket in buckets
41
+ score = bucket.compress(document)
42
+ if best_score == nil || (score < best_score)
43
+ best_score = score
44
+ best_result = bucket.name
45
+ end
46
+ end
47
+ return best_result
48
+ end
49
+
50
+ # Classifies a document, based on an array of supplied buckets.
51
+ # The document is automatically added to the bucket after classification.
52
+ def self.classify!(document, buckets)
53
+ result = self.classify(document, buckets)
54
+ for bucket in buckets
55
+ bucket << document if bucket.name == result
56
+ end
57
+ return result
58
+ end
59
+
60
+ class Bucket
61
+ # Creates a new bucket with a given name.
62
+ def initialize(name)
63
+ @name = name
64
+ end
65
+
66
+ # Returns the name of the bucket.
67
+ def name
68
+ return @name
69
+ end
70
+
71
+ # Returns the list of documents contained within the bucket. Each
72
+ # document is simply a Hash object.
73
+ def documents
74
+ if !defined?(@documents) || @documents.nil?
75
+ @documents = []
76
+ end
77
+ return @documents
78
+ end
79
+
80
+ # Magnets are Strings or Regexps which can be attached to a
81
+ # bucket. They cause any incoming document that matches them to be very,
82
+ # very strongly attracted to the bucket that they are attached to.
83
+ # In essence, it makes the string that the magnet matches infinitely
84
+ # compressible by that bucket.
85
+ def magnets
86
+ if !defined?(@magnets) || @magnets.nil?
87
+ @magnets = []
88
+ end
89
+ return @magnets
90
+ end
91
+
92
+ # Adds a document to the bucket. The supplied document must be a Hash.
93
+ # Suggested convention is to use a Hash such as this:
94
+ #
95
+ # {
96
+ # :name => "Bob Aman",
97
+ # :email => "bob@sporkmonger.com",
98
+ # :body => <<-TEXT
99
+ # This is some example text from a hypothetical comment I left on
100
+ # someone's blog.
101
+ # TEXT
102
+ # }
103
+ #
104
+ # Supplying a String will convert the string to the form:
105
+ #
106
+ # {
107
+ # :body => string
108
+ # }
109
+ #
110
+ # Automatically invalidates the previously calculated bucket data.
111
+ def <<(document)
112
+ self.invalidate()
113
+ document = {:body => document} if document.kind_of?(String)
114
+ self.documents << document
115
+ return self.documents
116
+ end
117
+
118
+ # Returns the compression ratio for the given document with this bucket.
119
+ # The lower this number is, the better the fit.
120
+ #
121
+ # Supplying a String will convert the string to the form:
122
+ #
123
+ # {
124
+ # :body => string
125
+ # }
126
+ #
127
+ def compress(document)
128
+ document = {:body => document} if document.kind_of?(String)
129
+
130
+ # Magically compress anything matched by the magnets to nothing.
131
+ # This strongly attracts the document to this bucket.
132
+ magnetized_document = document.dup
133
+ for magnet in self.magnets
134
+ regexp = nil
135
+ if magnet.kind_of?(String)
136
+ regexp = Regexp.new(Regexp.escape(magnet))
137
+ elsif magnet.kind_of?(Regexp)
138
+ regexp = magnet
139
+ end
140
+ if regexp != nil
141
+ for key, value in magnetized_document
142
+ value.gsub!(regexp, "")
143
+ end
144
+ end
145
+ end
146
+
147
+ document_bytes = Marshal.dump(
148
+ Squish.filter_document(magnetized_document))
149
+ document_compressed_binary = ""
150
+ document_compressed_bytes = ""
151
+ sorted_symbol_table =
152
+ self.symbol_table.sort { |a, b| b[0].size <=> a[0].size }
153
+ while document_bytes.size > 0
154
+ for symbol, coding in sorted_symbol_table
155
+ symbol_regexp = Regexp.new("^" + Regexp.escape(symbol))
156
+ if document_bytes =~ symbol_regexp
157
+ document_bytes.gsub!(symbol_regexp, "")
158
+ document_compressed_binary << coding
159
+ break
160
+ end
161
+ end
162
+ end
163
+ while document_compressed_binary != nil &&
164
+ document_compressed_binary.size > 0
165
+ document_compressed_bytes <<
166
+ document_compressed_binary[0...8].to_i(2).chr
167
+ document_compressed_binary = document_compressed_binary[8..-1]
168
+ end
169
+ return (document_compressed_bytes.size.to_f /
170
+ Marshal.dump(document).size.to_f)
171
+ end
172
+
173
+ # Invalidates the bucket compression data. This method should be called
174
+ # any time the bucket's list of documents changes. The << method calls
175
+ # this method automatically.
176
+ def invalidate
177
+ @tree = nil
178
+ @symbol_table = nil
179
+ end
180
+
181
+ protected
182
+
183
+ # Returns a list of documents that have been processed by the filter.
184
+ def processed_documents #:nodoc:
185
+ processed_documents = []
186
+ for document in self.documents
187
+ processed_documents << Squish.filter_document(document)
188
+ end
189
+ return processed_documents
190
+ end
191
+
192
+ # Returns the raw document list used for compression.
193
+ def raw #:nodoc:
194
+ return Marshal.dump(self.processed_documents)
195
+ end
196
+
197
+ # Returns a hash table of symbols and their huffman codings.
198
+ def symbol_table #:nodoc:
199
+ if !defined?(@symbol_table) || @symbol_table == nil
200
+ table = {}
201
+ self.build_table(self.tree, table)
202
+ @symbol_table = table
203
+ end
204
+ return @symbol_table
205
+ end
206
+
207
+ # Tree traversal helper method.
208
+ # Originally written by Aggelos Orfanakos
209
+ def build_table(root, table, bitstream=[]) #:nodoc:
210
+ if root.kind_of?(Squish::Internal)
211
+ bitstream.push '0'
212
+ self.build_table(root.left, table, bitstream)
213
+ bitstream[-1] = '1'
214
+ self.build_table(root.right, table, bitstream)
215
+ bitstream.pop
216
+ else
217
+ table[root.data] = bitstream.join
218
+ end
219
+ end
220
+
221
+ # Returns the huffman code tree for the bucket. This is used to test
222
+ # whether a document is a good match for a bucket or not.
223
+ def tree #:nodoc:
224
+ if !defined?(@tree) || @tree == nil
225
+ # Adapted from code written by Aggelos Orfanakos
226
+ forest = []
227
+
228
+ self.symbol_weights.each do |pair|
229
+ forest.push(Leaf.new(*pair.reverse))
230
+ end
231
+
232
+ # Sort once, we'll try to keep the array sorted after this
233
+ forest.sort!
234
+ while forest.length > 1
235
+ a = forest.pop
236
+ b = forest.pop
237
+ new_node = Internal.new(a.weight + b.weight, a, b)
238
+ inserted = false
239
+ for i in 0...forest.size
240
+ index = forest.size - i - 1
241
+ if forest[index].weight > new_node.weight
242
+ forest.insert(index + 1, new_node)
243
+ inserted = true
244
+ break
245
+ end
246
+ end
247
+ forest.unshift(new_node) if !inserted
248
+ end
249
+ @tree = forest.first
250
+ end
251
+ return @tree
252
+ end
253
+
254
+ # Returns a hash of keys and values, where each key is a string that has
255
+ # occurred at least once in the source, and each value is the number of
256
+ # times its corresponding key has appeared.
257
+ def symbol_occurrences(source=(self.raw + Squish.all_bytes)) #:nodoc:
258
+
259
+ # This method is can't help being a performance bottleneck. Anything
260
+ # that can be done to improve it performance-wise will be much
261
+ # appreciated.
262
+
263
+ # I didn't write this method. Someone on IRC improved on my original
264
+ # method that was quite terrible, and I haven't quite figured out how
265
+ # his replacement method works.
266
+
267
+ symbol_occurrences = {}
268
+
269
+ char_positions = Hash.new { |h, k| h[k] = [] }
270
+ source.unpack('C*').each_with_index do |ch, pos|
271
+ char_positions[ch] << pos
272
+ end
273
+ offsets = Array.new(source.length)
274
+ char_positions.each do |char, positions|
275
+ positions.each do |position|
276
+ offsets[position] = (positions.dup.delete_if do |other_position|
277
+ other_position <= position
278
+ end).map { |other_position| other_position - position }
279
+ end
280
+ end
281
+
282
+ (1..10).each do |length|
283
+ offsets.each_with_index do |offset_list, position|
284
+ if length == 1 || (offset_list.size + 1 >= 2)
285
+ if symbol_occurrences[source[position, length]] == nil
286
+ symbol_occurrences[source[position, length]] = []
287
+ end
288
+ symbol_occurrences[source[position, length]] << position
289
+ symbol_occurrences[source[position, length]].concat(
290
+ offset_list.map { |offset| position + offset }
291
+ )
292
+ end
293
+ end
294
+ offsets.evaporate
295
+ end
296
+
297
+ for key in symbol_occurrences.keys
298
+ symbol_occurrences[key] = symbol_occurrences[key].uniq.size
299
+ end
300
+
301
+ return symbol_occurrences
302
+ end
303
+
304
+ # Returns a hash of keys and values, where each key is a string that has
305
+ # occurred at least once in the source, and each value is a weighting
306
+ # of occurrances multiplied by the length of the key.
307
+ def symbol_weights(symbol_occurrences=self.symbol_occurrences) #:nodoc:
308
+ symbol_weights = {}
309
+ for key in symbol_occurrences.keys
310
+ symbol_weights[key] = (key.size * symbol_occurrences[key])
311
+ end
312
+ return symbol_weights
313
+ end
314
+ end
315
+
316
+ # Originally written by Aggelos Orfanakos
317
+ class Node # :nodoc:
318
+ include Comparable
319
+
320
+ attr_reader :weight
321
+
322
+ def initialize(weight)
323
+ @weight = weight
324
+ end
325
+
326
+ def <=>(other)
327
+ other.weight <=> @weight
328
+ end
329
+ end
330
+
331
+ # Originally written by Aggelos Orfanakos
332
+ class Internal < Node # :nodoc:
333
+ attr_reader :left, :right
334
+
335
+ def initialize(weight, left, right)
336
+ super(weight)
337
+ @left = left
338
+ @right = right
339
+ end
340
+ end
341
+
342
+ # Originally written by Aggelos Orfanakos
343
+ class Leaf < Node # :nodoc:
344
+ attr_reader :data
345
+
346
+ def initialize(weight, data)
347
+ super(weight)
348
+ @data = data
349
+ end
350
+ end
351
+
352
+ # Returns a string containing all possible bytes. This is appended to the
353
+ # raw bucket dump to ensure that all bytes can be handled by the tree,
354
+ # since incoming documents may contain bytes not previously encountered
355
+ # within training data.
356
+ def self.all_bytes #:nodoc:
357
+ if !defined?(@all_bytes) || @all_bytes == nil
358
+ all_bytes = ""
359
+ for i in 0...256
360
+ all_bytes << i.chr
361
+ end
362
+ @all_bytes = all_bytes
363
+ end
364
+ return @all_bytes
365
+ end
366
+
367
+ # Filters an entire document (Hash)
368
+ def self.filter_document(document) #:nodoc:
369
+ filtered_document = {}
370
+ for key in document.keys
371
+ filtered_document[key] = filter_value(document[key])
372
+ end
373
+ return filtered_document
374
+ end
375
+
376
+ # Does a visual reduction of the characters contained within the value.
377
+ # This prevents "1337" speak from degrading the effectiveness of the
378
+ # algorithm in any way. This is intentionally a VERY lossy algorithm, and
379
+ # isn't particularly efficient, but it works. The main advantage of this
380
+ # algorithm is that while some information may be lost from legitimate
381
+ # documents, more patterns will be revealed in illegitimate documents,
382
+ # with ultimately more critical information revealed than is lost.
383
+ def self.filter_value(value) #:nodoc:
384
+ filtered_value = value.to_s.dup
385
+
386
+ # Remove whitespace because spammers sometimes insert extraneous
387
+ # whitespace, and the main algorithm shouldn't give false positives due
388
+ # to a lack of whitespace, but it may give false positives due to extra
389
+ # whitespace.
390
+ filtered_value.gsub!(/\s/, "")
391
+
392
+ filtered_value.gsub!(/~/, "-")
393
+ filtered_value.gsub!(/\|/, "I")
394
+ filtered_value.gsub!(/!/, "I")
395
+ filtered_value.gsub!(/1/, "I")
396
+ filtered_value.gsub!(/l/, "I")
397
+ filtered_value.gsub!(/\+/, "t")
398
+ filtered_value.gsub!(/3/, "e")
399
+ filtered_value.gsub!(/7/, "T")
400
+ filtered_value.gsub!(/@/, "a")
401
+ filtered_value.gsub!(/4/, "A")
402
+ filtered_value.gsub!(/8/, "B")
403
+ filtered_value.gsub!(/6/, "G")
404
+ filtered_value.gsub!(/\$/, "S")
405
+ filtered_value.gsub!(/0/, "O")
406
+ filtered_value.gsub!(/\(\)/, "O")
407
+ filtered_value.gsub!(/I\)/, "D")
408
+ filtered_value.gsub!(/\]\)/, "D")
409
+ filtered_value.gsub!(/\[\)/, "D")
410
+ filtered_value.gsub!(/I\*/, "P")
411
+ filtered_value.gsub!(/\]\*/, "P")
412
+ filtered_value.gsub!(/\*/, "a")
413
+ filtered_value.gsub!(/I2/, "R")
414
+ filtered_value.gsub!(/I=/, "F")
415
+ filtered_value.gsub!(/I\\I/, "N")
416
+ filtered_value.gsub!(/\`\//, "Y")
417
+ filtered_value.gsub!(/\/\\\/\\/, "M")
418
+ filtered_value.gsub!(/\\\/\\\//, "W")
419
+ filtered_value.gsub!(/\\\/\\\//, "W")
420
+ filtered_value.gsub!(/I\\\/I/, "M")
421
+ filtered_value.gsub!(/IVI/i, "M")
422
+ filtered_value.gsub!(/VV/, "W")
423
+ filtered_value.gsub!(/\\X\//, "W")
424
+ filtered_value.gsub!(/\/\\\//, "N")
425
+ filtered_value.gsub!(/\\\/\\/, "N")
426
+ filtered_value.gsub!(/\/V\\/i, "M")
427
+ filtered_value.gsub!(/\/V/i, "N")
428
+ filtered_value.gsub!(/\\N/, "W")
429
+ filtered_value.gsub!(/\\\//, "V")
430
+ filtered_value.gsub!(/\>\</, "X")
431
+ filtered_value.gsub!(/I-I/, "H")
432
+ filtered_value.gsub!(/\]-\[/, "H")
433
+ filtered_value.gsub!(/\}\{/, "H")
434
+ filtered_value.gsub!(/I_I/, "U")
435
+ filtered_value.gsub!(/I\</, "K")
436
+ filtered_value.gsub!(/\]\</, "K")
437
+ filtered_value.gsub!(/\(/, "C")
438
+ filtered_value.gsub!(/\//, "I")
439
+ filtered_value.gsub!(/\\/, "I")
440
+ filtered_value.downcase!
441
+
442
+ return filtered_value
443
+ end
444
+ end
@@ -0,0 +1,9 @@
1
+ module Squish
2
+ module SQUISH_VERSION #:nodoc:
3
+ MAJOR = 0
4
+ MINOR = 0
5
+ TINY = 1
6
+
7
+ STRING = [MAJOR, MINOR, TINY].join('.')
8
+ end
9
+ end
@@ -0,0 +1,252 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'rake/testtask'
4
+ require 'rake/rdoctask'
5
+ require 'rake/packagetask'
6
+ require 'rake/gempackagetask'
7
+ require 'rake/contrib/rubyforgepublisher'
8
+ require 'spec/rake/spectask'
9
+
10
+ require File.join(File.dirname(__FILE__), 'lib/squish', 'version')
11
+
12
+ PKG_DISPLAY_NAME = 'Squish'
13
+ PKG_NAME = PKG_DISPLAY_NAME.downcase
14
+ PKG_VERSION = Squish::SQUISH_VERSION::STRING
15
+ PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
16
+
17
+ RELEASE_NAME = "REL #{PKG_VERSION}"
18
+
19
+ RUBY_FORGE_PROJECT = PKG_NAME
20
+ RUBY_FORGE_USER = "sporkmonger"
21
+
22
+ PKG_SUMMARY = "Resource classification library."
23
+ PKG_DESCRIPTION = <<-TEXT
24
+ Squish is a simple classification library that uses a modified Huffman
25
+ compression algorithm to classify resources into buckets. While it is
26
+ orders of magnitude slower than a naive Bayes classifier, it is potentially
27
+ more effective for certain types of data.
28
+ TEXT
29
+
30
+ PKG_FILES = FileList[
31
+ "lib/**/*", "spec/**/*", "doc/**/*", "vendor/**/*", "[A-Z]*", "rakefile"
32
+ ].exclude(/\bCVS\b|~$/).exclude(/database\.yml/).exclude(/[_\.]svn$/)
33
+
34
+ module Rake
35
+ def self.browse(filepath)
36
+ if RUBY_PLATFORM =~ /mswin/
37
+ system(filepath)
38
+ else
39
+ try_browsers = lambda do
40
+ result = true
41
+ if !(`which firefox 2>&1` =~ /no firefox/)
42
+ system("firefox #{filepath}")
43
+ elsif !(`which mozilla 2>&1` =~ /no mozilla/)
44
+ system("mozilla #{filepath}")
45
+ elsif !(`which netscape 2>&1` =~ /no netscape/)
46
+ system("netscape #{filepath}")
47
+ elsif !(`which links 2>&1` =~ /no links/)
48
+ system("links #{filepath}")
49
+ elsif !(`which lynx 2>&1` =~ /no lynx/)
50
+ system("lynx #{filepath}")
51
+ else
52
+ result = false
53
+ end
54
+ result
55
+ end
56
+ opened = false
57
+ if RUBY_PLATFORM =~ /darwin/
58
+ opened = true
59
+ system("open #{filepath}")
60
+ elsif !(`which gnome-open 2>&1` =~ /no gnome-open/)
61
+ success =
62
+ !(`gnome-open #{filepath} 2>&1` =~ /There is no default action/)
63
+ if !success
64
+ opened = try_browsers.call()
65
+ else
66
+ opened = true
67
+ end
68
+ else
69
+ opened = try_browsers.call()
70
+ end
71
+ if !opened
72
+ puts "Don't know how to browse to location."
73
+ end
74
+ end
75
+ end
76
+ end
77
+
78
+ task :default => [ "spec:run" ]
79
+
80
+ gem_spec = Gem::Specification.new do |s|
81
+ s.name = PKG_NAME
82
+ s.version = PKG_VERSION
83
+ s.summary = PKG_SUMMARY
84
+ s.description = PKG_DESCRIPTION
85
+
86
+ s.files = PKG_FILES.to_a
87
+
88
+ s.has_rdoc = true
89
+ s.extra_rdoc_files = %w( README )
90
+ s.rdoc_options.concat ['--main', 'README']
91
+
92
+ s.add_dependency('rake', '>= 0.7.2')
93
+ s.add_dependency('rspec', '>= 0.7.1')
94
+
95
+ s.require_path = 'lib'
96
+
97
+ s.author = "Bob Aman"
98
+ s.email = "bob@sporkmonger.com"
99
+ s.homepage = "http://sporkmonger.com/"
100
+ s.rubyforge_project = "squish"
101
+ end
102
+
103
+ Rake::GemPackageTask.new(gem_spec) do |p|
104
+ p.gem_spec = gem_spec
105
+ p.need_tar = true
106
+ p.need_zip = true
107
+ end
108
+
109
+ Rake::RDocTask.new do |rdoc|
110
+ rdoc.rdoc_dir = 'doc'
111
+ rdoc.title = "Squish -- simple resource classification"
112
+ rdoc.options << '--line-numbers' << '--inline-source' <<
113
+ '--accessor' << 'cattr_accessor=object'
114
+ rdoc.template = "#{ENV['template']}.rb" if ENV['template']
115
+ rdoc.rdoc_files.include('README', 'CHANGELOG', 'TODO', 'LICENSE')
116
+ rdoc.rdoc_files.include('lib/**/*.rb')
117
+ end
118
+
119
+ namespace :rcov do
120
+ desc 'Open the RCov code coverage report in a browser.'
121
+ task :browse do
122
+ if !File.exists?(File.expand_path(
123
+ File.dirname(__FILE__) + '/coverage/index.html'))
124
+ Rake::Task["spec:run"].invoke
125
+ end
126
+ Rake.browse(File.expand_path(
127
+ File.dirname(__FILE__) + '/coverage/index.html'))
128
+ end
129
+ end
130
+
131
+ namespace :spec do
132
+ desc "Run all the specs"
133
+ Spec::Rake::SpecTask.new(:run) do |t|
134
+ t.spec_files = FileList['spec/**/*_spec.rb']
135
+ t.spec_opts = ['--color']
136
+ t.rcov = true
137
+ t.rcov_opts = [
138
+ # Don't include the actual spec files in the coverage report
139
+ '--exclude', '"spec\/.*"'
140
+ ]
141
+ end
142
+
143
+ desc "Run all the specs"
144
+ Spec::Rake::SpecTask.new(:run_without_rcov) do |t|
145
+ t.spec_files = FileList['spec/**/*_spec.rb']
146
+ t.spec_opts = ['--color']
147
+ end
148
+
149
+ # desc "Start up autotest for RSpec"
150
+ # task :autospec do
151
+ # require 'autotest'
152
+ # require 'autotest/growl'
153
+ # require 'autotest/redgreen'
154
+ # require 'vendor/autospec/lib/autospec'
155
+ # Autospec.run
156
+ # end
157
+
158
+ desc "Print Specdoc for all specs"
159
+ Spec::Rake::SpecTask.new(:doc) do |t|
160
+ t.spec_files = FileList[
161
+ 'spec/**/*_spec.rb'
162
+ ]
163
+ t.spec_opts = ["--format", "specdoc"]
164
+ end
165
+
166
+ desc "Generate HTML Specdocs for all specs"
167
+ Spec::Rake::SpecTask.new(:html) do |t|
168
+ if !File.exists?(
169
+ File.expand_path(File.dirname(__FILE__) + '/doc/'))
170
+ puts "Creating doc folder..."
171
+ Dir.mkdir(File.expand_path(File.dirname(__FILE__) + '/doc/'))
172
+ end
173
+ if !File.exists?(
174
+ File.expand_path(File.dirname(__FILE__) + '/doc/specs/'))
175
+ puts "Creating specs folder..."
176
+ Dir.mkdir(File.expand_path(File.dirname(__FILE__) + '/doc/specs/'))
177
+ end
178
+
179
+ t.spec_files = FileList['spec/**/*_spec.rb']
180
+ t.spec_opts = ["--format", "html"]
181
+ t.out = File.expand_path(
182
+ File.dirname(__FILE__) + '/doc/specs/index.html')
183
+ end
184
+
185
+ desc 'Open the RSpec HTML specifications in a browser.'
186
+ task :browse => [ "spec:html" ] do
187
+ Rake.browse(File.expand_path(
188
+ File.dirname(__FILE__) + '/doc/specs/index.html'))
189
+ end
190
+ end
191
+
192
+ namespace :publish do
193
+ desc "Publish the coverage report"
194
+ task :coverage => [ "spec:run" ] do
195
+ Rake::SshDirPublisher.new(
196
+ "sporkmonger@sporkmonger.com",
197
+ "projects/squish/coverage/",
198
+ "coverage/"
199
+ ).upload
200
+ end
201
+
202
+ desc "Publish the specifications"
203
+ task :specs => [ "spec:html" ] do
204
+ Rake::SshDirPublisher.new(
205
+ "sporkmonger@sporkmonger.com",
206
+ "projects/squish/specs/",
207
+ "doc/specs/"
208
+ ).upload
209
+ end
210
+
211
+ desc "Publish the API documentation"
212
+ task :api => [ "rdoc" ] do
213
+ if !File.exists?(
214
+ File.expand_path(File.dirname(__FILE__) + '/doc/specs/'))
215
+ puts "Creating specs folder..."
216
+ Dir.mkdir(File.expand_path(File.dirname(__FILE__) + '/doc/specs/'))
217
+ end
218
+
219
+ Rake::SshDirPublisher.new(
220
+ "sporkmonger@sporkmonger.com",
221
+ "projects/squish/api/",
222
+ "doc/"
223
+ ).upload
224
+ end
225
+
226
+ desc "Runs all of the publishing tasks"
227
+ task :all => ["publish:coverage", "publish:api", "publish:specs"] do
228
+ end
229
+ end
230
+
231
+ task :lines do
232
+ lines, codelines, total_lines, total_codelines = 0, 0, 0, 0
233
+
234
+ for file_name in FileList["lib/**/*.rb"]
235
+ f = File.open(file_name)
236
+
237
+ while line = f.gets
238
+ lines += 1
239
+ next if line =~ /^\s*$/
240
+ next if line =~ /^\s*#/
241
+ codelines += 1
242
+ end
243
+ puts "L: #{sprintf("%4d", lines)}, LOC #{sprintf("%4d", codelines)} | #{file_name}"
244
+
245
+ total_lines += lines
246
+ total_codelines += codelines
247
+
248
+ lines, codelines = 0, 0
249
+ end
250
+
251
+ puts "Total: Lines #{total_lines}, LOC #{total_codelines}"
252
+ end
@@ -0,0 +1,241 @@
1
+ #--
2
+ # Squish, Copyright (c) 2006 Robert Aman
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+
24
+ $:.unshift(File.expand_path(File.dirname(__FILE__) + '/../../lib'))
25
+ $:.uniq!
26
+
27
+ require 'squish'
28
+
29
+ lorem_bucket = Squish::Bucket.new("lorem")
30
+ lorem_bucket << {
31
+ :body => <<-TEXT
32
+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
33
+ tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
34
+ quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
35
+ consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
36
+ cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
37
+ proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
38
+ TEXT
39
+ }
40
+
41
+ spam_bucket = Squish::Bucket.new("spam")
42
+ spam_bucket << {
43
+ :name => "Disabled",
44
+ :url => "http://www.artpromcompany.com/",
45
+ :body => <<-TEXT
46
+ I much have powerfully interested your resource. As you see on that be
47
+ changed reference http://www.artpromcompany.com
48
+ TEXT
49
+ }
50
+ spam_bucket << {
51
+ :name => "gamble in poker casinos",
52
+ :url => "http://www.see-the-dealer.com/",
53
+ :body => <<-TEXT
54
+ The eye has some latin bingo. Some interest has one new issue. Select craps
55
+ is some rolling Revolution. One confident Revolution poked thanks to a
56
+ middle-class side. Lesser keno is one socialist table. It’s social to be
57
+ overhung! It’s violent to be sat! The eldest poker misread some keno
58
+ conductively.
59
+ TEXT
60
+ }
61
+ spam_bucket << {
62
+ :name => "currency rates",
63
+ :url => "http://www.allinforex.com/",
64
+ :body => <<-TEXT
65
+ It’s valid to be lent! The exchange is untactfully legal. One war has a
66
+ capitalist forex investment. Some currency has this wonderful forex.
67
+ Goodness, one integral forex rates sociably input excepting that religious
68
+ forex investment. I mean, one face is less unemployed than a medical girl.
69
+ TEXT
70
+ }
71
+ spam_bucket.magnets << /viagra/i
72
+
73
+ valid_bucket = Squish::Bucket.new("valid")
74
+ valid_bucket << {
75
+ :name => "ninja",
76
+ :url => nil,
77
+ :body => <<-TEXT
78
+ Hey Bob,
79
+
80
+ This article sounds alot like you’re encouraging rather than discouraging
81
+ strict OOP. Not what I heard from you last time we spoke...
82
+
83
+ Dont bother constructing an elaborate counter-argument... I’m too ignorant
84
+ to understand what’s going on on this site anyway, and probabaly wouldnt
85
+ understand what you’re talking about :)
86
+
87
+ – Reda
88
+
89
+ p.s – how’s it going?
90
+ TEXT
91
+ }
92
+ valid_bucket.magnets << "Bob"
93
+
94
+ context "An empty bucket" do
95
+ setup do
96
+ @empty = Squish::Bucket.new("empty")
97
+ end
98
+
99
+ specify "should have the correct name" do
100
+ @empty.name.should == "empty"
101
+ end
102
+
103
+ specify "should not have any documents" do
104
+ @empty.documents.should.be.empty
105
+ end
106
+
107
+ specify "should not have any magnets" do
108
+ @empty.magnets.should.be.empty
109
+ end
110
+ end
111
+
112
+ context "A bucket containing 'lorem ipsum' text" do
113
+ setup do
114
+ @lorem = lorem_bucket
115
+ end
116
+
117
+ specify "should have the correct name" do
118
+ @lorem.name.should == "lorem"
119
+ end
120
+
121
+ specify "should have at least one 'lorem ipsum' document" do
122
+ @lorem.documents.should.not.be.empty
123
+ @lorem.documents.size.should >= 1
124
+ end
125
+
126
+ specify "should not have any magnets" do
127
+ @lorem.magnets.should.be.empty
128
+ end
129
+ end
130
+
131
+ context "A bucket containing several spammy documents and a viagra magnet" do
132
+ setup do
133
+ @spam = spam_bucket
134
+ end
135
+
136
+ specify "should have the correct name" do
137
+ @spam.name.should == "spam"
138
+ end
139
+
140
+ specify "should have multiple documents" do
141
+ @spam.documents.should.not.be.empty
142
+ @spam.documents.size.should >= 3
143
+ end
144
+
145
+ specify "should only have one magnet" do
146
+ @spam.magnets.should.not.be.empty
147
+ @spam.magnets.size.should == 1
148
+ end
149
+ end
150
+
151
+ context "With an array of several buckets, Squish" do
152
+ setup do
153
+ @buckets = [lorem_bucket, spam_bucket, valid_bucket]
154
+ end
155
+
156
+ specify "should correctly classify valid documents" do
157
+ Squish.classify!(
158
+ "Hi Bob, what have you been up to lately? Anything interesting?",
159
+ @buckets
160
+ ).should == "valid"
161
+
162
+ Squish.classify!(
163
+ "Bob, I figured out what was wrong with your ruby program.",
164
+ @buckets
165
+ ).should == "valid"
166
+ end
167
+
168
+ specify "should correctly classify spam documents" do
169
+ # Give the bucket a little help here, since it doesn't have enough
170
+ # training data.
171
+ spam_bucket << "Check currancy rates online!"
172
+ spam_bucket << "Online poker casino!"
173
+ spam_bucket << "Penis enlargement!"
174
+ spam_bucket << "Cheap online pharmacy sells viagra and cialis!"
175
+ spam_bucket << "Amazing mortgage rates! Buy your home for less!"
176
+ spam_bucket << "Viagra! Cialis!"
177
+
178
+ Squish.classify!(
179
+ "Invest money on the foreign exchange!",
180
+ @buckets
181
+ ).should == "spam"
182
+
183
+ Squish.classify!(
184
+ "Play bingo and poker online! Make money!",
185
+ @buckets
186
+ ).should == "spam"
187
+
188
+ Squish.classify!(
189
+ "Enlarge your penis for cheap! She will fall in love with you again!",
190
+ @buckets
191
+ ).should == "spam"
192
+
193
+ Squish.classify!(
194
+ "\\/|agr@!",
195
+ @buckets
196
+ ).should == "spam"
197
+
198
+ Squish.classify!(
199
+ "V / a G r A",
200
+ @buckets
201
+ ).should == "spam"
202
+
203
+ Squish.classify!(
204
+ "Buy viagra and cialis!",
205
+ @buckets
206
+ ).should == "spam"
207
+ end
208
+
209
+ specify "should correctly classify 'lorem ipsum' documents" do
210
+ Squish.classify!(
211
+ %{
212
+ Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Fusce
213
+ tincidunt augue a augue. Ut nunc. Fusce porta, sem a luctus mattis,
214
+ dolor dui gravida diam, a eleifend augue nibh eget nibh. Duis eu
215
+ justo. In viverra enim a turpis. Nullam eros. Nullam vestibulum
216
+ nunc vel nisi. Vestibulum ante ipsum primis in faucibus orci luctus
217
+ et ultrices posuere cubilia Curae; Integer feugiat lorem ut dolor.
218
+ Cras eget nulla. Donec velit pede, posuere vel, iaculis quis, commodo
219
+ sit amet, diam. Praesent pharetra velit ac enim. Donec porta tortor
220
+ congue nunc. Duis eu enim sit amet nulla tincidunt bibendum.
221
+ Donec mollis.
222
+ },
223
+ @buckets
224
+ ).should == "lorem"
225
+
226
+ Squish.classify!(
227
+ "Lorem ipsum dolor sit amet.",
228
+ @buckets
229
+ ).should == "lorem"
230
+
231
+ Squish.classify!(
232
+ "Lorem ipsum, you scallywag! Vestibulum ante ipsum and such!",
233
+ @buckets
234
+ ).should == "lorem"
235
+
236
+ Squish.classify!(
237
+ "Lorem ipsum, you scallywag!",
238
+ @buckets
239
+ ).should == "lorem"
240
+ end
241
+ end
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: 1
4
+ name: squish
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.0.1
7
+ date: 2007-03-30 00:00:00 -04:00
8
+ summary: Resource classification library.
9
+ require_paths:
10
+ - lib
11
+ email: bob@sporkmonger.com
12
+ homepage: http://sporkmonger.com/
13
+ rubyforge_project: squish
14
+ description: Squish is a simple classification library that uses a modified Huffman compression algorithm to classify resources into buckets. While it is orders of magnitude slower than a naive Bayes classifier, it is potentially more effective for certain types of data.
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Bob Aman
31
+ files:
32
+ - lib/squish
33
+ - lib/squish.rb
34
+ - lib/squish/version.rb
35
+ - spec/squish
36
+ - spec/squish/squish_spec.rb
37
+ - doc/specs
38
+ - CHANGELOG
39
+ - README
40
+ - rakefile
41
+ test_files: []
42
+
43
+ rdoc_options:
44
+ - --main
45
+ - README
46
+ extra_rdoc_files:
47
+ - README
48
+ executables: []
49
+
50
+ extensions: []
51
+
52
+ requirements: []
53
+
54
+ dependencies:
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ version_requirement:
58
+ version_requirements: !ruby/object:Gem::Version::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: 0.7.2
63
+ version:
64
+ - !ruby/object:Gem::Dependency
65
+ name: rspec
66
+ version_requirement:
67
+ version_requirements: !ruby/object:Gem::Version::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: 0.7.1
72
+ version: