simple_classifier 1.0.0 → 1.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +52 -0
- data/VERSION +1 -0
- data/bin/bayes.rb +2 -2
- data/bin/summarize.rb +3 -3
- data/install.rb +49 -0
- data/lib/{classifier.rb → simple_classifier.rb} +2 -2
- data/lib/{classifier → simple_classifier}/bayes.rb +1 -1
- data/lib/simple_classifier/extensions/string.rb +16 -0
- data/lib/simple_classifier/extensions/word_hash.rb +125 -0
- data/test/bayes/bayesian_test.rb +2 -2
- data/test/test_helper.rb +1 -1
- metadata +30 -22
data/Rakefile
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
require 'rake/testtask'
|
4
|
+
require 'rake/rdoctask'
|
5
|
+
|
6
|
+
PKG_VERSION = "1.3.1"
|
7
|
+
|
8
|
+
PKG_FILES = FileList[
|
9
|
+
"lib/**/*", "bin/*", "test/**/*", "[A-Z]*", "Rakefile", "html/**/*"
|
10
|
+
]
|
11
|
+
|
12
|
+
desc "Default Task"
|
13
|
+
task :default => [ :test ]
|
14
|
+
|
15
|
+
# Run the unit tests
|
16
|
+
desc "Run all unit tests"
|
17
|
+
Rake::TestTask.new("test") { |t|
|
18
|
+
t.libs << "lib"
|
19
|
+
t.pattern = 'test/*/*_test.rb'
|
20
|
+
t.verbose = true
|
21
|
+
}
|
22
|
+
|
23
|
+
# Make a console, useful when working on tests
|
24
|
+
desc "Generate a test console"
|
25
|
+
task :console do
|
26
|
+
verbose( false ) { sh "irb -I lib/ -r 'classifier'" }
|
27
|
+
end
|
28
|
+
|
29
|
+
# Genereate the RDoc documentation
|
30
|
+
desc "Create documentation"
|
31
|
+
Rake::RDocTask.new("doc") { |rdoc|
|
32
|
+
rdoc.title = "simple_classifier - A simple Bayesian classifier"
|
33
|
+
rdoc.rdoc_dir = 'html'
|
34
|
+
rdoc.rdoc_files.include('README')
|
35
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
36
|
+
}
|
37
|
+
|
38
|
+
desc "Generate gemspec with jeweler"
|
39
|
+
begin
|
40
|
+
require 'jeweler'
|
41
|
+
Jeweler::Tasks.new do |gemspec|
|
42
|
+
gemspec.name = "simple_classifier"
|
43
|
+
gemspec.summary = "A simple bayesian classifier"
|
44
|
+
gemspec.email = "ben.orenstein@gmail.com"
|
45
|
+
gemspec.homepage = "http://github.com/r00k/simple_classifier"
|
46
|
+
gemspec.authors = ["Ben Orenstein", "Lucas Carlson", "David Fayram II"]
|
47
|
+
gemspec.version = '1.3.4'
|
48
|
+
end
|
49
|
+
Jeweler::GemcutterTasks.new
|
50
|
+
rescue LoadError
|
51
|
+
puts "Jeweler not available. Install it with: gem install jeweler"
|
52
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.0.0
|
data/bin/bayes.rb
CHANGED
data/bin/summarize.rb
CHANGED
data/install.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'rbconfig'
|
2
|
+
require 'find'
|
3
|
+
require 'ftools'
|
4
|
+
|
5
|
+
include Config
|
6
|
+
|
7
|
+
# this was adapted from rdoc's install.rb by ways of Log4r
|
8
|
+
|
9
|
+
$sitedir = CONFIG["sitelibdir"]
|
10
|
+
unless $sitedir
|
11
|
+
version = CONFIG["MAJOR"] + "." + CONFIG["MINOR"]
|
12
|
+
$libdir = File.join(CONFIG["libdir"], "ruby", version)
|
13
|
+
$sitedir = $:.find {|x| x =~ /site_ruby/ }
|
14
|
+
if !$sitedir
|
15
|
+
$sitedir = File.join($libdir, "site_ruby")
|
16
|
+
elsif $sitedir !~ Regexp.quote(version)
|
17
|
+
$sitedir = File.join($sitedir, version)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
makedirs = %w{ simple_classifier }
|
22
|
+
makedirs = %w{ simple_classifier/extensions }
|
23
|
+
makedirs.each {|f| File::makedirs(File.join($sitedir, *f.split(/\//)))}
|
24
|
+
|
25
|
+
Dir.chdir("lib")
|
26
|
+
begin
|
27
|
+
require 'rubygems'
|
28
|
+
require 'rake'
|
29
|
+
rescue LoadError
|
30
|
+
puts
|
31
|
+
puts "Please install Gem and Rake from http://rubyforge.org/projects/rubygems and http://rubyforge.org/projects/rake"
|
32
|
+
puts
|
33
|
+
exit(-1)
|
34
|
+
end
|
35
|
+
|
36
|
+
files = FileList["**/*"]
|
37
|
+
|
38
|
+
# File::safe_unlink *deprecated.collect{|f| File.join($sitedir, f.split(/\//))}
|
39
|
+
files.each {|f|
|
40
|
+
File::install(f, File.join($sitedir, *f.split(/\//)), 0644, true)
|
41
|
+
}
|
42
|
+
|
43
|
+
begin
|
44
|
+
require 'stemmer'
|
45
|
+
rescue LoadError
|
46
|
+
puts
|
47
|
+
puts "Please install Stemmer from http://rubyforge.org/projects/stemmer or via 'gem install stemmer'"
|
48
|
+
puts
|
49
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
|
+
# License:: LGPL
|
4
|
+
|
5
|
+
begin
|
6
|
+
require 'stemmer'
|
7
|
+
rescue LoadError
|
8
|
+
puts "Please install stemmer from http://rubyforge.org/projects/stemmer or 'gem install stemmer'"
|
9
|
+
exit(-1)
|
10
|
+
end
|
11
|
+
|
12
|
+
require 'simple_classifier/extensions/word_hash'
|
13
|
+
|
14
|
+
class Object
|
15
|
+
def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
|
16
|
+
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
|
+
# License:: LGPL
|
4
|
+
|
5
|
+
# These are extensions to the String class to provide convenience
|
6
|
+
# methods for the Classifier package.
|
7
|
+
class String
|
8
|
+
|
9
|
+
# Removes common punctuation symbols, returning a new string.
|
10
|
+
# E.g.,
|
11
|
+
# "Hello (greeting's), with {braces} < >...?".without_punctuation
|
12
|
+
# => "Hello greetings with braces "
|
13
|
+
def without_punctuation
|
14
|
+
tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
|
15
|
+
end
|
16
|
+
|
17
|
+
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
18
|
+
# interned, and indexes to its frequency in the document.
|
19
|
+
def word_hash
|
20
|
+
word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split)
|
21
|
+
end
|
22
|
+
|
23
|
+
# Return a word hash without extra punctuation or short symbols, just stemmed words
|
24
|
+
def clean_word_hash
|
25
|
+
word_hash_for_words gsub(/[^\w\s]/,"").split
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def word_hash_for_words(words)
|
31
|
+
d = Hash.new
|
32
|
+
words.each do |word|
|
33
|
+
word.downcase! if word =~ /[\w]+/
|
34
|
+
key = word.stem.intern
|
35
|
+
if word =~ /[^\w]/ || ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
|
36
|
+
d[key] ||= 0
|
37
|
+
d[key] += 1
|
38
|
+
end
|
39
|
+
end
|
40
|
+
return d
|
41
|
+
end
|
42
|
+
|
43
|
+
CORPUS_SKIP_WORDS = [
|
44
|
+
"a",
|
45
|
+
"again",
|
46
|
+
"all",
|
47
|
+
"along",
|
48
|
+
"are",
|
49
|
+
"also",
|
50
|
+
"an",
|
51
|
+
"and",
|
52
|
+
"as",
|
53
|
+
"at",
|
54
|
+
"but",
|
55
|
+
"by",
|
56
|
+
"came",
|
57
|
+
"can",
|
58
|
+
"cant",
|
59
|
+
"couldnt",
|
60
|
+
"did",
|
61
|
+
"didn",
|
62
|
+
"didnt",
|
63
|
+
"do",
|
64
|
+
"doesnt",
|
65
|
+
"dont",
|
66
|
+
"ever",
|
67
|
+
"first",
|
68
|
+
"from",
|
69
|
+
"have",
|
70
|
+
"her",
|
71
|
+
"here",
|
72
|
+
"him",
|
73
|
+
"how",
|
74
|
+
"i",
|
75
|
+
"if",
|
76
|
+
"in",
|
77
|
+
"into",
|
78
|
+
"is",
|
79
|
+
"isnt",
|
80
|
+
"it",
|
81
|
+
"itll",
|
82
|
+
"just",
|
83
|
+
"last",
|
84
|
+
"least",
|
85
|
+
"like",
|
86
|
+
"most",
|
87
|
+
"my",
|
88
|
+
"new",
|
89
|
+
"no",
|
90
|
+
"not",
|
91
|
+
"now",
|
92
|
+
"of",
|
93
|
+
"on",
|
94
|
+
"or",
|
95
|
+
"should",
|
96
|
+
"sinc",
|
97
|
+
"so",
|
98
|
+
"some",
|
99
|
+
"th",
|
100
|
+
"than",
|
101
|
+
"this",
|
102
|
+
"that",
|
103
|
+
"the",
|
104
|
+
"their",
|
105
|
+
"then",
|
106
|
+
"those",
|
107
|
+
"to",
|
108
|
+
"told",
|
109
|
+
"too",
|
110
|
+
"true",
|
111
|
+
"try",
|
112
|
+
"until",
|
113
|
+
"url",
|
114
|
+
"us",
|
115
|
+
"were",
|
116
|
+
"when",
|
117
|
+
"whether",
|
118
|
+
"while",
|
119
|
+
"with",
|
120
|
+
"within",
|
121
|
+
"yes",
|
122
|
+
"you",
|
123
|
+
"youll",
|
124
|
+
]
|
125
|
+
end
|
data/test/bayes/bayesian_test.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/../test_helper'
|
2
2
|
class BayesianTest < Test::Unit::TestCase
|
3
3
|
def setup
|
4
|
-
@classifier =
|
4
|
+
@classifier = DfhccClassifier::Bayes.new 'Interesting', 'Uninteresting'
|
5
5
|
end
|
6
6
|
|
7
7
|
def test_good_training
|
@@ -12,7 +12,7 @@ class BayesianTest < Test::Unit::TestCase
|
|
12
12
|
assert_raise(StandardError) { @classifier.train_no_category "words" }
|
13
13
|
end
|
14
14
|
|
15
|
-
def
|
15
|
+
def test_method_missing_calls_super
|
16
16
|
assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
|
17
17
|
end
|
18
18
|
|
data/test/test_helper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simple_classifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 1
|
7
|
+
- 3
|
8
|
+
- 4
|
9
|
+
version: 1.3.4
|
5
10
|
platform: ruby
|
6
11
|
authors:
|
7
12
|
- Ben Orenstein
|
@@ -11,20 +16,11 @@ autorequire:
|
|
11
16
|
bindir: bin
|
12
17
|
cert_chain: []
|
13
18
|
|
14
|
-
date:
|
19
|
+
date: 2010-02-26 00:00:00 -05:00
|
15
20
|
default_executable:
|
16
|
-
dependencies:
|
17
|
-
|
18
|
-
|
19
|
-
type: :runtime
|
20
|
-
version_requirement:
|
21
|
-
version_requirements: !ruby/object:Gem::Requirement
|
22
|
-
requirements:
|
23
|
-
- - ">="
|
24
|
-
- !ruby/object:Gem::Version
|
25
|
-
version: "0"
|
26
|
-
version:
|
27
|
-
description: Simple bayesian filtering, without a lot of fuss.
|
21
|
+
dependencies: []
|
22
|
+
|
23
|
+
description:
|
28
24
|
email: ben.orenstein@gmail.com
|
29
25
|
executables:
|
30
26
|
- bayes.rb
|
@@ -35,12 +31,22 @@ extra_rdoc_files:
|
|
35
31
|
- LICENSE
|
36
32
|
- README
|
37
33
|
files:
|
38
|
-
- lib/classifier.rb
|
39
|
-
- lib/classifier/bayes.rb
|
40
34
|
- LICENSE
|
41
35
|
- README
|
36
|
+
- Rakefile
|
37
|
+
- VERSION
|
38
|
+
- bin/bayes.rb
|
39
|
+
- bin/summarize.rb
|
40
|
+
- install.rb
|
41
|
+
- lib/simple_classifier.rb
|
42
|
+
- lib/simple_classifier/bayes.rb
|
43
|
+
- lib/simple_classifier/extensions/string.rb
|
44
|
+
- lib/simple_classifier/extensions/word_hash.rb
|
45
|
+
- test/bayes/bayesian_test.rb
|
46
|
+
- test/extensions/word_hash_test.rb
|
47
|
+
- test/test_helper.rb
|
42
48
|
has_rdoc: true
|
43
|
-
homepage:
|
49
|
+
homepage: http://github.com/r00k/simple_classifier
|
44
50
|
licenses: []
|
45
51
|
|
46
52
|
post_install_message:
|
@@ -52,21 +58,23 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
52
58
|
requirements:
|
53
59
|
- - ">="
|
54
60
|
- !ruby/object:Gem::Version
|
61
|
+
segments:
|
62
|
+
- 0
|
55
63
|
version: "0"
|
56
|
-
version:
|
57
64
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
58
65
|
requirements:
|
59
66
|
- - ">="
|
60
67
|
- !ruby/object:Gem::Version
|
61
|
-
|
62
|
-
|
68
|
+
segments:
|
69
|
+
- 0
|
70
|
+
version: "0"
|
63
71
|
requirements: []
|
64
72
|
|
65
73
|
rubyforge_project:
|
66
|
-
rubygems_version: 1.3.
|
74
|
+
rubygems_version: 1.3.6
|
67
75
|
signing_key:
|
68
76
|
specification_version: 3
|
69
|
-
summary: A simple bayesian classifier
|
77
|
+
summary: A simple bayesian classifier
|
70
78
|
test_files:
|
71
79
|
- test/test_helper.rb
|
72
80
|
- test/bayes/bayesian_test.rb
|