thomas 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +4 -0
- data/lib/thomas.rb +198 -0
- data/test/thomas_test.rb +8 -0
- metadata +49 -0
data/README
ADDED
data/lib/thomas.rb
ADDED
@@ -0,0 +1,198 @@
|
|
1
|
+
# Thomas
|
2
|
+
#
|
3
|
+
# A naive bayesian filter that works with any model that has word, corpus and amount attributes.
|
4
|
+
#
|
5
|
+
# You use it like this (with the example model BayesWord):
|
6
|
+
#
|
7
|
+
# b = Adocca::Thomas::Bayes.new(BayesWord)
|
8
|
+
# b.train("spam", "i am a nigerian president and i sell viagra")
|
9
|
+
# b.train("ham", "i am a developer and i like ruby and beer")
|
10
|
+
# b.train("ham", "i want to sell you my fancy stock")
|
11
|
+
# b.guess(["ham","spam"], "i am a nigerian developer and i like ruby and fancy beer")
|
12
|
+
#
|
13
|
+
# You can also do (with the example corpuses (corpi?) "ham" and "spam"):
|
14
|
+
# b.ham_or_spam("i am a nigerian developer and i like ruby and fancy beer")
|
15
|
+
#
|
16
|
+
|
17
|
+
require 'pp'
|
18
|
+
|
19
|
+
module Adocca
|
20
|
+
module Thomas
|
21
|
+
|
22
|
+
class Update
|
23
|
+
def initialize(model, corpus, word, amount)
|
24
|
+
@model = model
|
25
|
+
@corpus = corpus
|
26
|
+
@word = word
|
27
|
+
@amount = amount
|
28
|
+
end
|
29
|
+
def execute
|
30
|
+
instance = @model.find_or_create_by_corpus_and_word(@corpus, @word)
|
31
|
+
instance.update_attribute(:amount, (instance.amount || 0) + @amount)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
class Bayes
|
36
|
+
|
37
|
+
NAMESPACE = 'adocca:thomas:bayes'
|
38
|
+
|
39
|
+
attr_accessor :model
|
40
|
+
|
41
|
+
@@updates = []
|
42
|
+
|
43
|
+
def initialize(model)
|
44
|
+
@model = model
|
45
|
+
end
|
46
|
+
|
47
|
+
def method_missing(meth, *args)
|
48
|
+
if meth.to_s =~ /_or_/
|
49
|
+
corpi = meth.to_s.split("_or_")
|
50
|
+
prob_by_corpus = guess(corpi, *args)
|
51
|
+
|
52
|
+
best_match = {"none" => -1}
|
53
|
+
next_best = {"none" => -1}
|
54
|
+
prob_by_corpus.each do |corpus, prob|
|
55
|
+
if prob > best_match.values.first
|
56
|
+
next_best = best_match.clone
|
57
|
+
best_match = {corpus => prob}
|
58
|
+
elsif prob > next_best.values.first
|
59
|
+
next_best = {corpus => prob}
|
60
|
+
end
|
61
|
+
end
|
62
|
+
[best_match.keys.first, best_match.values.first / next_best.values.first]
|
63
|
+
else
|
64
|
+
super
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def train(corpus, text, options = {})
|
69
|
+
tokenize(text, options).each do |word|
|
70
|
+
@@updates << Update.new(@model, corpus, word, 1)
|
71
|
+
flush_updates
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def untrain(corpus, text, options = {})
|
76
|
+
tokenize(text, options).each do |word|
|
77
|
+
@@updates << Update.new(@model, corpus, word, -1)
|
78
|
+
flush_updates
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def guess(corpi, text, options = {})
|
83
|
+
corpus_probs_by_word = {}
|
84
|
+
tokenize(text, options).each do |word|
|
85
|
+
prob_by_corpus, any_occ = get_prob_by_corpus(corpi, word)
|
86
|
+
corpus_probs_by_word[word] = prob_by_corpus if any_occ
|
87
|
+
end
|
88
|
+
rval = {}
|
89
|
+
corpi.each do |corpus|
|
90
|
+
rval[corpus] = 1.0
|
91
|
+
end
|
92
|
+
corpus_probs_by_word.each do |word, prob_by_corpus|
|
93
|
+
prob_by_corpus.each do |corpus, prob|
|
94
|
+
rval[corpus] *= prob
|
95
|
+
end
|
96
|
+
end
|
97
|
+
rval
|
98
|
+
end
|
99
|
+
|
100
|
+
def get_occ(corpus, word)
|
101
|
+
instance = @model.find_by_corpus_and_word(corpus, word)
|
102
|
+
instance.nil? ? 0 : instance.amount
|
103
|
+
end
|
104
|
+
|
105
|
+
protected
|
106
|
+
|
107
|
+
def flush_updates
|
108
|
+
if @@updates.size > 100 || (defined?($FLUSH_THOMAS) && $FLUSH_THOMAS)
|
109
|
+
@model.transaction do
|
110
|
+
until @@updates.empty?
|
111
|
+
@@updates.pop.execute
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def get_prob_by_corpus(corpi, word)
|
118
|
+
rval = {}
|
119
|
+
|
120
|
+
occurence_by_corpus, any_occ = get_occurence_by_corpus(corpi, word)
|
121
|
+
|
122
|
+
return [{}, false] unless any_occ
|
123
|
+
|
124
|
+
total_occurences = occurence_by_corpus.values.inject(0) do |sum, occ| sum + occ end.to_f
|
125
|
+
total_occurences = 1.0 if total_occurences == 0.0
|
126
|
+
|
127
|
+
corpi.each do |corpus|
|
128
|
+
rval[corpus] = max(0.01,
|
129
|
+
min(0.99,
|
130
|
+
occurence_by_corpus[corpus] / total_occurences))
|
131
|
+
end
|
132
|
+
|
133
|
+
[rval, true]
|
134
|
+
end
|
135
|
+
|
136
|
+
def get_occurence_by_corpus(corpi, word)
|
137
|
+
rval = {}
|
138
|
+
any_occ = false
|
139
|
+
corpi.each do |corpus|
|
140
|
+
occ = get_occ(corpus, word)
|
141
|
+
rval[corpus] = occ
|
142
|
+
any_occ = true if occ > 0
|
143
|
+
end
|
144
|
+
[rval, any_occ]
|
145
|
+
end
|
146
|
+
|
147
|
+
def min(a,b)
|
148
|
+
if a > b
|
149
|
+
b
|
150
|
+
else
|
151
|
+
a
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
def abs(a)
|
156
|
+
if a > 0
|
157
|
+
a
|
158
|
+
else
|
159
|
+
-a
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
def max(a,b)
|
164
|
+
if a > b
|
165
|
+
a
|
166
|
+
else
|
167
|
+
b
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
#
|
172
|
+
# Tokenizes the +text+ given and returns
|
173
|
+
# an Array of the included words separated by a bunch of stuff.
|
174
|
+
#
|
175
|
+
# Will accept HTML, and tokenize only the text nodes within, except
|
176
|
+
# it will add the source of any element containing source as 'src:SOURCE'.
|
177
|
+
#
|
178
|
+
# An optional Hash can be given and will be added to the token Array
|
179
|
+
# at the end as 'KEY:VALUE'.
|
180
|
+
#
|
181
|
+
def tokenize(text, options = {})
|
182
|
+
rval = []
|
183
|
+
tokenizer = HTML::Tokenizer.new(text)
|
184
|
+
while token = tokenizer.next
|
185
|
+
node = HTML::Node.parse(nil, 0, 0, token, false)
|
186
|
+
case node
|
187
|
+
when HTML::Tag
|
188
|
+
rval << "src:#{node.attributes["src"]}" if node.attributes && node.attributes.include?("src")
|
189
|
+
when HTML::Text
|
190
|
+
rval += node.content.split(/[\s!?:;,.\/\\=+-]+/)
|
191
|
+
end
|
192
|
+
end
|
193
|
+
rval + options.keys.collect do |key| "#{key}:#{options[key]}" end
|
194
|
+
end
|
195
|
+
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
data/test/thomas_test.rb
ADDED
metadata
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.0
|
3
|
+
specification_version: 1
|
4
|
+
name: thomas
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.1.0
|
7
|
+
date: 2006-11-13 00:00:00 +01:00
|
8
|
+
summary: Bayes classifier using memcached
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email:
|
12
|
+
homepage:
|
13
|
+
rubyforge_project:
|
14
|
+
description:
|
15
|
+
autorequire: thomas
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- Adocca Entertainment AB
|
31
|
+
files:
|
32
|
+
- lib/thomas.rb
|
33
|
+
- test/thomas_test.rb
|
34
|
+
- README
|
35
|
+
test_files: []
|
36
|
+
|
37
|
+
rdoc_options:
|
38
|
+
- --line-numbers
|
39
|
+
- --inline-source
|
40
|
+
extra_rdoc_files:
|
41
|
+
- README
|
42
|
+
executables: []
|
43
|
+
|
44
|
+
extensions: []
|
45
|
+
|
46
|
+
requirements: []
|
47
|
+
|
48
|
+
dependencies: []
|
49
|
+
|