thomas 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +4 -0
- data/lib/thomas.rb +198 -0
- data/test/thomas_test.rb +8 -0
- metadata +49 -0
data/README
ADDED
data/lib/thomas.rb
ADDED
@@ -0,0 +1,198 @@
|
|
1
|
+
# Thomas
|
2
|
+
#
|
3
|
+
# A naive bayesian filter that works with any model that has word, corpus and amount attributes.
|
4
|
+
#
|
5
|
+
# You use it like this (with the example model BayesWord):
|
6
|
+
#
|
7
|
+
# b = Adocca::Thomas::Bayes.new(BayesWord)
|
8
|
+
# b.train("spam", "i am a nigerian president and i sell viagra")
|
9
|
+
# b.train("ham", "i am a developer and i like ruby and beer")
|
10
|
+
# b.train("ham", "i want to sell you my fancy stock")
|
11
|
+
# b.guess(["ham","spam"], "i am a nigerian developer and i like ruby and fancy beer")
|
12
|
+
#
|
13
|
+
# You can also do (with the example corpuses (corpi?) "ham" and "spam"):
|
14
|
+
# b.ham_or_spam("i am a nigerian developer and i like ruby and fancy beer")
|
15
|
+
#
|
16
|
+
|
17
|
+
require 'pp'
|
18
|
+
|
19
|
+
module Adocca
|
20
|
+
module Thomas
|
21
|
+
|
22
|
+
class Update
|
23
|
+
def initialize(model, corpus, word, amount)
|
24
|
+
@model = model
|
25
|
+
@corpus = corpus
|
26
|
+
@word = word
|
27
|
+
@amount = amount
|
28
|
+
end
|
29
|
+
def execute
|
30
|
+
instance = @model.find_or_create_by_corpus_and_word(@corpus, @word)
|
31
|
+
instance.update_attribute(:amount, (instance.amount || 0) + @amount)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
class Bayes
|
36
|
+
|
37
|
+
NAMESPACE = 'adocca:thomas:bayes'
|
38
|
+
|
39
|
+
attr_accessor :model
|
40
|
+
|
41
|
+
@@updates = []
|
42
|
+
|
43
|
+
def initialize(model)
|
44
|
+
@model = model
|
45
|
+
end
|
46
|
+
|
47
|
+
def method_missing(meth, *args)
|
48
|
+
if meth.to_s =~ /_or_/
|
49
|
+
corpi = meth.to_s.split("_or_")
|
50
|
+
prob_by_corpus = guess(corpi, *args)
|
51
|
+
|
52
|
+
best_match = {"none" => -1}
|
53
|
+
next_best = {"none" => -1}
|
54
|
+
prob_by_corpus.each do |corpus, prob|
|
55
|
+
if prob > best_match.values.first
|
56
|
+
next_best = best_match.clone
|
57
|
+
best_match = {corpus => prob}
|
58
|
+
elsif prob > next_best.values.first
|
59
|
+
next_best = {corpus => prob}
|
60
|
+
end
|
61
|
+
end
|
62
|
+
[best_match.keys.first, best_match.values.first / next_best.values.first]
|
63
|
+
else
|
64
|
+
super
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def train(corpus, text, options = {})
|
69
|
+
tokenize(text, options).each do |word|
|
70
|
+
@@updates << Update.new(@model, corpus, word, 1)
|
71
|
+
flush_updates
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def untrain(corpus, text, options = {})
|
76
|
+
tokenize(text, options).each do |word|
|
77
|
+
@@updates << Update.new(@model, corpus, word, -1)
|
78
|
+
flush_updates
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def guess(corpi, text, options = {})
|
83
|
+
corpus_probs_by_word = {}
|
84
|
+
tokenize(text, options).each do |word|
|
85
|
+
prob_by_corpus, any_occ = get_prob_by_corpus(corpi, word)
|
86
|
+
corpus_probs_by_word[word] = prob_by_corpus if any_occ
|
87
|
+
end
|
88
|
+
rval = {}
|
89
|
+
corpi.each do |corpus|
|
90
|
+
rval[corpus] = 1.0
|
91
|
+
end
|
92
|
+
corpus_probs_by_word.each do |word, prob_by_corpus|
|
93
|
+
prob_by_corpus.each do |corpus, prob|
|
94
|
+
rval[corpus] *= prob
|
95
|
+
end
|
96
|
+
end
|
97
|
+
rval
|
98
|
+
end
|
99
|
+
|
100
|
+
def get_occ(corpus, word)
|
101
|
+
instance = @model.find_by_corpus_and_word(corpus, word)
|
102
|
+
instance.nil? ? 0 : instance.amount
|
103
|
+
end
|
104
|
+
|
105
|
+
protected
|
106
|
+
|
107
|
+
def flush_updates
|
108
|
+
if @@updates.size > 100 || (defined?($FLUSH_THOMAS) && $FLUSH_THOMAS)
|
109
|
+
@model.transaction do
|
110
|
+
until @@updates.empty?
|
111
|
+
@@updates.pop.execute
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def get_prob_by_corpus(corpi, word)
|
118
|
+
rval = {}
|
119
|
+
|
120
|
+
occurence_by_corpus, any_occ = get_occurence_by_corpus(corpi, word)
|
121
|
+
|
122
|
+
return [{}, false] unless any_occ
|
123
|
+
|
124
|
+
total_occurences = occurence_by_corpus.values.inject(0) do |sum, occ| sum + occ end.to_f
|
125
|
+
total_occurences = 1.0 if total_occurences == 0.0
|
126
|
+
|
127
|
+
corpi.each do |corpus|
|
128
|
+
rval[corpus] = max(0.01,
|
129
|
+
min(0.99,
|
130
|
+
occurence_by_corpus[corpus] / total_occurences))
|
131
|
+
end
|
132
|
+
|
133
|
+
[rval, true]
|
134
|
+
end
|
135
|
+
|
136
|
+
def get_occurence_by_corpus(corpi, word)
|
137
|
+
rval = {}
|
138
|
+
any_occ = false
|
139
|
+
corpi.each do |corpus|
|
140
|
+
occ = get_occ(corpus, word)
|
141
|
+
rval[corpus] = occ
|
142
|
+
any_occ = true if occ > 0
|
143
|
+
end
|
144
|
+
[rval, any_occ]
|
145
|
+
end
|
146
|
+
|
147
|
+
def min(a,b)
|
148
|
+
if a > b
|
149
|
+
b
|
150
|
+
else
|
151
|
+
a
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
def abs(a)
|
156
|
+
if a > 0
|
157
|
+
a
|
158
|
+
else
|
159
|
+
-a
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
def max(a,b)
|
164
|
+
if a > b
|
165
|
+
a
|
166
|
+
else
|
167
|
+
b
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
#
|
172
|
+
# Tokenizes the +text+ given and returns
|
173
|
+
# an Array of the included words separated by a bunch of stuff.
|
174
|
+
#
|
175
|
+
# Will accept HTML, and tokenize only the text nodes within, except
|
176
|
+
# it will add the source of any element containing source as 'src:SOURCE'.
|
177
|
+
#
|
178
|
+
# An optional Hash can be given and will be added to the token Array
|
179
|
+
# at the end as 'KEY:VALUE'.
|
180
|
+
#
|
181
|
+
def tokenize(text, options = {})
|
182
|
+
rval = []
|
183
|
+
tokenizer = HTML::Tokenizer.new(text)
|
184
|
+
while token = tokenizer.next
|
185
|
+
node = HTML::Node.parse(nil, 0, 0, token, false)
|
186
|
+
case node
|
187
|
+
when HTML::Tag
|
188
|
+
rval << "src:#{node.attributes["src"]}" if node.attributes && node.attributes.include?("src")
|
189
|
+
when HTML::Text
|
190
|
+
rval += node.content.split(/[\s!?:;,.\/\\=+-]+/)
|
191
|
+
end
|
192
|
+
end
|
193
|
+
rval + options.keys.collect do |key| "#{key}:#{options[key]}" end
|
194
|
+
end
|
195
|
+
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
data/test/thomas_test.rb
ADDED
metadata
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.0
|
3
|
+
specification_version: 1
|
4
|
+
name: thomas
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.1.0
|
7
|
+
date: 2006-11-13 00:00:00 +01:00
|
8
|
+
summary: Bayes classifier using memcached
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email:
|
12
|
+
homepage:
|
13
|
+
rubyforge_project:
|
14
|
+
description:
|
15
|
+
autorequire: thomas
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- Adocca Entertainment AB
|
31
|
+
files:
|
32
|
+
- lib/thomas.rb
|
33
|
+
- test/thomas_test.rb
|
34
|
+
- README
|
35
|
+
test_files: []
|
36
|
+
|
37
|
+
rdoc_options:
|
38
|
+
- --line-numbers
|
39
|
+
- --inline-source
|
40
|
+
extra_rdoc_files:
|
41
|
+
- README
|
42
|
+
executables: []
|
43
|
+
|
44
|
+
extensions: []
|
45
|
+
|
46
|
+
requirements: []
|
47
|
+
|
48
|
+
dependencies: []
|
49
|
+
|