thomas 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. data/README +4 -0
  2. data/lib/thomas.rb +198 -0
  3. data/test/thomas_test.rb +8 -0
  4. metadata +49 -0
data/README ADDED
@@ -0,0 +1,4 @@
1
+ Thomas
2
+ ======
3
+
4
+ Description goes here
data/lib/thomas.rb ADDED
@@ -0,0 +1,198 @@
1
+ # Thomas
2
+ #
3
+ # A naive bayesian filter that works with any model that has word, corpus and amount attributes.
4
+ #
5
+ # You use it like this (with the example model BayesWord):
6
+ #
7
+ # b = Adocca::Thomas::Bayes.new(BayesWord)
8
+ # b.train("spam", "i am a nigerian president and i sell viagra")
9
+ # b.train("ham", "i am a developer and i like ruby and beer")
10
+ # b.train("ham", "i want to sell you my fancy stock")
11
+ # b.guess(["ham","spam"], "i am a nigerian developer and i like ruby and fancy beer")
12
+ #
13
+ # You can also do (with the example corpuses (corpi?) "ham" and "spam"):
14
+ # b.ham_or_spam("i am a nigerian developer and i like ruby and fancy beer")
15
+ #
16
+
17
+ require 'pp'
18
+
19
+ module Adocca
20
+ module Thomas
21
+
22
+ class Update
23
+ def initialize(model, corpus, word, amount)
24
+ @model = model
25
+ @corpus = corpus
26
+ @word = word
27
+ @amount = amount
28
+ end
29
+ def execute
30
+ instance = @model.find_or_create_by_corpus_and_word(@corpus, @word)
31
+ instance.update_attribute(:amount, (instance.amount || 0) + @amount)
32
+ end
33
+ end
34
+
35
+ class Bayes
36
+
37
+ NAMESPACE = 'adocca:thomas:bayes'
38
+
39
+ attr_accessor :model
40
+
41
+ @@updates = []
42
+
43
+ def initialize(model)
44
+ @model = model
45
+ end
46
+
47
+ def method_missing(meth, *args)
48
+ if meth.to_s =~ /_or_/
49
+ corpi = meth.to_s.split("_or_")
50
+ prob_by_corpus = guess(corpi, *args)
51
+
52
+ best_match = {"none" => -1}
53
+ next_best = {"none" => -1}
54
+ prob_by_corpus.each do |corpus, prob|
55
+ if prob > best_match.values.first
56
+ next_best = best_match.clone
57
+ best_match = {corpus => prob}
58
+ elsif prob > next_best.values.first
59
+ next_best = {corpus => prob}
60
+ end
61
+ end
62
+ [best_match.keys.first, best_match.values.first / next_best.values.first]
63
+ else
64
+ super
65
+ end
66
+ end
67
+
68
+ def train(corpus, text, options = {})
69
+ tokenize(text, options).each do |word|
70
+ @@updates << Update.new(@model, corpus, word, 1)
71
+ flush_updates
72
+ end
73
+ end
74
+
75
+ def untrain(corpus, text, options = {})
76
+ tokenize(text, options).each do |word|
77
+ @@updates << Update.new(@model, corpus, word, -1)
78
+ flush_updates
79
+ end
80
+ end
81
+
82
+ def guess(corpi, text, options = {})
83
+ corpus_probs_by_word = {}
84
+ tokenize(text, options).each do |word|
85
+ prob_by_corpus, any_occ = get_prob_by_corpus(corpi, word)
86
+ corpus_probs_by_word[word] = prob_by_corpus if any_occ
87
+ end
88
+ rval = {}
89
+ corpi.each do |corpus|
90
+ rval[corpus] = 1.0
91
+ end
92
+ corpus_probs_by_word.each do |word, prob_by_corpus|
93
+ prob_by_corpus.each do |corpus, prob|
94
+ rval[corpus] *= prob
95
+ end
96
+ end
97
+ rval
98
+ end
99
+
100
+ def get_occ(corpus, word)
101
+ instance = @model.find_by_corpus_and_word(corpus, word)
102
+ instance.nil? ? 0 : instance.amount
103
+ end
104
+
105
+ protected
106
+
107
+ def flush_updates
108
+ if @@updates.size > 100 || (defined?($FLUSH_THOMAS) && $FLUSH_THOMAS)
109
+ @model.transaction do
110
+ until @@updates.empty?
111
+ @@updates.pop.execute
112
+ end
113
+ end
114
+ end
115
+ end
116
+
117
+ def get_prob_by_corpus(corpi, word)
118
+ rval = {}
119
+
120
+ occurence_by_corpus, any_occ = get_occurence_by_corpus(corpi, word)
121
+
122
+ return [{}, false] unless any_occ
123
+
124
+ total_occurences = occurence_by_corpus.values.inject(0) do |sum, occ| sum + occ end.to_f
125
+ total_occurences = 1.0 if total_occurences == 0.0
126
+
127
+ corpi.each do |corpus|
128
+ rval[corpus] = max(0.01,
129
+ min(0.99,
130
+ occurence_by_corpus[corpus] / total_occurences))
131
+ end
132
+
133
+ [rval, true]
134
+ end
135
+
136
+ def get_occurence_by_corpus(corpi, word)
137
+ rval = {}
138
+ any_occ = false
139
+ corpi.each do |corpus|
140
+ occ = get_occ(corpus, word)
141
+ rval[corpus] = occ
142
+ any_occ = true if occ > 0
143
+ end
144
+ [rval, any_occ]
145
+ end
146
+
147
+ def min(a,b)
148
+ if a > b
149
+ b
150
+ else
151
+ a
152
+ end
153
+ end
154
+
155
+ def abs(a)
156
+ if a > 0
157
+ a
158
+ else
159
+ -a
160
+ end
161
+ end
162
+
163
+ def max(a,b)
164
+ if a > b
165
+ a
166
+ else
167
+ b
168
+ end
169
+ end
170
+
171
+ #
172
+ # Tokenizes the +text+ given and returns
173
+ # an Array of the included words separated by a bunch of stuff.
174
+ #
175
+ # Will accept HTML, and tokenize only the text nodes within, except
176
+ # it will add the source of any element containing source as 'src:SOURCE'.
177
+ #
178
+ # An optional Hash can be given and will be added to the token Array
179
+ # at the end as 'KEY:VALUE'.
180
+ #
181
+ def tokenize(text, options = {})
182
+ rval = []
183
+ tokenizer = HTML::Tokenizer.new(text)
184
+ while token = tokenizer.next
185
+ node = HTML::Node.parse(nil, 0, 0, token, false)
186
+ case node
187
+ when HTML::Tag
188
+ rval << "src:#{node.attributes["src"]}" if node.attributes && node.attributes.include?("src")
189
+ when HTML::Text
190
+ rval += node.content.split(/[\s!?:;,.\/\\=+-]+/)
191
+ end
192
+ end
193
+ rval + options.keys.collect do |key| "#{key}:#{options[key]}" end
194
+ end
195
+
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,8 @@
1
+ require 'test/unit'
2
+
3
+ class ThomasTest < Test::Unit::TestCase
4
+ # Replace this with your real tests.
5
+ def test_this_plugin
6
+ flunk
7
+ end
8
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: 1
4
+ name: thomas
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2006-11-13 00:00:00 +01:00
8
+ summary: Bayes classifier using memcached
9
+ require_paths:
10
+ - lib
11
+ email:
12
+ homepage:
13
+ rubyforge_project:
14
+ description:
15
+ autorequire: thomas
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Adocca Entertainment AB
31
+ files:
32
+ - lib/thomas.rb
33
+ - test/thomas_test.rb
34
+ - README
35
+ test_files: []
36
+
37
+ rdoc_options:
38
+ - --line-numbers
39
+ - --inline-source
40
+ extra_rdoc_files:
41
+ - README
42
+ executables: []
43
+
44
+ extensions: []
45
+
46
+ requirements: []
47
+
48
+ dependencies: []
49
+