thomas 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. data/README +4 -0
  2. data/lib/thomas.rb +198 -0
  3. data/test/thomas_test.rb +8 -0
  4. metadata +49 -0
data/README ADDED
@@ -0,0 +1,4 @@
1
+ Thomas
2
+ ======
3
+
4
+ Description goes here
data/lib/thomas.rb ADDED
@@ -0,0 +1,198 @@
1
+ # Thomas
2
+ #
3
+ # A naive bayesian filter that works with any model that has word, corpus and amount attributes.
4
+ #
5
+ # You use it like this (with the example model BayesWord):
6
+ #
7
+ # b = Adocca::Thomas::Bayes.new(BayesWord)
8
+ # b.train("spam", "i am a nigerian president and i sell viagra")
9
+ # b.train("ham", "i am a developer and i like ruby and beer")
10
+ # b.train("ham", "i want to sell you my fancy stock")
11
+ # b.guess(["ham","spam"], "i am a nigerian developer and i like ruby and fancy beer")
12
+ #
13
+ # You can also do (with the example corpuses (corpi?) "ham" and "spam"):
14
+ # b.ham_or_spam("i am a nigerian developer and i like ruby and fancy beer")
15
+ #
16
+
17
+ require 'pp'
18
+
19
+ module Adocca
20
+ module Thomas
21
+
22
+ class Update
23
+ def initialize(model, corpus, word, amount)
24
+ @model = model
25
+ @corpus = corpus
26
+ @word = word
27
+ @amount = amount
28
+ end
29
+ def execute
30
+ instance = @model.find_or_create_by_corpus_and_word(@corpus, @word)
31
+ instance.update_attribute(:amount, (instance.amount || 0) + @amount)
32
+ end
33
+ end
34
+
35
+ class Bayes
36
+
37
+ NAMESPACE = 'adocca:thomas:bayes'
38
+
39
+ attr_accessor :model
40
+
41
+ @@updates = []
42
+
43
+ def initialize(model)
44
+ @model = model
45
+ end
46
+
47
+ def method_missing(meth, *args)
48
+ if meth.to_s =~ /_or_/
49
+ corpi = meth.to_s.split("_or_")
50
+ prob_by_corpus = guess(corpi, *args)
51
+
52
+ best_match = {"none" => -1}
53
+ next_best = {"none" => -1}
54
+ prob_by_corpus.each do |corpus, prob|
55
+ if prob > best_match.values.first
56
+ next_best = best_match.clone
57
+ best_match = {corpus => prob}
58
+ elsif prob > next_best.values.first
59
+ next_best = {corpus => prob}
60
+ end
61
+ end
62
+ [best_match.keys.first, best_match.values.first / next_best.values.first]
63
+ else
64
+ super
65
+ end
66
+ end
67
+
68
+ def train(corpus, text, options = {})
69
+ tokenize(text, options).each do |word|
70
+ @@updates << Update.new(@model, corpus, word, 1)
71
+ flush_updates
72
+ end
73
+ end
74
+
75
+ def untrain(corpus, text, options = {})
76
+ tokenize(text, options).each do |word|
77
+ @@updates << Update.new(@model, corpus, word, -1)
78
+ flush_updates
79
+ end
80
+ end
81
+
82
+ def guess(corpi, text, options = {})
83
+ corpus_probs_by_word = {}
84
+ tokenize(text, options).each do |word|
85
+ prob_by_corpus, any_occ = get_prob_by_corpus(corpi, word)
86
+ corpus_probs_by_word[word] = prob_by_corpus if any_occ
87
+ end
88
+ rval = {}
89
+ corpi.each do |corpus|
90
+ rval[corpus] = 1.0
91
+ end
92
+ corpus_probs_by_word.each do |word, prob_by_corpus|
93
+ prob_by_corpus.each do |corpus, prob|
94
+ rval[corpus] *= prob
95
+ end
96
+ end
97
+ rval
98
+ end
99
+
100
+ def get_occ(corpus, word)
101
+ instance = @model.find_by_corpus_and_word(corpus, word)
102
+ instance.nil? ? 0 : instance.amount
103
+ end
104
+
105
+ protected
106
+
107
+ def flush_updates
108
+ if @@updates.size > 100 || (defined?($FLUSH_THOMAS) && $FLUSH_THOMAS)
109
+ @model.transaction do
110
+ until @@updates.empty?
111
+ @@updates.pop.execute
112
+ end
113
+ end
114
+ end
115
+ end
116
+
117
+ def get_prob_by_corpus(corpi, word)
118
+ rval = {}
119
+
120
+ occurence_by_corpus, any_occ = get_occurence_by_corpus(corpi, word)
121
+
122
+ return [{}, false] unless any_occ
123
+
124
+ total_occurences = occurence_by_corpus.values.inject(0) do |sum, occ| sum + occ end.to_f
125
+ total_occurences = 1.0 if total_occurences == 0.0
126
+
127
+ corpi.each do |corpus|
128
+ rval[corpus] = max(0.01,
129
+ min(0.99,
130
+ occurence_by_corpus[corpus] / total_occurences))
131
+ end
132
+
133
+ [rval, true]
134
+ end
135
+
136
+ def get_occurence_by_corpus(corpi, word)
137
+ rval = {}
138
+ any_occ = false
139
+ corpi.each do |corpus|
140
+ occ = get_occ(corpus, word)
141
+ rval[corpus] = occ
142
+ any_occ = true if occ > 0
143
+ end
144
+ [rval, any_occ]
145
+ end
146
+
147
+ def min(a,b)
148
+ if a > b
149
+ b
150
+ else
151
+ a
152
+ end
153
+ end
154
+
155
+ def abs(a)
156
+ if a > 0
157
+ a
158
+ else
159
+ -a
160
+ end
161
+ end
162
+
163
+ def max(a,b)
164
+ if a > b
165
+ a
166
+ else
167
+ b
168
+ end
169
+ end
170
+
171
+ #
172
+ # Tokenizes the +text+ given and returns
173
+ # an Array of the included words separated by a bunch of stuff.
174
+ #
175
+ # Will accept HTML, and tokenize only the text nodes within, except
176
+ # it will add the source of any element containing source as 'src:SOURCE'.
177
+ #
178
+ # An optional Hash can be given and will be added to the token Array
179
+ # at the end as 'KEY:VALUE'.
180
+ #
181
+ def tokenize(text, options = {})
182
+ rval = []
183
+ tokenizer = HTML::Tokenizer.new(text)
184
+ while token = tokenizer.next
185
+ node = HTML::Node.parse(nil, 0, 0, token, false)
186
+ case node
187
+ when HTML::Tag
188
+ rval << "src:#{node.attributes["src"]}" if node.attributes && node.attributes.include?("src")
189
+ when HTML::Text
190
+ rval += node.content.split(/[\s!?:;,.\/\\=+-]+/)
191
+ end
192
+ end
193
+ rval + options.keys.collect do |key| "#{key}:#{options[key]}" end
194
+ end
195
+
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,8 @@
1
+ require 'test/unit'
2
+
3
+ class ThomasTest < Test::Unit::TestCase
4
+ # Replace this with your real tests.
5
+ def test_this_plugin
6
+ flunk
7
+ end
8
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: 1
4
+ name: thomas
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2006-11-13 00:00:00 +01:00
8
+ summary: Bayes classifier using memcached
9
+ require_paths:
10
+ - lib
11
+ email:
12
+ homepage:
13
+ rubyforge_project:
14
+ description:
15
+ autorequire: thomas
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Adocca Entertainment AB
31
+ files:
32
+ - lib/thomas.rb
33
+ - test/thomas_test.rb
34
+ - README
35
+ test_files: []
36
+
37
+ rdoc_options:
38
+ - --line-numbers
39
+ - --inline-source
40
+ extra_rdoc_files:
41
+ - README
42
+ executables: []
43
+
44
+ extensions: []
45
+
46
+ requirements: []
47
+
48
+ dependencies: []
49
+