bayes_on_redis_internal 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +5 -0
- data/Rakefile +1 -0
- data/bayes_on_redis_internal.gemspec +24 -0
- data/lib/bayes_on_redis_internal.rb +120 -0
- data/lib/bayes_on_redis_internal/version.rb +3 -0
- metadata +51 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "bayes_on_redis_internal/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "bayes_on_redis_internal"
|
7
|
+
s.version = BayesOnRedisInternal::VERSION
|
8
|
+
s.authors = ["Dermot Haughey"]
|
9
|
+
s.email = ["hderms@gmail.com"]
|
10
|
+
s.homepage = ""
|
11
|
+
s.summary = "fork of bayes_on_redis that includes more redis integration"
|
12
|
+
s.description = "library for performing bayesian analysis by way of redis hashes"
|
13
|
+
|
14
|
+
s.rubyforge_project = "bayes_on_redis_internal"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# specify any dependencies here; for example:
|
22
|
+
# s.add_development_dependency "rspec"
|
23
|
+
# s.add_runtime_dependency "rest-client"
|
24
|
+
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
|
3
|
+
class BayesOnRedis
|
4
|
+
CATEGORIES_KEY = "BayesOnRedis:categories"
|
5
|
+
ONE_OR_TWO_WORDS_RE = /\b\w{1,2}\b/mi
|
6
|
+
NON_ALPHANUMERIC_AND_NON_DOT_RE = /[^\w\.]/mi
|
7
|
+
|
8
|
+
attr_reader :redis, :stopwords
|
9
|
+
|
10
|
+
def initialize(options)
|
11
|
+
if options.is_a?(Hash)
|
12
|
+
@redis = Redis.new(:host => options[:redis_host], :port => options[:redis_port], :db => options[:redis_db])
|
13
|
+
else
|
14
|
+
@redis = options
|
15
|
+
end
|
16
|
+
|
17
|
+
@stopwords = Stopword.new
|
18
|
+
end
|
19
|
+
|
20
|
+
def flushdb
|
21
|
+
@redis.flushdb
|
22
|
+
end
|
23
|
+
|
24
|
+
# training for a category
|
25
|
+
def train(category, text)
|
26
|
+
category = category.downcase
|
27
|
+
@redis.sadd(CATEGORIES_KEY, category)
|
28
|
+
|
29
|
+
count_occurance(text).each do |word, count|
|
30
|
+
@redis.hincrby(redis_category_key(category), word, count)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
alias_method :learn, :train
|
34
|
+
|
35
|
+
def remove(category)
|
36
|
+
category=category.downcase
|
37
|
+
@redis.del(redis_category_key(category))
|
38
|
+
@redis.srem(CATEGORIES_KEY, category)
|
39
|
+
end
|
40
|
+
def list_members
|
41
|
+
return @redis.smembers (CATEGORIES_KEY)
|
42
|
+
end
|
43
|
+
|
44
|
+
def untrain(category, text)
|
45
|
+
category = category.downcase
|
46
|
+
|
47
|
+
count_occurance(text).each do |word, count|
|
48
|
+
word_count_atm = @redis.hget(redis_category_key(category), word)
|
49
|
+
if (word_count_atm >= count)
|
50
|
+
new_count = (word_count_atm - count)
|
51
|
+
else
|
52
|
+
new_count = 0
|
53
|
+
end
|
54
|
+
@redis.hset(redis_category_key(category), word, new_count)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
alias_method :unlearn, :untrain
|
58
|
+
|
59
|
+
def score(text)
|
60
|
+
scores = {}
|
61
|
+
|
62
|
+
@redis.smembers(CATEGORIES_KEY).each do |category|
|
63
|
+
words_count_per_category = @redis.hvals(redis_category_key(category)).inject(0) {|sum, score| sum + score.to_i}
|
64
|
+
@redis.srem(CATEGORIES_KEY, category) if words_count_per_category <= 0
|
65
|
+
|
66
|
+
scores[category] = 0
|
67
|
+
|
68
|
+
count_occurance(text).each do |word, count|
|
69
|
+
tmp_score = @redis.hget(redis_category_key(category), word).to_i
|
70
|
+
tmp_score = 0.1 if tmp_score <= 0
|
71
|
+
|
72
|
+
scores[category] += Math.log(tmp_score / words_count_per_category.to_f)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
return scores
|
77
|
+
end
|
78
|
+
|
79
|
+
def classify(text)
|
80
|
+
(score(text).sort_by { |score| -score[1] })[0][0] # [0][0] -> first score, get the key
|
81
|
+
end
|
82
|
+
|
83
|
+
private
|
84
|
+
def redis_category_key(category)
|
85
|
+
"BayesOnRedis:cat:#{category}"
|
86
|
+
end
|
87
|
+
|
88
|
+
# Incoming text is always downcased
|
89
|
+
def count_occurance(text='')
|
90
|
+
raise "input must be instance of String" unless text.is_a?(String)
|
91
|
+
|
92
|
+
text_chunks = text.downcase.gsub(ONE_OR_TWO_WORDS_RE, '').gsub(NON_ALPHANUMERIC_AND_NON_DOT_RE, ' ').gsub(@stopwords.to_re, '').gsub(/\./, '').split
|
93
|
+
text_chunks.inject(Hash.new(0)) do |container, word|
|
94
|
+
container[word] += 1; container
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def remove_stopwords
|
99
|
+
@redis.smembers(CATEGORIES_KEY).each do |category|
|
100
|
+
@stopwords.to_a.each do |stopword|
|
101
|
+
@redis.hdel(redis_category_key(category), stopword)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
|
108
|
+
class Stopword
|
109
|
+
def initialize
|
110
|
+
@stopwords = File.read(File.expand_path(File.join(__FILE__, "..", "..", "datasets", "stopwords.txt"))).split
|
111
|
+
end
|
112
|
+
|
113
|
+
def to_a
|
114
|
+
@stopwords
|
115
|
+
end
|
116
|
+
|
117
|
+
def to_re
|
118
|
+
@to_re ||= /\b(#{@stopwords.join('|')})\b/mi
|
119
|
+
end
|
120
|
+
end
|
metadata
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bayes_on_redis_internal
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Dermot Haughey
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-02-11 00:00:00.000000000Z
|
13
|
+
dependencies: []
|
14
|
+
description: library for performing bayesian analysis by way of redis hashes
|
15
|
+
email:
|
16
|
+
- hderms@gmail.com
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- .gitignore
|
22
|
+
- Gemfile
|
23
|
+
- Rakefile
|
24
|
+
- bayes_on_redis_internal.gemspec
|
25
|
+
- lib/bayes_on_redis_internal.rb
|
26
|
+
- lib/bayes_on_redis_internal/version.rb
|
27
|
+
homepage: ''
|
28
|
+
licenses: []
|
29
|
+
post_install_message:
|
30
|
+
rdoc_options: []
|
31
|
+
require_paths:
|
32
|
+
- lib
|
33
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
34
|
+
none: false
|
35
|
+
requirements:
|
36
|
+
- - ! '>='
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ! '>='
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
version: '0'
|
45
|
+
requirements: []
|
46
|
+
rubyforge_project: bayes_on_redis_internal
|
47
|
+
rubygems_version: 1.8.10
|
48
|
+
signing_key:
|
49
|
+
specification_version: 3
|
50
|
+
summary: fork of bayes_on_redis that includes more redis integration
|
51
|
+
test_files: []
|