classifier-reborn 2.0.1 → 2.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.markdown +3 -3
- data/lib/classifier-reborn.rb +1 -1
- data/lib/classifier-reborn/bayes.rb +23 -13
- data/lib/classifier-reborn/{extensions/string.rb → category_namer.rb} +7 -2
- data/lib/classifier-reborn/extensions/vector.rb +0 -12
- data/lib/classifier-reborn/lsi/content_node.rb +1 -1
- data/lib/classifier-reborn/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1219c6a6bbe3f8c2a0820d953da2ef4c85bb5e52
|
4
|
+
data.tar.gz: 66575a30d03763da10f35ee669942bc111838e6e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7d8012f760b929a521a03aa43914def24bd2ddcf688c6816c3699d549d584cccf91e5b8499f1c988b8eceb8d4cf767ccad2585cb2cacabcd9092ce040d64b02e
|
7
|
+
data.tar.gz: 6e6d7e9a50f9ab05d8548b7032c208729c852fa32452a2c411f7ebd50b9ccf53750f0155907a4cfefd697ec12a0dd24fc25e8fd6d9131e7318fbf04350f6b93e
|
data/README.markdown
CHANGED
@@ -29,7 +29,7 @@ This should install automatically with RubyGems.
|
|
29
29
|
If you would like to speed up LSI classification by at least 10x, please install the following libraries:
|
30
30
|
|
31
31
|
* [GNU GSL](http://www.gnu.org/software/gsl)
|
32
|
-
* [rb-gsl](
|
32
|
+
* [rb-gsl](https://rubygems.org/gems/rb-gsl)
|
33
33
|
|
34
34
|
Notice that LSI will work without these libraries, but as soon as they are installed, Classifier will make use of them. No configuration changes are needed, we like to keep things ridiculously easy for you.
|
35
35
|
|
@@ -40,7 +40,7 @@ A Bayesian classifier by Lucas Carlson. Bayesian Classifiers are accurate, fast,
|
|
40
40
|
### Usage
|
41
41
|
|
42
42
|
```ruby
|
43
|
-
require 'classifier'
|
43
|
+
require 'classifier-reborn'
|
44
44
|
b = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting'
|
45
45
|
b.train_interesting "here are some good words. I hope you love them"
|
46
46
|
b.train_uninteresting "here are some bad words, I hate you"
|
@@ -74,7 +74,7 @@ theoretically simulates human learning.
|
|
74
74
|
### Usage
|
75
75
|
|
76
76
|
```ruby
|
77
|
-
require 'classifier'
|
77
|
+
require 'classifier-reborn'
|
78
78
|
lsi = ClassifierReborn::LSI.new
|
79
79
|
strings = [ ["This text deals with dogs. Dogs.", :dog],
|
80
80
|
["This text involves dogs too. Dogs! ", :dog],
|
data/lib/classifier-reborn.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
3
|
# License:: LGPL
|
4
4
|
|
5
|
-
require_relative '
|
5
|
+
require_relative 'category_namer'
|
6
6
|
|
7
7
|
module ClassifierReborn
|
8
8
|
class Bayes
|
@@ -11,9 +11,10 @@ module ClassifierReborn
|
|
11
11
|
# b = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
|
12
12
|
def initialize(*categories)
|
13
13
|
@categories = Hash.new
|
14
|
-
categories.each { |category| @categories[category
|
14
|
+
categories.each { |category| @categories[CategoryNamer.prepare_name(category)] = Hash.new }
|
15
15
|
@total_words = 0
|
16
16
|
@category_counts = Hash.new(0)
|
17
|
+
@category_word_count = Hash.new
|
17
18
|
end
|
18
19
|
|
19
20
|
# Provides a general training method for all categories specified in Bayes#new
|
@@ -23,11 +24,13 @@ module ClassifierReborn
|
|
23
24
|
# b.train "that", "That text"
|
24
25
|
# b.train "The other", "The other text"
|
25
26
|
def train(category, text)
|
26
|
-
category = category
|
27
|
-
|
27
|
+
category = CategoryNamer.prepare_name(category)
|
28
|
+
@category_word_count[category] ||= 0
|
29
|
+
@category_counts[category] += 1
|
28
30
|
Hasher.word_hash(text).each do |word, count|
|
29
31
|
@categories[category][word] ||= 0
|
30
32
|
@categories[category][word] += count
|
33
|
+
@category_word_count[category] += count
|
31
34
|
@total_words += count
|
32
35
|
end
|
33
36
|
end
|
@@ -40,17 +43,22 @@ module ClassifierReborn
|
|
40
43
|
# b.train :this, "This text"
|
41
44
|
# b.untrain :this, "This text"
|
42
45
|
def untrain(category, text)
|
43
|
-
category = category
|
46
|
+
category = CategoryNamer.prepare_name(category)
|
47
|
+
@category_word_count[category] ||= 0
|
44
48
|
@category_counts[category] -= 1
|
45
49
|
Hasher.word_hash(text).each do |word, count|
|
46
50
|
if @total_words >= 0
|
47
51
|
orig = @categories[category][word] || 0
|
48
|
-
@categories[category][word]
|
49
|
-
@categories[category][word]
|
52
|
+
@categories[category][word] ||= 0
|
53
|
+
@categories[category][word] -= count
|
50
54
|
if @categories[category][word] <= 0
|
51
55
|
@categories[category].delete(word)
|
52
56
|
count = orig
|
53
57
|
end
|
58
|
+
|
59
|
+
if @category_word_count[category] >= count
|
60
|
+
@category_word_count[category] -= count
|
61
|
+
end
|
54
62
|
@total_words -= count
|
55
63
|
end
|
56
64
|
end
|
@@ -62,13 +70,14 @@ module ClassifierReborn
|
|
62
70
|
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
63
71
|
def classifications(text)
|
64
72
|
score = Hash.new
|
65
|
-
|
73
|
+
word_hash = Hasher.word_hash(text)
|
74
|
+
training_count = @category_counts.values.reduce(:+).to_f
|
66
75
|
@categories.each do |category, category_words|
|
67
76
|
score[category.to_s] = 0
|
68
|
-
total =
|
69
|
-
|
77
|
+
total = (@category_word_count[category] || 1).to_f
|
78
|
+
word_hash.each do |word, count|
|
70
79
|
s = category_words.has_key?(word) ? category_words[word] : 0.1
|
71
|
-
score[category.to_s] += Math.log(s/total
|
80
|
+
score[category.to_s] += Math.log(s/total)
|
72
81
|
end
|
73
82
|
# now add prior probability for the category
|
74
83
|
s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
|
@@ -93,7 +102,8 @@ module ClassifierReborn
|
|
93
102
|
# b.untrain_that "That text"
|
94
103
|
# b.train_the_other "The other text"
|
95
104
|
def method_missing(name, *args)
|
96
|
-
|
105
|
+
cleaned_name = name.to_s.gsub(/(un)?train_([\w]+)/, '\2')
|
106
|
+
category = CategoryNamer.prepare_name(cleaned_name)
|
97
107
|
if @categories.has_key? category
|
98
108
|
args.each { |text| eval("#{$1}train(category, text)") }
|
99
109
|
elsif name.to_s =~ /(un)?train_([\w]+)/
|
@@ -120,7 +130,7 @@ module ClassifierReborn
|
|
120
130
|
# more criteria than the trained selective categories. In short,
|
121
131
|
# try to initialize your categories at initialization.
|
122
132
|
def add_category(category)
|
123
|
-
@categories[category
|
133
|
+
@categories[CategoryNamer.prepare_name(category)] = Hash.new
|
124
134
|
end
|
125
135
|
|
126
136
|
alias append_category add_category
|
@@ -5,6 +5,11 @@
|
|
5
5
|
require 'fast_stemmer'
|
6
6
|
require 'classifier-reborn/extensions/hasher'
|
7
7
|
|
8
|
-
|
9
|
-
|
8
|
+
module ClassifierReborn
|
9
|
+
module CategoryNamer
|
10
|
+
extend self
|
11
|
+
def prepare_name(name)
|
12
|
+
name.to_s.gsub("_"," ").capitalize.intern
|
13
|
+
end
|
14
|
+
end
|
10
15
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: classifier-reborn
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lucas Carlson
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-
|
12
|
+
date: 2014-11-09 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: fast-stemmer
|
@@ -71,8 +71,8 @@ files:
|
|
71
71
|
- bin/summarize.rb
|
72
72
|
- lib/classifier-reborn.rb
|
73
73
|
- lib/classifier-reborn/bayes.rb
|
74
|
+
- lib/classifier-reborn/category_namer.rb
|
74
75
|
- lib/classifier-reborn/extensions/hasher.rb
|
75
|
-
- lib/classifier-reborn/extensions/string.rb
|
76
76
|
- lib/classifier-reborn/extensions/vector.rb
|
77
77
|
- lib/classifier-reborn/extensions/vector_serialize.rb
|
78
78
|
- lib/classifier-reborn/lsi.rb
|