luisparravicini-classifier 1.3.8 → 1.3.9
Sign up to get free protection for your applications and to get access to all the features.
- data/{README → README.rdoc} +15 -4
- data/Rakefile +2 -2
- data/VERSION.yml +1 -1
- data/lib/classifier.rb +0 -1
- data/lib/classifier/bayes.rb +9 -0
- data/lib/classifier/extensions/vector.rb +0 -9
- data/lib/classifier/lsi.rb +11 -0
- data/luisparravicini-classifier.gemspec +6 -6
- data/test/bayes/bayesian_test.rb +10 -0
- data/test/stopwords_test.rb +1 -0
- metadata +6 -6
data/{README → README.rdoc}
RENAMED
@@ -9,8 +9,9 @@ rb-gsl:: http://rb-gsl.rubyforge.org
|
|
9
9
|
Notice that LSI will work without these libraries, but as soon as they are installed, Classifier will make use of them. No configuration changes are needed, we like to keep things ridiculously easy for you.
|
10
10
|
|
11
11
|
== Changes in this branch
|
12
|
-
I made this branch to fix a TypeError on untrain (classifier-1.3.1), then
|
12
|
+
I made this branch to fix a TypeError on untrain (classifier-1.3.1), then francois[http://github.com/francois/classifier/] branch for jeweler and all the changes yuri[http://github.com/yury/classifier/] made on his branch (specially the use of ruby-stemmer, and the incompatibility fix on Array#sum, which I needed).
|
13
13
|
After that I added support for loading the stopwords of certain language from a file (before the list was embedded on the source code) and a stopword list for Spanish.
|
14
|
+
This branch only works with Ruby 1.9
|
14
15
|
|
15
16
|
== Bayes
|
16
17
|
A Bayesian classifier by Lucas Carlson. Bayesian Classifiers are accurate, fast, and have modest memory requirements.
|
@@ -33,8 +34,17 @@ A Bayesian classifier by Lucas Carlson. Bayesian Classifiers are accurate, fast,
|
|
33
34
|
|
34
35
|
Using Madeleine, your application can persist the learned data over time.
|
35
36
|
|
36
|
-
|
37
|
-
|
37
|
+
=== Stemmer configuration
|
38
|
+
|
39
|
+
You can specify language and encoding for internal stemmer:
|
40
|
+
|
41
|
+
b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting'],
|
42
|
+
:language => 'ro', :encoding => 'ISO_8859_2'
|
43
|
+
|
44
|
+
The default values are 'en' for language and 'UTF-8' for the encoding.
|
45
|
+
|
46
|
+
Each language uses a word list to exclude certain words (stopwords). classifier comes with three included stopword lists, for English, Russian and Spanish.
|
47
|
+
The English list is the list that comes with the original gem (don't know where it was taken from) and the Russian and Spanish are from snowball[http://snowball.tartarus.org/algorithms/].
|
38
48
|
|
39
49
|
=== Bayesian Classification
|
40
50
|
|
@@ -71,7 +81,8 @@ theoretically simulates human learning.
|
|
71
81
|
Please see the Classifier::LSI documentation for more information. It is possible to index, search and classify
|
72
82
|
with more than just simple strings.
|
73
83
|
|
74
|
-
|
84
|
+
The configuration for the stemmer is the same used for Bayes:
|
85
|
+
|
75
86
|
lsi = Classifier::LSI.new :language => 'ro', :encoding => 'ISO_8859_2'
|
76
87
|
|
77
88
|
|
data/Rakefile
CHANGED
@@ -8,8 +8,8 @@ begin
|
|
8
8
|
s.summary = "A general classifier module to allow Bayesian and other types of classifications."
|
9
9
|
s.description = "Bayesian classifier and others."
|
10
10
|
s.homepage = "http://github.com/luisparravicini/classifier"
|
11
|
-
s.author = "
|
12
|
-
s.email = "
|
11
|
+
s.author = "Luis Parravicini"
|
12
|
+
s.email = "lparravi@gmail.com"
|
13
13
|
|
14
14
|
s.add_dependency "activesupport", ">= 2.2.2"
|
15
15
|
s.add_dependency "ruby-stemmer", ">= 0.5.1"
|
data/VERSION.yml
CHANGED
data/lib/classifier.rb
CHANGED
data/lib/classifier/bayes.rb
CHANGED
@@ -131,6 +131,15 @@ class Bayes < Classifier::Base
|
|
131
131
|
end
|
132
132
|
|
133
133
|
alias append_category add_category
|
134
|
+
|
135
|
+
def marshal_dump
|
136
|
+
[@categories, @total_words, @options ]
|
137
|
+
end
|
138
|
+
|
139
|
+
def marshal_load(data)
|
140
|
+
@categories, @total_words, @options = data
|
141
|
+
end
|
142
|
+
|
134
143
|
end
|
135
144
|
|
136
145
|
end
|
@@ -6,15 +6,6 @@
|
|
6
6
|
require 'matrix'
|
7
7
|
require 'mathn'
|
8
8
|
|
9
|
-
# Conflicts with ActiveSupport
|
10
|
-
unless Array.new.respond_to?(:sum)
|
11
|
-
class Array
|
12
|
-
def sum
|
13
|
-
inject(0) { |sum,term| sum += term }.to_f
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
9
|
class Vector
|
19
10
|
def magnitude
|
20
11
|
sumsqs = 0.0
|
data/lib/classifier/lsi.rb
CHANGED
@@ -295,6 +295,17 @@ module Classifier
|
|
295
295
|
return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
|
296
296
|
end
|
297
297
|
|
298
|
+
def marshal_dump
|
299
|
+
[ @auto_rebuild, @word_list, @items, @version, @built_at_version,
|
300
|
+
@options,
|
301
|
+
]
|
302
|
+
end
|
303
|
+
|
304
|
+
def marshal_load(data)
|
305
|
+
@auto_rebuild, @word_list, @items, @version, @built_at_version,
|
306
|
+
@options = data
|
307
|
+
end
|
308
|
+
|
298
309
|
private
|
299
310
|
def build_reduced_matrix( matrix, cutoff=0.75 )
|
300
311
|
# TODO: Check that M>=N on these dimensions! Transpose helps assure this
|
@@ -5,21 +5,21 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{luisparravicini-classifier}
|
8
|
-
s.version = "1.3.
|
8
|
+
s.version = "1.3.9"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
-
s.authors = ["
|
12
|
-
s.date = %q{2009-12-
|
11
|
+
s.authors = ["Luis Parravicini"]
|
12
|
+
s.date = %q{2009-12-26}
|
13
13
|
s.description = %q{Bayesian classifier and others.}
|
14
|
-
s.email = %q{
|
14
|
+
s.email = %q{lparravi@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
16
16
|
"LICENSE",
|
17
|
-
"README"
|
17
|
+
"README.rdoc"
|
18
18
|
]
|
19
19
|
s.files = [
|
20
20
|
"LICENSE",
|
21
21
|
"Manifest",
|
22
|
-
"README",
|
22
|
+
"README.rdoc",
|
23
23
|
"Rakefile",
|
24
24
|
"VERSION.yml",
|
25
25
|
"lib/classifier.rb",
|
data/test/bayes/bayesian_test.rb
CHANGED
@@ -55,4 +55,14 @@ class BayesianTest < Test::Unit::TestCase
|
|
55
55
|
assert_equal c.classifications("ХОРОШО"), c.classifications("хорошо")
|
56
56
|
assert_equal c.classifications("плОХО"), c.classifications("плохо")
|
57
57
|
end
|
58
|
+
|
59
|
+
def test_serialize
|
60
|
+
txt = "this can be serialized"
|
61
|
+
b = Classifier::Bayes.new(:categories => ['Interesting', 'Uninteresting'])
|
62
|
+
b.train_interesting(txt)
|
63
|
+
b.train_uninteresting("really uninteresting")
|
64
|
+
|
65
|
+
b2 = Marshal::load(Marshal::dump(b))
|
66
|
+
assert_equal b.classify(txt), b2.classify(txt)
|
67
|
+
end
|
58
68
|
end
|
data/test/stopwords_test.rb
CHANGED
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: luisparravicini-classifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
-
|
7
|
+
- Luis Parravicini
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-12-
|
12
|
+
date: 2009-12-26 00:00:00 -02:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -33,18 +33,18 @@ dependencies:
|
|
33
33
|
version: 0.5.1
|
34
34
|
version:
|
35
35
|
description: Bayesian classifier and others.
|
36
|
-
email:
|
36
|
+
email: lparravi@gmail.com
|
37
37
|
executables: []
|
38
38
|
|
39
39
|
extensions: []
|
40
40
|
|
41
41
|
extra_rdoc_files:
|
42
42
|
- LICENSE
|
43
|
-
- README
|
43
|
+
- README.rdoc
|
44
44
|
files:
|
45
45
|
- LICENSE
|
46
46
|
- Manifest
|
47
|
-
- README
|
47
|
+
- README.rdoc
|
48
48
|
- Rakefile
|
49
49
|
- VERSION.yml
|
50
50
|
- lib/classifier.rb
|