luisparravicini-classifier 1.4.0 → 1.4.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +15 -2
- data/VERSION.yml +2 -2
- data/lib/classifier.rb +1 -1
- data/lib/classifier/base.rb +9 -0
- data/luisparravicini-classifier.gemspec +3 -2
- data/pkg/luisparravicini-classifier-1.4.0.gem +0 -0
- data/test/bayes/bayesian_test.rb +6 -6
- metadata +3 -2
data/README.rdoc
CHANGED
@@ -41,10 +41,10 @@ You can specify language and encoding for internal stemmer:
|
|
41
41
|
b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting'],
|
42
42
|
:language => 'ro', :encoding => 'ISO_8859_2'
|
43
43
|
|
44
|
-
The default values are 'en' for language and '
|
44
|
+
The default values are 'en' for language and 'UTF_8' for the encoding. The encoding name must have underscores instead of hyphens (i.e.: UTF_8 instead of UTF-8).
|
45
45
|
|
46
46
|
Each language uses a word list to exclude certain words (stopwords). classifier comes with three included stopword lists, for English, Russian and Spanish.
|
47
|
-
The English list is the
|
47
|
+
The English list is the one that comes with the original gem (don't know where it was taken from) and the Russian and Spanish are from snowball[http://snowball.tartarus.org/algorithms/].
|
48
48
|
|
49
49
|
You can override the default stopword list, or add lists for new languages sending a value for :lang_dir when initializing Bayes:
|
50
50
|
|
@@ -55,6 +55,19 @@ This directory is used when loading the list from disk and takes precedence over
|
|
55
55
|
|
56
56
|
The stopwords file can have comments (indicated with '#'), blank lines are ignored and the encoding must be utf-8.
|
57
57
|
|
58
|
+
=== A warning about classifier serialization
|
59
|
+
If you serialize a classifier and then deserialize it in another process, you need to be careful that the stemmer is reinitialized
|
60
|
+
the next time you want to use it. Here is an example using ActiveRecord:
|
61
|
+
|
62
|
+
class User < ActiveRecord::Base
|
63
|
+
serialize :classifier, Classifier::Bayes
|
64
|
+
before_save :remove_stemmer
|
65
|
+
|
66
|
+
def remove_stemmer
|
67
|
+
self.classifier.remove_stemmer
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
58
71
|
=== Bayesian Classification
|
59
72
|
|
60
73
|
* http://www.process.com/precisemail/bayesian_filtering.htm
|
data/VERSION.yml
CHANGED
data/lib/classifier.rb
CHANGED
data/lib/classifier/base.rb
CHANGED
@@ -31,6 +31,15 @@ module Classifier
|
|
31
31
|
def clean_word_hash str
|
32
32
|
word_hash_for_words str.gsub(/[^\w\s]/,"").split
|
33
33
|
end
|
34
|
+
|
35
|
+
# When a Classifier instance is serialized, it is saved with an instance
|
36
|
+
# of Lingua::Stemmer that may not be initialized when deserialized later,
|
37
|
+
# raising a "RuntimeError: Stemmer is not initialized".
|
38
|
+
#
|
39
|
+
# You can run remove_stemmer to force a new Stemmer to be initialized.
|
40
|
+
def remove_stemmer
|
41
|
+
@stemmer = nil
|
42
|
+
end
|
34
43
|
|
35
44
|
private
|
36
45
|
|
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{luisparravicini-classifier}
|
8
|
-
s.version = "1.4.
|
8
|
+
s.version = "1.4.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Luis Parravicini"]
|
12
|
-
s.date = %q{
|
12
|
+
s.date = %q{2010-01-29}
|
13
13
|
s.description = %q{Bayesian classifier and others.}
|
14
14
|
s.email = %q{lparravi@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -37,6 +37,7 @@ Gem::Specification.new do |s|
|
|
37
37
|
"lib/classifier/stopwords/ru",
|
38
38
|
"lib/init.rb",
|
39
39
|
"luisparravicini-classifier.gemspec",
|
40
|
+
"pkg/luisparravicini-classifier-1.4.0.gem",
|
40
41
|
"tasks/test.rake",
|
41
42
|
"test/base_test.rb",
|
42
43
|
"test/bayes/bayesian_test.rb",
|
Binary file
|
data/test/bayes/bayesian_test.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# coding:utf-8
|
2
|
-
$KCODE = 'utf8'
|
3
2
|
|
4
3
|
require File.dirname(__FILE__) + '/../test_helper'
|
4
|
+
|
5
5
|
class BayesianTest < Test::Unit::TestCase
|
6
6
|
def setup
|
7
7
|
@classifier = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting']
|
@@ -58,11 +58,11 @@ class BayesianTest < Test::Unit::TestCase
|
|
58
58
|
|
59
59
|
def test_serialize
|
60
60
|
txt = "this can be serialized"
|
61
|
-
|
62
|
-
|
63
|
-
b.train_uninteresting("really uninteresting")
|
61
|
+
@classifier.train_interesting(txt)
|
62
|
+
@classifier.train_uninteresting("really uninteresting")
|
64
63
|
|
65
|
-
b2 = Marshal::load(Marshal::dump(
|
66
|
-
assert_equal
|
64
|
+
b2 = Marshal::load(Marshal::dump(@classifier))
|
65
|
+
assert_equal @classifier.classify(txt), b2.classify(txt)
|
67
66
|
end
|
67
|
+
|
68
68
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: luisparravicini-classifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.4.
|
4
|
+
version: 1.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis Parravicini
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2010-01-29 00:00:00 -02:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -62,6 +62,7 @@ files:
|
|
62
62
|
- lib/classifier/stopwords/ru
|
63
63
|
- lib/init.rb
|
64
64
|
- luisparravicini-classifier.gemspec
|
65
|
+
- pkg/luisparravicini-classifier-1.4.0.gem
|
65
66
|
- tasks/test.rake
|
66
67
|
- test/base_test.rb
|
67
68
|
- test/bayes/bayesian_test.rb
|