luisparravicini-classifier 1.4.0 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +15 -2
- data/VERSION.yml +2 -2
- data/lib/classifier.rb +1 -1
- data/lib/classifier/base.rb +9 -0
- data/luisparravicini-classifier.gemspec +3 -2
- data/pkg/luisparravicini-classifier-1.4.0.gem +0 -0
- data/test/bayes/bayesian_test.rb +6 -6
- metadata +3 -2
data/README.rdoc
CHANGED
@@ -41,10 +41,10 @@ You can specify language and encoding for internal stemmer:
|
|
41
41
|
b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting'],
|
42
42
|
:language => 'ro', :encoding => 'ISO_8859_2'
|
43
43
|
|
44
|
-
The default values are 'en' for language and '
|
44
|
+
The default values are 'en' for language and 'UTF_8' for the encoding. The encoding name must have underscores instead of hyphens (i.e.: UTF_8 instead of UTF-8).
|
45
45
|
|
46
46
|
Each language uses a word list to exclude certain words (stopwords). classifier comes with three included stopword lists, for English, Russian and Spanish.
|
47
|
-
The English list is the
|
47
|
+
The English list is the one that comes with the original gem (don't know where it was taken from) and the Russian and Spanish are from snowball[http://snowball.tartarus.org/algorithms/].
|
48
48
|
|
49
49
|
You can override the default stopword list, or add lists for new languages sending a value for :lang_dir when initializing Bayes:
|
50
50
|
|
@@ -55,6 +55,19 @@ This directory is used when loading the list from disk and takes precedence over
|
|
55
55
|
|
56
56
|
The stopwords file can have comments (indicated with '#'), blank lines are ignored and the encoding must be utf-8.
|
57
57
|
|
58
|
+
=== A warning about classifier serialization
|
59
|
+
If you serialize a classifier and then deserialize it in another process, you need to be careful that the stemmer is reinitialized
|
60
|
+
the next time you want to use it. Here is an example using ActiveRecord:
|
61
|
+
|
62
|
+
class User < ActiveRecord::Base
|
63
|
+
serialize :classifier, Classifier::Bayes
|
64
|
+
before_save :remove_stemmer
|
65
|
+
|
66
|
+
def remove_stemmer
|
67
|
+
self.classifier.remove_stemmer
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
58
71
|
=== Bayesian Classification
|
59
72
|
|
60
73
|
* http://www.process.com/precisemail/bayesian_filtering.htm
|
data/VERSION.yml
CHANGED
data/lib/classifier.rb
CHANGED
data/lib/classifier/base.rb
CHANGED
@@ -31,6 +31,15 @@ module Classifier
|
|
31
31
|
def clean_word_hash str
|
32
32
|
word_hash_for_words str.gsub(/[^\w\s]/,"").split
|
33
33
|
end
|
34
|
+
|
35
|
+
# When a Classifier instance is serialized, it is saved with an instance
|
36
|
+
# of Lingua::Stemmer that may not be initialized when deserialized later,
|
37
|
+
# raising a "RuntimeError: Stemmer is not initialized".
|
38
|
+
#
|
39
|
+
# You can run remove_stemmer to force a new Stemmer to be initialized.
|
40
|
+
def remove_stemmer
|
41
|
+
@stemmer = nil
|
42
|
+
end
|
34
43
|
|
35
44
|
private
|
36
45
|
|
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{luisparravicini-classifier}
|
8
|
-
s.version = "1.4.
|
8
|
+
s.version = "1.4.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Luis Parravicini"]
|
12
|
-
s.date = %q{
|
12
|
+
s.date = %q{2010-01-29}
|
13
13
|
s.description = %q{Bayesian classifier and others.}
|
14
14
|
s.email = %q{lparravi@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -37,6 +37,7 @@ Gem::Specification.new do |s|
|
|
37
37
|
"lib/classifier/stopwords/ru",
|
38
38
|
"lib/init.rb",
|
39
39
|
"luisparravicini-classifier.gemspec",
|
40
|
+
"pkg/luisparravicini-classifier-1.4.0.gem",
|
40
41
|
"tasks/test.rake",
|
41
42
|
"test/base_test.rb",
|
42
43
|
"test/bayes/bayesian_test.rb",
|
Binary file
|
data/test/bayes/bayesian_test.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# coding:utf-8
|
2
|
-
$KCODE = 'utf8'
|
3
2
|
|
4
3
|
require File.dirname(__FILE__) + '/../test_helper'
|
4
|
+
|
5
5
|
class BayesianTest < Test::Unit::TestCase
|
6
6
|
def setup
|
7
7
|
@classifier = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting']
|
@@ -58,11 +58,11 @@ class BayesianTest < Test::Unit::TestCase
|
|
58
58
|
|
59
59
|
def test_serialize
|
60
60
|
txt = "this can be serialized"
|
61
|
-
|
62
|
-
|
63
|
-
b.train_uninteresting("really uninteresting")
|
61
|
+
@classifier.train_interesting(txt)
|
62
|
+
@classifier.train_uninteresting("really uninteresting")
|
64
63
|
|
65
|
-
b2 = Marshal::load(Marshal::dump(
|
66
|
-
assert_equal
|
64
|
+
b2 = Marshal::load(Marshal::dump(@classifier))
|
65
|
+
assert_equal @classifier.classify(txt), b2.classify(txt)
|
67
66
|
end
|
67
|
+
|
68
68
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: luisparravicini-classifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.4.
|
4
|
+
version: 1.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis Parravicini
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2010-01-29 00:00:00 -02:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -62,6 +62,7 @@ files:
|
|
62
62
|
- lib/classifier/stopwords/ru
|
63
63
|
- lib/init.rb
|
64
64
|
- luisparravicini-classifier.gemspec
|
65
|
+
- pkg/luisparravicini-classifier-1.4.0.gem
|
65
66
|
- tasks/test.rake
|
66
67
|
- test/base_test.rb
|
67
68
|
- test/bayes/bayesian_test.rb
|