luisparravicini-classifier 1.4.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -41,10 +41,10 @@ You can specify language and encoding for internal stemmer:
41
41
  b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting'],
42
42
  :language => 'ro', :encoding => 'ISO_8859_2'
43
43
 
44
- The default values are 'en' for language and 'UTF-8' for the encoding.
44
+ The default values are 'en' for language and 'UTF_8' for the encoding. The encoding name must have underscores instead of hyphens (i.e.: UTF_8 instead of UTF-8).
45
45
 
46
46
  Each language uses a word list to exclude certain words (stopwords). classifier comes with three included stopword lists, for English, Russian and Spanish.
47
- The English list is the list that comes with the original gem (don't know where it was taken from) and the Russian and Spanish are from snowball[http://snowball.tartarus.org/algorithms/].
47
+ The English list is the one that comes with the original gem (don't know where it was taken from) and the Russian and Spanish are from snowball[http://snowball.tartarus.org/algorithms/].
48
48
 
49
49
  You can override the default stopword list, or add lists for new languages sending a value for :lang_dir when initializing Bayes:
50
50
 
@@ -55,6 +55,19 @@ This directory is used when loading the list from disk and takes precedence over
55
55
 
56
56
  The stopwords file can have comments (indicated with '#'), blank lines are ignored and the encoding must be utf-8.
57
57
 
58
+ === A warning about classifier serialization
59
+ If you serialize a classifier and then deserialize it in another process, you need to be careful that the stemmer is reinitialized
60
+ the next time you want to use it. Here is an example using ActiveRecord:
61
+
62
+ class User < ActiveRecord::Base
63
+ serialize :classifier, Classifier::Bayes
64
+ before_save :remove_stemmer
65
+
66
+ def remove_stemmer
67
+ self.classifier.remove_stemmer
68
+ end
69
+ end
70
+
58
71
  === Bayesian Classification
59
72
 
60
73
  * http://www.process.com/precisemail/bayesian_filtering.htm
data/VERSION.yml CHANGED
@@ -1,5 +1,5 @@
1
1
  ---
2
+ :build:
3
+ :patch: 1
2
4
  :major: 1
3
5
  :minor: 4
4
- :patch: 0
5
- :build:
data/lib/classifier.rb CHANGED
@@ -24,7 +24,7 @@
24
24
  # Copyright:: Copyright (c) 2005 Lucas Carlson
25
25
  # License:: LGPL
26
26
 
27
- require 'activesupport'
27
+ require 'active_support'
28
28
  require 'lingua/stemmer'
29
29
  require 'classifier/base'
30
30
  require 'classifier/bayes'
@@ -31,6 +31,15 @@ module Classifier
31
31
  def clean_word_hash str
32
32
  word_hash_for_words str.gsub(/[^\w\s]/,"").split
33
33
  end
34
+
35
+ # When a Classifier instance is serialized, it is saved with an instance
36
+ # of Lingua::Stemmer that may not be initialized when deserialized later,
37
+ # raising a "RuntimeError: Stemmer is not initialized".
38
+ #
39
+ # You can run remove_stemmer to force a new Stemmer to be initialized.
40
+ def remove_stemmer
41
+ @stemmer = nil
42
+ end
34
43
 
35
44
  private
36
45
 
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{luisparravicini-classifier}
8
- s.version = "1.4.0"
8
+ s.version = "1.4.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Luis Parravicini"]
12
- s.date = %q{2009-12-26}
12
+ s.date = %q{2010-01-29}
13
13
  s.description = %q{Bayesian classifier and others.}
14
14
  s.email = %q{lparravi@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -37,6 +37,7 @@ Gem::Specification.new do |s|
37
37
  "lib/classifier/stopwords/ru",
38
38
  "lib/init.rb",
39
39
  "luisparravicini-classifier.gemspec",
40
+ "pkg/luisparravicini-classifier-1.4.0.gem",
40
41
  "tasks/test.rake",
41
42
  "test/base_test.rb",
42
43
  "test/bayes/bayesian_test.rb",
@@ -1,7 +1,7 @@
1
1
  # coding:utf-8
2
- $KCODE = 'utf8'
3
2
 
4
3
  require File.dirname(__FILE__) + '/../test_helper'
4
+
5
5
  class BayesianTest < Test::Unit::TestCase
6
6
  def setup
7
7
  @classifier = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting']
@@ -58,11 +58,11 @@ class BayesianTest < Test::Unit::TestCase
58
58
 
59
59
  def test_serialize
60
60
  txt = "this can be serialized"
61
- b = Classifier::Bayes.new(:categories => ['Interesting', 'Uninteresting'])
62
- b.train_interesting(txt)
63
- b.train_uninteresting("really uninteresting")
61
+ @classifier.train_interesting(txt)
62
+ @classifier.train_uninteresting("really uninteresting")
64
63
 
65
- b2 = Marshal::load(Marshal::dump(b))
66
- assert_equal b.classify(txt), b2.classify(txt)
64
+ b2 = Marshal::load(Marshal::dump(@classifier))
65
+ assert_equal @classifier.classify(txt), b2.classify(txt)
67
66
  end
67
+
68
68
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: luisparravicini-classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0
4
+ version: 1.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis Parravicini
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-12-26 00:00:00 -02:00
12
+ date: 2010-01-29 00:00:00 -02:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -62,6 +62,7 @@ files:
62
62
  - lib/classifier/stopwords/ru
63
63
  - lib/init.rb
64
64
  - luisparravicini-classifier.gemspec
65
+ - pkg/luisparravicini-classifier-1.4.0.gem
65
66
  - tasks/test.rake
66
67
  - test/base_test.rb
67
68
  - test/bayes/bayesian_test.rb