linnaeus 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -1,4 +1,4 @@
1
- source "http://rubygems.org"
1
+ source 'http://rubygems.org'
2
2
 
3
3
  gem 'redis', '~> 3.0.0'
4
4
  gem 'stemmer', '~> 1.0.0'
@@ -6,10 +6,11 @@ gem 'stemmer', '~> 1.0.0'
6
6
  # Add dependencies to develop your gem here.
7
7
  # Include everything needed to run rake, tests, features, etc.
8
8
  group :development do
9
- gem "rspec", "~> 2.11.0"
10
- gem "yard", "~> 0.7"
11
- gem "rdoc", "~> 3.12"
12
- gem "bundler"
13
- gem "jeweler"
14
- gem "simplecov"
9
+ gem 'rspec', '~> 2.11.0'
10
+ gem 'yard', '~> 0.7'
11
+ gem 'rdoc', '~> 3.12'
12
+ gem 'bundler'
13
+ gem 'jeweler'
14
+ gem 'simplecov'
15
+ gem 'redcarpet'
15
16
  end
data/Gemfile.lock CHANGED
@@ -13,6 +13,7 @@ GEM
13
13
  rake (0.9.2.2)
14
14
  rdoc (3.12)
15
15
  json (~> 1.4)
16
+ redcarpet (2.2.2)
16
17
  redis (3.0.2)
17
18
  rspec (2.11.0)
18
19
  rspec-core (~> 2.11.0)
@@ -36,6 +37,7 @@ DEPENDENCIES
36
37
  bundler
37
38
  jeweler
38
39
  rdoc (~> 3.12)
40
+ redcarpet
39
41
  redis (~> 3.0.0)
40
42
  rspec (~> 2.11.0)
41
43
  simplecov
data/README.md ADDED
@@ -0,0 +1,40 @@
1
+ # Linnaeus [![Build Status](https://secure.travis-ci.org/djcp/linnaeus.png?branch=master)](http://travis-ci.org/djcp/linnaeus)
2
+
3
+ ![Carl Linnaeus](https://raw.github.com/djcp/linnaeus/master/images/linnaeus.jpg)
4
+
5
+ Linnaeus is a redis-backed naive Bayesian classification system. Please see the [rdoc](http://rubydoc.info/gems/linnaeus/) for more information. Ruby 1.9 is required.
6
+
7
+ Examples
8
+ --------
9
+
10
+ lt = Linnaeus::Trainer.new # Used to train documents
11
+ lc = Linnaeus::Classifier.new # Used to classify documents
12
+
13
+ lt.train 'language', 'Ruby is a dynamic, reflective, general-purpose object-oriented programming language that combines syntax inspired by Perl with Smalltalk-like features.'
14
+ lt.train 'database', 'PostgreSQL, often simply Postgres, is an object-relational database management system (ORDBMS) available for many platforms including Linux, FreeBSD, Solaris, Microsoft Windows and Mac OS X.'
15
+
16
+ lc.classify 'Perl is a high-level, general-purpose, interpreted, dynamic programming language.' # returns "language"
17
+
18
+
19
+ Contributing to linnaeus
20
+ ------------------------
21
+
22
+ * Submit bugs to the github issue tracker: https://github.com/djcp/linnaeus/issues
23
+ * If you'd like to add a feature, please submit a description of it to the issue tracker so we can discuss.
24
+ * If the feature makes sense, fork the github repository. Write rspec tests and issue a pull request when your change is done.
25
+
26
+ The Future
27
+ ----------
28
+
29
+ * Create additional storage backends - sqlite, postgresql, mongodb, etc.
30
+ * Allow for weighting tweaks.
31
+
32
+ Copyright
33
+ ---------
34
+
35
+ Copyright (c) 2012 Dan Collis-Puro. See LICENSE.txt for further details.
36
+
37
+ Credits
38
+ -------
39
+
40
+ * Image courtesy wikipedia. About Carl Linnaeus: http://en.wikipedia.org/wiki/Linnaeus
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.0.2
1
+ 1.0.3
@@ -57,7 +57,12 @@ class Linnaeus::Classifier < Linnaeus
57
57
  # == Returns
58
58
  # A string representing the most likely category.
59
59
  def classify(text)
60
- (classification_scores(text).sort_by { |a| -a[1] })[0][0]
60
+ scores = classification_scores(text)
61
+ if scores.any?
62
+ (scores.sort_by { |a| -a[1] })[0][0]
63
+ else
64
+ ''
65
+ end
61
66
  end
62
67
 
63
68
  end
@@ -116,7 +116,9 @@ class Linnaeus::Persistence < Linnaeus
116
116
  if empty_words == word_counts
117
117
  @redis.del BASE_CATEGORY_KEY + category
118
118
  else
119
- @redis.hdel BASE_CATEGORY_KEY + category, empty_words.keys
119
+ if empty_words.any?
120
+ @redis.hdel BASE_CATEGORY_KEY + category, empty_words.keys
121
+ end
120
122
  end
121
123
  end
122
124
 
data/lib/linnaeus.rb CHANGED
@@ -44,7 +44,7 @@ class Linnaeus
44
44
  # A string or array of categories
45
45
  def normalize_categories(categories = [])
46
46
  [categories].flatten.collect do |cat|
47
- cat.to_s.downcase.gsub(/[^a-z\d\.\-_]/,'')
47
+ cat.to_s.encode(@encoding).downcase.gsub(/[^a-z\d\.\-_ ]/,'')
48
48
  end.reject{|cat| cat == ''}.compact
49
49
  end
50
50
 
data/linnaeus.gemspec CHANGED
@@ -5,16 +5,16 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "linnaeus"
8
- s.version = "1.0.2"
8
+ s.version = "1.0.3"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["djcp"]
12
- s.date = "2012-10-31"
12
+ s.date = "2012-11-02"
13
13
  s.description = "Linnaeus provides a redis-backed Bayesian classifier. Words are stemmed, stopwords are stopped, and redis is used to allow for persistent and concurrent training and classification."
14
14
  s.email = "dan@collispuro.net"
15
15
  s.extra_rdoc_files = [
16
16
  "LICENSE.txt",
17
- "README.rdoc"
17
+ "README.md"
18
18
  ]
19
19
  s.files = [
20
20
  ".document",
@@ -23,7 +23,7 @@ Gem::Specification.new do |s|
23
23
  "Gemfile",
24
24
  "Gemfile.lock",
25
25
  "LICENSE.txt",
26
- "README.rdoc",
26
+ "README.md",
27
27
  "Rakefile",
28
28
  "VERSION",
29
29
  "images/linnaeus.jpg",
@@ -58,6 +58,7 @@ Gem::Specification.new do |s|
58
58
  s.add_development_dependency(%q<bundler>, [">= 0"])
59
59
  s.add_development_dependency(%q<jeweler>, [">= 0"])
60
60
  s.add_development_dependency(%q<simplecov>, [">= 0"])
61
+ s.add_development_dependency(%q<redcarpet>, [">= 0"])
61
62
  else
62
63
  s.add_dependency(%q<redis>, ["~> 3.0.0"])
63
64
  s.add_dependency(%q<stemmer>, ["~> 1.0.0"])
@@ -67,6 +68,7 @@ Gem::Specification.new do |s|
67
68
  s.add_dependency(%q<bundler>, [">= 0"])
68
69
  s.add_dependency(%q<jeweler>, [">= 0"])
69
70
  s.add_dependency(%q<simplecov>, [">= 0"])
71
+ s.add_dependency(%q<redcarpet>, [">= 0"])
70
72
  end
71
73
  else
72
74
  s.add_dependency(%q<redis>, ["~> 3.0.0"])
@@ -77,6 +79,7 @@ Gem::Specification.new do |s|
77
79
  s.add_dependency(%q<bundler>, [">= 0"])
78
80
  s.add_dependency(%q<jeweler>, [">= 0"])
79
81
  s.add_dependency(%q<simplecov>, [">= 0"])
82
+ s.add_dependency(%q<redcarpet>, [">= 0"])
80
83
  end
81
84
  end
82
85
 
@@ -1,12 +1,32 @@
1
1
  require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
2
 
3
3
  describe Linnaeus::Classifier do
4
+ context 'with no training data' do
5
+ it 'should return empty values when attempting to classify' do
6
+ Linnaeus::Persistence.new.clear_all_training_data
7
+ subject.classify("foo bar baz").should be_empty
8
+ subject.classification_scores("foo bar baz").should be_empty
9
+ end
10
+ end
11
+
4
12
  context 'with a very small dataset' do
5
- it 'should classify easy things well' do
13
+ before do
6
14
  create_small_dataset
15
+ end
16
+
17
+ it 'should classify easy things well' do
7
18
  subject.classify('A bird that migrates').should eq('bird')
8
19
  subject.classify('This was directed by Gus Van Sant').should eq('movie')
9
20
  end
21
+
22
+ it 'should return correct classification scores' do
23
+ subject.classification_scores('a bird').should eq(
24
+ { "movie"=>-6.272877006546167, "bird"=>-4.2626798770413155 }
25
+ )
26
+ subject.classification_scores('a directorial bird').should eq(
27
+ { "movie"=>-10.24316892009829, "bird"=>-10.827944847076676 }
28
+ )
29
+ end
10
30
  end
11
31
 
12
32
  def create_small_dataset
@@ -1,22 +1,88 @@
1
1
  require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
2
 
3
3
  describe Linnaeus::Persistence do
4
- it 'stores categories successfully' do
5
- lp = Linnaeus::Persistence.new
4
+ before do
5
+ lp = get_linnaeus_persistence
6
+ lp.clear_all_training_data
7
+ end
8
+
9
+ it '#clear_all_training_data' do
10
+ lp = get_linnaeus_persistence
11
+ train_a_document_in 'testcategory'
12
+ lp.get_words_with_count_for_category('testcategory').should_not be_empty
6
13
  lp.clear_all_training_data
14
+ lp.get_words_with_count_for_category('testcategory').should be_empty
15
+ end
16
+
17
+ it 'stores categories successfully' do
18
+ lp = get_linnaeus_persistence
7
19
  add_categories lp
8
20
  lp.get_categories.sort.should eq ['bar','baz','foo']
9
21
  end
10
22
 
11
23
  it 'can remove categories' do
12
- lp = Linnaeus::Persistence.new
13
- lp.clear_all_training_data
24
+ lp = get_linnaeus_persistence
14
25
  add_categories lp
15
26
  lp.remove_category 'bar'
16
27
  lp.get_categories.sort.should eq ['baz','foo']
17
28
  end
18
29
 
30
+ it '#get_words_with_count_for_category' do
31
+ lp = get_linnaeus_persistence
32
+ train_a_document_in 'testcategory'
33
+ lp.get_words_with_count_for_category('testcategory').should eq({
34
+ "test"=>"1", "document"=>"1", "stuff"=>"1",
35
+ "bayesian"=>"1", "corpu"=>"1"
36
+ })
37
+ end
38
+
39
+ it '#increment_word_counts_for_category' do
40
+ lp = get_linnaeus_persistence
41
+ train_a_document_in 'testcategory'
42
+ train_a_document_in 'testcategory'
43
+ lp.get_words_with_count_for_category('testcategory').should eq({
44
+ "test"=>"2", "document"=>"2", "stuff"=>"2",
45
+ "bayesian"=>"2", "corpu"=>"2"
46
+ })
47
+ end
48
+
49
+ it '#decrement_word_counts_for_category' do
50
+ lp = get_linnaeus_persistence
51
+ train_a_document_in 'testcategory'
52
+ train_a_document_in 'testcategory'
53
+ untrain_a_document_in 'testcategory'
54
+ lp.get_words_with_count_for_category('testcategory').should eq({
55
+ "test"=>"1", "document"=>"1", "stuff"=>"1",
56
+ "bayesian"=>"1", "corpu"=>"1"
57
+ })
58
+ end
59
+
60
+ it '#cleanup_empty_words_in_category' do
61
+ lp = get_linnaeus_persistence
62
+ train_a_document_in 'testcategory'
63
+ untrain_a_document_in 'testcategory'
64
+ lp.get_words_with_count_for_category('testcategory').should eq ({})
65
+ end
66
+
19
67
  def add_categories(lp)
20
68
  lp.add_categories(['foo','bar','baz','foo', 'bar'])
21
69
  end
70
+
71
+ def get_linnaeus_persistence
72
+ @lp ||= Linnaeus::Persistence.new
73
+ end
74
+
75
+ def train_a_document_in(category)
76
+ lt = Linnaeus::Trainer.new
77
+ lt.train category, document
78
+ end
79
+
80
+ def untrain_a_document_in(category)
81
+ lt = Linnaeus::Trainer.new
82
+ lt.untrain category, document
83
+ end
84
+
85
+ def document
86
+ 'I am a test document and I will have stuff in the bayesian corpus'
87
+ end
22
88
  end
@@ -52,7 +52,7 @@ describe Linnaeus::Trainer do
52
52
 
53
53
  context 'with non-default stopwords' do
54
54
  subject { Linnaeus::Trainer.new(stopwords_class: FooStop) }
55
- it 'should count word occurrencs properly' do
55
+ it 'should count word occurrences properly' do
56
56
  subject.count_word_occurrences('foo bar foo baz').should == { 'baz' => 1 }
57
57
  end
58
58
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linnaeus
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-10-31 00:00:00.000000000 Z
12
+ date: 2012-11-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: redis
@@ -139,6 +139,22 @@ dependencies:
139
139
  - - ! '>='
140
140
  - !ruby/object:Gem::Version
141
141
  version: '0'
142
+ - !ruby/object:Gem::Dependency
143
+ name: redcarpet
144
+ requirement: !ruby/object:Gem::Requirement
145
+ none: false
146
+ requirements:
147
+ - - ! '>='
148
+ - !ruby/object:Gem::Version
149
+ version: '0'
150
+ type: :development
151
+ prerelease: false
152
+ version_requirements: !ruby/object:Gem::Requirement
153
+ none: false
154
+ requirements:
155
+ - - ! '>='
156
+ - !ruby/object:Gem::Version
157
+ version: '0'
142
158
  description: Linnaeus provides a redis-backed Bayesian classifier. Words are stemmed,
143
159
  stopwords are stopped, and redis is used to allow for persistent and concurrent
144
160
  training and classification.
@@ -147,7 +163,7 @@ executables: []
147
163
  extensions: []
148
164
  extra_rdoc_files:
149
165
  - LICENSE.txt
150
- - README.rdoc
166
+ - README.md
151
167
  files:
152
168
  - .document
153
169
  - .rspec
@@ -155,7 +171,7 @@ files:
155
171
  - Gemfile
156
172
  - Gemfile.lock
157
173
  - LICENSE.txt
158
- - README.rdoc
174
+ - README.md
159
175
  - Rakefile
160
176
  - VERSION
161
177
  - images/linnaeus.jpg
@@ -186,7 +202,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
186
202
  version: '0'
187
203
  segments:
188
204
  - 0
189
- hash: 2735128872899197737
205
+ hash: 2790078718663664512
190
206
  required_rubygems_version: !ruby/object:Gem::Requirement
191
207
  none: false
192
208
  requirements:
data/README.rdoc DELETED
@@ -1,35 +0,0 @@
1
- = Linnaeus
2
-
3
- https://raw.github.com/djcp/linnaeus/master/images/linnaeus.jpg
4
-
5
- Linnaeus is a redis-backed Bayesian classification system. Please see the generated rdocs for more information. Ruby 1.9 is required.
6
-
7
- == Examples
8
-
9
- lt = Linnaeus::Trainer.new # Used to train documents
10
- lc = Linnaeus::Classifier.new # Used to classify documents
11
-
12
- lt.train 'language', 'Ruby is a dynamic, reflective, general-purpose object-oriented programming language that combines syntax inspired by Perl with Smalltalk-like features.'
13
- lt.train 'database', 'PostgreSQL, often simply Postgres, is an object-relational database management system (ORDBMS) available for many platforms including Linux, FreeBSD, Solaris, Microsoft Windows and Mac OS X.'
14
-
15
- lc.classify 'Perl is a high-level, general-purpose, interpreted, dynamic programming language.' # returns "language"
16
-
17
-
18
- == Contributing to linnaeus
19
-
20
- * Submit bugs to the github issue tracker: https://github.com/djcp/linnaeus/issues
21
- * If you'd like to add a feature, please submit a description of it to the issue tracker so we can discuss.
22
- * If the feature makes sense, fork the github repository. Write rspec tests and issue a pull request when your change is done.
23
-
24
- == The Future
25
-
26
- * Create additional storage backends - sqlite, postgresql, mongodb, etc.
27
- * Allow for weighting tweaks.
28
-
29
- == Copyright
30
-
31
- Copyright (c) 2012 Dan Collis-Puro. See LICENSE.txt for further details.
32
-
33
- == Credits
34
-
35
- * Image courtesy wikipedia. About Carl Linnaeus: http://en.wikipedia.org/wiki/Linnaeus