linnaeus 1.0.2 → 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -1,4 +1,4 @@
1
- source "http://rubygems.org"
1
+ source 'http://rubygems.org'
2
2
 
3
3
  gem 'redis', '~> 3.0.0'
4
4
  gem 'stemmer', '~> 1.0.0'
@@ -6,10 +6,11 @@ gem 'stemmer', '~> 1.0.0'
6
6
  # Add dependencies to develop your gem here.
7
7
  # Include everything needed to run rake, tests, features, etc.
8
8
  group :development do
9
- gem "rspec", "~> 2.11.0"
10
- gem "yard", "~> 0.7"
11
- gem "rdoc", "~> 3.12"
12
- gem "bundler"
13
- gem "jeweler"
14
- gem "simplecov"
9
+ gem 'rspec', '~> 2.11.0'
10
+ gem 'yard', '~> 0.7'
11
+ gem 'rdoc', '~> 3.12'
12
+ gem 'bundler'
13
+ gem 'jeweler'
14
+ gem 'simplecov'
15
+ gem 'redcarpet'
15
16
  end
data/Gemfile.lock CHANGED
@@ -13,6 +13,7 @@ GEM
13
13
  rake (0.9.2.2)
14
14
  rdoc (3.12)
15
15
  json (~> 1.4)
16
+ redcarpet (2.2.2)
16
17
  redis (3.0.2)
17
18
  rspec (2.11.0)
18
19
  rspec-core (~> 2.11.0)
@@ -36,6 +37,7 @@ DEPENDENCIES
36
37
  bundler
37
38
  jeweler
38
39
  rdoc (~> 3.12)
40
+ redcarpet
39
41
  redis (~> 3.0.0)
40
42
  rspec (~> 2.11.0)
41
43
  simplecov
data/README.md ADDED
@@ -0,0 +1,40 @@
1
+ # Linnaeus [![Build Status](https://secure.travis-ci.org/djcp/linnaeus.png?branch=master)](http://travis-ci.org/djcp/linnaeus)
2
+
3
+ ![Carl Linnaeus](https://raw.github.com/djcp/linnaeus/master/images/linnaeus.jpg)
4
+
5
+ Linnaeus is a redis-backed naive Bayesian classification system. Please see the [rdoc](http://rubydoc.info/gems/linnaeus/) for more information. Ruby 1.9 is required.
6
+
7
+ Examples
8
+ --------
9
+
10
+ lt = Linnaeus::Trainer.new # Used to train documents
11
+ lc = Linnaeus::Classifier.new # Used to classify documents
12
+
13
+ lt.train 'language', 'Ruby is a dynamic, reflective, general-purpose object-oriented programming language that combines syntax inspired by Perl with Smalltalk-like features.'
14
+ lt.train 'database', 'PostgreSQL, often simply Postgres, is an object-relational database management system (ORDBMS) available for many platforms including Linux, FreeBSD, Solaris, Microsoft Windows and Mac OS X.'
15
+
16
+ lc.classify 'Perl is a high-level, general-purpose, interpreted, dynamic programming language.' # returns "language"
17
+
18
+
19
+ Contributing to linnaeus
20
+ ------------------------
21
+
22
+ * Submit bugs to the github issue tracker: https://github.com/djcp/linnaeus/issues
23
+ * If you'd like to add a feature, please submit a description of it to the issue tracker so we can discuss.
24
+ * If the feature makes sense, fork the github repository. Write rspec tests and issue a pull request when your change is done.
25
+
26
+ The Future
27
+ ----------
28
+
29
+ * Create additional storage backends - sqlite, postgresql, mongodb, etc.
30
+ * Allow for weighting tweaks.
31
+
32
+ Copyright
33
+ ---------
34
+
35
+ Copyright (c) 2012 Dan Collis-Puro. See LICENSE.txt for further details.
36
+
37
+ Credits
38
+ -------
39
+
40
+ * Image courtesy wikipedia. About Carl Linnaeus: http://en.wikipedia.org/wiki/Linnaeus
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.0.2
1
+ 1.0.3
@@ -57,7 +57,12 @@ class Linnaeus::Classifier < Linnaeus
57
57
  # == Returns
58
58
  # A string representing the most likely category.
59
59
  def classify(text)
60
- (classification_scores(text).sort_by { |a| -a[1] })[0][0]
60
+ scores = classification_scores(text)
61
+ if scores.any?
62
+ (scores.sort_by { |a| -a[1] })[0][0]
63
+ else
64
+ ''
65
+ end
61
66
  end
62
67
 
63
68
  end
@@ -116,7 +116,9 @@ class Linnaeus::Persistence < Linnaeus
116
116
  if empty_words == word_counts
117
117
  @redis.del BASE_CATEGORY_KEY + category
118
118
  else
119
- @redis.hdel BASE_CATEGORY_KEY + category, empty_words.keys
119
+ if empty_words.any?
120
+ @redis.hdel BASE_CATEGORY_KEY + category, empty_words.keys
121
+ end
120
122
  end
121
123
  end
122
124
 
data/lib/linnaeus.rb CHANGED
@@ -44,7 +44,7 @@ class Linnaeus
44
44
  # A string or array of categories
45
45
  def normalize_categories(categories = [])
46
46
  [categories].flatten.collect do |cat|
47
- cat.to_s.downcase.gsub(/[^a-z\d\.\-_]/,'')
47
+ cat.to_s.encode(@encoding).downcase.gsub(/[^a-z\d\.\-_ ]/,'')
48
48
  end.reject{|cat| cat == ''}.compact
49
49
  end
50
50
 
data/linnaeus.gemspec CHANGED
@@ -5,16 +5,16 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "linnaeus"
8
- s.version = "1.0.2"
8
+ s.version = "1.0.3"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["djcp"]
12
- s.date = "2012-10-31"
12
+ s.date = "2012-11-02"
13
13
  s.description = "Linnaeus provides a redis-backed Bayesian classifier. Words are stemmed, stopwords are stopped, and redis is used to allow for persistent and concurrent training and classification."
14
14
  s.email = "dan@collispuro.net"
15
15
  s.extra_rdoc_files = [
16
16
  "LICENSE.txt",
17
- "README.rdoc"
17
+ "README.md"
18
18
  ]
19
19
  s.files = [
20
20
  ".document",
@@ -23,7 +23,7 @@ Gem::Specification.new do |s|
23
23
  "Gemfile",
24
24
  "Gemfile.lock",
25
25
  "LICENSE.txt",
26
- "README.rdoc",
26
+ "README.md",
27
27
  "Rakefile",
28
28
  "VERSION",
29
29
  "images/linnaeus.jpg",
@@ -58,6 +58,7 @@ Gem::Specification.new do |s|
58
58
  s.add_development_dependency(%q<bundler>, [">= 0"])
59
59
  s.add_development_dependency(%q<jeweler>, [">= 0"])
60
60
  s.add_development_dependency(%q<simplecov>, [">= 0"])
61
+ s.add_development_dependency(%q<redcarpet>, [">= 0"])
61
62
  else
62
63
  s.add_dependency(%q<redis>, ["~> 3.0.0"])
63
64
  s.add_dependency(%q<stemmer>, ["~> 1.0.0"])
@@ -67,6 +68,7 @@ Gem::Specification.new do |s|
67
68
  s.add_dependency(%q<bundler>, [">= 0"])
68
69
  s.add_dependency(%q<jeweler>, [">= 0"])
69
70
  s.add_dependency(%q<simplecov>, [">= 0"])
71
+ s.add_dependency(%q<redcarpet>, [">= 0"])
70
72
  end
71
73
  else
72
74
  s.add_dependency(%q<redis>, ["~> 3.0.0"])
@@ -77,6 +79,7 @@ Gem::Specification.new do |s|
77
79
  s.add_dependency(%q<bundler>, [">= 0"])
78
80
  s.add_dependency(%q<jeweler>, [">= 0"])
79
81
  s.add_dependency(%q<simplecov>, [">= 0"])
82
+ s.add_dependency(%q<redcarpet>, [">= 0"])
80
83
  end
81
84
  end
82
85
 
@@ -1,12 +1,32 @@
1
1
  require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
2
 
3
3
  describe Linnaeus::Classifier do
4
+ context 'with no training data' do
5
+ it 'should return empty values when attempting to classify' do
6
+ Linnaeus::Persistence.new.clear_all_training_data
7
+ subject.classify("foo bar baz").should be_empty
8
+ subject.classification_scores("foo bar baz").should be_empty
9
+ end
10
+ end
11
+
4
12
  context 'with a very small dataset' do
5
- it 'should classify easy things well' do
13
+ before do
6
14
  create_small_dataset
15
+ end
16
+
17
+ it 'should classify easy things well' do
7
18
  subject.classify('A bird that migrates').should eq('bird')
8
19
  subject.classify('This was directed by Gus Van Sant').should eq('movie')
9
20
  end
21
+
22
+ it 'should return correct classification scores' do
23
+ subject.classification_scores('a bird').should eq(
24
+ { "movie"=>-6.272877006546167, "bird"=>-4.2626798770413155 }
25
+ )
26
+ subject.classification_scores('a directorial bird').should eq(
27
+ { "movie"=>-10.24316892009829, "bird"=>-10.827944847076676 }
28
+ )
29
+ end
10
30
  end
11
31
 
12
32
  def create_small_dataset
@@ -1,22 +1,88 @@
1
1
  require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
2
 
3
3
  describe Linnaeus::Persistence do
4
- it 'stores categories successfully' do
5
- lp = Linnaeus::Persistence.new
4
+ before do
5
+ lp = get_linnaeus_persistence
6
+ lp.clear_all_training_data
7
+ end
8
+
9
+ it '#clear_all_training_data' do
10
+ lp = get_linnaeus_persistence
11
+ train_a_document_in 'testcategory'
12
+ lp.get_words_with_count_for_category('testcategory').should_not be_empty
6
13
  lp.clear_all_training_data
14
+ lp.get_words_with_count_for_category('testcategory').should be_empty
15
+ end
16
+
17
+ it 'stores categories successfully' do
18
+ lp = get_linnaeus_persistence
7
19
  add_categories lp
8
20
  lp.get_categories.sort.should eq ['bar','baz','foo']
9
21
  end
10
22
 
11
23
  it 'can remove categories' do
12
- lp = Linnaeus::Persistence.new
13
- lp.clear_all_training_data
24
+ lp = get_linnaeus_persistence
14
25
  add_categories lp
15
26
  lp.remove_category 'bar'
16
27
  lp.get_categories.sort.should eq ['baz','foo']
17
28
  end
18
29
 
30
+ it '#get_words_with_count_for_category' do
31
+ lp = get_linnaeus_persistence
32
+ train_a_document_in 'testcategory'
33
+ lp.get_words_with_count_for_category('testcategory').should eq({
34
+ "test"=>"1", "document"=>"1", "stuff"=>"1",
35
+ "bayesian"=>"1", "corpu"=>"1"
36
+ })
37
+ end
38
+
39
+ it '#increment_word_counts_for_category' do
40
+ lp = get_linnaeus_persistence
41
+ train_a_document_in 'testcategory'
42
+ train_a_document_in 'testcategory'
43
+ lp.get_words_with_count_for_category('testcategory').should eq({
44
+ "test"=>"2", "document"=>"2", "stuff"=>"2",
45
+ "bayesian"=>"2", "corpu"=>"2"
46
+ })
47
+ end
48
+
49
+ it '#decrement_word_counts_for_category' do
50
+ lp = get_linnaeus_persistence
51
+ train_a_document_in 'testcategory'
52
+ train_a_document_in 'testcategory'
53
+ untrain_a_document_in 'testcategory'
54
+ lp.get_words_with_count_for_category('testcategory').should eq({
55
+ "test"=>"1", "document"=>"1", "stuff"=>"1",
56
+ "bayesian"=>"1", "corpu"=>"1"
57
+ })
58
+ end
59
+
60
+ it '#cleanup_empty_words_in_category' do
61
+ lp = get_linnaeus_persistence
62
+ train_a_document_in 'testcategory'
63
+ untrain_a_document_in 'testcategory'
64
+ lp.get_words_with_count_for_category('testcategory').should eq ({})
65
+ end
66
+
19
67
  def add_categories(lp)
20
68
  lp.add_categories(['foo','bar','baz','foo', 'bar'])
21
69
  end
70
+
71
+ def get_linnaeus_persistence
72
+ @lp ||= Linnaeus::Persistence.new
73
+ end
74
+
75
+ def train_a_document_in(category)
76
+ lt = Linnaeus::Trainer.new
77
+ lt.train category, document
78
+ end
79
+
80
+ def untrain_a_document_in(category)
81
+ lt = Linnaeus::Trainer.new
82
+ lt.untrain category, document
83
+ end
84
+
85
+ def document
86
+ 'I am a test document and I will have stuff in the bayesian corpus'
87
+ end
22
88
  end
@@ -52,7 +52,7 @@ describe Linnaeus::Trainer do
52
52
 
53
53
  context 'with non-default stopwords' do
54
54
  subject { Linnaeus::Trainer.new(stopwords_class: FooStop) }
55
- it 'should count word occurrencs properly' do
55
+ it 'should count word occurrences properly' do
56
56
  subject.count_word_occurrences('foo bar foo baz').should == { 'baz' => 1 }
57
57
  end
58
58
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linnaeus
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-10-31 00:00:00.000000000 Z
12
+ date: 2012-11-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: redis
@@ -139,6 +139,22 @@ dependencies:
139
139
  - - ! '>='
140
140
  - !ruby/object:Gem::Version
141
141
  version: '0'
142
+ - !ruby/object:Gem::Dependency
143
+ name: redcarpet
144
+ requirement: !ruby/object:Gem::Requirement
145
+ none: false
146
+ requirements:
147
+ - - ! '>='
148
+ - !ruby/object:Gem::Version
149
+ version: '0'
150
+ type: :development
151
+ prerelease: false
152
+ version_requirements: !ruby/object:Gem::Requirement
153
+ none: false
154
+ requirements:
155
+ - - ! '>='
156
+ - !ruby/object:Gem::Version
157
+ version: '0'
142
158
  description: Linnaeus provides a redis-backed Bayesian classifier. Words are stemmed,
143
159
  stopwords are stopped, and redis is used to allow for persistent and concurrent
144
160
  training and classification.
@@ -147,7 +163,7 @@ executables: []
147
163
  extensions: []
148
164
  extra_rdoc_files:
149
165
  - LICENSE.txt
150
- - README.rdoc
166
+ - README.md
151
167
  files:
152
168
  - .document
153
169
  - .rspec
@@ -155,7 +171,7 @@ files:
155
171
  - Gemfile
156
172
  - Gemfile.lock
157
173
  - LICENSE.txt
158
- - README.rdoc
174
+ - README.md
159
175
  - Rakefile
160
176
  - VERSION
161
177
  - images/linnaeus.jpg
@@ -186,7 +202,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
186
202
  version: '0'
187
203
  segments:
188
204
  - 0
189
- hash: 2735128872899197737
205
+ hash: 2790078718663664512
190
206
  required_rubygems_version: !ruby/object:Gem::Requirement
191
207
  none: false
192
208
  requirements:
data/README.rdoc DELETED
@@ -1,35 +0,0 @@
1
- = Linnaeus
2
-
3
- https://raw.github.com/djcp/linnaeus/master/images/linnaeus.jpg
4
-
5
- Linnaeus is a redis-backed Bayesian classification system. Please see the generated rdocs for more information. Ruby 1.9 is required.
6
-
7
- == Examples
8
-
9
- lt = Linnaeus::Trainer.new # Used to train documents
10
- lc = Linnaeus::Classifier.new # Used to classify documents
11
-
12
- lt.train 'language', 'Ruby is a dynamic, reflective, general-purpose object-oriented programming language that combines syntax inspired by Perl with Smalltalk-like features.'
13
- lt.train 'database', 'PostgreSQL, often simply Postgres, is an object-relational database management system (ORDBMS) available for many platforms including Linux, FreeBSD, Solaris, Microsoft Windows and Mac OS X.'
14
-
15
- lc.classify 'Perl is a high-level, general-purpose, interpreted, dynamic programming language.' # returns "language"
16
-
17
-
18
- == Contributing to linnaeus
19
-
20
- * Submit bugs to the github issue tracker: https://github.com/djcp/linnaeus/issues
21
- * If you'd like to add a feature, please submit a description of it to the issue tracker so we can discuss.
22
- * If the feature makes sense, fork the github repository. Write rspec tests and issue a pull request when your change is done.
23
-
24
- == The Future
25
-
26
- * Create additional storage backends - sqlite, postgresql, mongodb, etc.
27
- * Allow for weighting tweaks.
28
-
29
- == Copyright
30
-
31
- Copyright (c) 2012 Dan Collis-Puro. See LICENSE.txt for further details.
32
-
33
- == Credits
34
-
35
- * Image courtesy wikipedia. About Carl Linnaeus: http://en.wikipedia.org/wiki/Linnaeus