linnaeus 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +8 -7
- data/Gemfile.lock +2 -0
- data/README.md +40 -0
- data/VERSION +1 -1
- data/lib/linnaeus/classifier.rb +6 -1
- data/lib/linnaeus/persistence.rb +3 -1
- data/lib/linnaeus.rb +1 -1
- data/linnaeus.gemspec +7 -4
- data/spec/linnaeus_classifier_spec.rb +21 -1
- data/spec/linnaeus_persistence_spec.rb +70 -4
- data/spec/linnaeus_trainer_spec.rb +1 -1
- metadata +21 -5
- data/README.rdoc +0 -35
data/Gemfile
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
source
|
1
|
+
source 'http://rubygems.org'
|
2
2
|
|
3
3
|
gem 'redis', '~> 3.0.0'
|
4
4
|
gem 'stemmer', '~> 1.0.0'
|
@@ -6,10 +6,11 @@ gem 'stemmer', '~> 1.0.0'
|
|
6
6
|
# Add dependencies to develop your gem here.
|
7
7
|
# Include everything needed to run rake, tests, features, etc.
|
8
8
|
group :development do
|
9
|
-
gem
|
10
|
-
gem
|
11
|
-
gem
|
12
|
-
gem
|
13
|
-
gem
|
14
|
-
gem
|
9
|
+
gem 'rspec', '~> 2.11.0'
|
10
|
+
gem 'yard', '~> 0.7'
|
11
|
+
gem 'rdoc', '~> 3.12'
|
12
|
+
gem 'bundler'
|
13
|
+
gem 'jeweler'
|
14
|
+
gem 'simplecov'
|
15
|
+
gem 'redcarpet'
|
15
16
|
end
|
data/Gemfile.lock
CHANGED
@@ -13,6 +13,7 @@ GEM
|
|
13
13
|
rake (0.9.2.2)
|
14
14
|
rdoc (3.12)
|
15
15
|
json (~> 1.4)
|
16
|
+
redcarpet (2.2.2)
|
16
17
|
redis (3.0.2)
|
17
18
|
rspec (2.11.0)
|
18
19
|
rspec-core (~> 2.11.0)
|
@@ -36,6 +37,7 @@ DEPENDENCIES
|
|
36
37
|
bundler
|
37
38
|
jeweler
|
38
39
|
rdoc (~> 3.12)
|
40
|
+
redcarpet
|
39
41
|
redis (~> 3.0.0)
|
40
42
|
rspec (~> 2.11.0)
|
41
43
|
simplecov
|
data/README.md
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
# Linnaeus [](http://travis-ci.org/djcp/linnaeus)
|
2
|
+
|
3
|
+

|
4
|
+
|
5
|
+
Linnaeus is a redis-backed naive Bayesian classification system. Please see the [rdoc](http://rubydoc.info/gems/linnaeus/) for more information. Ruby 1.9 is required.
|
6
|
+
|
7
|
+
Examples
|
8
|
+
--------
|
9
|
+
|
10
|
+
lt = Linnaeus::Trainer.new # Used to train documents
|
11
|
+
lc = Linnaeus::Classifier.new # Used to classify documents
|
12
|
+
|
13
|
+
lt.train 'language', 'Ruby is a dynamic, reflective, general-purpose object-oriented programming language that combines syntax inspired by Perl with Smalltalk-like features.'
|
14
|
+
lt.train 'database', 'PostgreSQL, often simply Postgres, is an object-relational database management system (ORDBMS) available for many platforms including Linux, FreeBSD, Solaris, Microsoft Windows and Mac OS X.'
|
15
|
+
|
16
|
+
lc.classify 'Perl is a high-level, general-purpose, interpreted, dynamic programming language.' # returns "language"
|
17
|
+
|
18
|
+
|
19
|
+
Contributing to linnaeus
|
20
|
+
------------------------
|
21
|
+
|
22
|
+
* Submit bugs to the github issue tracker: https://github.com/djcp/linnaeus/issues
|
23
|
+
* If you'd like to add a feature, please submit a description of it to the issue tracker so we can discuss.
|
24
|
+
* If the feature makes sense, fork the github repository. Write rspec tests and issue a pull request when your change is done.
|
25
|
+
|
26
|
+
The Future
|
27
|
+
----------
|
28
|
+
|
29
|
+
* Create additional storage backends - sqlite, postgresql, mongodb, etc.
|
30
|
+
* Allow for weighting tweaks.
|
31
|
+
|
32
|
+
Copyright
|
33
|
+
---------
|
34
|
+
|
35
|
+
Copyright (c) 2012 Dan Collis-Puro. See LICENSE.txt for further details.
|
36
|
+
|
37
|
+
Credits
|
38
|
+
-------
|
39
|
+
|
40
|
+
* Image courtesy wikipedia. About Carl Linnaeus: http://en.wikipedia.org/wiki/Linnaeus
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
1.0.
|
1
|
+
1.0.3
|
data/lib/linnaeus/classifier.rb
CHANGED
@@ -57,7 +57,12 @@ class Linnaeus::Classifier < Linnaeus
|
|
57
57
|
# == Returns
|
58
58
|
# A string representing the most likely category.
|
59
59
|
def classify(text)
|
60
|
-
|
60
|
+
scores = classification_scores(text)
|
61
|
+
if scores.any?
|
62
|
+
(scores.sort_by { |a| -a[1] })[0][0]
|
63
|
+
else
|
64
|
+
''
|
65
|
+
end
|
61
66
|
end
|
62
67
|
|
63
68
|
end
|
data/lib/linnaeus/persistence.rb
CHANGED
@@ -116,7 +116,9 @@ class Linnaeus::Persistence < Linnaeus
|
|
116
116
|
if empty_words == word_counts
|
117
117
|
@redis.del BASE_CATEGORY_KEY + category
|
118
118
|
else
|
119
|
-
|
119
|
+
if empty_words.any?
|
120
|
+
@redis.hdel BASE_CATEGORY_KEY + category, empty_words.keys
|
121
|
+
end
|
120
122
|
end
|
121
123
|
end
|
122
124
|
|
data/lib/linnaeus.rb
CHANGED
@@ -44,7 +44,7 @@ class Linnaeus
|
|
44
44
|
# A string or array of categories
|
45
45
|
def normalize_categories(categories = [])
|
46
46
|
[categories].flatten.collect do |cat|
|
47
|
-
cat.to_s.downcase.gsub(/[^a-z\d\.\-_]/,'')
|
47
|
+
cat.to_s.encode(@encoding).downcase.gsub(/[^a-z\d\.\-_ ]/,'')
|
48
48
|
end.reject{|cat| cat == ''}.compact
|
49
49
|
end
|
50
50
|
|
data/linnaeus.gemspec
CHANGED
@@ -5,16 +5,16 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "linnaeus"
|
8
|
-
s.version = "1.0.
|
8
|
+
s.version = "1.0.3"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["djcp"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-11-02"
|
13
13
|
s.description = "Linnaeus provides a redis-backed Bayesian classifier. Words are stemmed, stopwords are stopped, and redis is used to allow for persistent and concurrent training and classification."
|
14
14
|
s.email = "dan@collispuro.net"
|
15
15
|
s.extra_rdoc_files = [
|
16
16
|
"LICENSE.txt",
|
17
|
-
"README.
|
17
|
+
"README.md"
|
18
18
|
]
|
19
19
|
s.files = [
|
20
20
|
".document",
|
@@ -23,7 +23,7 @@ Gem::Specification.new do |s|
|
|
23
23
|
"Gemfile",
|
24
24
|
"Gemfile.lock",
|
25
25
|
"LICENSE.txt",
|
26
|
-
"README.
|
26
|
+
"README.md",
|
27
27
|
"Rakefile",
|
28
28
|
"VERSION",
|
29
29
|
"images/linnaeus.jpg",
|
@@ -58,6 +58,7 @@ Gem::Specification.new do |s|
|
|
58
58
|
s.add_development_dependency(%q<bundler>, [">= 0"])
|
59
59
|
s.add_development_dependency(%q<jeweler>, [">= 0"])
|
60
60
|
s.add_development_dependency(%q<simplecov>, [">= 0"])
|
61
|
+
s.add_development_dependency(%q<redcarpet>, [">= 0"])
|
61
62
|
else
|
62
63
|
s.add_dependency(%q<redis>, ["~> 3.0.0"])
|
63
64
|
s.add_dependency(%q<stemmer>, ["~> 1.0.0"])
|
@@ -67,6 +68,7 @@ Gem::Specification.new do |s|
|
|
67
68
|
s.add_dependency(%q<bundler>, [">= 0"])
|
68
69
|
s.add_dependency(%q<jeweler>, [">= 0"])
|
69
70
|
s.add_dependency(%q<simplecov>, [">= 0"])
|
71
|
+
s.add_dependency(%q<redcarpet>, [">= 0"])
|
70
72
|
end
|
71
73
|
else
|
72
74
|
s.add_dependency(%q<redis>, ["~> 3.0.0"])
|
@@ -77,6 +79,7 @@ Gem::Specification.new do |s|
|
|
77
79
|
s.add_dependency(%q<bundler>, [">= 0"])
|
78
80
|
s.add_dependency(%q<jeweler>, [">= 0"])
|
79
81
|
s.add_dependency(%q<simplecov>, [">= 0"])
|
82
|
+
s.add_dependency(%q<redcarpet>, [">= 0"])
|
80
83
|
end
|
81
84
|
end
|
82
85
|
|
@@ -1,12 +1,32 @@
|
|
1
1
|
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
2
|
|
3
3
|
describe Linnaeus::Classifier do
|
4
|
+
context 'with no training data' do
|
5
|
+
it 'should return empty values when attempting to classify' do
|
6
|
+
Linnaeus::Persistence.new.clear_all_training_data
|
7
|
+
subject.classify("foo bar baz").should be_empty
|
8
|
+
subject.classification_scores("foo bar baz").should be_empty
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
4
12
|
context 'with a very small dataset' do
|
5
|
-
|
13
|
+
before do
|
6
14
|
create_small_dataset
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'should classify easy things well' do
|
7
18
|
subject.classify('A bird that migrates').should eq('bird')
|
8
19
|
subject.classify('This was directed by Gus Van Sant').should eq('movie')
|
9
20
|
end
|
21
|
+
|
22
|
+
it 'should return correct classification scores' do
|
23
|
+
subject.classification_scores('a bird').should eq(
|
24
|
+
{ "movie"=>-6.272877006546167, "bird"=>-4.2626798770413155 }
|
25
|
+
)
|
26
|
+
subject.classification_scores('a directorial bird').should eq(
|
27
|
+
{ "movie"=>-10.24316892009829, "bird"=>-10.827944847076676 }
|
28
|
+
)
|
29
|
+
end
|
10
30
|
end
|
11
31
|
|
12
32
|
def create_small_dataset
|
@@ -1,22 +1,88 @@
|
|
1
1
|
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
2
|
|
3
3
|
describe Linnaeus::Persistence do
|
4
|
-
|
5
|
-
lp =
|
4
|
+
before do
|
5
|
+
lp = get_linnaeus_persistence
|
6
|
+
lp.clear_all_training_data
|
7
|
+
end
|
8
|
+
|
9
|
+
it '#clear_all_training_data' do
|
10
|
+
lp = get_linnaeus_persistence
|
11
|
+
train_a_document_in 'testcategory'
|
12
|
+
lp.get_words_with_count_for_category('testcategory').should_not be_empty
|
6
13
|
lp.clear_all_training_data
|
14
|
+
lp.get_words_with_count_for_category('testcategory').should be_empty
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'stores categories successfully' do
|
18
|
+
lp = get_linnaeus_persistence
|
7
19
|
add_categories lp
|
8
20
|
lp.get_categories.sort.should eq ['bar','baz','foo']
|
9
21
|
end
|
10
22
|
|
11
23
|
it 'can remove categories' do
|
12
|
-
lp =
|
13
|
-
lp.clear_all_training_data
|
24
|
+
lp = get_linnaeus_persistence
|
14
25
|
add_categories lp
|
15
26
|
lp.remove_category 'bar'
|
16
27
|
lp.get_categories.sort.should eq ['baz','foo']
|
17
28
|
end
|
18
29
|
|
30
|
+
it '#get_words_with_count_for_category' do
|
31
|
+
lp = get_linnaeus_persistence
|
32
|
+
train_a_document_in 'testcategory'
|
33
|
+
lp.get_words_with_count_for_category('testcategory').should eq({
|
34
|
+
"test"=>"1", "document"=>"1", "stuff"=>"1",
|
35
|
+
"bayesian"=>"1", "corpu"=>"1"
|
36
|
+
})
|
37
|
+
end
|
38
|
+
|
39
|
+
it '#increment_word_counts_for_category' do
|
40
|
+
lp = get_linnaeus_persistence
|
41
|
+
train_a_document_in 'testcategory'
|
42
|
+
train_a_document_in 'testcategory'
|
43
|
+
lp.get_words_with_count_for_category('testcategory').should eq({
|
44
|
+
"test"=>"2", "document"=>"2", "stuff"=>"2",
|
45
|
+
"bayesian"=>"2", "corpu"=>"2"
|
46
|
+
})
|
47
|
+
end
|
48
|
+
|
49
|
+
it '#decrement_word_counts_for_category' do
|
50
|
+
lp = get_linnaeus_persistence
|
51
|
+
train_a_document_in 'testcategory'
|
52
|
+
train_a_document_in 'testcategory'
|
53
|
+
untrain_a_document_in 'testcategory'
|
54
|
+
lp.get_words_with_count_for_category('testcategory').should eq({
|
55
|
+
"test"=>"1", "document"=>"1", "stuff"=>"1",
|
56
|
+
"bayesian"=>"1", "corpu"=>"1"
|
57
|
+
})
|
58
|
+
end
|
59
|
+
|
60
|
+
it '#cleanup_empty_words_in_category' do
|
61
|
+
lp = get_linnaeus_persistence
|
62
|
+
train_a_document_in 'testcategory'
|
63
|
+
untrain_a_document_in 'testcategory'
|
64
|
+
lp.get_words_with_count_for_category('testcategory').should eq ({})
|
65
|
+
end
|
66
|
+
|
19
67
|
def add_categories(lp)
|
20
68
|
lp.add_categories(['foo','bar','baz','foo', 'bar'])
|
21
69
|
end
|
70
|
+
|
71
|
+
def get_linnaeus_persistence
|
72
|
+
@lp ||= Linnaeus::Persistence.new
|
73
|
+
end
|
74
|
+
|
75
|
+
def train_a_document_in(category)
|
76
|
+
lt = Linnaeus::Trainer.new
|
77
|
+
lt.train category, document
|
78
|
+
end
|
79
|
+
|
80
|
+
def untrain_a_document_in(category)
|
81
|
+
lt = Linnaeus::Trainer.new
|
82
|
+
lt.untrain category, document
|
83
|
+
end
|
84
|
+
|
85
|
+
def document
|
86
|
+
'I am a test document and I will have stuff in the bayesian corpus'
|
87
|
+
end
|
22
88
|
end
|
@@ -52,7 +52,7 @@ describe Linnaeus::Trainer do
|
|
52
52
|
|
53
53
|
context 'with non-default stopwords' do
|
54
54
|
subject { Linnaeus::Trainer.new(stopwords_class: FooStop) }
|
55
|
-
it 'should count word
|
55
|
+
it 'should count word occurrences properly' do
|
56
56
|
subject.count_word_occurrences('foo bar foo baz').should == { 'baz' => 1 }
|
57
57
|
end
|
58
58
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linnaeus
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-11-02 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: redis
|
@@ -139,6 +139,22 @@ dependencies:
|
|
139
139
|
- - ! '>='
|
140
140
|
- !ruby/object:Gem::Version
|
141
141
|
version: '0'
|
142
|
+
- !ruby/object:Gem::Dependency
|
143
|
+
name: redcarpet
|
144
|
+
requirement: !ruby/object:Gem::Requirement
|
145
|
+
none: false
|
146
|
+
requirements:
|
147
|
+
- - ! '>='
|
148
|
+
- !ruby/object:Gem::Version
|
149
|
+
version: '0'
|
150
|
+
type: :development
|
151
|
+
prerelease: false
|
152
|
+
version_requirements: !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
154
|
+
requirements:
|
155
|
+
- - ! '>='
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: '0'
|
142
158
|
description: Linnaeus provides a redis-backed Bayesian classifier. Words are stemmed,
|
143
159
|
stopwords are stopped, and redis is used to allow for persistent and concurrent
|
144
160
|
training and classification.
|
@@ -147,7 +163,7 @@ executables: []
|
|
147
163
|
extensions: []
|
148
164
|
extra_rdoc_files:
|
149
165
|
- LICENSE.txt
|
150
|
-
- README.
|
166
|
+
- README.md
|
151
167
|
files:
|
152
168
|
- .document
|
153
169
|
- .rspec
|
@@ -155,7 +171,7 @@ files:
|
|
155
171
|
- Gemfile
|
156
172
|
- Gemfile.lock
|
157
173
|
- LICENSE.txt
|
158
|
-
- README.
|
174
|
+
- README.md
|
159
175
|
- Rakefile
|
160
176
|
- VERSION
|
161
177
|
- images/linnaeus.jpg
|
@@ -186,7 +202,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
186
202
|
version: '0'
|
187
203
|
segments:
|
188
204
|
- 0
|
189
|
-
hash:
|
205
|
+
hash: 2790078718663664512
|
190
206
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
191
207
|
none: false
|
192
208
|
requirements:
|
data/README.rdoc
DELETED
@@ -1,35 +0,0 @@
|
|
1
|
-
= Linnaeus
|
2
|
-
|
3
|
-
https://raw.github.com/djcp/linnaeus/master/images/linnaeus.jpg
|
4
|
-
|
5
|
-
Linnaeus is a redis-backed Bayesian classification system. Please see the generated rdocs for more information. Ruby 1.9 is required.
|
6
|
-
|
7
|
-
== Examples
|
8
|
-
|
9
|
-
lt = Linnaeus::Trainer.new # Used to train documents
|
10
|
-
lc = Linnaeus::Classifier.new # Used to classify documents
|
11
|
-
|
12
|
-
lt.train 'language', 'Ruby is a dynamic, reflective, general-purpose object-oriented programming language that combines syntax inspired by Perl with Smalltalk-like features.'
|
13
|
-
lt.train 'database', 'PostgreSQL, often simply Postgres, is an object-relational database management system (ORDBMS) available for many platforms including Linux, FreeBSD, Solaris, Microsoft Windows and Mac OS X.'
|
14
|
-
|
15
|
-
lc.classify 'Perl is a high-level, general-purpose, interpreted, dynamic programming language.' # returns "language"
|
16
|
-
|
17
|
-
|
18
|
-
== Contributing to linnaeus
|
19
|
-
|
20
|
-
* Submit bugs to the github issue tracker: https://github.com/djcp/linnaeus/issues
|
21
|
-
* If you'd like to add a feature, please submit a description of it to the issue tracker so we can discuss.
|
22
|
-
* If the feature makes sense, fork the github repository. Write rspec tests and issue a pull request when your change is done.
|
23
|
-
|
24
|
-
== The Future
|
25
|
-
|
26
|
-
* Create additional storage backends - sqlite, postgresql, mongodb, etc.
|
27
|
-
* Allow for weighting tweaks.
|
28
|
-
|
29
|
-
== Copyright
|
30
|
-
|
31
|
-
Copyright (c) 2012 Dan Collis-Puro. See LICENSE.txt for further details.
|
32
|
-
|
33
|
-
== Credits
|
34
|
-
|
35
|
-
* Image courtesy wikipedia. About Carl Linnaeus: http://en.wikipedia.org/wiki/Linnaeus
|