linnaeus 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.rspec +1 -0
- data/.travis.yml +6 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +43 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +36 -0
- data/Rakefile +37 -0
- data/VERSION +1 -0
- data/images/linnaeus.jpg +0 -0
- data/lib/linnaeus.rb +60 -0
- data/lib/linnaeus/classifier.rb +41 -0
- data/lib/linnaeus/persistence.rb +114 -0
- data/lib/linnaeus/stopwords.rb +17 -0
- data/lib/linnaeus/trainer.rb +38 -0
- data/spec/linnaeus_classifier_spec.rb +23 -0
- data/spec/linnaeus_persistence_spec.rb +22 -0
- data/spec/linnaeus_spec.rb +4 -0
- data/spec/linnaeus_stopwords_spec.rb +20 -0
- data/spec/linnaeus_trainer_spec.rb +73 -0
- data/spec/spec_helper.rb +14 -0
- metadata +201 -0
data/.document
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/.travis.yml
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
|
3
|
+
gem 'redis', '~> 3.0.0'
|
4
|
+
gem 'stemmer', '~> 1.0.0'
|
5
|
+
|
6
|
+
# Add dependencies to develop your gem here.
|
7
|
+
# Include everything needed to run rake, tests, features, etc.
|
8
|
+
group :development do
|
9
|
+
gem "rspec", "~> 2.11.0"
|
10
|
+
gem "yard", "~> 0.7"
|
11
|
+
gem "rdoc", "~> 3.12"
|
12
|
+
gem "bundler"
|
13
|
+
gem "jeweler"
|
14
|
+
gem "simplecov"
|
15
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
diff-lcs (1.1.3)
|
5
|
+
git (1.2.5)
|
6
|
+
jeweler (1.8.4)
|
7
|
+
bundler (~> 1.0)
|
8
|
+
git (>= 1.2.5)
|
9
|
+
rake
|
10
|
+
rdoc
|
11
|
+
json (1.7.5)
|
12
|
+
multi_json (1.3.6)
|
13
|
+
rake (0.9.2.2)
|
14
|
+
rdoc (3.12)
|
15
|
+
json (~> 1.4)
|
16
|
+
redis (3.0.2)
|
17
|
+
rspec (2.11.0)
|
18
|
+
rspec-core (~> 2.11.0)
|
19
|
+
rspec-expectations (~> 2.11.0)
|
20
|
+
rspec-mocks (~> 2.11.0)
|
21
|
+
rspec-core (2.11.1)
|
22
|
+
rspec-expectations (2.11.3)
|
23
|
+
diff-lcs (~> 1.1.3)
|
24
|
+
rspec-mocks (2.11.3)
|
25
|
+
simplecov (0.7.1)
|
26
|
+
multi_json (~> 1.0)
|
27
|
+
simplecov-html (~> 0.7.1)
|
28
|
+
simplecov-html (0.7.1)
|
29
|
+
stemmer (1.0.1)
|
30
|
+
yard (0.8.3)
|
31
|
+
|
32
|
+
PLATFORMS
|
33
|
+
ruby
|
34
|
+
|
35
|
+
DEPENDENCIES
|
36
|
+
bundler
|
37
|
+
jeweler
|
38
|
+
rdoc (~> 3.12)
|
39
|
+
redis (~> 3.0.0)
|
40
|
+
rspec (~> 2.11.0)
|
41
|
+
simplecov
|
42
|
+
stemmer (~> 1.0.0)
|
43
|
+
yard (~> 0.7)
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 Dan Collis-Puro
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
= Linnaeus
|
2
|
+
|
3
|
+
https://raw.github.com/djcp/linnaeus/master/images/linnaeus.jpg
|
4
|
+
|
5
|
+
Linnaeus is a redis-backed Bayesian classification system. Please see the generated rdocs for more information.
|
6
|
+
|
7
|
+
== Examples
|
8
|
+
|
9
|
+
lt = Linnaeus::Trainer.new # Used to train documents
|
10
|
+
lc = Linnaeus::Classifier.new # Used to classify documents
|
11
|
+
|
12
|
+
lt.train 'language', 'Ruby is a dynamic, reflective, general-purpose object-oriented programming language that combines syntax inspired by Perl with Smalltalk-like features.'
|
13
|
+
lt.train 'database', 'PostgreSQL, often simply Postgres, is an object-relational database management system (ORDBMS) available for many platforms including Linux, FreeBSD, Solaris, Microsoft Windows and Mac OS X.'
|
14
|
+
|
15
|
+
lc.classify 'Perl is a high-level, general-purpose, interpreted, dynamic programming language.' # returns "language"
|
16
|
+
|
17
|
+
|
18
|
+
== Contributing to linnaeus
|
19
|
+
|
20
|
+
* Submit bugs to the github issue tracker: https://github.com/djcp/linnaeus/issues
|
21
|
+
* If you'd like to add a feature, please submit a description of it to the issue tracker so we can discuss.
|
22
|
+
* If the feature makes sense, fork the github repository. Write rspec tests and issue a pull request when your change is done.
|
23
|
+
|
24
|
+
== The Future
|
25
|
+
|
26
|
+
* Make sure we're unicode OK
|
27
|
+
* Create additional storage backends - sqlite, postgresql, mongodb, etc.
|
28
|
+
* Allow for weighting tweaks.
|
29
|
+
|
30
|
+
== Copyright
|
31
|
+
|
32
|
+
Copyright (c) 2012 Dan Collis-Puro. See LICENSE.txt for further details.
|
33
|
+
|
34
|
+
== Credits
|
35
|
+
|
36
|
+
* Image courtesy wikipedia. About Carl Linnaeus: http://en.wikipedia.org/wiki/Linnaeus
|
data/Rakefile
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "linnaeus"
|
18
|
+
gem.homepage = "http://github.com/djcp/linnaeus"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = %Q{Another redis-backed Bayesian classifier}
|
21
|
+
gem.description = %Q{Linnaeus provides a redis-backed Bayesian classifier. Words are stemmed, stopwords are stopped, and redis is used to allow for persistent and concurrent training and classification.}
|
22
|
+
gem.email = "dan@collispuro.net"
|
23
|
+
gem.authors = ["djcp"]
|
24
|
+
# dependencies defined in Gemfile
|
25
|
+
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
27
|
+
|
28
|
+
require 'rspec/core'
|
29
|
+
require 'rspec/core/rake_task'
|
30
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
31
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
32
|
+
end
|
33
|
+
|
34
|
+
task :default => :spec
|
35
|
+
|
36
|
+
require 'yard'
|
37
|
+
YARD::Rake::YardocTask.new
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.0.0
|
data/images/linnaeus.jpg
ADDED
Binary file
|
data/lib/linnaeus.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
|
2
|
+
|
3
|
+
require 'redis'
|
4
|
+
require 'stemmer'
|
5
|
+
|
6
|
+
# The base class. You won't use this directly - use one of the subclasses.
|
7
|
+
class Linnaeus
|
8
|
+
|
9
|
+
def initialize(opts = {})
|
10
|
+
options = {
|
11
|
+
persistence_class: Persistence,
|
12
|
+
stopwords_class: Stopwords,
|
13
|
+
skip_stemming: false
|
14
|
+
}.merge(opts)
|
15
|
+
|
16
|
+
@db = options[:persistence_class].new(options)
|
17
|
+
@stopword_generator = options[:stopwords_class].new
|
18
|
+
@skip_stemming = options[:skip_stemming]
|
19
|
+
end
|
20
|
+
|
21
|
+
# Count occurences of words in a text corpus.
|
22
|
+
#
|
23
|
+
# == Parameters
|
24
|
+
# text::
|
25
|
+
# A string representing a document. Stopwords are removed and words are stemmed using the "Stemmer" gem.
|
26
|
+
def count_word_occurrences(text = '')
|
27
|
+
count = {}
|
28
|
+
text.downcase.split.each do |word|
|
29
|
+
stemmed_word = (@skip_stemming) ? word : word.stem_porter
|
30
|
+
unless stopwords.include? stemmed_word
|
31
|
+
count[stemmed_word] = count[stemmed_word] ? count[stemmed_word] + 1 : 1
|
32
|
+
end
|
33
|
+
end
|
34
|
+
count
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
# Format categories for training or untraining.
|
39
|
+
#
|
40
|
+
# == Parameters
|
41
|
+
# categories::
|
42
|
+
# A string or array of categories
|
43
|
+
def normalize_categories(categories = [])
|
44
|
+
[categories].flatten.collect do |cat|
|
45
|
+
cat.to_s.downcase.gsub(/[^a-z\d\.\-_]/,'')
|
46
|
+
end.reject{|cat| cat == ''}.compact
|
47
|
+
end
|
48
|
+
|
49
|
+
# Get a Set of stopwords to remove from documents for training / classifying.
|
50
|
+
def stopwords
|
51
|
+
@stopwords ||= @stopword_generator.to_set
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
require 'set'
|
57
|
+
require 'linnaeus/stopwords'
|
58
|
+
require 'linnaeus/persistence'
|
59
|
+
require 'linnaeus/trainer'
|
60
|
+
require 'linnaeus/classifier'
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# Classify documents against the Bayesian corpus.
|
2
|
+
class Linnaeus::Classifier < Linnaeus
|
3
|
+
|
4
|
+
# Returns a hash of scores for each category in the Bayesian corpus.
|
5
|
+
# The closer a score is to 0, the more likely a match it is.
|
6
|
+
#
|
7
|
+
# == Parameters
|
8
|
+
# text::
|
9
|
+
# a string of text to classify.
|
10
|
+
#
|
11
|
+
# == Returns
|
12
|
+
# a hash of categories with a score as the values.
|
13
|
+
def classification_scores(text)
|
14
|
+
scores = {}
|
15
|
+
|
16
|
+
@db.get_categories.each do |category|
|
17
|
+
words_with_count_for_category = @db.get_words_with_count_for_category category
|
18
|
+
total_word_count_sum_for_category = words_with_count_for_category.values.reduce(0){|sum, count| sum += count.to_i}
|
19
|
+
|
20
|
+
scores[category] = 0
|
21
|
+
count_word_occurrences(text).each do |word, count|
|
22
|
+
tmp_score = (words_with_count_for_category[word].nil?) ? 0.1 : words_with_count_for_category[word].to_i
|
23
|
+
scores[category] += Math.log(tmp_score / total_word_count_sum_for_category.to_f)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
scores
|
27
|
+
end
|
28
|
+
|
29
|
+
# The most likely category for a document.
|
30
|
+
#
|
31
|
+
# == Parameters
|
32
|
+
# text::
|
33
|
+
# a string of text to classify.
|
34
|
+
#
|
35
|
+
# == Returns
|
36
|
+
# A string representing the most likely category.
|
37
|
+
def classify(text)
|
38
|
+
(classification_scores(text).sort_by { |a| -a[1] })[0][0]
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
# The redis persistence layer.
|
2
|
+
class Linnaeus::Persistence < Linnaeus
|
3
|
+
# The Set (in the redis sense) of categories are stored in this key.
|
4
|
+
CATEGORIES_KEY = 'Linnaeus:category'
|
5
|
+
# The base key for a category in the redis corpus. Word occurrence counts for a category appear under here.
|
6
|
+
BASE_CATEGORY_KEY = 'Linnaeus:cat:'
|
7
|
+
|
8
|
+
attr_accessor :redis
|
9
|
+
|
10
|
+
def initialize(opts = {})
|
11
|
+
options = {
|
12
|
+
redis_host: '127.0.0.1',
|
13
|
+
redis_port: '6379',
|
14
|
+
redis_db: 0
|
15
|
+
}.merge(opts)
|
16
|
+
|
17
|
+
@redis = Redis.new(
|
18
|
+
host: options[:redis_host],
|
19
|
+
port: options[:redis_port],
|
20
|
+
db: options[:redis_db]
|
21
|
+
)
|
22
|
+
self
|
23
|
+
end
|
24
|
+
|
25
|
+
# Add categories to the bayesian corpus.
|
26
|
+
#
|
27
|
+
# == Parameters
|
28
|
+
# categories::
|
29
|
+
# A string or array of categories.
|
30
|
+
def add_categories(categories)
|
31
|
+
@redis.sadd CATEGORIES_KEY, categories
|
32
|
+
end
|
33
|
+
|
34
|
+
# Remove categories from the bayesian corpus
|
35
|
+
#
|
36
|
+
# == Parameters
|
37
|
+
# categories::
|
38
|
+
# A string or array of categories.
|
39
|
+
def remove_category(category)
|
40
|
+
@redis.srem CATEGORIES_KEY, category
|
41
|
+
end
|
42
|
+
|
43
|
+
# Get categories from the bayesian corpus
|
44
|
+
#
|
45
|
+
# == Parameters
|
46
|
+
# categories::
|
47
|
+
# A string or array of categories.
|
48
|
+
def get_categories
|
49
|
+
@redis.smembers CATEGORIES_KEY
|
50
|
+
end
|
51
|
+
|
52
|
+
# Get a list of words with their number of occurrences.
|
53
|
+
#
|
54
|
+
# == Parameters
|
55
|
+
# category::
|
56
|
+
# A string representing a category.
|
57
|
+
#
|
58
|
+
# == Returns
|
59
|
+
# A hash with the word counts for this category.
|
60
|
+
def get_words_with_count_for_category(category)
|
61
|
+
@redis.hgetall BASE_CATEGORY_KEY + category
|
62
|
+
end
|
63
|
+
|
64
|
+
# Clear all training data from the backend.
|
65
|
+
def clear_all_training_data
|
66
|
+
@redis.flushdb
|
67
|
+
end
|
68
|
+
|
69
|
+
# Increment word counts within a category
|
70
|
+
#
|
71
|
+
# == Parameters
|
72
|
+
# category::
|
73
|
+
# A string representing a category.
|
74
|
+
# word_occurrences::
|
75
|
+
# A hash containing a count of the number of word occurences in a document
|
76
|
+
def increment_word_counts_for_category(category, word_occurrences)
|
77
|
+
@redis.multi do |multi|
|
78
|
+
word_occurrences.each do|word,count|
|
79
|
+
multi.hincrby BASE_CATEGORY_KEY + category, word, count
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
# Decrement word counts within a category. This is used when removing a document from the corpus.
|
85
|
+
#
|
86
|
+
# == Parameters
|
87
|
+
# category::
|
88
|
+
# A string representing a category.
|
89
|
+
# word_occurrences::
|
90
|
+
# A hash containing a count of the number of word occurences in a document
|
91
|
+
def decrement_word_counts_for_category(category, word_occurrences)
|
92
|
+
@redis.multi do |multi|
|
93
|
+
word_occurrences.each do|word,count|
|
94
|
+
multi.hincrby BASE_CATEGORY_KEY + category, word, - count
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
# Clean out words with a count of zero in a category. Used during untraining.
|
100
|
+
#
|
101
|
+
# == Parameters
|
102
|
+
# category::
|
103
|
+
# A string representing a category.
|
104
|
+
def cleanup_empty_words_in_category(category)
|
105
|
+
word_counts = @redis.hgetall BASE_CATEGORY_KEY + category
|
106
|
+
empty_words = word_counts.select{|word, count| count.to_i <= 0}
|
107
|
+
if empty_words == word_counts
|
108
|
+
@redis.del BASE_CATEGORY_KEY + category
|
109
|
+
else
|
110
|
+
@redis.hdel BASE_CATEGORY_KEY + category, empty_words.keys
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# The stopword list - you can override this list by creating a stopword generator and registering it in the Linnaeus::Trainer or Linnaeus::Classifier constructors.
|
2
|
+
class Linnaeus::Stopwords
|
3
|
+
# The default stopword list.
|
4
|
+
DEFAULT_STOPWORDS = %w(a able about across after all almost also am among an and any are as at be because been but by can cannot could dear did do does either else ever every for from get got had has have he her hers him his how however i if in into is it its just least let like likely may me might most must my neither no nor not of off often on only or other our own rather said say says she should since so some than that the their them then there these they this tis to too twas us wants was we were what when where which while who whom why will with would yet you your)
|
5
|
+
|
6
|
+
attr_accessor :stopwords
|
7
|
+
|
8
|
+
# The list of stopwords as an array
|
9
|
+
def to_a
|
10
|
+
@stopwords || DEFAULT_STOPWORDS
|
11
|
+
end
|
12
|
+
|
13
|
+
# The list of stopwords as a ruby Set
|
14
|
+
def to_set
|
15
|
+
to_a.to_set
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# Train or untrain documents from the Bayesian corpus.
|
2
|
+
class Linnaeus::Trainer < Linnaeus
|
3
|
+
|
4
|
+
# Add a document to the training corpus.
|
5
|
+
#
|
6
|
+
# == Parameters
|
7
|
+
# categories::
|
8
|
+
# A string or array of categories
|
9
|
+
# text::
|
10
|
+
# A string of text in this document.
|
11
|
+
def train(categories, text)
|
12
|
+
categories = normalize_categories categories
|
13
|
+
@db.add_categories(categories)
|
14
|
+
|
15
|
+
word_occurrences = count_word_occurrences text
|
16
|
+
categories.each do|cat|
|
17
|
+
@db.increment_word_counts_for_category cat, word_occurrences
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Remove a document from the training corpus.
|
22
|
+
#
|
23
|
+
# == Parameters
|
24
|
+
# categories::
|
25
|
+
# A string or array of categories
|
26
|
+
# text::
|
27
|
+
# A string of text in this document.
|
28
|
+
def untrain(categories, text)
|
29
|
+
categories = normalize_categories categories
|
30
|
+
|
31
|
+
word_occurrences = count_word_occurrences text
|
32
|
+
categories.each do|cat|
|
33
|
+
@db.decrement_word_counts_for_category cat, word_occurrences
|
34
|
+
@db.cleanup_empty_words_in_category cat
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe Linnaeus::Classifier do
|
4
|
+
context 'with a very small dataset' do
|
5
|
+
it 'should classify easy things well' do
|
6
|
+
create_small_dataset
|
7
|
+
subject.classify('A bird that migrates').should eq('bird')
|
8
|
+
subject.classify('This was directed by Gus Van Sant').should eq('movie')
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def create_small_dataset
|
13
|
+
Linnaeus::Persistence.new.clear_all_training_data
|
14
|
+
lt = Linnaeus::Trainer.new
|
15
|
+
lt.train 'movie', "Gone with the Wind is a 1939 American historical epic film adapted from Margaret Mitchell's Pulitzer-winning 1936 novel of the same name."
|
16
|
+
lt.train 'movie', "THX 1138 is a 1971 science fiction film directed by George Lucas in his feature directorial debut. The film was written by Lucas and Walter Murch."
|
17
|
+
lt.train 'movie', "Top Gun is a 1986 American action drama film directed by Tony Scott, and produced by Don Simpson and Jerry Bruckheimer, in association with the Paramount Pictures company."
|
18
|
+
|
19
|
+
lt.train 'bird', "The Yellow-throated Warbler (Setophaga dominica) is a small migratory songbird species breeding in temperate North America. It belongs to the New World warbler family (Parulidae)."
|
20
|
+
lt.train 'bird', "The Blue Jay (Cyanocitta cristata) is a passerine bird in the family Corvidae, native to North America. It is resident through most of eastern and central United States and southern Canada, although western populations may be migratory."
|
21
|
+
lt.train 'bird', "The Mallard or Wild Duck (Anas platyrhynchos) is a dabbling duck which breeds throughout the temperate and subtropical Americas, Europe, Asia, and North Africa, and has been introduced to New Zealand and Australia. This duck belongs to the subfamily Anatinae of the waterfowl family Anatidae"
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe Linnaeus::Persistence do
|
4
|
+
it 'stores categories successfully' do
|
5
|
+
lp = Linnaeus::Persistence.new
|
6
|
+
lp.clear_all_training_data
|
7
|
+
add_categories lp
|
8
|
+
lp.get_categories.sort.should eq ['bar','baz','foo']
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'can remove categories' do
|
12
|
+
lp = Linnaeus::Persistence.new
|
13
|
+
lp.clear_all_training_data
|
14
|
+
add_categories lp
|
15
|
+
lp.remove_category 'bar'
|
16
|
+
lp.get_categories.sort.should eq ['baz','foo']
|
17
|
+
end
|
18
|
+
|
19
|
+
def add_categories(lp)
|
20
|
+
lp.add_categories(['foo','bar','baz','foo', 'bar'])
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe Linnaeus::Stopwords do
|
4
|
+
subject { Linnaeus::Stopwords.new }
|
5
|
+
it '.to_a' do
|
6
|
+
subject.should respond_to :to_a
|
7
|
+
subject.to_a.should be_an_instance_of Array
|
8
|
+
subject.to_a.should include 'the'
|
9
|
+
end
|
10
|
+
it '.to_set' do
|
11
|
+
subject.should respond_to :to_set
|
12
|
+
subject.to_set.should be_an_instance_of Set
|
13
|
+
subject.to_set.should include 'the'
|
14
|
+
end
|
15
|
+
it 'can have stopwords overridden' do
|
16
|
+
subject.stopwords = ['foo','bar']
|
17
|
+
subject.to_a.should eq ['foo','bar']
|
18
|
+
subject.to_set.should eq ['foo','bar'].to_set
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe Linnaeus::Trainer do
|
4
|
+
context 'with default options' do
|
5
|
+
subject { Linnaeus::Trainer.new }
|
6
|
+
|
7
|
+
it 'should count word occurrencs properly' do
|
8
|
+
subject.count_word_occurrences('foo bar foo baz').should ==
|
9
|
+
{ 'foo' => 2, 'bar' => 1, 'baz' => 1 }
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'should not count stopwords' do
|
13
|
+
subject.count_word_occurrences('foo the you').should == { 'foo' => 1 }
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'returns an empty hash when given an empty string' do
|
17
|
+
subject.count_word_occurrences.should == { }
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'should train on documents properly' do
|
21
|
+
lp = Linnaeus::Persistence.new
|
22
|
+
lp.clear_all_training_data
|
23
|
+
subject.train 'fruit', grape
|
24
|
+
subject.train 'fruit', orange
|
25
|
+
lp.get_words_with_count_for_category('fruit').should eq(
|
26
|
+
{
|
27
|
+
"grape"=>"1", "purpl"=>"1", "blue"=>"1", "green"=>"1",
|
28
|
+
"fruit"=>"2", "sweet"=>"2", "wine"=>"1", "oval"=>"1",
|
29
|
+
"orang"=>"1", "round"=>"1", "citru"=>"1"
|
30
|
+
})
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'should partially untrain properly' do
|
34
|
+
lp = Linnaeus::Persistence.new
|
35
|
+
lp.clear_all_training_data
|
36
|
+
subject.train 'fruit', grape
|
37
|
+
subject.train 'fruit', orange
|
38
|
+
|
39
|
+
subject.untrain 'fruit', grape
|
40
|
+
lp.get_words_with_count_for_category('fruit').should eq({"fruit"=>"1", "sweet"=>"1", "orang"=>"1", "round"=>"1", "citru"=>"1"})
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should fully untrain properly' do
|
44
|
+
lp = Linnaeus::Persistence.new
|
45
|
+
lp.clear_all_training_data
|
46
|
+
subject.train 'fruit', grape
|
47
|
+
subject.untrain 'fruit', grape
|
48
|
+
lp.get_words_with_count_for_category('fruit').should eq({})
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
context 'with non-default stopwords' do
|
54
|
+
subject { Linnaeus::Trainer.new(stopwords_class: FooStop) }
|
55
|
+
it 'should count word occurrencs properly' do
|
56
|
+
subject.count_word_occurrences('foo bar foo baz').should == { 'baz' => 1 }
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def grape
|
61
|
+
'grape purple blue green fruit sweet wine oval'
|
62
|
+
end
|
63
|
+
|
64
|
+
def orange
|
65
|
+
'orange round citrus fruit sweet'
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
class FooStop
|
70
|
+
def to_set
|
71
|
+
Set.new ['foo','bar']
|
72
|
+
end
|
73
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
3
|
+
require 'simplecov'
|
4
|
+
SimpleCov.start
|
5
|
+
require 'rspec'
|
6
|
+
require 'linnaeus'
|
7
|
+
|
8
|
+
# Requires supporting files with custom matchers and macros, etc,
|
9
|
+
# in ./support/ and its subdirectories.
|
10
|
+
Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
|
11
|
+
|
12
|
+
RSpec.configure do |config|
|
13
|
+
|
14
|
+
end
|
metadata
ADDED
@@ -0,0 +1,201 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: linnaeus
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- djcp
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-10-30 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: redis
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 3.0.0
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 3.0.0
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: stemmer
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ~>
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: 1.0.0
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 1.0.0
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rspec
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ~>
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 2.11.0
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 2.11.0
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: yard
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ~>
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0.7'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0.7'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: rdoc
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ~>
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '3.12'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ~>
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '3.12'
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: bundler
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :development
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
- !ruby/object:Gem::Dependency
|
111
|
+
name: jeweler
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
114
|
+
requirements:
|
115
|
+
- - ! '>='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
122
|
+
requirements:
|
123
|
+
- - ! '>='
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '0'
|
126
|
+
- !ruby/object:Gem::Dependency
|
127
|
+
name: simplecov
|
128
|
+
requirement: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
130
|
+
requirements:
|
131
|
+
- - ! '>='
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '0'
|
134
|
+
type: :development
|
135
|
+
prerelease: false
|
136
|
+
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ! '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
142
|
+
description: Linnaeus provides a redis-backed Bayesian classifier. Words are stemmed,
|
143
|
+
stopwords are stopped, and redis is used to allow for persistent and concurrent
|
144
|
+
training and classification.
|
145
|
+
email: dan@collispuro.net
|
146
|
+
executables: []
|
147
|
+
extensions: []
|
148
|
+
extra_rdoc_files:
|
149
|
+
- LICENSE.txt
|
150
|
+
- README.rdoc
|
151
|
+
files:
|
152
|
+
- .document
|
153
|
+
- .rspec
|
154
|
+
- .travis.yml
|
155
|
+
- Gemfile
|
156
|
+
- Gemfile.lock
|
157
|
+
- LICENSE.txt
|
158
|
+
- README.rdoc
|
159
|
+
- Rakefile
|
160
|
+
- VERSION
|
161
|
+
- images/linnaeus.jpg
|
162
|
+
- lib/linnaeus.rb
|
163
|
+
- lib/linnaeus/classifier.rb
|
164
|
+
- lib/linnaeus/persistence.rb
|
165
|
+
- lib/linnaeus/stopwords.rb
|
166
|
+
- lib/linnaeus/trainer.rb
|
167
|
+
- spec/linnaeus_classifier_spec.rb
|
168
|
+
- spec/linnaeus_persistence_spec.rb
|
169
|
+
- spec/linnaeus_spec.rb
|
170
|
+
- spec/linnaeus_stopwords_spec.rb
|
171
|
+
- spec/linnaeus_trainer_spec.rb
|
172
|
+
- spec/spec_helper.rb
|
173
|
+
homepage: http://github.com/djcp/linnaeus
|
174
|
+
licenses:
|
175
|
+
- MIT
|
176
|
+
post_install_message:
|
177
|
+
rdoc_options: []
|
178
|
+
require_paths:
|
179
|
+
- lib
|
180
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
181
|
+
none: false
|
182
|
+
requirements:
|
183
|
+
- - ! '>='
|
184
|
+
- !ruby/object:Gem::Version
|
185
|
+
version: '0'
|
186
|
+
segments:
|
187
|
+
- 0
|
188
|
+
hash: 494428062127756217
|
189
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
190
|
+
none: false
|
191
|
+
requirements:
|
192
|
+
- - ! '>='
|
193
|
+
- !ruby/object:Gem::Version
|
194
|
+
version: '0'
|
195
|
+
requirements: []
|
196
|
+
rubyforge_project:
|
197
|
+
rubygems_version: 1.8.24
|
198
|
+
signing_key:
|
199
|
+
specification_version: 3
|
200
|
+
summary: Another redis-backed Bayesian classifier
|
201
|
+
test_files: []
|