linnaeus 1.0.0 → 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +1 -2
- data/VERSION +1 -1
- data/lib/linnaeus.rb +4 -2
- data/lib/linnaeus/classifier.rb +22 -0
- data/lib/linnaeus/persistence.rb +19 -10
- data/lib/linnaeus/trainer.rb +23 -0
- data/linnaeus.gemspec +82 -0
- metadata +4 -3
data/README.rdoc
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
https://raw.github.com/djcp/linnaeus/master/images/linnaeus.jpg
|
4
4
|
|
5
|
-
Linnaeus is a redis-backed Bayesian classification system. Please see the generated rdocs for more information.
|
5
|
+
Linnaeus is a redis-backed Bayesian classification system. Please see the generated rdocs for more information. Ruby 1.9 is required.
|
6
6
|
|
7
7
|
== Examples
|
8
8
|
|
@@ -23,7 +23,6 @@ Linnaeus is a redis-backed Bayesian classification system. Please see the genera
|
|
23
23
|
|
24
24
|
== The Future
|
25
25
|
|
26
|
-
* Make sure we're unicode OK
|
27
26
|
* Create additional storage backends - sqlite, postgresql, mongodb, etc.
|
28
27
|
* Allow for weighting tweaks.
|
29
28
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
1.0.
|
1
|
+
1.0.2
|
data/lib/linnaeus.rb
CHANGED
@@ -10,12 +10,14 @@ class Linnaeus
|
|
10
10
|
options = {
|
11
11
|
persistence_class: Persistence,
|
12
12
|
stopwords_class: Stopwords,
|
13
|
-
skip_stemming: false
|
13
|
+
skip_stemming: false,
|
14
|
+
encoding: 'UTF-8'
|
14
15
|
}.merge(opts)
|
15
16
|
|
16
17
|
@db = options[:persistence_class].new(options)
|
17
18
|
@stopword_generator = options[:stopwords_class].new
|
18
19
|
@skip_stemming = options[:skip_stemming]
|
20
|
+
@encoding = options[:encoding]
|
19
21
|
end
|
20
22
|
|
21
23
|
# Count occurences of words in a text corpus.
|
@@ -25,7 +27,7 @@ class Linnaeus
|
|
25
27
|
# A string representing a document. Stopwords are removed and words are stemmed using the "Stemmer" gem.
|
26
28
|
def count_word_occurrences(text = '')
|
27
29
|
count = {}
|
28
|
-
text.downcase.split.each do |word|
|
30
|
+
text.encode(@encoding).downcase.split.each do |word|
|
29
31
|
stemmed_word = (@skip_stemming) ? word : word.stem_porter
|
30
32
|
unless stopwords.include? stemmed_word
|
31
33
|
count[stemmed_word] = count[stemmed_word] ? count[stemmed_word] + 1 : 1
|
data/lib/linnaeus/classifier.rb
CHANGED
@@ -1,4 +1,26 @@
|
|
1
1
|
# Classify documents against the Bayesian corpus.
|
2
|
+
#
|
3
|
+
# lc = Linnaeus::Classifier.new(<options hash>)
|
4
|
+
# lc.classify 'a string of text' #a wild category appears
|
5
|
+
# lc.classification_scores 'a different string of text' #a hash of categories and scores
|
6
|
+
#
|
7
|
+
# == Constructor Options
|
8
|
+
# persistence_class::
|
9
|
+
# A class implementing persistence - the default (Linnaeus::Persistence) uses redis.
|
10
|
+
# stopwords_class::
|
11
|
+
# A class that emits a set of stopwords. The default is Linnaeus::Stopwords
|
12
|
+
# skip_stemming::
|
13
|
+
# Set to true to skip porter stemming.
|
14
|
+
# encoding::
|
15
|
+
# Force text to use this character set. UTF-8 by default.
|
16
|
+
# redis_host::
|
17
|
+
# Passed to persistence class constructor. Defaults to "127.0.0.1"
|
18
|
+
# redis_port::
|
19
|
+
# Passed to persistence class constructor. Defaults to "6379".
|
20
|
+
# redis_db::
|
21
|
+
# Passed to persistence class constructor. Defaults to "0".
|
22
|
+
# redis_*::
|
23
|
+
# Please see Linnaeus::Persistence for the rest of the options that're passed through directly to the Redis client connection.
|
2
24
|
class Linnaeus::Classifier < Linnaeus
|
3
25
|
|
4
26
|
# Returns a hash of scores for each category in the Bayesian corpus.
|
data/lib/linnaeus/persistence.rb
CHANGED
@@ -11,14 +11,27 @@ class Linnaeus::Persistence < Linnaeus
|
|
11
11
|
options = {
|
12
12
|
redis_host: '127.0.0.1',
|
13
13
|
redis_port: '6379',
|
14
|
-
redis_db: 0
|
14
|
+
redis_db: 0,
|
15
|
+
redis_scheme: "redis",
|
16
|
+
redis_path: nil,
|
17
|
+
redis_timeout: 5.0,
|
18
|
+
redis_password: nil,
|
19
|
+
redis_id: nil,
|
20
|
+
redis_tcp_keepalive: 0
|
15
21
|
}.merge(opts)
|
16
22
|
|
17
23
|
@redis = Redis.new(
|
18
24
|
host: options[:redis_host],
|
19
25
|
port: options[:redis_port],
|
20
|
-
db: options[:redis_db]
|
26
|
+
db: options[:redis_db],
|
27
|
+
scheme: options[:redis_scheme],
|
28
|
+
path: options[:redis_path],
|
29
|
+
timeout: options[:redis_timeout],
|
30
|
+
password: options[:redis_password],
|
31
|
+
id: options[:redis_id],
|
32
|
+
tcp_keepalive: options[:redis_tcp_keepalive]
|
21
33
|
)
|
34
|
+
|
22
35
|
self
|
23
36
|
end
|
24
37
|
|
@@ -74,10 +87,8 @@ class Linnaeus::Persistence < Linnaeus
|
|
74
87
|
# word_occurrences::
|
75
88
|
# A hash containing a count of the number of word occurences in a document
|
76
89
|
def increment_word_counts_for_category(category, word_occurrences)
|
77
|
-
|
78
|
-
|
79
|
-
multi.hincrby BASE_CATEGORY_KEY + category, word, count
|
80
|
-
end
|
90
|
+
word_occurrences.each do|word,count|
|
91
|
+
@redis.hincrby BASE_CATEGORY_KEY + category, word, count
|
81
92
|
end
|
82
93
|
end
|
83
94
|
|
@@ -89,10 +100,8 @@ class Linnaeus::Persistence < Linnaeus
|
|
89
100
|
# word_occurrences::
|
90
101
|
# A hash containing a count of the number of word occurences in a document
|
91
102
|
def decrement_word_counts_for_category(category, word_occurrences)
|
92
|
-
|
93
|
-
|
94
|
-
multi.hincrby BASE_CATEGORY_KEY + category, word, - count
|
95
|
-
end
|
103
|
+
word_occurrences.each do|word,count|
|
104
|
+
@redis.hincrby BASE_CATEGORY_KEY + category, word, - count
|
96
105
|
end
|
97
106
|
end
|
98
107
|
|
data/lib/linnaeus/trainer.rb
CHANGED
@@ -1,4 +1,27 @@
|
|
1
1
|
# Train or untrain documents from the Bayesian corpus.
|
2
|
+
#
|
3
|
+
# lt = Linnaeus::Trainer.new(<options hash>)
|
4
|
+
# lt.train 'category', 'a string of text'
|
5
|
+
# lt.train 'differentcategory', 'another string of text'
|
6
|
+
# lt.untrain 'category', 'a document we just removed'
|
7
|
+
#
|
8
|
+
# == Constructor Options
|
9
|
+
# persistence_class::
|
10
|
+
# A class implementing persistence - the default (Linnaeus::Persistence) uses redis.
|
11
|
+
# stopwords_class::
|
12
|
+
# A class that emits a set of stopwords. The default is Linnaeus::Stopwords
|
13
|
+
# skip_stemming::
|
14
|
+
# Set to true to skip porter stemming.
|
15
|
+
# encoding::
|
16
|
+
# Force text to use this character set. UTF-8 by default.
|
17
|
+
# redis_host::
|
18
|
+
# Passed to persistence class constructor. Defaults to "127.0.0.1"
|
19
|
+
# redis_port::
|
20
|
+
# Passed to persistence class constructor. Defaults to "6379".
|
21
|
+
# redis_db::
|
22
|
+
# Passed to persistence class constructor. Defaults to "0".
|
23
|
+
# redis_*::
|
24
|
+
# Please see Linnaeus::Persistence for the rest of the options that're passed through directly to the Redis client connection.
|
2
25
|
class Linnaeus::Trainer < Linnaeus
|
3
26
|
|
4
27
|
# Add a document to the training corpus.
|
data/linnaeus.gemspec
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "linnaeus"
|
8
|
+
s.version = "1.0.2"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["djcp"]
|
12
|
+
s.date = "2012-10-31"
|
13
|
+
s.description = "Linnaeus provides a redis-backed Bayesian classifier. Words are stemmed, stopwords are stopped, and redis is used to allow for persistent and concurrent training and classification."
|
14
|
+
s.email = "dan@collispuro.net"
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE.txt",
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
".rspec",
|
22
|
+
".travis.yml",
|
23
|
+
"Gemfile",
|
24
|
+
"Gemfile.lock",
|
25
|
+
"LICENSE.txt",
|
26
|
+
"README.rdoc",
|
27
|
+
"Rakefile",
|
28
|
+
"VERSION",
|
29
|
+
"images/linnaeus.jpg",
|
30
|
+
"lib/linnaeus.rb",
|
31
|
+
"lib/linnaeus/classifier.rb",
|
32
|
+
"lib/linnaeus/persistence.rb",
|
33
|
+
"lib/linnaeus/stopwords.rb",
|
34
|
+
"lib/linnaeus/trainer.rb",
|
35
|
+
"linnaeus.gemspec",
|
36
|
+
"spec/linnaeus_classifier_spec.rb",
|
37
|
+
"spec/linnaeus_persistence_spec.rb",
|
38
|
+
"spec/linnaeus_spec.rb",
|
39
|
+
"spec/linnaeus_stopwords_spec.rb",
|
40
|
+
"spec/linnaeus_trainer_spec.rb",
|
41
|
+
"spec/spec_helper.rb"
|
42
|
+
]
|
43
|
+
s.homepage = "http://github.com/djcp/linnaeus"
|
44
|
+
s.licenses = ["MIT"]
|
45
|
+
s.require_paths = ["lib"]
|
46
|
+
s.rubygems_version = "1.8.24"
|
47
|
+
s.summary = "Another redis-backed Bayesian classifier"
|
48
|
+
|
49
|
+
if s.respond_to? :specification_version then
|
50
|
+
s.specification_version = 3
|
51
|
+
|
52
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
53
|
+
s.add_runtime_dependency(%q<redis>, ["~> 3.0.0"])
|
54
|
+
s.add_runtime_dependency(%q<stemmer>, ["~> 1.0.0"])
|
55
|
+
s.add_development_dependency(%q<rspec>, ["~> 2.11.0"])
|
56
|
+
s.add_development_dependency(%q<yard>, ["~> 0.7"])
|
57
|
+
s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
|
58
|
+
s.add_development_dependency(%q<bundler>, [">= 0"])
|
59
|
+
s.add_development_dependency(%q<jeweler>, [">= 0"])
|
60
|
+
s.add_development_dependency(%q<simplecov>, [">= 0"])
|
61
|
+
else
|
62
|
+
s.add_dependency(%q<redis>, ["~> 3.0.0"])
|
63
|
+
s.add_dependency(%q<stemmer>, ["~> 1.0.0"])
|
64
|
+
s.add_dependency(%q<rspec>, ["~> 2.11.0"])
|
65
|
+
s.add_dependency(%q<yard>, ["~> 0.7"])
|
66
|
+
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
67
|
+
s.add_dependency(%q<bundler>, [">= 0"])
|
68
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
69
|
+
s.add_dependency(%q<simplecov>, [">= 0"])
|
70
|
+
end
|
71
|
+
else
|
72
|
+
s.add_dependency(%q<redis>, ["~> 3.0.0"])
|
73
|
+
s.add_dependency(%q<stemmer>, ["~> 1.0.0"])
|
74
|
+
s.add_dependency(%q<rspec>, ["~> 2.11.0"])
|
75
|
+
s.add_dependency(%q<yard>, ["~> 0.7"])
|
76
|
+
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
77
|
+
s.add_dependency(%q<bundler>, [">= 0"])
|
78
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
79
|
+
s.add_dependency(%q<simplecov>, [">= 0"])
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linnaeus
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-10-
|
12
|
+
date: 2012-10-31 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: redis
|
@@ -164,6 +164,7 @@ files:
|
|
164
164
|
- lib/linnaeus/persistence.rb
|
165
165
|
- lib/linnaeus/stopwords.rb
|
166
166
|
- lib/linnaeus/trainer.rb
|
167
|
+
- linnaeus.gemspec
|
167
168
|
- spec/linnaeus_classifier_spec.rb
|
168
169
|
- spec/linnaeus_persistence_spec.rb
|
169
170
|
- spec/linnaeus_spec.rb
|
@@ -185,7 +186,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
185
186
|
version: '0'
|
186
187
|
segments:
|
187
188
|
- 0
|
188
|
-
hash:
|
189
|
+
hash: 2735128872899197737
|
189
190
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
190
191
|
none: false
|
191
192
|
requirements:
|