linnaeus 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +1 -2
- data/VERSION +1 -1
- data/lib/linnaeus.rb +4 -2
- data/lib/linnaeus/classifier.rb +22 -0
- data/lib/linnaeus/persistence.rb +19 -10
- data/lib/linnaeus/trainer.rb +23 -0
- data/linnaeus.gemspec +82 -0
- metadata +4 -3
data/README.rdoc
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
https://raw.github.com/djcp/linnaeus/master/images/linnaeus.jpg
|
4
4
|
|
5
|
-
Linnaeus is a redis-backed Bayesian classification system. Please see the generated rdocs for more information.
|
5
|
+
Linnaeus is a redis-backed Bayesian classification system. Please see the generated rdocs for more information. Ruby 1.9 is required.
|
6
6
|
|
7
7
|
== Examples
|
8
8
|
|
@@ -23,7 +23,6 @@ Linnaeus is a redis-backed Bayesian classification system. Please see the genera
|
|
23
23
|
|
24
24
|
== The Future
|
25
25
|
|
26
|
-
* Make sure we're unicode OK
|
27
26
|
* Create additional storage backends - sqlite, postgresql, mongodb, etc.
|
28
27
|
* Allow for weighting tweaks.
|
29
28
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
1.0.
|
1
|
+
1.0.2
|
data/lib/linnaeus.rb
CHANGED
@@ -10,12 +10,14 @@ class Linnaeus
|
|
10
10
|
options = {
|
11
11
|
persistence_class: Persistence,
|
12
12
|
stopwords_class: Stopwords,
|
13
|
-
skip_stemming: false
|
13
|
+
skip_stemming: false,
|
14
|
+
encoding: 'UTF-8'
|
14
15
|
}.merge(opts)
|
15
16
|
|
16
17
|
@db = options[:persistence_class].new(options)
|
17
18
|
@stopword_generator = options[:stopwords_class].new
|
18
19
|
@skip_stemming = options[:skip_stemming]
|
20
|
+
@encoding = options[:encoding]
|
19
21
|
end
|
20
22
|
|
21
23
|
# Count occurences of words in a text corpus.
|
@@ -25,7 +27,7 @@ class Linnaeus
|
|
25
27
|
# A string representing a document. Stopwords are removed and words are stemmed using the "Stemmer" gem.
|
26
28
|
def count_word_occurrences(text = '')
|
27
29
|
count = {}
|
28
|
-
text.downcase.split.each do |word|
|
30
|
+
text.encode(@encoding).downcase.split.each do |word|
|
29
31
|
stemmed_word = (@skip_stemming) ? word : word.stem_porter
|
30
32
|
unless stopwords.include? stemmed_word
|
31
33
|
count[stemmed_word] = count[stemmed_word] ? count[stemmed_word] + 1 : 1
|
data/lib/linnaeus/classifier.rb
CHANGED
@@ -1,4 +1,26 @@
|
|
1
1
|
# Classify documents against the Bayesian corpus.
|
2
|
+
#
|
3
|
+
# lc = Linnaeus::Classifier.new(<options hash>)
|
4
|
+
# lc.classify 'a string of text' #a wild category appears
|
5
|
+
# lc.classification_scores 'a different string of text' #a hash of categories and scores
|
6
|
+
#
|
7
|
+
# == Constructor Options
|
8
|
+
# persistence_class::
|
9
|
+
# A class implementing persistence - the default (Linnaeus::Persistence) uses redis.
|
10
|
+
# stopwords_class::
|
11
|
+
# A class that emits a set of stopwords. The default is Linnaeus::Stopwords
|
12
|
+
# skip_stemming::
|
13
|
+
# Set to true to skip porter stemming.
|
14
|
+
# encoding::
|
15
|
+
# Force text to use this character set. UTF-8 by default.
|
16
|
+
# redis_host::
|
17
|
+
# Passed to persistence class constructor. Defaults to "127.0.0.1"
|
18
|
+
# redis_port::
|
19
|
+
# Passed to persistence class constructor. Defaults to "6379".
|
20
|
+
# redis_db::
|
21
|
+
# Passed to persistence class constructor. Defaults to "0".
|
22
|
+
# redis_*::
|
23
|
+
# Please see Linnaeus::Persistence for the rest of the options that're passed through directly to the Redis client connection.
|
2
24
|
class Linnaeus::Classifier < Linnaeus
|
3
25
|
|
4
26
|
# Returns a hash of scores for each category in the Bayesian corpus.
|
data/lib/linnaeus/persistence.rb
CHANGED
@@ -11,14 +11,27 @@ class Linnaeus::Persistence < Linnaeus
|
|
11
11
|
options = {
|
12
12
|
redis_host: '127.0.0.1',
|
13
13
|
redis_port: '6379',
|
14
|
-
redis_db: 0
|
14
|
+
redis_db: 0,
|
15
|
+
redis_scheme: "redis",
|
16
|
+
redis_path: nil,
|
17
|
+
redis_timeout: 5.0,
|
18
|
+
redis_password: nil,
|
19
|
+
redis_id: nil,
|
20
|
+
redis_tcp_keepalive: 0
|
15
21
|
}.merge(opts)
|
16
22
|
|
17
23
|
@redis = Redis.new(
|
18
24
|
host: options[:redis_host],
|
19
25
|
port: options[:redis_port],
|
20
|
-
db: options[:redis_db]
|
26
|
+
db: options[:redis_db],
|
27
|
+
scheme: options[:redis_scheme],
|
28
|
+
path: options[:redis_path],
|
29
|
+
timeout: options[:redis_timeout],
|
30
|
+
password: options[:redis_password],
|
31
|
+
id: options[:redis_id],
|
32
|
+
tcp_keepalive: options[:redis_tcp_keepalive]
|
21
33
|
)
|
34
|
+
|
22
35
|
self
|
23
36
|
end
|
24
37
|
|
@@ -74,10 +87,8 @@ class Linnaeus::Persistence < Linnaeus
|
|
74
87
|
# word_occurrences::
|
75
88
|
# A hash containing a count of the number of word occurences in a document
|
76
89
|
def increment_word_counts_for_category(category, word_occurrences)
|
77
|
-
|
78
|
-
|
79
|
-
multi.hincrby BASE_CATEGORY_KEY + category, word, count
|
80
|
-
end
|
90
|
+
word_occurrences.each do|word,count|
|
91
|
+
@redis.hincrby BASE_CATEGORY_KEY + category, word, count
|
81
92
|
end
|
82
93
|
end
|
83
94
|
|
@@ -89,10 +100,8 @@ class Linnaeus::Persistence < Linnaeus
|
|
89
100
|
# word_occurrences::
|
90
101
|
# A hash containing a count of the number of word occurences in a document
|
91
102
|
def decrement_word_counts_for_category(category, word_occurrences)
|
92
|
-
|
93
|
-
|
94
|
-
multi.hincrby BASE_CATEGORY_KEY + category, word, - count
|
95
|
-
end
|
103
|
+
word_occurrences.each do|word,count|
|
104
|
+
@redis.hincrby BASE_CATEGORY_KEY + category, word, - count
|
96
105
|
end
|
97
106
|
end
|
98
107
|
|
data/lib/linnaeus/trainer.rb
CHANGED
@@ -1,4 +1,27 @@
|
|
1
1
|
# Train or untrain documents from the Bayesian corpus.
|
2
|
+
#
|
3
|
+
# lt = Linnaeus::Trainer.new(<options hash>)
|
4
|
+
# lt.train 'category', 'a string of text'
|
5
|
+
# lt.train 'differentcategory', 'another string of text'
|
6
|
+
# lt.untrain 'category', 'a document we just removed'
|
7
|
+
#
|
8
|
+
# == Constructor Options
|
9
|
+
# persistence_class::
|
10
|
+
# A class implementing persistence - the default (Linnaeus::Persistence) uses redis.
|
11
|
+
# stopwords_class::
|
12
|
+
# A class that emits a set of stopwords. The default is Linnaeus::Stopwords
|
13
|
+
# skip_stemming::
|
14
|
+
# Set to true to skip porter stemming.
|
15
|
+
# encoding::
|
16
|
+
# Force text to use this character set. UTF-8 by default.
|
17
|
+
# redis_host::
|
18
|
+
# Passed to persistence class constructor. Defaults to "127.0.0.1"
|
19
|
+
# redis_port::
|
20
|
+
# Passed to persistence class constructor. Defaults to "6379".
|
21
|
+
# redis_db::
|
22
|
+
# Passed to persistence class constructor. Defaults to "0".
|
23
|
+
# redis_*::
|
24
|
+
# Please see Linnaeus::Persistence for the rest of the options that're passed through directly to the Redis client connection.
|
2
25
|
class Linnaeus::Trainer < Linnaeus
|
3
26
|
|
4
27
|
# Add a document to the training corpus.
|
data/linnaeus.gemspec
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "linnaeus"
|
8
|
+
s.version = "1.0.2"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["djcp"]
|
12
|
+
s.date = "2012-10-31"
|
13
|
+
s.description = "Linnaeus provides a redis-backed Bayesian classifier. Words are stemmed, stopwords are stopped, and redis is used to allow for persistent and concurrent training and classification."
|
14
|
+
s.email = "dan@collispuro.net"
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE.txt",
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
".rspec",
|
22
|
+
".travis.yml",
|
23
|
+
"Gemfile",
|
24
|
+
"Gemfile.lock",
|
25
|
+
"LICENSE.txt",
|
26
|
+
"README.rdoc",
|
27
|
+
"Rakefile",
|
28
|
+
"VERSION",
|
29
|
+
"images/linnaeus.jpg",
|
30
|
+
"lib/linnaeus.rb",
|
31
|
+
"lib/linnaeus/classifier.rb",
|
32
|
+
"lib/linnaeus/persistence.rb",
|
33
|
+
"lib/linnaeus/stopwords.rb",
|
34
|
+
"lib/linnaeus/trainer.rb",
|
35
|
+
"linnaeus.gemspec",
|
36
|
+
"spec/linnaeus_classifier_spec.rb",
|
37
|
+
"spec/linnaeus_persistence_spec.rb",
|
38
|
+
"spec/linnaeus_spec.rb",
|
39
|
+
"spec/linnaeus_stopwords_spec.rb",
|
40
|
+
"spec/linnaeus_trainer_spec.rb",
|
41
|
+
"spec/spec_helper.rb"
|
42
|
+
]
|
43
|
+
s.homepage = "http://github.com/djcp/linnaeus"
|
44
|
+
s.licenses = ["MIT"]
|
45
|
+
s.require_paths = ["lib"]
|
46
|
+
s.rubygems_version = "1.8.24"
|
47
|
+
s.summary = "Another redis-backed Bayesian classifier"
|
48
|
+
|
49
|
+
if s.respond_to? :specification_version then
|
50
|
+
s.specification_version = 3
|
51
|
+
|
52
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
53
|
+
s.add_runtime_dependency(%q<redis>, ["~> 3.0.0"])
|
54
|
+
s.add_runtime_dependency(%q<stemmer>, ["~> 1.0.0"])
|
55
|
+
s.add_development_dependency(%q<rspec>, ["~> 2.11.0"])
|
56
|
+
s.add_development_dependency(%q<yard>, ["~> 0.7"])
|
57
|
+
s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
|
58
|
+
s.add_development_dependency(%q<bundler>, [">= 0"])
|
59
|
+
s.add_development_dependency(%q<jeweler>, [">= 0"])
|
60
|
+
s.add_development_dependency(%q<simplecov>, [">= 0"])
|
61
|
+
else
|
62
|
+
s.add_dependency(%q<redis>, ["~> 3.0.0"])
|
63
|
+
s.add_dependency(%q<stemmer>, ["~> 1.0.0"])
|
64
|
+
s.add_dependency(%q<rspec>, ["~> 2.11.0"])
|
65
|
+
s.add_dependency(%q<yard>, ["~> 0.7"])
|
66
|
+
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
67
|
+
s.add_dependency(%q<bundler>, [">= 0"])
|
68
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
69
|
+
s.add_dependency(%q<simplecov>, [">= 0"])
|
70
|
+
end
|
71
|
+
else
|
72
|
+
s.add_dependency(%q<redis>, ["~> 3.0.0"])
|
73
|
+
s.add_dependency(%q<stemmer>, ["~> 1.0.0"])
|
74
|
+
s.add_dependency(%q<rspec>, ["~> 2.11.0"])
|
75
|
+
s.add_dependency(%q<yard>, ["~> 0.7"])
|
76
|
+
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
77
|
+
s.add_dependency(%q<bundler>, [">= 0"])
|
78
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
79
|
+
s.add_dependency(%q<simplecov>, [">= 0"])
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linnaeus
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-10-
|
12
|
+
date: 2012-10-31 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: redis
|
@@ -164,6 +164,7 @@ files:
|
|
164
164
|
- lib/linnaeus/persistence.rb
|
165
165
|
- lib/linnaeus/stopwords.rb
|
166
166
|
- lib/linnaeus/trainer.rb
|
167
|
+
- linnaeus.gemspec
|
167
168
|
- spec/linnaeus_classifier_spec.rb
|
168
169
|
- spec/linnaeus_persistence_spec.rb
|
169
170
|
- spec/linnaeus_spec.rb
|
@@ -185,7 +186,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
185
186
|
version: '0'
|
186
187
|
segments:
|
187
188
|
- 0
|
188
|
-
hash:
|
189
|
+
hash: 2735128872899197737
|
189
190
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
190
191
|
none: false
|
191
192
|
requirements:
|