linnaeus 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@
2
2
 
3
3
  https://raw.github.com/djcp/linnaeus/master/images/linnaeus.jpg
4
4
 
5
- Linnaeus is a redis-backed Bayesian classification system. Please see the generated rdocs for more information.
5
+ Linnaeus is a redis-backed Bayesian classification system. Please see the generated rdocs for more information. Ruby 1.9 is required.
6
6
 
7
7
  == Examples
8
8
 
@@ -23,7 +23,6 @@ Linnaeus is a redis-backed Bayesian classification system. Please see the genera
23
23
 
24
24
  == The Future
25
25
 
26
- * Make sure we're unicode OK
27
26
  * Create additional storage backends - sqlite, postgresql, mongodb, etc.
28
27
  * Allow for weighting tweaks.
29
28
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.0.0
1
+ 1.0.2
@@ -10,12 +10,14 @@ class Linnaeus
10
10
  options = {
11
11
  persistence_class: Persistence,
12
12
  stopwords_class: Stopwords,
13
- skip_stemming: false
13
+ skip_stemming: false,
14
+ encoding: 'UTF-8'
14
15
  }.merge(opts)
15
16
 
16
17
  @db = options[:persistence_class].new(options)
17
18
  @stopword_generator = options[:stopwords_class].new
18
19
  @skip_stemming = options[:skip_stemming]
20
+ @encoding = options[:encoding]
19
21
  end
20
22
 
21
23
  # Count occurences of words in a text corpus.
@@ -25,7 +27,7 @@ class Linnaeus
25
27
  # A string representing a document. Stopwords are removed and words are stemmed using the "Stemmer" gem.
26
28
  def count_word_occurrences(text = '')
27
29
  count = {}
28
- text.downcase.split.each do |word|
30
+ text.encode(@encoding).downcase.split.each do |word|
29
31
  stemmed_word = (@skip_stemming) ? word : word.stem_porter
30
32
  unless stopwords.include? stemmed_word
31
33
  count[stemmed_word] = count[stemmed_word] ? count[stemmed_word] + 1 : 1
@@ -1,4 +1,26 @@
1
1
  # Classify documents against the Bayesian corpus.
2
+ #
3
+ # lc = Linnaeus::Classifier.new(<options hash>)
4
+ # lc.classify 'a string of text' #a wild category appears
5
+ # lc.classification_scores 'a different string of text' #a hash of categories and scores
6
+ #
7
+ # == Constructor Options
8
+ # persistence_class::
9
+ # A class implementing persistence - the default (Linnaeus::Persistence) uses redis.
10
+ # stopwords_class::
11
+ # A class that emits a set of stopwords. The default is Linnaeus::Stopwords
12
+ # skip_stemming::
13
+ # Set to true to skip porter stemming.
14
+ # encoding::
15
+ # Force text to use this character set. UTF-8 by default.
16
+ # redis_host::
17
+ # Passed to persistence class constructor. Defaults to "127.0.0.1"
18
+ # redis_port::
19
+ # Passed to persistence class constructor. Defaults to "6379".
20
+ # redis_db::
21
+ # Passed to persistence class constructor. Defaults to "0".
22
+ # redis_*::
23
+ # Please see Linnaeus::Persistence for the rest of the options that're passed through directly to the Redis client connection.
2
24
  class Linnaeus::Classifier < Linnaeus
3
25
 
4
26
  # Returns a hash of scores for each category in the Bayesian corpus.
@@ -11,14 +11,27 @@ class Linnaeus::Persistence < Linnaeus
11
11
  options = {
12
12
  redis_host: '127.0.0.1',
13
13
  redis_port: '6379',
14
- redis_db: 0
14
+ redis_db: 0,
15
+ redis_scheme: "redis",
16
+ redis_path: nil,
17
+ redis_timeout: 5.0,
18
+ redis_password: nil,
19
+ redis_id: nil,
20
+ redis_tcp_keepalive: 0
15
21
  }.merge(opts)
16
22
 
17
23
  @redis = Redis.new(
18
24
  host: options[:redis_host],
19
25
  port: options[:redis_port],
20
- db: options[:redis_db]
26
+ db: options[:redis_db],
27
+ scheme: options[:redis_scheme],
28
+ path: options[:redis_path],
29
+ timeout: options[:redis_timeout],
30
+ password: options[:redis_password],
31
+ id: options[:redis_id],
32
+ tcp_keepalive: options[:redis_tcp_keepalive]
21
33
  )
34
+
22
35
  self
23
36
  end
24
37
 
@@ -74,10 +87,8 @@ class Linnaeus::Persistence < Linnaeus
74
87
  # word_occurrences::
75
88
  # A hash containing a count of the number of word occurences in a document
76
89
  def increment_word_counts_for_category(category, word_occurrences)
77
- @redis.multi do |multi|
78
- word_occurrences.each do|word,count|
79
- multi.hincrby BASE_CATEGORY_KEY + category, word, count
80
- end
90
+ word_occurrences.each do|word,count|
91
+ @redis.hincrby BASE_CATEGORY_KEY + category, word, count
81
92
  end
82
93
  end
83
94
 
@@ -89,10 +100,8 @@ class Linnaeus::Persistence < Linnaeus
89
100
  # word_occurrences::
90
101
  # A hash containing a count of the number of word occurences in a document
91
102
  def decrement_word_counts_for_category(category, word_occurrences)
92
- @redis.multi do |multi|
93
- word_occurrences.each do|word,count|
94
- multi.hincrby BASE_CATEGORY_KEY + category, word, - count
95
- end
103
+ word_occurrences.each do|word,count|
104
+ @redis.hincrby BASE_CATEGORY_KEY + category, word, - count
96
105
  end
97
106
  end
98
107
 
@@ -1,4 +1,27 @@
1
1
  # Train or untrain documents from the Bayesian corpus.
2
+ #
3
+ # lt = Linnaeus::Trainer.new(<options hash>)
4
+ # lt.train 'category', 'a string of text'
5
+ # lt.train 'differentcategory', 'another string of text'
6
+ # lt.untrain 'category', 'a document we just removed'
7
+ #
8
+ # == Constructor Options
9
+ # persistence_class::
10
+ # A class implementing persistence - the default (Linnaeus::Persistence) uses redis.
11
+ # stopwords_class::
12
+ # A class that emits a set of stopwords. The default is Linnaeus::Stopwords
13
+ # skip_stemming::
14
+ # Set to true to skip porter stemming.
15
+ # encoding::
16
+ # Force text to use this character set. UTF-8 by default.
17
+ # redis_host::
18
+ # Passed to persistence class constructor. Defaults to "127.0.0.1"
19
+ # redis_port::
20
+ # Passed to persistence class constructor. Defaults to "6379".
21
+ # redis_db::
22
+ # Passed to persistence class constructor. Defaults to "0".
23
+ # redis_*::
24
+ # Please see Linnaeus::Persistence for the rest of the options that're passed through directly to the Redis client connection.
2
25
  class Linnaeus::Trainer < Linnaeus
3
26
 
4
27
  # Add a document to the training corpus.
@@ -0,0 +1,82 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "linnaeus"
8
+ s.version = "1.0.2"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["djcp"]
12
+ s.date = "2012-10-31"
13
+ s.description = "Linnaeus provides a redis-backed Bayesian classifier. Words are stemmed, stopwords are stopped, and redis is used to allow for persistent and concurrent training and classification."
14
+ s.email = "dan@collispuro.net"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".rspec",
22
+ ".travis.yml",
23
+ "Gemfile",
24
+ "Gemfile.lock",
25
+ "LICENSE.txt",
26
+ "README.rdoc",
27
+ "Rakefile",
28
+ "VERSION",
29
+ "images/linnaeus.jpg",
30
+ "lib/linnaeus.rb",
31
+ "lib/linnaeus/classifier.rb",
32
+ "lib/linnaeus/persistence.rb",
33
+ "lib/linnaeus/stopwords.rb",
34
+ "lib/linnaeus/trainer.rb",
35
+ "linnaeus.gemspec",
36
+ "spec/linnaeus_classifier_spec.rb",
37
+ "spec/linnaeus_persistence_spec.rb",
38
+ "spec/linnaeus_spec.rb",
39
+ "spec/linnaeus_stopwords_spec.rb",
40
+ "spec/linnaeus_trainer_spec.rb",
41
+ "spec/spec_helper.rb"
42
+ ]
43
+ s.homepage = "http://github.com/djcp/linnaeus"
44
+ s.licenses = ["MIT"]
45
+ s.require_paths = ["lib"]
46
+ s.rubygems_version = "1.8.24"
47
+ s.summary = "Another redis-backed Bayesian classifier"
48
+
49
+ if s.respond_to? :specification_version then
50
+ s.specification_version = 3
51
+
52
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
53
+ s.add_runtime_dependency(%q<redis>, ["~> 3.0.0"])
54
+ s.add_runtime_dependency(%q<stemmer>, ["~> 1.0.0"])
55
+ s.add_development_dependency(%q<rspec>, ["~> 2.11.0"])
56
+ s.add_development_dependency(%q<yard>, ["~> 0.7"])
57
+ s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
58
+ s.add_development_dependency(%q<bundler>, [">= 0"])
59
+ s.add_development_dependency(%q<jeweler>, [">= 0"])
60
+ s.add_development_dependency(%q<simplecov>, [">= 0"])
61
+ else
62
+ s.add_dependency(%q<redis>, ["~> 3.0.0"])
63
+ s.add_dependency(%q<stemmer>, ["~> 1.0.0"])
64
+ s.add_dependency(%q<rspec>, ["~> 2.11.0"])
65
+ s.add_dependency(%q<yard>, ["~> 0.7"])
66
+ s.add_dependency(%q<rdoc>, ["~> 3.12"])
67
+ s.add_dependency(%q<bundler>, [">= 0"])
68
+ s.add_dependency(%q<jeweler>, [">= 0"])
69
+ s.add_dependency(%q<simplecov>, [">= 0"])
70
+ end
71
+ else
72
+ s.add_dependency(%q<redis>, ["~> 3.0.0"])
73
+ s.add_dependency(%q<stemmer>, ["~> 1.0.0"])
74
+ s.add_dependency(%q<rspec>, ["~> 2.11.0"])
75
+ s.add_dependency(%q<yard>, ["~> 0.7"])
76
+ s.add_dependency(%q<rdoc>, ["~> 3.12"])
77
+ s.add_dependency(%q<bundler>, [">= 0"])
78
+ s.add_dependency(%q<jeweler>, [">= 0"])
79
+ s.add_dependency(%q<simplecov>, [">= 0"])
80
+ end
81
+ end
82
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linnaeus
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-10-30 00:00:00.000000000 Z
12
+ date: 2012-10-31 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: redis
@@ -164,6 +164,7 @@ files:
164
164
  - lib/linnaeus/persistence.rb
165
165
  - lib/linnaeus/stopwords.rb
166
166
  - lib/linnaeus/trainer.rb
167
+ - linnaeus.gemspec
167
168
  - spec/linnaeus_classifier_spec.rb
168
169
  - spec/linnaeus_persistence_spec.rb
169
170
  - spec/linnaeus_spec.rb
@@ -185,7 +186,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
185
186
  version: '0'
186
187
  segments:
187
188
  - 0
188
- hash: 494428062127756217
189
+ hash: 2735128872899197737
189
190
  required_rubygems_version: !ruby/object:Gem::Requirement
190
191
  none: false
191
192
  requirements: