linnaeus 1.0.0 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,7 +2,7 @@
2
2
 
3
3
  https://raw.github.com/djcp/linnaeus/master/images/linnaeus.jpg
4
4
 
5
- Linnaeus is a redis-backed Bayesian classification system. Please see the generated rdocs for more information.
5
+ Linnaeus is a redis-backed Bayesian classification system. Please see the generated rdocs for more information. Ruby 1.9 is required.
6
6
 
7
7
  == Examples
8
8
 
@@ -23,7 +23,6 @@ Linnaeus is a redis-backed Bayesian classification system. Please see the genera
23
23
 
24
24
  == The Future
25
25
 
26
- * Make sure we're unicode OK
27
26
  * Create additional storage backends - sqlite, postgresql, mongodb, etc.
28
27
  * Allow for weighting tweaks.
29
28
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.0.0
1
+ 1.0.2
@@ -10,12 +10,14 @@ class Linnaeus
10
10
  options = {
11
11
  persistence_class: Persistence,
12
12
  stopwords_class: Stopwords,
13
- skip_stemming: false
13
+ skip_stemming: false,
14
+ encoding: 'UTF-8'
14
15
  }.merge(opts)
15
16
 
16
17
  @db = options[:persistence_class].new(options)
17
18
  @stopword_generator = options[:stopwords_class].new
18
19
  @skip_stemming = options[:skip_stemming]
20
+ @encoding = options[:encoding]
19
21
  end
20
22
 
21
23
  # Count occurences of words in a text corpus.
@@ -25,7 +27,7 @@ class Linnaeus
25
27
  # A string representing a document. Stopwords are removed and words are stemmed using the "Stemmer" gem.
26
28
  def count_word_occurrences(text = '')
27
29
  count = {}
28
- text.downcase.split.each do |word|
30
+ text.encode(@encoding).downcase.split.each do |word|
29
31
  stemmed_word = (@skip_stemming) ? word : word.stem_porter
30
32
  unless stopwords.include? stemmed_word
31
33
  count[stemmed_word] = count[stemmed_word] ? count[stemmed_word] + 1 : 1
@@ -1,4 +1,26 @@
1
1
  # Classify documents against the Bayesian corpus.
2
+ #
3
+ # lc = Linnaeus::Classifier.new(<options hash>)
4
+ # lc.classify 'a string of text' #a wild category appears
5
+ # lc.classification_scores 'a different string of text' #a hash of categories and scores
6
+ #
7
+ # == Constructor Options
8
+ # persistence_class::
9
+ # A class implementing persistence - the default (Linnaeus::Persistence) uses redis.
10
+ # stopwords_class::
11
+ # A class that emits a set of stopwords. The default is Linnaeus::Stopwords
12
+ # skip_stemming::
13
+ # Set to true to skip porter stemming.
14
+ # encoding::
15
+ # Force text to use this character set. UTF-8 by default.
16
+ # redis_host::
17
+ # Passed to persistence class constructor. Defaults to "127.0.0.1"
18
+ # redis_port::
19
+ # Passed to persistence class constructor. Defaults to "6379".
20
+ # redis_db::
21
+ # Passed to persistence class constructor. Defaults to "0".
22
+ # redis_*::
23
+ # Please see Linnaeus::Persistence for the rest of the options that're passed through directly to the Redis client connection.
2
24
  class Linnaeus::Classifier < Linnaeus
3
25
 
4
26
  # Returns a hash of scores for each category in the Bayesian corpus.
@@ -11,14 +11,27 @@ class Linnaeus::Persistence < Linnaeus
11
11
  options = {
12
12
  redis_host: '127.0.0.1',
13
13
  redis_port: '6379',
14
- redis_db: 0
14
+ redis_db: 0,
15
+ redis_scheme: "redis",
16
+ redis_path: nil,
17
+ redis_timeout: 5.0,
18
+ redis_password: nil,
19
+ redis_id: nil,
20
+ redis_tcp_keepalive: 0
15
21
  }.merge(opts)
16
22
 
17
23
  @redis = Redis.new(
18
24
  host: options[:redis_host],
19
25
  port: options[:redis_port],
20
- db: options[:redis_db]
26
+ db: options[:redis_db],
27
+ scheme: options[:redis_scheme],
28
+ path: options[:redis_path],
29
+ timeout: options[:redis_timeout],
30
+ password: options[:redis_password],
31
+ id: options[:redis_id],
32
+ tcp_keepalive: options[:redis_tcp_keepalive]
21
33
  )
34
+
22
35
  self
23
36
  end
24
37
 
@@ -74,10 +87,8 @@ class Linnaeus::Persistence < Linnaeus
74
87
  # word_occurrences::
75
88
  # A hash containing a count of the number of word occurences in a document
76
89
  def increment_word_counts_for_category(category, word_occurrences)
77
- @redis.multi do |multi|
78
- word_occurrences.each do|word,count|
79
- multi.hincrby BASE_CATEGORY_KEY + category, word, count
80
- end
90
+ word_occurrences.each do|word,count|
91
+ @redis.hincrby BASE_CATEGORY_KEY + category, word, count
81
92
  end
82
93
  end
83
94
 
@@ -89,10 +100,8 @@ class Linnaeus::Persistence < Linnaeus
89
100
  # word_occurrences::
90
101
  # A hash containing a count of the number of word occurences in a document
91
102
  def decrement_word_counts_for_category(category, word_occurrences)
92
- @redis.multi do |multi|
93
- word_occurrences.each do|word,count|
94
- multi.hincrby BASE_CATEGORY_KEY + category, word, - count
95
- end
103
+ word_occurrences.each do|word,count|
104
+ @redis.hincrby BASE_CATEGORY_KEY + category, word, - count
96
105
  end
97
106
  end
98
107
 
@@ -1,4 +1,27 @@
1
1
  # Train or untrain documents from the Bayesian corpus.
2
+ #
3
+ # lt = Linnaeus::Trainer.new(<options hash>)
4
+ # lt.train 'category', 'a string of text'
5
+ # lt.train 'differentcategory', 'another string of text'
6
+ # lt.untrain 'category', 'a document we just removed'
7
+ #
8
+ # == Constructor Options
9
+ # persistence_class::
10
+ # A class implementing persistence - the default (Linnaeus::Persistence) uses redis.
11
+ # stopwords_class::
12
+ # A class that emits a set of stopwords. The default is Linnaeus::Stopwords
13
+ # skip_stemming::
14
+ # Set to true to skip porter stemming.
15
+ # encoding::
16
+ # Force text to use this character set. UTF-8 by default.
17
+ # redis_host::
18
+ # Passed to persistence class constructor. Defaults to "127.0.0.1"
19
+ # redis_port::
20
+ # Passed to persistence class constructor. Defaults to "6379".
21
+ # redis_db::
22
+ # Passed to persistence class constructor. Defaults to "0".
23
+ # redis_*::
24
+ # Please see Linnaeus::Persistence for the rest of the options that're passed through directly to the Redis client connection.
2
25
  class Linnaeus::Trainer < Linnaeus
3
26
 
4
27
  # Add a document to the training corpus.
@@ -0,0 +1,82 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "linnaeus"
8
+ s.version = "1.0.2"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["djcp"]
12
+ s.date = "2012-10-31"
13
+ s.description = "Linnaeus provides a redis-backed Bayesian classifier. Words are stemmed, stopwords are stopped, and redis is used to allow for persistent and concurrent training and classification."
14
+ s.email = "dan@collispuro.net"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".rspec",
22
+ ".travis.yml",
23
+ "Gemfile",
24
+ "Gemfile.lock",
25
+ "LICENSE.txt",
26
+ "README.rdoc",
27
+ "Rakefile",
28
+ "VERSION",
29
+ "images/linnaeus.jpg",
30
+ "lib/linnaeus.rb",
31
+ "lib/linnaeus/classifier.rb",
32
+ "lib/linnaeus/persistence.rb",
33
+ "lib/linnaeus/stopwords.rb",
34
+ "lib/linnaeus/trainer.rb",
35
+ "linnaeus.gemspec",
36
+ "spec/linnaeus_classifier_spec.rb",
37
+ "spec/linnaeus_persistence_spec.rb",
38
+ "spec/linnaeus_spec.rb",
39
+ "spec/linnaeus_stopwords_spec.rb",
40
+ "spec/linnaeus_trainer_spec.rb",
41
+ "spec/spec_helper.rb"
42
+ ]
43
+ s.homepage = "http://github.com/djcp/linnaeus"
44
+ s.licenses = ["MIT"]
45
+ s.require_paths = ["lib"]
46
+ s.rubygems_version = "1.8.24"
47
+ s.summary = "Another redis-backed Bayesian classifier"
48
+
49
+ if s.respond_to? :specification_version then
50
+ s.specification_version = 3
51
+
52
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
53
+ s.add_runtime_dependency(%q<redis>, ["~> 3.0.0"])
54
+ s.add_runtime_dependency(%q<stemmer>, ["~> 1.0.0"])
55
+ s.add_development_dependency(%q<rspec>, ["~> 2.11.0"])
56
+ s.add_development_dependency(%q<yard>, ["~> 0.7"])
57
+ s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
58
+ s.add_development_dependency(%q<bundler>, [">= 0"])
59
+ s.add_development_dependency(%q<jeweler>, [">= 0"])
60
+ s.add_development_dependency(%q<simplecov>, [">= 0"])
61
+ else
62
+ s.add_dependency(%q<redis>, ["~> 3.0.0"])
63
+ s.add_dependency(%q<stemmer>, ["~> 1.0.0"])
64
+ s.add_dependency(%q<rspec>, ["~> 2.11.0"])
65
+ s.add_dependency(%q<yard>, ["~> 0.7"])
66
+ s.add_dependency(%q<rdoc>, ["~> 3.12"])
67
+ s.add_dependency(%q<bundler>, [">= 0"])
68
+ s.add_dependency(%q<jeweler>, [">= 0"])
69
+ s.add_dependency(%q<simplecov>, [">= 0"])
70
+ end
71
+ else
72
+ s.add_dependency(%q<redis>, ["~> 3.0.0"])
73
+ s.add_dependency(%q<stemmer>, ["~> 1.0.0"])
74
+ s.add_dependency(%q<rspec>, ["~> 2.11.0"])
75
+ s.add_dependency(%q<yard>, ["~> 0.7"])
76
+ s.add_dependency(%q<rdoc>, ["~> 3.12"])
77
+ s.add_dependency(%q<bundler>, [">= 0"])
78
+ s.add_dependency(%q<jeweler>, [">= 0"])
79
+ s.add_dependency(%q<simplecov>, [">= 0"])
80
+ end
81
+ end
82
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linnaeus
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-10-30 00:00:00.000000000 Z
12
+ date: 2012-10-31 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: redis
@@ -164,6 +164,7 @@ files:
164
164
  - lib/linnaeus/persistence.rb
165
165
  - lib/linnaeus/stopwords.rb
166
166
  - lib/linnaeus/trainer.rb
167
+ - linnaeus.gemspec
167
168
  - spec/linnaeus_classifier_spec.rb
168
169
  - spec/linnaeus_persistence_spec.rb
169
170
  - spec/linnaeus_spec.rb
@@ -185,7 +186,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
185
186
  version: '0'
186
187
  segments:
187
188
  - 0
188
- hash: 494428062127756217
189
+ hash: 2735128872899197737
189
190
  required_rubygems_version: !ruby/object:Gem::Requirement
190
191
  none: false
191
192
  requirements: