stuff-classifier 0.2 → 0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -43,7 +43,16 @@ cls = StuffClassifier::Bayes.new("Cats or Dogs")
43
43
 
44
44
  # for the Tf-Idf based implementation
45
45
  cls = StuffClassifier::TfIdf.new("Cats or Dogs")
46
- ```
46
+
47
+ # these classifiers use word stemming by default, but if it has weird
48
+ # behavior, then you can disable it on init:
49
+ cls = StuffClassifier::TfIdf.new("Cats or Dogs", :stemming => false)
50
+
51
+ # also by default, the parsing phase filters out stop words, to
52
+ # disable or to come up with your own list of stop words, on a
53
+ # classifier instance you can do this:
54
+ cls.ignore_words = [ 'the', 'my', 'i', 'dont' ]
55
+ ```
47
56
 
48
57
  Training the classifier:
49
58
 
@@ -93,10 +102,46 @@ cls.classify("Who is eating my meat?")
93
102
  #=> :dog
94
103
  ```
95
104
 
96
- ## TODO
105
+ ## Persistency
106
+
107
+ 2 persistency layers for saving the training data are implemented:
108
+
109
+ - in memory (by default)
110
+ - on disk
111
+
112
+ To persist the data on disk, you can do this:
113
+
114
+ ```ruby
115
+ store = StuffClassifier::FileStorage.new(@storage_path)
116
+
117
+ # global setting
118
+ StuffClassifier::Base.storage = store
97
119
 
98
- - provide more implementations
99
- - plugable storage mechanism (in-memory, on disk, database)
120
+ # or alternative local setting on instantiation, by means of an
121
+ # optional param ...
122
+ cls = StuffClassifier::Bayes.new("Cats or Dogs", :storage => store)
123
+
124
+ # after training is done, to persist the data ...
125
+ cls.save_state
126
+
127
+ # or you could just do this:
128
+ StuffClassifier::Bayes.open("Cats or Dogs") do |cls|
129
+ # when done, save_state is called on END
130
+ end
131
+
132
+ # to start fresh, deleting the saved training data for this classifier
133
+ StuffClassifier::Bayes.new("Cats or Dogs", :purge_state => true)
134
+ ```
135
+
136
+ The name you give your classifier is important, as based on it the
137
+ data will get loaded and saved. For instance, following 3 classifiers
138
+ will be stored in different buckets, being independent of each other.
139
+
140
+ ```ruby
141
+ cls1 = StuffClassifier::Bayes.new("Cats or Dogs")
142
+ cls2 = StuffClassifier::Bayes.new("True or False")
143
+ cls3 = StuffClassifier::Bayes.new("Spam or Ham")
144
+ ```
100
145
 
101
146
  ## License
102
147
 
data/Rakefile CHANGED
@@ -16,15 +16,15 @@ Rcov::RcovTask.new do |test|
16
16
  test.rcov_opts << '--exclude "gems/*"'
17
17
  end
18
18
 
19
- require 'rdoc/task'
20
- RDoc::Task.new do |rdoc|
21
- version = StuffClassifier::VERSION
22
-
23
- rdoc.rdoc_dir = 'rdoc'
24
- rdoc.title = "stuff-classifier #{version}"
25
- rdoc.rdoc_files.include('README*')
26
- rdoc.rdoc_files.include('lib/**/*.rb')
27
- end
19
+ #require 'rdoc/task'
20
+ #RDoc::Task.new do |rdoc|
21
+ # version = StuffClassifier::VERSION
22
+ #
23
+ # rdoc.rdoc_dir = 'rdoc'
24
+ # rdoc.title = "stuff-classifier #{version}"
25
+ # rdoc.rdoc_files.include('README*')
26
+ # rdoc.rdoc_files.include('lib/**/*.rb')
27
+ #end
28
28
 
29
29
  task :default => :test
30
30
 
@@ -1,8 +1,12 @@
1
1
  module StuffClassifier
2
2
  autoload :VERSION, 'stuff-classifier/version'
3
3
  autoload :STOP_WORDS, 'stuff-classifier/stop_words'
4
+
4
5
  autoload :Tokenizer, 'stuff-classifier/tokenizer'
5
6
  autoload :Base, 'stuff-classifier/base'
6
7
  autoload :Bayes, 'stuff-classifier/bayes'
7
8
  autoload :TfIdf, 'stuff-classifier/tf-idf'
9
+
10
+ autoload :InMemoryStorage, 'stuff-classifier/storage'
11
+ autoload :FileStorage, 'stuff-classifier/storage'
8
12
  end
@@ -1,12 +1,22 @@
1
1
  class StuffClassifier::Base
2
2
  include StuffClassifier::Tokenizer
3
+ attr_reader :name
3
4
 
4
5
  def initialize(name, opts={})
5
- @name = name
6
6
  @stemming = opts.key?(:stemming) ? opts[:stemming] : true
7
+ purge_state = opts[:purge_state]
8
+
9
+ @name = name
7
10
  @wcount = {}
8
11
  @ccount = {}
9
12
  @ignore_words = nil
13
+
14
+ @storage = opts[:storage] || StuffClassifier::Base.storage
15
+ unless purge_state
16
+ @storage.load_state(self)
17
+ else
18
+ @storage.purge_state(self)
19
+ end
10
20
  end
11
21
 
12
22
  def incr_word(word, category)
@@ -63,4 +73,27 @@ class StuffClassifier::Base
63
73
  # the final weighted average
64
74
  (weight * assumed_prob + totals * basic_prob) / (weight + totals)
65
75
  end
76
+
77
+ def save_state
78
+ @storage.save_state(self)
79
+ end
80
+
81
+ class << self
82
+ attr_writer :storage
83
+
84
+ def storage
85
+ @storage = StuffClassifier::InMemoryStorage.new unless defined? @storage
86
+ @storage
87
+ end
88
+
89
+ def open(name)
90
+ inst = self.new(name)
91
+ if block_given?
92
+ yield inst
93
+ inst.save_state
94
+ else
95
+ inst
96
+ end
97
+ end
98
+ end
66
99
  end
@@ -2,6 +2,8 @@
2
2
  class StuffClassifier::Bayes < StuffClassifier::Base
3
3
  # http://en.wikipedia.org/wiki/Naive_Bayes_classifier
4
4
 
5
+ attr_writer :thresholds
6
+
5
7
  def initialize(name, opts={})
6
8
  super(name, opts)
7
9
  @thresholds = {}
@@ -1,3 +1,5 @@
1
+ require 'set'
2
+
1
3
  StuffClassifier::STOP_WORDS = Set.new [
2
4
  'a', 'about', 'above', 'across', 'after', 'afterwards',
3
5
  'again', 'against', 'all', 'almost', 'alone', 'along',
@@ -0,0 +1,71 @@
1
+ require 'msgpack'
2
+
3
+ module StuffClassifier
4
+ class InMemoryStorage
5
+ def initialize
6
+ @storage = {}
7
+ end
8
+
9
+ def load_state(classifier)
10
+ if @storage.key? classifier.name
11
+ _wcount, _ccount = @storage[classifier.name]
12
+ classifier.instance_eval do
13
+ @wcount = _wcount
14
+ @ccount = _ccount
15
+ end
16
+ end
17
+ end
18
+
19
+ def save_state(classifier)
20
+ name = classifier.name
21
+ wcount = classifier.instance_variable_get :@wcount
22
+ ccount = classifier.instance_variable_get :@ccount
23
+ @storage[name] = [wcount, ccount]
24
+ end
25
+
26
+ def purge_state(classifier)
27
+ @storage.delete(classifier.name)
28
+ end
29
+ end
30
+
31
+ class FileStorage
32
+ def initialize(path)
33
+ @storage = {}
34
+ @path = path
35
+ end
36
+
37
+ def load_state(classifier)
38
+ if @storage.length == 0 && File.exists?(@path)
39
+ @storage = MessagePack.unpack(File.read(@path))
40
+ end
41
+
42
+ if @storage.key? classifier.name
43
+ _wcount, _ccount = @storage[classifier.name]
44
+ classifier.instance_eval do
45
+ @wcount = _wcount
46
+ @ccount = _ccount
47
+ end
48
+ end
49
+ end
50
+
51
+ def save_state(classifier)
52
+ name = classifier.name
53
+ wcount = classifier.instance_variable_get :@wcount
54
+ ccount = classifier.instance_variable_get :@ccount
55
+ @storage[name] = [wcount, ccount]
56
+ _write_to_file
57
+ end
58
+
59
+ def purge_state(classifier)
60
+ @storage.delete(classifier.name)
61
+ _write_to_file
62
+ end
63
+
64
+ def _write_to_file
65
+ File.open(@path, 'w') do |fh|
66
+ fh.flock(File::LOCK_EX)
67
+ fh.write(@storage.to_msgpack)
68
+ end
69
+ end
70
+ end
71
+ end
@@ -1,6 +1,7 @@
1
1
  require 'fast_stemmer'
2
2
 
3
3
  module StuffClassifier::Tokenizer
4
+ attr_writer :stemming
4
5
 
5
6
  def ignore_words=(value)
6
7
  @ignore_words = value
@@ -14,10 +15,6 @@ module StuffClassifier::Tokenizer
14
15
  defined?(@stemming) ? @stemming : false
15
16
  end
16
17
 
17
- def stemming=(value)
18
- @stemming = value
19
- end
20
-
21
18
  def each_word(string)
22
19
  string = string.strip
23
20
  return if string == ''
@@ -37,8 +34,7 @@ module StuffClassifier::Tokenizer
37
34
  w = w.downcase
38
35
  end
39
36
 
40
- yield w if block_given?
41
- words << w
37
+ words << (block_given? ? (yield w) : w)
42
38
  end
43
39
  end
44
40
 
@@ -1,3 +1,3 @@
1
1
  module StuffClassifier
2
- VERSION = '0.2'
2
+ VERSION = '0.4'
3
3
  end
@@ -17,7 +17,11 @@ Gem::Specification.new do |s|
17
17
  s.require_paths = ["lib"]
18
18
 
19
19
  s.add_runtime_dependency "fast-stemmer", ">= 1.0"
20
+ s.add_runtime_dependency "sqlite3"
21
+ s.add_runtime_dependency "sequel"
22
+ s.add_runtime_dependency "msgpack"
20
23
 
24
+ s.add_development_dependency "ruby-debug19"
21
25
  s.add_development_dependency "bundler"
22
26
  s.add_development_dependency "rake", ">= 0.9.2"
23
27
  s.add_development_dependency "minitest", ">= 2.10"
@@ -0,0 +1,31 @@
1
+ require 'helper'
2
+
3
+
4
+ class Test004InMemoryStorage < TestBase
5
+ before do
6
+ StuffClassifier::Base.storage = StuffClassifier::InMemoryStorage.new
7
+
8
+ StuffClassifier::Bayes.open("Cats or Dogs") do |cls|
9
+ cls.train(:dog, "Dogs are awesome, cats too. I love my dog")
10
+ cls.train(:cat, "Cats are more preferred by software developers. I never could stand cats. I have a dog")
11
+ end
12
+ end
13
+
14
+ def test_for_persistance
15
+ test = self
16
+ StuffClassifier::Bayes.new("Cats or Dogs").instance_eval do
17
+ test.assert @storage.instance_of?(StuffClassifier::InMemoryStorage),
18
+ "@storage should be an instance of FileStorage"
19
+ test.assert @wcount.length > 0, "Word count should be persisted"
20
+ test.assert @ccount.length > 0, "Category count should be persisted"
21
+ end
22
+ end
23
+
24
+ def test_purge_state
25
+ test = self
26
+ StuffClassifier::Bayes.new("Cats or Dogs", :purge_state => true).instance_eval do
27
+ test.assert @wcount.length == 0, "Word count should be purged"
28
+ test.assert @ccount.length == 0, "Category count should be purged"
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,54 @@
1
+ require 'helper'
2
+
3
+
4
+ class Test005FileStorage < TestBase
5
+ before do
6
+ @storage_path = "/tmp/test_classifier.db"
7
+ @storage = StuffClassifier::FileStorage.new(@storage_path)
8
+ StuffClassifier::Base.storage = @storage
9
+
10
+ StuffClassifier::Bayes.open("Cats or Dogs") do |cls|
11
+ cls.train(:dog, "Dogs are awesome, cats too. I love my dog")
12
+ cls.train(:cat, "Cats are more preferred by software developers. I never could stand cats. I have a dog")
13
+ end
14
+
15
+ # redefining storage instance, forcing it to read from file again
16
+ StuffClassifier::Base.storage = StuffClassifier::FileStorage.new(@storage_path)
17
+ end
18
+
19
+ def teardown
20
+ File.unlink @storage_path if File.exists? @storage_path
21
+ end
22
+
23
+ def test_for_persistance
24
+ assert ! @storage.equal?(StuffClassifier::Base.storage),
25
+ "Storage instance should not be the same"
26
+
27
+ test = self
28
+ StuffClassifier::Bayes.new("Cats or Dogs").instance_eval do
29
+ test.assert @storage.instance_of?(StuffClassifier::FileStorage),
30
+ "@storage should be an instance of FileStorage"
31
+ test.assert @wcount.length > 0, "Word count should be persisted"
32
+ test.assert @ccount.length > 0, "Category count should be persisted"
33
+ end
34
+ end
35
+
36
+ def test_file_created
37
+ assert File.exist?(@storage_path),
38
+ "File #@storage_path should exist"
39
+
40
+ content = File.read(@storage_path)
41
+ assert content.length > 100,
42
+ "Serialized content should have more than 100 chars"
43
+ end
44
+
45
+ def test_purge_state
46
+ test = self
47
+ StuffClassifier::Bayes.new("Cats or Dogs", :purge_state => true).instance_eval do
48
+ test.assert @storage.instance_of?(StuffClassifier::FileStorage),
49
+ "@storage should be an instance of FileStorage"
50
+ test.assert @wcount.length == 0, "Word count should be purged"
51
+ test.assert @ccount.length == 0, "Category count should be purged"
52
+ end
53
+ end
54
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: stuff-classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.2'
4
+ version: '0.4'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-01-19 00:00:00.000000000Z
12
+ date: 2012-01-20 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: fast-stemmer
16
- requirement: &77488640 !ruby/object:Gem::Requirement
16
+ requirement: &77637680 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,54 @@ dependencies:
21
21
  version: '1.0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *77488640
24
+ version_requirements: *77637680
25
+ - !ruby/object:Gem::Dependency
26
+ name: sqlite3
27
+ requirement: &77637470 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *77637470
36
+ - !ruby/object:Gem::Dependency
37
+ name: sequel
38
+ requirement: &77637240 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *77637240
47
+ - !ruby/object:Gem::Dependency
48
+ name: msgpack
49
+ requirement: &77637030 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *77637030
58
+ - !ruby/object:Gem::Dependency
59
+ name: ruby-debug19
60
+ requirement: &77636820 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ type: :development
67
+ prerelease: false
68
+ version_requirements: *77636820
25
69
  - !ruby/object:Gem::Dependency
26
70
  name: bundler
27
- requirement: &77549150 !ruby/object:Gem::Requirement
71
+ requirement: &77636610 !ruby/object:Gem::Requirement
28
72
  none: false
29
73
  requirements:
30
74
  - - ! '>='
@@ -32,10 +76,10 @@ dependencies:
32
76
  version: '0'
33
77
  type: :development
34
78
  prerelease: false
35
- version_requirements: *77549150
79
+ version_requirements: *77636610
36
80
  - !ruby/object:Gem::Dependency
37
81
  name: rake
38
- requirement: &77548880 !ruby/object:Gem::Requirement
82
+ requirement: &77636360 !ruby/object:Gem::Requirement
39
83
  none: false
40
84
  requirements:
41
85
  - - ! '>='
@@ -43,10 +87,10 @@ dependencies:
43
87
  version: 0.9.2
44
88
  type: :development
45
89
  prerelease: false
46
- version_requirements: *77548880
90
+ version_requirements: *77636360
47
91
  - !ruby/object:Gem::Dependency
48
92
  name: minitest
49
- requirement: &77548630 !ruby/object:Gem::Requirement
93
+ requirement: &77636110 !ruby/object:Gem::Requirement
50
94
  none: false
51
95
  requirements:
52
96
  - - ! '>='
@@ -54,10 +98,10 @@ dependencies:
54
98
  version: '2.10'
55
99
  type: :development
56
100
  prerelease: false
57
- version_requirements: *77548630
101
+ version_requirements: *77636110
58
102
  - !ruby/object:Gem::Dependency
59
103
  name: turn
60
- requirement: &77548400 !ruby/object:Gem::Requirement
104
+ requirement: &77635880 !ruby/object:Gem::Requirement
61
105
  none: false
62
106
  requirements:
63
107
  - - ! '>='
@@ -65,10 +109,10 @@ dependencies:
65
109
  version: 0.8.3
66
110
  type: :development
67
111
  prerelease: false
68
- version_requirements: *77548400
112
+ version_requirements: *77635880
69
113
  - !ruby/object:Gem::Dependency
70
114
  name: rcov
71
- requirement: &77548170 !ruby/object:Gem::Requirement
115
+ requirement: &77635650 !ruby/object:Gem::Requirement
72
116
  none: false
73
117
  requirements:
74
118
  - - ! '>='
@@ -76,7 +120,7 @@ dependencies:
76
120
  version: '0.9'
77
121
  type: :development
78
122
  prerelease: false
79
- version_requirements: *77548170
123
+ version_requirements: *77635650
80
124
  description: 2 methods are provided for now - (1) naive bayes implementation + (2)
81
125
  tf-idf weights
82
126
  email:
@@ -85,9 +129,7 @@ executables: []
85
129
  extensions: []
86
130
  extra_rdoc_files: []
87
131
  files:
88
- - .gitignore
89
132
  - Gemfile
90
- - Gemfile.lock
91
133
  - LICENSE.txt
92
134
  - README.md
93
135
  - Rakefile
@@ -95,15 +137,17 @@ files:
95
137
  - lib/stuff-classifier/base.rb
96
138
  - lib/stuff-classifier/bayes.rb
97
139
  - lib/stuff-classifier/stop_words.rb
140
+ - lib/stuff-classifier/storage.rb
98
141
  - lib/stuff-classifier/tf-idf.rb
99
142
  - lib/stuff-classifier/tokenizer.rb
100
143
  - lib/stuff-classifier/version.rb
101
- - stuff-classifier-0.1.gem
102
144
  - stuff-classifier.gemspec
103
145
  - test/helper.rb
104
146
  - test/test_001_tokenizer.rb
105
147
  - test/test_002_naive_bayes.rb
106
148
  - test/test_003_tf_idf.rb
149
+ - test/test_004_in_memory_storage.rb
150
+ - test/test_005_file_storage.rb
107
151
  homepage: https://github.com/alexandru/stuff-classifier/
108
152
  licenses: []
109
153
  post_install_message:
data/.gitignore DELETED
@@ -1,48 +0,0 @@
1
- # rcov generated
2
- coverage
3
-
4
- # rdoc generated
5
- rdoc
6
-
7
- # yard generated
8
- doc
9
- .yardoc
10
-
11
- # bundler
12
- .bundle
13
-
14
- # jeweler generated
15
- pkg
16
-
17
- # Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore:
18
- #
19
- # * Create a file at ~/.gitignore
20
- # * Include files you want ignored
21
- # * Run: git config --global core.excludesfile ~/.gitignore
22
- #
23
- # After doing this, these files will be ignored in all your git projects,
24
- # saving you from having to 'pollute' every project you touch with them
25
- #
26
- # Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
27
- #
28
- # For MacOS:
29
- #
30
- #.DS_Store
31
-
32
- # For TextMate
33
- #*.tmproj
34
- #tmtags
35
-
36
- # For emacs:
37
- #*~
38
- #\#*
39
- #.\#*
40
-
41
- # For vim:
42
- #*.swp
43
-
44
- # For redcar:
45
- #.redcar
46
-
47
- # For rubinius:
48
- #*.rbc
@@ -1,46 +0,0 @@
1
- PATH
2
- remote: .
3
- specs:
4
- stuff-classifier (0.1)
5
- fast-stemmer (>= 1.0)
6
-
7
- GEM
8
- remote: http://rubygems.org/
9
- specs:
10
- ansi (1.4.1)
11
- archive-tar-minitar (0.5.2)
12
- columnize (0.3.4)
13
- fast-stemmer (1.0.0)
14
- json (1.6.5)
15
- linecache19 (0.5.12)
16
- ruby_core_source (>= 0.1.4)
17
- minitest (2.10.1)
18
- rake (0.9.2.2)
19
- rcov (0.9.11)
20
- rdoc (3.12)
21
- json (~> 1.4)
22
- ruby-debug-base19 (0.11.25)
23
- columnize (>= 0.3.1)
24
- linecache19 (>= 0.5.11)
25
- ruby_core_source (>= 0.1.4)
26
- ruby-debug19 (0.11.6)
27
- columnize (>= 0.3.1)
28
- linecache19 (>= 0.5.11)
29
- ruby-debug-base19 (>= 0.11.19)
30
- ruby_core_source (0.1.5)
31
- archive-tar-minitar (>= 0.5.2)
32
- turn (0.8.3)
33
- ansi
34
-
35
- PLATFORMS
36
- ruby
37
-
38
- DEPENDENCIES
39
- bundler
40
- minitest (>= 2.10)
41
- rake (>= 0.9.2)
42
- rcov (>= 0.9)
43
- rdoc (>= 3.1)
44
- ruby-debug19
45
- stuff-classifier!
46
- turn (>= 0.8.3)
Binary file