stuff-classifier 0.2 → 0.4

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -43,7 +43,16 @@ cls = StuffClassifier::Bayes.new("Cats or Dogs")
43
43
 
44
44
  # for the Tf-Idf based implementation
45
45
  cls = StuffClassifier::TfIdf.new("Cats or Dogs")
46
- ```
46
+
47
+ # these classifiers use word stemming by default, but if it has weird
48
+ # behavior, then you can disable it on init:
49
+ cls = StuffClassifier::TfIdf.new("Cats or Dogs", :stemming => false)
50
+
51
+ # also by default, the parsing phase filters out stop words, to
52
+ # disable or to come up with your own list of stop words, on a
53
+ # classifier instance you can do this:
54
+ cls.ignore_words = [ 'the', 'my', 'i', 'dont' ]
55
+ ```
47
56
 
48
57
  Training the classifier:
49
58
 
@@ -93,10 +102,46 @@ cls.classify("Who is eating my meat?")
93
102
  #=> :dog
94
103
  ```
95
104
 
96
- ## TODO
105
+ ## Persistency
106
+
107
+ 2 persistency layers for saving the training data are implemented:
108
+
109
+ - in memory (by default)
110
+ - on disk
111
+
112
+ To persist the data on disk, you can do this:
113
+
114
+ ```ruby
115
+ store = StuffClassifier::FileStorage.new(@storage_path)
116
+
117
+ # global setting
118
+ StuffClassifier::Base.storage = store
97
119
 
98
- - provide more implementations
99
- - plugable storage mechanism (in-memory, on disk, database)
120
+ # or alternative local setting on instantiation, by means of an
121
+ # optional param ...
122
+ cls = StuffClassifier::Bayes.new("Cats or Dogs", :storage => store)
123
+
124
+ # after training is done, to persist the data ...
125
+ cls.save_state
126
+
127
+ # or you could just do this:
128
+ StuffClassifier::Bayes.open("Cats or Dogs") do |cls|
129
+ # when done, save_state is called on END
130
+ end
131
+
132
+ # to start fresh, deleting the saved training data for this classifier
133
+ StuffClassifier::Bayes.new("Cats or Dogs", :purge_state => true)
134
+ ```
135
+
136
+ The name you give your classifier is important, as based on it the
137
+ data will get loaded and saved. For instance, following 3 classifiers
138
+ will be stored in different buckets, being independent of each other.
139
+
140
+ ```ruby
141
+ cls1 = StuffClassifier::Bayes.new("Cats or Dogs")
142
+ cls2 = StuffClassifier::Bayes.new("True or False")
143
+ cls3 = StuffClassifier::Bayes.new("Spam or Ham")
144
+ ```
100
145
 
101
146
  ## License
102
147
 
data/Rakefile CHANGED
@@ -16,15 +16,15 @@ Rcov::RcovTask.new do |test|
16
16
  test.rcov_opts << '--exclude "gems/*"'
17
17
  end
18
18
 
19
- require 'rdoc/task'
20
- RDoc::Task.new do |rdoc|
21
- version = StuffClassifier::VERSION
22
-
23
- rdoc.rdoc_dir = 'rdoc'
24
- rdoc.title = "stuff-classifier #{version}"
25
- rdoc.rdoc_files.include('README*')
26
- rdoc.rdoc_files.include('lib/**/*.rb')
27
- end
19
+ #require 'rdoc/task'
20
+ #RDoc::Task.new do |rdoc|
21
+ # version = StuffClassifier::VERSION
22
+ #
23
+ # rdoc.rdoc_dir = 'rdoc'
24
+ # rdoc.title = "stuff-classifier #{version}"
25
+ # rdoc.rdoc_files.include('README*')
26
+ # rdoc.rdoc_files.include('lib/**/*.rb')
27
+ #end
28
28
 
29
29
  task :default => :test
30
30
 
@@ -1,8 +1,12 @@
1
1
  module StuffClassifier
2
2
  autoload :VERSION, 'stuff-classifier/version'
3
3
  autoload :STOP_WORDS, 'stuff-classifier/stop_words'
4
+
4
5
  autoload :Tokenizer, 'stuff-classifier/tokenizer'
5
6
  autoload :Base, 'stuff-classifier/base'
6
7
  autoload :Bayes, 'stuff-classifier/bayes'
7
8
  autoload :TfIdf, 'stuff-classifier/tf-idf'
9
+
10
+ autoload :InMemoryStorage, 'stuff-classifier/storage'
11
+ autoload :FileStorage, 'stuff-classifier/storage'
8
12
  end
@@ -1,12 +1,22 @@
1
1
  class StuffClassifier::Base
2
2
  include StuffClassifier::Tokenizer
3
+ attr_reader :name
3
4
 
4
5
  def initialize(name, opts={})
5
- @name = name
6
6
  @stemming = opts.key?(:stemming) ? opts[:stemming] : true
7
+ purge_state = opts[:purge_state]
8
+
9
+ @name = name
7
10
  @wcount = {}
8
11
  @ccount = {}
9
12
  @ignore_words = nil
13
+
14
+ @storage = opts[:storage] || StuffClassifier::Base.storage
15
+ unless purge_state
16
+ @storage.load_state(self)
17
+ else
18
+ @storage.purge_state(self)
19
+ end
10
20
  end
11
21
 
12
22
  def incr_word(word, category)
@@ -63,4 +73,27 @@ class StuffClassifier::Base
63
73
  # the final weighted average
64
74
  (weight * assumed_prob + totals * basic_prob) / (weight + totals)
65
75
  end
76
+
77
+ def save_state
78
+ @storage.save_state(self)
79
+ end
80
+
81
+ class << self
82
+ attr_writer :storage
83
+
84
+ def storage
85
+ @storage = StuffClassifier::InMemoryStorage.new unless defined? @storage
86
+ @storage
87
+ end
88
+
89
+ def open(name)
90
+ inst = self.new(name)
91
+ if block_given?
92
+ yield inst
93
+ inst.save_state
94
+ else
95
+ inst
96
+ end
97
+ end
98
+ end
66
99
  end
@@ -2,6 +2,8 @@
2
2
  class StuffClassifier::Bayes < StuffClassifier::Base
3
3
  # http://en.wikipedia.org/wiki/Naive_Bayes_classifier
4
4
 
5
+ attr_writer :thresholds
6
+
5
7
  def initialize(name, opts={})
6
8
  super(name, opts)
7
9
  @thresholds = {}
@@ -1,3 +1,5 @@
1
+ require 'set'
2
+
1
3
  StuffClassifier::STOP_WORDS = Set.new [
2
4
  'a', 'about', 'above', 'across', 'after', 'afterwards',
3
5
  'again', 'against', 'all', 'almost', 'alone', 'along',
@@ -0,0 +1,71 @@
1
+ require 'msgpack'
2
+
3
+ module StuffClassifier
4
+ class InMemoryStorage
5
+ def initialize
6
+ @storage = {}
7
+ end
8
+
9
+ def load_state(classifier)
10
+ if @storage.key? classifier.name
11
+ _wcount, _ccount = @storage[classifier.name]
12
+ classifier.instance_eval do
13
+ @wcount = _wcount
14
+ @ccount = _ccount
15
+ end
16
+ end
17
+ end
18
+
19
+ def save_state(classifier)
20
+ name = classifier.name
21
+ wcount = classifier.instance_variable_get :@wcount
22
+ ccount = classifier.instance_variable_get :@ccount
23
+ @storage[name] = [wcount, ccount]
24
+ end
25
+
26
+ def purge_state(classifier)
27
+ @storage.delete(classifier.name)
28
+ end
29
+ end
30
+
31
+ class FileStorage
32
+ def initialize(path)
33
+ @storage = {}
34
+ @path = path
35
+ end
36
+
37
+ def load_state(classifier)
38
+ if @storage.length == 0 && File.exists?(@path)
39
+ @storage = MessagePack.unpack(File.read(@path))
40
+ end
41
+
42
+ if @storage.key? classifier.name
43
+ _wcount, _ccount = @storage[classifier.name]
44
+ classifier.instance_eval do
45
+ @wcount = _wcount
46
+ @ccount = _ccount
47
+ end
48
+ end
49
+ end
50
+
51
+ def save_state(classifier)
52
+ name = classifier.name
53
+ wcount = classifier.instance_variable_get :@wcount
54
+ ccount = classifier.instance_variable_get :@ccount
55
+ @storage[name] = [wcount, ccount]
56
+ _write_to_file
57
+ end
58
+
59
+ def purge_state(classifier)
60
+ @storage.delete(classifier.name)
61
+ _write_to_file
62
+ end
63
+
64
+ def _write_to_file
65
+ File.open(@path, 'w') do |fh|
66
+ fh.flock(File::LOCK_EX)
67
+ fh.write(@storage.to_msgpack)
68
+ end
69
+ end
70
+ end
71
+ end
@@ -1,6 +1,7 @@
1
1
  require 'fast_stemmer'
2
2
 
3
3
  module StuffClassifier::Tokenizer
4
+ attr_writer :stemming
4
5
 
5
6
  def ignore_words=(value)
6
7
  @ignore_words = value
@@ -14,10 +15,6 @@ module StuffClassifier::Tokenizer
14
15
  defined?(@stemming) ? @stemming : false
15
16
  end
16
17
 
17
- def stemming=(value)
18
- @stemming = value
19
- end
20
-
21
18
  def each_word(string)
22
19
  string = string.strip
23
20
  return if string == ''
@@ -37,8 +34,7 @@ module StuffClassifier::Tokenizer
37
34
  w = w.downcase
38
35
  end
39
36
 
40
- yield w if block_given?
41
- words << w
37
+ words << (block_given? ? (yield w) : w)
42
38
  end
43
39
  end
44
40
 
@@ -1,3 +1,3 @@
1
1
  module StuffClassifier
2
- VERSION = '0.2'
2
+ VERSION = '0.4'
3
3
  end
@@ -17,7 +17,11 @@ Gem::Specification.new do |s|
17
17
  s.require_paths = ["lib"]
18
18
 
19
19
  s.add_runtime_dependency "fast-stemmer", ">= 1.0"
20
+ s.add_runtime_dependency "sqlite3"
21
+ s.add_runtime_dependency "sequel"
22
+ s.add_runtime_dependency "msgpack"
20
23
 
24
+ s.add_development_dependency "ruby-debug19"
21
25
  s.add_development_dependency "bundler"
22
26
  s.add_development_dependency "rake", ">= 0.9.2"
23
27
  s.add_development_dependency "minitest", ">= 2.10"
@@ -0,0 +1,31 @@
1
+ require 'helper'
2
+
3
+
4
+ class Test004InMemoryStorage < TestBase
5
+ before do
6
+ StuffClassifier::Base.storage = StuffClassifier::InMemoryStorage.new
7
+
8
+ StuffClassifier::Bayes.open("Cats or Dogs") do |cls|
9
+ cls.train(:dog, "Dogs are awesome, cats too. I love my dog")
10
+ cls.train(:cat, "Cats are more preferred by software developers. I never could stand cats. I have a dog")
11
+ end
12
+ end
13
+
14
+ def test_for_persistance
15
+ test = self
16
+ StuffClassifier::Bayes.new("Cats or Dogs").instance_eval do
17
+ test.assert @storage.instance_of?(StuffClassifier::InMemoryStorage),
18
+ "@storage should be an instance of FileStorage"
19
+ test.assert @wcount.length > 0, "Word count should be persisted"
20
+ test.assert @ccount.length > 0, "Category count should be persisted"
21
+ end
22
+ end
23
+
24
+ def test_purge_state
25
+ test = self
26
+ StuffClassifier::Bayes.new("Cats or Dogs", :purge_state => true).instance_eval do
27
+ test.assert @wcount.length == 0, "Word count should be purged"
28
+ test.assert @ccount.length == 0, "Category count should be purged"
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,54 @@
1
+ require 'helper'
2
+
3
+
4
+ class Test005FileStorage < TestBase
5
+ before do
6
+ @storage_path = "/tmp/test_classifier.db"
7
+ @storage = StuffClassifier::FileStorage.new(@storage_path)
8
+ StuffClassifier::Base.storage = @storage
9
+
10
+ StuffClassifier::Bayes.open("Cats or Dogs") do |cls|
11
+ cls.train(:dog, "Dogs are awesome, cats too. I love my dog")
12
+ cls.train(:cat, "Cats are more preferred by software developers. I never could stand cats. I have a dog")
13
+ end
14
+
15
+ # redefining storage instance, forcing it to read from file again
16
+ StuffClassifier::Base.storage = StuffClassifier::FileStorage.new(@storage_path)
17
+ end
18
+
19
+ def teardown
20
+ File.unlink @storage_path if File.exists? @storage_path
21
+ end
22
+
23
+ def test_for_persistance
24
+ assert ! @storage.equal?(StuffClassifier::Base.storage),
25
+ "Storage instance should not be the same"
26
+
27
+ test = self
28
+ StuffClassifier::Bayes.new("Cats or Dogs").instance_eval do
29
+ test.assert @storage.instance_of?(StuffClassifier::FileStorage),
30
+ "@storage should be an instance of FileStorage"
31
+ test.assert @wcount.length > 0, "Word count should be persisted"
32
+ test.assert @ccount.length > 0, "Category count should be persisted"
33
+ end
34
+ end
35
+
36
+ def test_file_created
37
+ assert File.exist?(@storage_path),
38
+ "File #@storage_path should exist"
39
+
40
+ content = File.read(@storage_path)
41
+ assert content.length > 100,
42
+ "Serialized content should have more than 100 chars"
43
+ end
44
+
45
+ def test_purge_state
46
+ test = self
47
+ StuffClassifier::Bayes.new("Cats or Dogs", :purge_state => true).instance_eval do
48
+ test.assert @storage.instance_of?(StuffClassifier::FileStorage),
49
+ "@storage should be an instance of FileStorage"
50
+ test.assert @wcount.length == 0, "Word count should be purged"
51
+ test.assert @ccount.length == 0, "Category count should be purged"
52
+ end
53
+ end
54
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: stuff-classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.2'
4
+ version: '0.4'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-01-19 00:00:00.000000000Z
12
+ date: 2012-01-20 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: fast-stemmer
16
- requirement: &77488640 !ruby/object:Gem::Requirement
16
+ requirement: &77637680 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,54 @@ dependencies:
21
21
  version: '1.0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *77488640
24
+ version_requirements: *77637680
25
+ - !ruby/object:Gem::Dependency
26
+ name: sqlite3
27
+ requirement: &77637470 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *77637470
36
+ - !ruby/object:Gem::Dependency
37
+ name: sequel
38
+ requirement: &77637240 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *77637240
47
+ - !ruby/object:Gem::Dependency
48
+ name: msgpack
49
+ requirement: &77637030 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *77637030
58
+ - !ruby/object:Gem::Dependency
59
+ name: ruby-debug19
60
+ requirement: &77636820 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ type: :development
67
+ prerelease: false
68
+ version_requirements: *77636820
25
69
  - !ruby/object:Gem::Dependency
26
70
  name: bundler
27
- requirement: &77549150 !ruby/object:Gem::Requirement
71
+ requirement: &77636610 !ruby/object:Gem::Requirement
28
72
  none: false
29
73
  requirements:
30
74
  - - ! '>='
@@ -32,10 +76,10 @@ dependencies:
32
76
  version: '0'
33
77
  type: :development
34
78
  prerelease: false
35
- version_requirements: *77549150
79
+ version_requirements: *77636610
36
80
  - !ruby/object:Gem::Dependency
37
81
  name: rake
38
- requirement: &77548880 !ruby/object:Gem::Requirement
82
+ requirement: &77636360 !ruby/object:Gem::Requirement
39
83
  none: false
40
84
  requirements:
41
85
  - - ! '>='
@@ -43,10 +87,10 @@ dependencies:
43
87
  version: 0.9.2
44
88
  type: :development
45
89
  prerelease: false
46
- version_requirements: *77548880
90
+ version_requirements: *77636360
47
91
  - !ruby/object:Gem::Dependency
48
92
  name: minitest
49
- requirement: &77548630 !ruby/object:Gem::Requirement
93
+ requirement: &77636110 !ruby/object:Gem::Requirement
50
94
  none: false
51
95
  requirements:
52
96
  - - ! '>='
@@ -54,10 +98,10 @@ dependencies:
54
98
  version: '2.10'
55
99
  type: :development
56
100
  prerelease: false
57
- version_requirements: *77548630
101
+ version_requirements: *77636110
58
102
  - !ruby/object:Gem::Dependency
59
103
  name: turn
60
- requirement: &77548400 !ruby/object:Gem::Requirement
104
+ requirement: &77635880 !ruby/object:Gem::Requirement
61
105
  none: false
62
106
  requirements:
63
107
  - - ! '>='
@@ -65,10 +109,10 @@ dependencies:
65
109
  version: 0.8.3
66
110
  type: :development
67
111
  prerelease: false
68
- version_requirements: *77548400
112
+ version_requirements: *77635880
69
113
  - !ruby/object:Gem::Dependency
70
114
  name: rcov
71
- requirement: &77548170 !ruby/object:Gem::Requirement
115
+ requirement: &77635650 !ruby/object:Gem::Requirement
72
116
  none: false
73
117
  requirements:
74
118
  - - ! '>='
@@ -76,7 +120,7 @@ dependencies:
76
120
  version: '0.9'
77
121
  type: :development
78
122
  prerelease: false
79
- version_requirements: *77548170
123
+ version_requirements: *77635650
80
124
  description: 2 methods are provided for now - (1) naive bayes implementation + (2)
81
125
  tf-idf weights
82
126
  email:
@@ -85,9 +129,7 @@ executables: []
85
129
  extensions: []
86
130
  extra_rdoc_files: []
87
131
  files:
88
- - .gitignore
89
132
  - Gemfile
90
- - Gemfile.lock
91
133
  - LICENSE.txt
92
134
  - README.md
93
135
  - Rakefile
@@ -95,15 +137,17 @@ files:
95
137
  - lib/stuff-classifier/base.rb
96
138
  - lib/stuff-classifier/bayes.rb
97
139
  - lib/stuff-classifier/stop_words.rb
140
+ - lib/stuff-classifier/storage.rb
98
141
  - lib/stuff-classifier/tf-idf.rb
99
142
  - lib/stuff-classifier/tokenizer.rb
100
143
  - lib/stuff-classifier/version.rb
101
- - stuff-classifier-0.1.gem
102
144
  - stuff-classifier.gemspec
103
145
  - test/helper.rb
104
146
  - test/test_001_tokenizer.rb
105
147
  - test/test_002_naive_bayes.rb
106
148
  - test/test_003_tf_idf.rb
149
+ - test/test_004_in_memory_storage.rb
150
+ - test/test_005_file_storage.rb
107
151
  homepage: https://github.com/alexandru/stuff-classifier/
108
152
  licenses: []
109
153
  post_install_message:
data/.gitignore DELETED
@@ -1,48 +0,0 @@
1
- # rcov generated
2
- coverage
3
-
4
- # rdoc generated
5
- rdoc
6
-
7
- # yard generated
8
- doc
9
- .yardoc
10
-
11
- # bundler
12
- .bundle
13
-
14
- # jeweler generated
15
- pkg
16
-
17
- # Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore:
18
- #
19
- # * Create a file at ~/.gitignore
20
- # * Include files you want ignored
21
- # * Run: git config --global core.excludesfile ~/.gitignore
22
- #
23
- # After doing this, these files will be ignored in all your git projects,
24
- # saving you from having to 'pollute' every project you touch with them
25
- #
26
- # Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
27
- #
28
- # For MacOS:
29
- #
30
- #.DS_Store
31
-
32
- # For TextMate
33
- #*.tmproj
34
- #tmtags
35
-
36
- # For emacs:
37
- #*~
38
- #\#*
39
- #.\#*
40
-
41
- # For vim:
42
- #*.swp
43
-
44
- # For redcar:
45
- #.redcar
46
-
47
- # For rubinius:
48
- #*.rbc
@@ -1,46 +0,0 @@
1
- PATH
2
- remote: .
3
- specs:
4
- stuff-classifier (0.1)
5
- fast-stemmer (>= 1.0)
6
-
7
- GEM
8
- remote: http://rubygems.org/
9
- specs:
10
- ansi (1.4.1)
11
- archive-tar-minitar (0.5.2)
12
- columnize (0.3.4)
13
- fast-stemmer (1.0.0)
14
- json (1.6.5)
15
- linecache19 (0.5.12)
16
- ruby_core_source (>= 0.1.4)
17
- minitest (2.10.1)
18
- rake (0.9.2.2)
19
- rcov (0.9.11)
20
- rdoc (3.12)
21
- json (~> 1.4)
22
- ruby-debug-base19 (0.11.25)
23
- columnize (>= 0.3.1)
24
- linecache19 (>= 0.5.11)
25
- ruby_core_source (>= 0.1.4)
26
- ruby-debug19 (0.11.6)
27
- columnize (>= 0.3.1)
28
- linecache19 (>= 0.5.11)
29
- ruby-debug-base19 (>= 0.11.19)
30
- ruby_core_source (0.1.5)
31
- archive-tar-minitar (>= 0.5.2)
32
- turn (0.8.3)
33
- ansi
34
-
35
- PLATFORMS
36
- ruby
37
-
38
- DEPENDENCIES
39
- bundler
40
- minitest (>= 2.10)
41
- rake (>= 0.9.2)
42
- rcov (>= 0.9)
43
- rdoc (>= 3.1)
44
- ruby-debug19
45
- stuff-classifier!
46
- turn (>= 0.8.3)
Binary file