classifier 1.3.3 → 1.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +5 -0
- data/Gemfile.lock +26 -0
- data/{README → README.markdown} +37 -28
- data/Rakefile +1 -13
- data/lib/classifier/bayes.rb +7 -0
- data/lib/classifier/extensions/vector.rb +1 -1
- data/lib/classifier/extensions/word_hash.rb +21 -10
- data/test/extensions/word_hash_test.rb +21 -0
- metadata +40 -43
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 3c53668ddd328fb78862c67723b185df9c2aa717
|
4
|
+
data.tar.gz: 3655405d082fdd8a01e4ca893a70360ca9f62322
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 40b7395e2f04f56bdbabb49a4d0013dba36e9c1325ae66e5bff92451059c5b559677aaea30e50f8f2fbbae58e50bf0f084925ef38e0e3d3fb729e37e357469d4
|
7
|
+
data.tar.gz: 150f8f387706d870a37e86b0418c5e68ad386b82518294bdf21585ab3509fd98515648bc5e06dfb78b97f1e544099fe1da5ddcd69413826e0ccc39780d457940
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
GEM
|
2
|
+
remote: https://rubygems.org/
|
3
|
+
specs:
|
4
|
+
diff-lcs (1.2.5)
|
5
|
+
fast-stemmer (1.0.2)
|
6
|
+
json (1.8.1)
|
7
|
+
rake (10.1.1)
|
8
|
+
rdoc (4.1.0)
|
9
|
+
json (~> 1.4)
|
10
|
+
rspec (2.14.1)
|
11
|
+
rspec-core (~> 2.14.0)
|
12
|
+
rspec-expectations (~> 2.14.0)
|
13
|
+
rspec-mocks (~> 2.14.0)
|
14
|
+
rspec-core (2.14.7)
|
15
|
+
rspec-expectations (2.14.4)
|
16
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
17
|
+
rspec-mocks (2.14.4)
|
18
|
+
|
19
|
+
PLATFORMS
|
20
|
+
ruby
|
21
|
+
|
22
|
+
DEPENDENCIES
|
23
|
+
fast-stemmer
|
24
|
+
rake
|
25
|
+
rdoc
|
26
|
+
rspec
|
data/{README → README.markdown}
RENAMED
@@ -1,16 +1,18 @@
|
|
1
|
-
|
1
|
+
## Welcome to Classifier
|
2
2
|
|
3
3
|
Classifier is a general module to allow Bayesian and other types of classifications.
|
4
4
|
|
5
|
-
|
5
|
+
## Download
|
6
6
|
|
7
|
-
*
|
7
|
+
* https://github.com/cardmagic/classifier
|
8
8
|
* gem install classifier
|
9
|
-
*
|
9
|
+
* git clone https://github.com/cardmagic/classifier.git
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
|
11
|
+
## Dependencies
|
12
|
+
|
13
|
+
If you install Classifier from source, you'll need to install Roman Shterenzon's fast-stemmer gem with RubyGems as follows:
|
14
|
+
|
15
|
+
gem install fast-stemmer
|
14
16
|
|
15
17
|
If you would like to speed up LSI classification by at least 10x, please install the following libraries:
|
16
18
|
GNU GSL:: http://www.gnu.org/software/gsl
|
@@ -18,10 +20,12 @@ rb-gsl:: http://rb-gsl.rubyforge.org
|
|
18
20
|
|
19
21
|
Notice that LSI will work without these libraries, but as soon as they are installed, Classifier will make use of them. No configuration changes are needed, we like to keep things ridiculously easy for you.
|
20
22
|
|
21
|
-
|
23
|
+
## Bayes
|
24
|
+
|
22
25
|
A Bayesian classifier by Lucas Carlson. Bayesian Classifiers are accurate, fast, and have modest memory requirements.
|
23
26
|
|
24
|
-
|
27
|
+
### Usage
|
28
|
+
|
25
29
|
require 'classifier'
|
26
30
|
b = Classifier::Bayes.new 'Interesting', 'Uninteresting'
|
27
31
|
b.train_interesting "here are some good words. I hope you love them"
|
@@ -39,50 +43,55 @@ A Bayesian classifier by Lucas Carlson. Bayesian Classifiers are accurate, fast,
|
|
39
43
|
|
40
44
|
Using Madeleine, your application can persist the learned data over time.
|
41
45
|
|
42
|
-
|
46
|
+
### Bayesian Classification
|
43
47
|
|
44
48
|
* http://www.process.com/precisemail/bayesian_filtering.htm
|
45
49
|
* http://en.wikipedia.org/wiki/Bayesian_filtering
|
46
50
|
* http://www.paulgraham.com/spam.html
|
47
51
|
|
48
|
-
|
52
|
+
## LSI
|
53
|
+
|
49
54
|
A Latent Semantic Indexer by David Fayram. Latent Semantic Indexing engines
|
50
55
|
are not as fast or as small as Bayesian classifiers, but are more flexible, providing
|
51
56
|
fast search and clustering detection as well as semantic analysis of the text that
|
52
57
|
theoretically simulates human learning.
|
53
58
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
59
|
+
### Usage
|
60
|
+
|
61
|
+
require 'classifier'
|
62
|
+
lsi = Classifier::LSI.new
|
63
|
+
strings = [ ["This text deals with dogs. Dogs.", :dog],
|
58
64
|
["This text involves dogs too. Dogs! ", :dog],
|
59
65
|
["This text revolves around cats. Cats.", :cat],
|
60
66
|
["This text also involves cats. Cats!", :cat],
|
61
67
|
["This text involves birds. Birds.",:bird ]]
|
62
|
-
|
68
|
+
strings.each {|x| lsi.add_item x.first, x.last}
|
63
69
|
|
64
|
-
|
65
|
-
|
66
|
-
|
70
|
+
lsi.search("dog", 3)
|
71
|
+
# returns => ["This text deals with dogs. Dogs.", "This text involves dogs too. Dogs! ",
|
72
|
+
# "This text also involves cats. Cats!"]
|
67
73
|
|
68
|
-
|
69
|
-
|
74
|
+
lsi.find_related(strings[2], 2)
|
75
|
+
# returns => ["This text revolves around cats. Cats.", "This text also involves cats. Cats!"]
|
70
76
|
|
71
|
-
|
72
|
-
|
77
|
+
lsi.classify "This text is also about dogs!"
|
78
|
+
# returns => :dog
|
73
79
|
|
74
80
|
Please see the Classifier::LSI documentation for more information. It is possible to index, search and classify
|
75
81
|
with more than just simple strings.
|
76
82
|
|
77
|
-
|
83
|
+
### Latent Semantic Indexing
|
84
|
+
|
78
85
|
* http://www.c2.com/cgi/wiki?LatentSemanticIndexing
|
79
86
|
* http://www.chadfowler.com/index.cgi/Computing/LatentSemanticIndexing.rdoc
|
80
87
|
* http://en.wikipedia.org/wiki/Latent_semantic_analysis
|
81
88
|
|
82
|
-
|
83
|
-
|
84
|
-
*
|
85
|
-
*
|
89
|
+
## Authors
|
90
|
+
|
91
|
+
* Lucas Carlson (lucas@rufy.com)
|
92
|
+
* David Fayram II (dfayram@gmail.com)
|
93
|
+
* Cameron McBride (cameron.mcbride@gmail.com)
|
94
|
+
* Ivan Acosta-Rubio (ivan@softwarecriollo.com)
|
86
95
|
|
87
96
|
This library is released under the terms of the GNU LGPL. See LICENSE for more details.
|
88
97
|
|
data/Rakefile
CHANGED
@@ -1,16 +1,9 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'rake'
|
3
3
|
require 'rake/testtask'
|
4
|
-
require '
|
5
|
-
require 'rake/gempackagetask'
|
4
|
+
require 'rdoc/task'
|
6
5
|
require 'rake/contrib/rubyforgepublisher'
|
7
6
|
|
8
|
-
PKG_VERSION = "1.3.3"
|
9
|
-
|
10
|
-
PKG_FILES = FileList[
|
11
|
-
"lib/**/*", "bin/*", "test/**/*", "[A-Z]*", "Rakefile", "html/**/*"
|
12
|
-
]
|
13
|
-
|
14
7
|
desc "Default Task"
|
15
8
|
task :default => [ :test ]
|
16
9
|
|
@@ -75,11 +68,6 @@ spec = Gem::Specification.new do |s|
|
|
75
68
|
s.homepage = "http://classifier.rufy.com/"
|
76
69
|
end
|
77
70
|
|
78
|
-
Rake::GemPackageTask.new(spec) do |pkg|
|
79
|
-
pkg.need_zip = true
|
80
|
-
pkg.need_tar = true
|
81
|
-
end
|
82
|
-
|
83
71
|
desc "Report code statistics (KLOCs, etc) from the application"
|
84
72
|
task :stats do
|
85
73
|
require 'code_statistics'
|
data/lib/classifier/bayes.rb
CHANGED
@@ -12,6 +12,7 @@ class Bayes
|
|
12
12
|
@categories = Hash.new
|
13
13
|
categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
|
14
14
|
@total_words = 0
|
15
|
+
@category_counts = Hash.new(0)
|
15
16
|
end
|
16
17
|
|
17
18
|
#
|
@@ -23,6 +24,7 @@ class Bayes
|
|
23
24
|
# b.train "The other", "The other text"
|
24
25
|
def train(category, text)
|
25
26
|
category = category.prepare_category_name
|
27
|
+
@category_counts[category] += 1
|
26
28
|
text.word_hash.each do |word, count|
|
27
29
|
@categories[category][word] ||= 0
|
28
30
|
@categories[category][word] += count
|
@@ -40,6 +42,7 @@ class Bayes
|
|
40
42
|
# b.untrain :this, "This text"
|
41
43
|
def untrain(category, text)
|
42
44
|
category = category.prepare_category_name
|
45
|
+
@category_counts[category] -= 1
|
43
46
|
text.word_hash.each do |word, count|
|
44
47
|
if @total_words >= 0
|
45
48
|
orig = @categories[category][word]
|
@@ -61,6 +64,7 @@ class Bayes
|
|
61
64
|
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
62
65
|
def classifications(text)
|
63
66
|
score = Hash.new
|
67
|
+
training_count = @category_counts.values.inject { |x,y| x+y }.to_f
|
64
68
|
@categories.each do |category, category_words|
|
65
69
|
score[category.to_s] = 0
|
66
70
|
total = category_words.values.inject(0) {|sum, element| sum+element}
|
@@ -68,6 +72,9 @@ class Bayes
|
|
68
72
|
s = category_words.has_key?(word) ? category_words[word] : 0.1
|
69
73
|
score[category.to_s] += Math.log(s/total.to_f)
|
70
74
|
end
|
75
|
+
# now add prior probability for the category
|
76
|
+
s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
|
77
|
+
score[category.to_s] += Math.log(s / training_count)
|
71
78
|
end
|
72
79
|
return score
|
73
80
|
end
|
@@ -2,6 +2,8 @@
|
|
2
2
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
3
|
# License:: LGPL
|
4
4
|
|
5
|
+
require "set"
|
6
|
+
|
5
7
|
# These are extensions to the String class to provide convenience
|
6
8
|
# methods for the Classifier package.
|
7
9
|
class String
|
@@ -17,7 +19,9 @@ class String
|
|
17
19
|
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
18
20
|
# interned, and indexes to its frequency in the document.
|
19
21
|
def word_hash
|
20
|
-
|
22
|
+
word_hash = clean_word_hash()
|
23
|
+
symbol_hash = word_hash_for_symbols(gsub(/[\w]/," ").split)
|
24
|
+
return word_hash.merge(symbol_hash)
|
21
25
|
end
|
22
26
|
|
23
27
|
# Return a word hash without extra punctuation or short symbols, just stemmed words
|
@@ -28,19 +32,26 @@ class String
|
|
28
32
|
private
|
29
33
|
|
30
34
|
def word_hash_for_words(words)
|
31
|
-
d = Hash.new
|
35
|
+
d = Hash.new(0)
|
32
36
|
words.each do |word|
|
33
|
-
word.downcase!
|
34
|
-
|
35
|
-
|
36
|
-
d[key] ||= 0
|
37
|
-
d[key] += 1
|
37
|
+
word.downcase!
|
38
|
+
if ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
|
39
|
+
d[word.stem.intern] += 1
|
38
40
|
end
|
39
41
|
end
|
40
42
|
return d
|
41
43
|
end
|
44
|
+
|
45
|
+
|
46
|
+
def word_hash_for_symbols(words)
|
47
|
+
d = Hash.new(0)
|
48
|
+
words.each do |word|
|
49
|
+
d[word.intern] += 1
|
50
|
+
end
|
51
|
+
return d
|
52
|
+
end
|
42
53
|
|
43
|
-
CORPUS_SKIP_WORDS = [
|
54
|
+
CORPUS_SKIP_WORDS = Set.new([
|
44
55
|
"a",
|
45
56
|
"again",
|
46
57
|
"all",
|
@@ -121,5 +132,5 @@ class String
|
|
121
132
|
"yes",
|
122
133
|
"you",
|
123
134
|
"youll",
|
124
|
-
]
|
125
|
-
end
|
135
|
+
])
|
136
|
+
end
|
@@ -12,3 +12,24 @@ class StringExtensionsTest < Test::Unit::TestCase
|
|
12
12
|
end
|
13
13
|
|
14
14
|
end
|
15
|
+
|
16
|
+
|
17
|
+
class ArrayExtensionsTest < Test::Unit::TestCase
|
18
|
+
|
19
|
+
def test_plays_nicely_with_any_array
|
20
|
+
assert_equal [Array].sum, Array
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_monkey_path_array_sum
|
24
|
+
assert_equal [1,2,3].sum, 6
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_summing_an_empty_array
|
28
|
+
assert_equal [nil].sum, 0
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_summing_an_empty_array
|
32
|
+
assert_equal Array[].sum, 0
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
metadata
CHANGED
@@ -1,82 +1,79 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: classifier
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.3.4
|
5
5
|
platform: ruby
|
6
|
-
authors:
|
6
|
+
authors:
|
7
7
|
- Lucas Carlson
|
8
8
|
autorequire: classifier
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
dependencies:
|
15
|
-
- !ruby/object:Gem::Dependency
|
11
|
+
date: 2013-12-31 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
16
14
|
name: fast-stemmer
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 1.0.0
|
17
20
|
type: :runtime
|
18
|
-
|
19
|
-
version_requirements: !ruby/object:Gem::Requirement
|
20
|
-
requirements:
|
21
|
-
- -
|
22
|
-
- !ruby/object:Gem::Version
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
23
26
|
version: 1.0.0
|
24
|
-
|
25
|
-
|
27
|
+
description: |2
|
28
|
+
A general classifier module to allow Bayesian and other types of classifications.
|
26
29
|
email: lucas@rufy.com
|
27
30
|
executables: []
|
28
|
-
|
29
31
|
extensions: []
|
30
|
-
|
31
32
|
extra_rdoc_files: []
|
32
|
-
|
33
|
-
|
33
|
+
files:
|
34
|
+
- lib/classifier.rb
|
34
35
|
- lib/classifier/bayes.rb
|
35
36
|
- lib/classifier/extensions/string.rb
|
36
37
|
- lib/classifier/extensions/vector.rb
|
37
38
|
- lib/classifier/extensions/vector_serialize.rb
|
38
39
|
- lib/classifier/extensions/word_hash.rb
|
40
|
+
- lib/classifier/lsi.rb
|
39
41
|
- lib/classifier/lsi/content_node.rb
|
40
42
|
- lib/classifier/lsi/summary.rb
|
41
43
|
- lib/classifier/lsi/word_list.rb
|
42
|
-
- lib/classifier/lsi.rb
|
43
|
-
- lib/classifier.rb
|
44
44
|
- bin/bayes.rb
|
45
45
|
- bin/summarize.rb
|
46
46
|
- test/bayes/bayesian_test.rb
|
47
47
|
- test/extensions/word_hash_test.rb
|
48
48
|
- test/lsi/lsi_test.rb
|
49
49
|
- test/test_helper.rb
|
50
|
+
- Gemfile
|
51
|
+
- Gemfile.lock
|
50
52
|
- LICENSE
|
53
|
+
- README.markdown
|
51
54
|
- Rakefile
|
52
|
-
- README
|
53
|
-
has_rdoc: true
|
54
55
|
homepage: http://classifier.rufy.com/
|
55
56
|
licenses: []
|
56
|
-
|
57
|
+
metadata: {}
|
57
58
|
post_install_message:
|
58
59
|
rdoc_options: []
|
59
|
-
|
60
|
-
require_paths:
|
60
|
+
require_paths:
|
61
61
|
- lib
|
62
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
63
|
-
requirements:
|
64
|
-
- -
|
65
|
-
- !ruby/object:Gem::Version
|
66
|
-
version:
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
version:
|
74
|
-
requirements:
|
62
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - '>='
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '0'
|
67
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
68
|
+
requirements:
|
69
|
+
- - '>='
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
requirements:
|
75
73
|
- A porter-stemmer module to split word stems.
|
76
74
|
rubyforge_project:
|
77
|
-
rubygems_version:
|
75
|
+
rubygems_version: 2.0.3
|
78
76
|
signing_key:
|
79
|
-
specification_version:
|
77
|
+
specification_version: 4
|
80
78
|
summary: A general classifier module to allow Bayesian and other types of classifications.
|
81
79
|
test_files: []
|
82
|
-
|