classifier_atsukamoto 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e6eca98ba96b5157ddcfef0ba3f02e129652c5ce
4
+ data.tar.gz: bc219f04544083e8a017b548ca2fede7a942fa45
5
+ SHA512:
6
+ metadata.gz: 1c78965de0ffd493b57ebf013deb1baf92c8e554f21157bb681dca1ae980edcd1d4a7fdafe601983270ecb0bbd86214a6b19ec3a1e9ebebc12789ebdbc1f0131
7
+ data.tar.gz: a9eb9c3ebac570198f25800f4c02677aea29ad8e9f152b41585242bd8667cf8b6de3812cf28ce68386c1090f5764fbc0c610972a0041342cb58bfb2fddccaf15
data/Rakefile ADDED
@@ -0,0 +1,97 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'rake/testtask'
4
+ require 'rake/rdoctask'
5
+ require 'rake/gempackagetask'
6
+ require 'rake/contrib/rubyforgepublisher'
7
+
8
+ PKG_VERSION = "0.0.1"
9
+
10
+ PKG_FILES = FileList[
11
+ "lib/**/*", "bin/*", "test/**/*", "[A-Z]*", "Rakefile", "html/**/*"
12
+ ]
13
+
14
+ desc "Default Task"
15
+ task :default => [ :test ]
16
+
17
+ # Run the unit tests
18
+ desc "Run all unit tests"
19
+ Rake::TestTask.new("test") { |t|
20
+ t.libs << "lib"
21
+ t.pattern = 'test/*/*_test.rb'
22
+ t.verbose = true
23
+ }
24
+
25
+ # Make a console, useful when working on tests
26
+ desc "Generate a test console"
27
+ task :console do
28
+ verbose( false ) { sh "irb -I lib/ -r 'classifier'" }
29
+ end
30
+
31
+ # Genereate the RDoc documentation
32
+ desc "Create documentation"
33
+ Rake::RDocTask.new("doc") { |rdoc|
34
+ rdoc.title = "Ruby Classifier Fork by ATsukamoto - Bayesian and LSI classification library with Redis for persistence
35
+ "
36
+ rdoc.rdoc_dir = 'html'
37
+ rdoc.rdoc_files.include('README')
38
+ rdoc.rdoc_files.include('lib/**/*.rb')
39
+ }
40
+
41
+ # Genereate the package
42
+ spec = Gem::Specification.new do |s|
43
+
44
+ #### Basic information.
45
+
46
+ s.name = 'classifier'
47
+ s.version = PKG_VERSION
48
+ s.summary = <<-EOF
49
+ A general classifier module to allow Bayesian and other types of classifications.
50
+ EOF
51
+ s.description = <<-EOF
52
+ A general classifier module to allow Bayesian and other types of classifications.
53
+ EOF
54
+
55
+ #### Which files are to be included in this gem? Everything! (Except CVS directories.)
56
+
57
+ s.files = PKG_FILES
58
+
59
+ #### Load-time details: library and application (you will need one or both).
60
+
61
+ s.require_path = 'lib'
62
+ s.autorequire = 'classifier'
63
+
64
+ #### Documentation and testing.
65
+
66
+ s.has_rdoc = true
67
+
68
+ #### Dependencies and requirements.
69
+
70
+ s.add_dependency('fast-stemmer', '>= 1.0.0')
71
+ s.requirements << "A porter-stemmer module to split word stems."
72
+
73
+ #### Author and project details.
74
+ s.author = "Lucas Carlson"
75
+ s.email = "lucas@rufy.com"
76
+ s.homepage = "http://classifier.rufy.com/"
77
+ end
78
+
79
+ Rake::GemPackageTask.new(spec) do |pkg|
80
+ pkg.need_zip = true
81
+ pkg.need_tar = true
82
+ end
83
+
84
+ desc "Report code statistics (KLOCs, etc) from the application"
85
+ task :stats do
86
+ require 'code_statistics'
87
+ CodeStatistics.new(
88
+ ["Library", "lib"],
89
+ ["Units", "test"]
90
+ ).to_s
91
+ end
92
+
93
+ desc "Publish new documentation"
94
+ task :publish do
95
+ `ssh rufy update-classifier-doc`
96
+ Rake::RubyForgePublisher.new('classifier', 'cardmagic').upload
97
+ end
@@ -0,0 +1,156 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ require 'lingua/stemmer'
8
+
9
+ class Bayes
10
+ # The class can be created with one or more categories, each of which will be
11
+ # initialized and given a training method. E.g.,
12
+ # b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
13
+ def initialize(lang, *categories)
14
+ #@categories = Hash.new
15
+ #categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
16
+ # RedisStore.total_words = 0
17
+ @categories = RedisStore.new lang, categories
18
+ @categories.init_total
19
+ @stemmer = Lingua::Stemmer.new(:language => lang.downcase)
20
+ end
21
+
22
+ #
23
+ # Provides a general training method for all categories specified in Bayes#new
24
+ # For example:
25
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
26
+ # b.train :this, "This text"
27
+ # b.train "that", "That text"
28
+ # b.train "The other", "The other text"
29
+ def train(category, text)
30
+ category = category.prepare_category_name
31
+ text.word_hash(@stemmer).each do |word, count|
32
+ # @categories[category][word] ||= 0
33
+ @categories.init(category, word)
34
+
35
+ # @categories[category][word] += count
36
+ @categories.incr(category, word, count)
37
+
38
+ # @total_words += count
39
+ @categories.incr_total(count)
40
+ end
41
+ end
42
+
43
+ #
44
+ # Provides a untraining method for all categories specified in Bayes#new
45
+ # Be very careful with this method.
46
+ #
47
+ # For example:
48
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
49
+ # b.train :this, "This text"
50
+ # b.untrain :this, "This text"
51
+ def untrain(category, text)
52
+ category = category.prepare_category_name
53
+ text.word_hash(@stemmer).each do |word, count|
54
+ # @total_words >= 0
55
+ if @categories.total_words >= 0
56
+ # orig = @categories[category][word]
57
+ orig = @categories.get(category,word)
58
+
59
+ # @categories[category][word] ||= 0
60
+ @categories.init(category, word)
61
+
62
+ # @categories[category][word] -= count
63
+ @categories.decr(category, word, count)
64
+
65
+
66
+ #if @categories[category][word] <= 0
67
+ if @categories.get(category,word) <= 0
68
+ # @categories[category].delete(word)
69
+ @categories.remove(category,word)
70
+ count = orig
71
+ end
72
+ #@total_words -= count
73
+ @categories.decr_total(count)
74
+ end
75
+ end
76
+ end
77
+
78
+ #
79
+ # Returns the scores in each category the provided +text+. E.g.,
80
+ # b.classifications "I hate bad words and you"
81
+ # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
82
+ # The largest of these scores (the one closest to 0) is the one picked out by #classify
83
+ def classifications(text)
84
+ score = Hash.new
85
+ # actual categories saved in the beggining but each do |category|
86
+ @categories.names.each do |category, category_words|
87
+ score[category.to_s] = 0
88
+
89
+ # total = category_words.values.inject(0) {|sum, element| sum+element}
90
+ total = category_words.inject(0) { |sum, element| sum + element }
91
+
92
+ text.word_hash(@stemmer).each do |word, count|
93
+ #s = category_words.has_key?(word) ? category_words[word] : 0.1
94
+ s = @categories.has_word?(category, word) ? @categories.get(category, word) : 0.1
95
+
96
+ score[category.to_s] += Math.log(s/total.to_f)
97
+ end
98
+ end
99
+ return score
100
+ end
101
+
102
+ #
103
+ # Returns the classification of the provided +text+, which is one of the
104
+ # categories given in the initializer. E.g.,
105
+ # b.classify "I hate bad words and you"
106
+ # => 'Uninteresting'
107
+ def classify(text)
108
+ (classifications(text).sort_by { |a| -a[1] })[0][0]
109
+ end
110
+
111
+ #
112
+ # Provides training and untraining methods for the categories specified in Bayes#new
113
+ # For example:
114
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
115
+ # b.train_this "This text"
116
+ # b.train_that "That text"
117
+ # b.untrain_that "That text"
118
+ # b.train_the_other "The other text"
119
+ def method_missing(name, *args)
120
+ category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
121
+ # categories.has_key?(key)
122
+ if @categories.names.include? category
123
+ args.each { |text| eval("#{$1}train(category, text)") }
124
+ elsif name.to_s =~ /(un)?train_([\w]+)/
125
+ raise StandardError, "No such category: #{category}"
126
+ else
127
+ super #raise StandardError, "No such method: #{name}"
128
+ end
129
+ end
130
+
131
+ #
132
+ # Provides a list of category names
133
+ # For example:
134
+ # b.categories
135
+ # => ['This', 'That', 'the_other']
136
+ def categories # :nodoc:
137
+ @categories
138
+ end
139
+
140
+ #
141
+ # Allows you to add categories to the classifier.
142
+ # For example:
143
+ # b.add_category "Not spam"
144
+ #
145
+ # WARNING: Adding categories to a trained classifier will
146
+ # result in an undertrained category that will tend to match
147
+ # more criteria than the trained selective categories. In short,
148
+ # try to initialize your categories at initialization.
149
+ def add_category(category)
150
+ @categories[category.prepare_category_name] = Hash.new
151
+ end
152
+
153
+ alias append_category add_category
154
+ end
155
+
156
+ end
@@ -0,0 +1,10 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ # require 'fast_stemmer'
6
+ require 'classifier/extensions/word_hash'
7
+
8
+ class Object
9
+ def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
10
+ end
@@ -0,0 +1,113 @@
1
+ # Author:: Ernest Ellingson
2
+ # Copyright:: Copyright (c) 2005
3
+
4
+ # These are extensions to the std-lib 'matrix' to allow an all ruby SVD
5
+
6
+ require 'matrix'
7
+ require 'mathn'
8
+
9
+ class Array
10
+ # TODO! Change name!
11
+ def a_sum(identity = 0, &block)
12
+ return identity unless size > 0
13
+
14
+ if block_given?
15
+ map(&block).sum
16
+ else
17
+ inject { |sum, element| sum + element }.to_f
18
+ end
19
+ end
20
+ end
21
+
22
+ class Vector
23
+ def magnitude
24
+ sumsqs = 0.0
25
+ self.size.times do |i|
26
+ sumsqs += self[i] ** 2.0
27
+ end
28
+ Math.sqrt(sumsqs)
29
+ end
30
+ def normalize
31
+ nv = []
32
+ mag = self.magnitude
33
+ self.size.times do |i|
34
+
35
+ nv << (self[i] / mag)
36
+
37
+ end
38
+ Vector[*nv]
39
+ end
40
+ end
41
+
42
+ class Matrix
43
+ def Matrix.diag(s)
44
+ Matrix.diagonal(*s)
45
+ end
46
+
47
+ alias :trans :transpose
48
+
49
+ def SV_decomp(maxSweeps = 20)
50
+ if self.row_size >= self.column_size
51
+ q = self.trans * self
52
+ else
53
+ q = self * self.trans
54
+ end
55
+
56
+ qrot = q.dup
57
+ v = Matrix.identity(q.row_size)
58
+ azrot = nil
59
+ mzrot = nil
60
+ cnt = 0
61
+ s_old = nil
62
+ mu = nil
63
+
64
+ while true do
65
+ cnt += 1
66
+ for row in (0...qrot.row_size-1) do
67
+ for col in (1..qrot.row_size-1) do
68
+ next if row == col
69
+ h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
70
+ hcos = Math.cos(h)
71
+ hsin = Math.sin(h)
72
+ mzrot = Matrix.identity(qrot.row_size)
73
+ mzrot[row,row] = hcos
74
+ mzrot[row,col] = -hsin
75
+ mzrot[col,row] = hsin
76
+ mzrot[col,col] = hcos
77
+ qrot = mzrot.trans * qrot * mzrot
78
+ v = v * mzrot
79
+ end
80
+ end
81
+ s_old = qrot.dup if cnt == 1
82
+ sum_qrot = 0.0
83
+ if cnt > 1
84
+ qrot.row_size.times do |r|
85
+ sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
86
+ end
87
+ s_old = qrot.dup
88
+ end
89
+ break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
90
+ end # of do while true
91
+ s = []
92
+ qrot.row_size.times do |r|
93
+ s << Math.sqrt(qrot[r,r])
94
+ end
95
+ #puts "cnt = #{cnt}"
96
+ if self.row_size >= self.column_size
97
+ mu = self * v * Matrix.diagonal(*s).inverse
98
+ return [mu, v, s]
99
+ else
100
+ puts v.row_size
101
+ puts v.column_size
102
+ puts self.row_size
103
+ puts self.column_size
104
+ puts s.size
105
+
106
+ mu = (self.trans * v * Matrix.diagonal(*s).inverse)
107
+ return [mu, v, s]
108
+ end
109
+ end
110
+ def []=(i,j,val)
111
+ @rows[i][j] = val
112
+ end
113
+ end
@@ -0,0 +1,20 @@
1
+ module GSL
2
+
3
+ class Vector
4
+ def _dump(v)
5
+ Marshal.dump( self.to_a )
6
+ end
7
+
8
+ def self._load(arr)
9
+ arry = Marshal.load(arr)
10
+ return GSL::Vector.alloc(arry)
11
+ end
12
+
13
+ end
14
+
15
+ class Matrix
16
+ class <<self
17
+ alias :diag :diagonal
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,129 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ # These are extensions to the String class to provide convenience
6
+ # methods for the Classifier package.
7
+ require 'lingua/stemmer'
8
+
9
+ class String
10
+
11
+ # Removes common punctuation symbols, returning a new string.
12
+ # E.g.,
13
+ # "Hello (greeting's), with {braces} < >...?".without_punctuation
14
+ # => "Hello greetings with braces "
15
+ def without_punctuation
16
+ tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
17
+ end
18
+
19
+ # Return a Hash of strings => ints. Each word in the string is stemmed,
20
+ # interned, and indexes to its frequency in the document.
21
+ def word_hash(stemmer)
22
+ word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split, stemmer)
23
+ end
24
+
25
+ # Return a word hash without extra punctuation or short symbols, just stemmed words
26
+ def clean_word_hash
27
+ word_hash_for_words gsub(/[^\w\s]/,"").split
28
+ end
29
+
30
+ private
31
+
32
+ def word_hash_for_words(words, stemmer)
33
+ d = Hash.new
34
+ words.each do |word|
35
+ word.downcase! if word =~ /[\w]+/
36
+ #key = word.stem.intern
37
+ key = stemmer.stem(word).intern
38
+ if word =~ /[^\w]/ || ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
39
+ d[key] ||= 0
40
+ d[key] += 1
41
+ end
42
+ end
43
+ return d
44
+ end
45
+
46
+ # TODO! Actualize for each language
47
+ CORPUS_SKIP_WORDS = [
48
+ "a",
49
+ "again",
50
+ "all",
51
+ "along",
52
+ "are",
53
+ "also",
54
+ "an",
55
+ "and",
56
+ "as",
57
+ "at",
58
+ "but",
59
+ "by",
60
+ "came",
61
+ "can",
62
+ "cant",
63
+ "couldnt",
64
+ "did",
65
+ "didn",
66
+ "didnt",
67
+ "do",
68
+ "doesnt",
69
+ "dont",
70
+ "ever",
71
+ "first",
72
+ "from",
73
+ "have",
74
+ "her",
75
+ "here",
76
+ "him",
77
+ "how",
78
+ "i",
79
+ "if",
80
+ "in",
81
+ "into",
82
+ "is",
83
+ "isnt",
84
+ "it",
85
+ "itll",
86
+ "just",
87
+ "last",
88
+ "least",
89
+ "like",
90
+ "most",
91
+ "my",
92
+ "new",
93
+ "no",
94
+ "not",
95
+ "now",
96
+ "of",
97
+ "on",
98
+ "or",
99
+ "should",
100
+ "sinc",
101
+ "so",
102
+ "some",
103
+ "th",
104
+ "than",
105
+ "this",
106
+ "that",
107
+ "the",
108
+ "their",
109
+ "then",
110
+ "those",
111
+ "to",
112
+ "told",
113
+ "too",
114
+ "true",
115
+ "try",
116
+ "until",
117
+ "url",
118
+ "us",
119
+ "were",
120
+ "when",
121
+ "whether",
122
+ "while",
123
+ "with",
124
+ "within",
125
+ "yes",
126
+ "you",
127
+ "youll",
128
+ ]
129
+ end
@@ -0,0 +1,72 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ # This is an internal data structure class for the LSI node. Save for
8
+ # raw_vector_with, it should be fairly straightforward to understand.
9
+ # You should never have to use it directly.
10
+ class ContentNode
11
+ attr_accessor :raw_vector, :raw_norm,
12
+ :lsi_vector, :lsi_norm,
13
+ :categories
14
+
15
+ attr_reader :word_hash
16
+ # If text_proc is not specified, the source will be duck-typed
17
+ # via source.to_s
18
+ def initialize( word_hash, *categories )
19
+ @categories = categories || []
20
+ @word_hash = word_hash
21
+ end
22
+
23
+ # Use this to fetch the appropriate search vector.
24
+ def search_vector
25
+ @lsi_vector || @raw_vector
26
+ end
27
+
28
+ # Use this to fetch the appropriate search vector in normalized form.
29
+ def search_norm
30
+ @lsi_norm || @raw_norm
31
+ end
32
+
33
+ # Creates the raw vector out of word_hash using word_list as the
34
+ # key for mapping the vector space.
35
+ def raw_vector_with( word_list )
36
+ if $GSL
37
+ vec = GSL::Vector.alloc(word_list.size)
38
+ else
39
+ vec = Array.new(word_list.size, 0)
40
+ end
41
+
42
+ @word_hash.each_key do |word|
43
+ vec[word_list[word]] = @word_hash[word] if word_list[word]
44
+ end
45
+
46
+ # Perform the scaling transform
47
+ total_words = vec.a_sum
48
+
49
+ # Perform first-order association transform if this vector has more
50
+ # than one word in it.
51
+ if total_words > 1.0
52
+ weighted_total = 0.0
53
+ vec.each do |term|
54
+ if ( term > 0 )
55
+ weighted_total += (( term / total_words ) * Math.log( term / total_words ))
56
+ end
57
+ end
58
+ vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
59
+ end
60
+
61
+ if $GSL
62
+ @raw_norm = vec.normalize
63
+ @raw_vector = vec
64
+ else
65
+ @raw_norm = Vector[*vec].normalize
66
+ @raw_vector = Vector[*vec]
67
+ end
68
+ end
69
+
70
+ end
71
+
72
+ end
@@ -0,0 +1,31 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ class String
6
+ def summary( count=10, separator=" [...] " )
7
+ perform_lsi split_sentences, count, separator
8
+ end
9
+
10
+ def paragraph_summary( count=1, separator=" [...] " )
11
+ perform_lsi split_paragraphs, count, separator
12
+ end
13
+
14
+ def split_sentences
15
+ split /(\.|\!|\?)/ # TODO: make this less primitive
16
+ end
17
+
18
+ def split_paragraphs
19
+ split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
20
+ end
21
+
22
+ private
23
+
24
+ def perform_lsi(chunks, count, separator)
25
+ lsi = Classifier::LSI.new :auto_rebuild => false
26
+ chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
27
+ lsi.build_index
28
+ summaries = lsi.highest_relative_content count
29
+ return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
30
+ end
31
+ end
@@ -0,0 +1,36 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+ # This class keeps a word => index mapping. It is used to map stemmed words
7
+ # to dimensions of a vector.
8
+
9
+ class WordList
10
+ def initialize
11
+ @location_table = Hash.new
12
+ end
13
+
14
+ # Adds a word (if it is new) and assigns it a unique dimension.
15
+ def add_word(word)
16
+ term = word
17
+ @location_table[term] = @location_table.size unless @location_table[term]
18
+ end
19
+
20
+ # Returns the dimension of the word or nil if the word is not in the space.
21
+ def [](lookup)
22
+ term = lookup
23
+ @location_table[term]
24
+ end
25
+
26
+ def word_for_index(ind)
27
+ @location_table.invert[ind]
28
+ end
29
+
30
+ # Returns the number of words mapped.
31
+ def size
32
+ @location_table.size
33
+ end
34
+
35
+ end
36
+ end
@@ -0,0 +1,318 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ begin
6
+ raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
7
+
8
+ require 'gsl' # requires http://rb-gsl.rubyforge.org/
9
+ require 'classifier/extensions/vector_serialize'
10
+ $GSL = true
11
+
12
+ rescue LoadError
13
+ warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
14
+ require 'classifier/extensions/vector'
15
+ end
16
+
17
+ require 'classifier/lsi/word_list'
18
+ require 'classifier/lsi/content_node'
19
+ require 'classifier/lsi/summary'
20
+
21
+ module Classifier
22
+
23
+ # This class implements a Latent Semantic Indexer, which can search, classify and cluster
24
+ # data based on underlying semantic relations. For more information on the algorithms used,
25
+ # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
26
+ class LSI
27
+
28
+ attr_reader :word_list
29
+ attr_accessor :auto_rebuild
30
+
31
+ # Create a fresh index.
32
+ # If you want to call #build_index manually, use
33
+ # Classifier::LSI.new :auto_rebuild => false
34
+ #
35
+ def initialize(options = {})
36
+ @auto_rebuild = true unless options[:auto_rebuild] == false
37
+ @word_list, @items = WordList.new, {}
38
+ @version, @built_at_version = 0, -1
39
+ end
40
+
41
+ # Returns true if the index needs to be rebuilt. The index needs
42
+ # to be built after all informaton is added, but before you start
43
+ # using it for search, classification and cluster detection.
44
+ def needs_rebuild?
45
+ (@items.keys.size > 1) && (@version != @built_at_version)
46
+ end
47
+
48
+ # Adds an item to the index. item is assumed to be a string, but
49
+ # any item may be indexed so long as it responds to #to_s or if
50
+ # you provide an optional block explaining how the indexer can
51
+ # fetch fresh string data. This optional block is passed the item,
52
+ # so the item may only be a reference to a URL or file name.
53
+ #
54
+ # For example:
55
+ # lsi = Classifier::LSI.new
56
+ # lsi.add_item "This is just plain text"
57
+ # lsi.add_item "/home/me/filename.txt" { |x| File.read x }
58
+ # ar = ActiveRecordObject.find( :all )
59
+ # lsi.add_item ar, *ar.categories { |x| ar.content }
60
+ #
61
+ def add_item( item, *categories, &block )
62
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
63
+ @items[item] = ContentNode.new(clean_word_hash, *categories)
64
+ @version += 1
65
+ build_index if @auto_rebuild
66
+ end
67
+
68
+ # A less flexible shorthand for add_item that assumes
69
+ # you are passing in a string with no categorries. item
70
+ # will be duck typed via to_s .
71
+ #
72
+ def <<( item )
73
+ add_item item
74
+ end
75
+
76
+ # Returns the categories for a given indexed items. You are free to add and remove
77
+ # items from this as you see fit. It does not invalide an index to change its categories.
78
+ def categories_for(item)
79
+ return [] unless @items[item]
80
+ return @items[item].categories
81
+ end
82
+
83
+ # Removes an item from the database, if it is indexed.
84
+ #
85
+ def remove_item( item )
86
+ if @items.keys.contain? item
87
+ @items.remove item
88
+ @version += 1
89
+ end
90
+ end
91
+
92
+ # Returns an array of items that are indexed.
93
+ def items
94
+ @items.keys
95
+ end
96
+
97
+ # Returns the categories for a given indexed items. You are free to add and remove
98
+ # items from this as you see fit. It does not invalide an index to change its categories.
99
+ def categories_for(item)
100
+ return [] unless @items[item]
101
+ return @items[item].categories
102
+ end
103
+
104
+ # This function rebuilds the index if needs_rebuild? returns true.
105
+ # For very large document spaces, this indexing operation may take some
106
+ # time to complete, so it may be wise to place the operation in another
107
+ # thread.
108
+ #
109
+ # As a rule, indexing will be fairly swift on modern machines until
110
+ # you have well over 500 documents indexed, or have an incredibly diverse
111
+ # vocabulary for your documents.
112
+ #
113
+ # The optional parameter "cutoff" is a tuning parameter. When the index is
114
+ # built, a certain number of s-values are discarded from the system. The
115
+ # cutoff parameter tells the indexer how many of these values to keep.
116
+ # A value of 1 for cutoff means that no semantic analysis will take place,
117
+ # turning the LSI class into a simple vector search engine.
118
+ def build_index( cutoff=0.75 )
119
+ return unless needs_rebuild?
120
+ make_word_list
121
+
122
+ doc_list = @items.values
123
+ tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
124
+
125
+ if $GSL
126
+ tdm = GSL::Matrix.alloc(*tda).trans
127
+ ntdm = build_reduced_matrix(tdm, cutoff)
128
+
129
+ ntdm.size[1].times do |col|
130
+ vec = GSL::Vector.alloc( ntdm.column(col) ).row
131
+ doc_list[col].lsi_vector = vec
132
+ doc_list[col].lsi_norm = vec.normalize
133
+ end
134
+ else
135
+ tdm = Matrix.rows(tda).trans
136
+ ntdm = build_reduced_matrix(tdm, cutoff)
137
+
138
+ ntdm.row_size.times do |col|
139
+ doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
140
+ doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
141
+ end
142
+ end
143
+
144
+ @built_at_version = @version
145
+ end
146
+
147
+ # This method returns max_chunks entries, ordered by their average semantic rating.
148
+ # Essentially, the average distance of each entry from all other entries is calculated,
149
+ # the highest are returned.
150
+ #
151
+ # This can be used to build a summary service, or to provide more information about
152
+ # your dataset's general content. For example, if you were to use categorize on the
153
+ # results of this data, you could gather information on what your dataset is generally
154
+ # about.
155
+ def highest_relative_content( max_chunks=10 )
156
+ return [] if needs_rebuild?
157
+
158
+ avg_density = Hash.new
159
+ @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
160
+
161
+ avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
162
+ end
163
+
164
+ # This function is the primitive that find_related and classify
165
+ # build upon. It returns an array of 2-element arrays. The first element
166
+ # of this array is a document, and the second is its "score", defining
167
+ # how "close" it is to other indexed items.
168
+ #
169
+ # These values are somewhat arbitrary, having to do with the vector space
170
+ # created by your content, so the magnitude is interpretable but not always
171
+ # meaningful between indexes.
172
+ #
173
+ # The parameter doc is the content to compare. If that content is not
174
+ # indexed, you can pass an optional block to define how to create the
175
+ # text data. See add_item for examples of how this works.
176
+ def proximity_array_for_content( doc, &block )
177
+ return [] if needs_rebuild?
178
+
179
+ content_node = node_for_content( doc, &block )
180
+ result =
181
+ @items.keys.collect do |item|
182
+ if $GSL
183
+ val = content_node.search_vector * @items[item].search_vector.col
184
+ else
185
+ val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
186
+ end
187
+ [item, val]
188
+ end
189
+ result.sort_by { |x| x[1] }.reverse
190
+ end
191
+
192
+ # Similar to proximity_array_for_content, this function takes similar
193
+ # arguments and returns a similar array. However, it uses the normalized
194
+ # calculated vectors instead of their full versions. This is useful when
195
+ # you're trying to perform operations on content that is much smaller than
196
+ # the text you're working with. search uses this primitive.
197
+ def proximity_norms_for_content( doc, &block )
198
+ return [] if needs_rebuild?
199
+
200
+ content_node = node_for_content( doc, &block )
201
+ result =
202
+ @items.keys.collect do |item|
203
+ if $GSL
204
+ val = content_node.search_norm * @items[item].search_norm.col
205
+ else
206
+ val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
207
+ end
208
+ [item, val]
209
+ end
210
+ result.sort_by { |x| x[1] }.reverse
211
+ end
212
+
213
+ # This function allows for text-based search of your index. Unlike other functions
214
+ # like find_related and classify, search only takes short strings. It will also ignore
215
+ # factors like repeated words. It is best for short, google-like search terms.
216
+ # A search will first priortize lexical relationships, then semantic ones.
217
+ #
218
+ # While this may seem backwards compared to the other functions that LSI supports,
219
+ # it is actually the same algorithm, just applied on a smaller document.
220
+ def search( string, max_nearest=3 )
221
+ return [] if needs_rebuild?
222
+ carry = proximity_norms_for_content( string )
223
+ result = carry.collect { |x| x[0] }
224
+ return result[0..max_nearest-1]
225
+ end
226
+
227
+ # This function takes content and finds other documents
228
+ # that are semantically "close", returning an array of documents sorted
229
+ # from most to least relavant.
230
+ # max_nearest specifies the number of documents to return. A value of
231
+ # 0 means that it returns all the indexed documents, sorted by relavence.
232
+ #
233
+ # This is particularly useful for identifing clusters in your document space.
234
+ # For example you may want to identify several "What's Related" items for weblog
235
+ # articles, or find paragraphs that relate to each other in an essay.
236
+ def find_related( doc, max_nearest=3, &block )
237
+ carry =
238
+ proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
239
+ result = carry.collect { |x| x[0] }
240
+ return result[0..max_nearest-1]
241
+ end
242
+
243
+ # This function uses a voting system to categorize documents, based on
244
+ # the categories of other documents. It uses the same logic as the
245
+ # find_related function to find related documents, then returns the
246
+ # most obvious category from this list.
247
+ #
248
+ # cutoff signifies the number of documents to consider when clasifying
249
+ # text. A cutoff of 1 means that every document in the index votes on
250
+ # what category the document is in. This may not always make sense.
251
+ #
252
+ def classify( doc, cutoff=0.30, &block )
253
+ icutoff = (@items.size * cutoff).round
254
+ carry = proximity_array_for_content( doc, &block )
255
+ carry = carry[0..icutoff-1]
256
+ votes = {}
257
+ carry.each do |pair|
258
+ categories = @items[pair[0]].categories
259
+ categories.each do |category|
260
+ votes[category] ||= 0.0
261
+ votes[category] += pair[1]
262
+ end
263
+ end
264
+
265
+ ranking = votes.keys.sort_by { |x| votes[x] }
266
+ return ranking[-1]
267
+ end
268
+
269
+ # Prototype, only works on indexed documents.
270
+ # I have no clue if this is going to work, but in theory
271
+ # it's supposed to.
272
+ def highest_ranked_stems( doc, count=3 )
273
+ raise "Requested stem ranking on non-indexed content!" unless @items[doc]
274
+ arr = node_for_content(doc).lsi_vector.to_a
275
+ top_n = arr.sort.reverse[0..count-1]
276
+ return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
277
+ end
278
+
279
+ private
280
+ def build_reduced_matrix( matrix, cutoff=0.75 )
281
+ # TODO: Check that M>=N on these dimensions! Transpose helps assure this
282
+ u, v, s = matrix.SV_decomp
283
+
284
+ # TODO: Better than 75% term, please. :\
285
+ s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
286
+ s.size.times do |ord|
287
+ s[ord] = 0.0 if s[ord] < s_cutoff
288
+ end
289
+ # Reconstruct the term document matrix, only with reduced rank
290
+ u * ($GSL ? GSL::Matrix : ::Matrix).diag( s ) * v.trans
291
+ end
292
+
293
+ def node_for_content(item, &block)
294
+ if @items[item]
295
+ return @items[item]
296
+ else
297
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
298
+
299
+ cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
300
+
301
+ unless needs_rebuild?
302
+ cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
303
+ end
304
+ end
305
+
306
+ return cn
307
+ end
308
+
309
+ def make_word_list
310
+ @word_list = WordList.new
311
+ @items.each_value do |node|
312
+ node.word_hash.each_key { |key| @word_list.add_word key }
313
+ end
314
+ end
315
+
316
+ end
317
+ end
318
+
@@ -0,0 +1,125 @@
1
+ module Classifier
2
+ require 'redis'
3
+
4
+ #if !String.instance_methods.include?(:underscore)
5
+ class String
6
+ def underscore
7
+ self.gsub(/::/, '/').
8
+ gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
9
+ gsub(/([a-z\d])([A-Z])/,'\1_\2').
10
+ tr("-", "_").
11
+ downcase
12
+ end
13
+ end
14
+ #end
15
+
16
+ class RedisStore
17
+ include Enumerable
18
+
19
+ attr_accessor :names
20
+
21
+ def initialize(lang, categories)
22
+ $redis = Redis.new
23
+ @names = []
24
+ @lang = lang
25
+ categories.each_with_index do |category, index|
26
+ @names << category.prepare_category_name
27
+ end
28
+ end
29
+
30
+ def init(category, word)
31
+ if !key_for?(category, word)
32
+ insert(category, word, 0)
33
+ end
34
+ end
35
+
36
+ def init_total
37
+ $redis.set redis_total_key, 0
38
+ end
39
+
40
+ def total_words
41
+ $redis.get(redis_total_key).to_i
42
+ end
43
+
44
+ def key_for?(category, word)
45
+ $redis.exists(redis_key(category, word))
46
+ end
47
+
48
+ alias :has_word? :key_for?
49
+
50
+ def insert(category, word, val)
51
+ $redis.set(redis_key(category, word), "#{val}")
52
+ end
53
+
54
+ def get(category, word)
55
+ val = $redis.get redis_key(category, word)
56
+ val.nil? ? nil : val.to_i
57
+ end
58
+
59
+ def remove(category, word)
60
+ $redis.del redis_key(category, word)
61
+ end
62
+
63
+ def incr(category, word, count)
64
+ $redis.incrby redis_key(category, word), count.to_i
65
+ end
66
+
67
+ def incr_total(count)
68
+ $redis.incrby redis_total_key, count.to_i
69
+ end
70
+
71
+ def decr
72
+ $redis.decrby redis_key(category, word), count.to_i
73
+ end
74
+
75
+ def decr_total(count)
76
+ $redis.decrby redis_total_key, count.to_i
77
+ end
78
+
79
+ def each(&block)
80
+ #return enum_for(__method__) if block.nil?
81
+ @names.each do |category|
82
+ if block_given?
83
+ block.call(category, get_by_wild_keys(category))
84
+ else
85
+ yield category
86
+ end
87
+ end
88
+ end
89
+
90
+ #protected
91
+
92
+ def redis_key(category, word)
93
+ "#{escape_lang}:#{escape_category(category)}:#{escape_word(word)}"
94
+ end
95
+
96
+ def redis_total_key
97
+ "redis_bayes_store_#{@lang}"
98
+ end
99
+
100
+ def escape_category(category)
101
+ category.to_s.gsub(" ", "_").downcase
102
+ end
103
+
104
+ def escape_word(word)
105
+ word.to_s.force_encoding('UTF-8')
106
+ end
107
+
108
+ def escape_lang
109
+ @lang.to_s.downcase
110
+ end
111
+
112
+ def get_by_wild_keys(category)
113
+ wildlings = []
114
+ $redis.keys("#{escape_category(category)}:*").each do |key|
115
+ wildlings << get_by_key(key).to_i
116
+ end
117
+ wildlings
118
+ end
119
+
120
+ def get_by_key(key)
121
+ val = $redis.get(key)
122
+ val.is_a?(String) ? eval(val) : val
123
+ end
124
+ end
125
+ end
data/lib/classifier.rb ADDED
@@ -0,0 +1,31 @@
1
+ #--
2
+ # Copyright (c) 2005 Lucas Carlson
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
24
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
25
+ # License:: LGPL
26
+
27
+ require 'rubygems'
28
+ require 'classifier/extensions/string'
29
+ require 'classifier/bayes'
30
+ require 'classifier/lsi'
31
+ require 'classifier/redis_store'
@@ -0,0 +1,33 @@
1
+ require File.dirname(__FILE__) + '/../test_helper'
2
+ class BayesianTest < Test::Unit::TestCase
3
+ def setup
4
+ @classifier = Classifier::Bayes.new 'Interesting', 'Uninteresting'
5
+ end
6
+
7
+ def test_good_training
8
+ assert_nothing_raised { @classifier.train_interesting "love" }
9
+ end
10
+
11
+ def test_bad_training
12
+ assert_raise(StandardError) { @classifier.train_no_category "words" }
13
+ end
14
+
15
+ def test_bad_method
16
+ assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
17
+ end
18
+
19
+ def test_categories
20
+ assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
21
+ end
22
+
23
+ def test_add_category
24
+ @classifier.add_category 'Test'
25
+ assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
26
+ end
27
+
28
+ def test_classification
29
+ @classifier.train_interesting "here are some good words. I hope you love them"
30
+ @classifier.train_uninteresting "here are some bad words, I hate you"
31
+ assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
32
+ end
33
+ end
metadata ADDED
@@ -0,0 +1,57 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: classifier_atsukamoto
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Lucas Carlson
8
+ - Afonso Tsukamoto
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-12-13 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Classifier with redis
15
+ email: atsukamoto@faber-ventures.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - Rakefile
21
+ - lib/classifier/extensions/string.rb
22
+ - lib/classifier/extensions/vector.rb
23
+ - lib/classifier/extensions/vector_serialize.rb
24
+ - lib/classifier/extensions/word_hash.rb
25
+ - lib/classifier/lsi/content_node.rb
26
+ - lib/classifier/lsi/summary.rb
27
+ - lib/classifier/lsi/word_list.rb
28
+ - lib/classifier/bayes.rb
29
+ - lib/classifier/lsi.rb
30
+ - lib/classifier/redis_store.rb
31
+ - lib/classifier.rb
32
+ - test/bayes/bayesian_test.rb
33
+ homepage: http://rubygems.org/gems/classifier_atsukamoto
34
+ licenses:
35
+ - GNU
36
+ metadata: {}
37
+ post_install_message:
38
+ rdoc_options: []
39
+ require_paths:
40
+ - lib
41
+ required_ruby_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ required_rubygems_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - '>='
49
+ - !ruby/object:Gem::Version
50
+ version: '0'
51
+ requirements: []
52
+ rubyforge_project:
53
+ rubygems_version: 2.1.11
54
+ signing_key:
55
+ specification_version: 4
56
+ summary: Classifier with Redis
57
+ test_files: []