classifier_atsukamoto 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e6eca98ba96b5157ddcfef0ba3f02e129652c5ce
4
+ data.tar.gz: bc219f04544083e8a017b548ca2fede7a942fa45
5
+ SHA512:
6
+ metadata.gz: 1c78965de0ffd493b57ebf013deb1baf92c8e554f21157bb681dca1ae980edcd1d4a7fdafe601983270ecb0bbd86214a6b19ec3a1e9ebebc12789ebdbc1f0131
7
+ data.tar.gz: a9eb9c3ebac570198f25800f4c02677aea29ad8e9f152b41585242bd8667cf8b6de3812cf28ce68386c1090f5764fbc0c610972a0041342cb58bfb2fddccaf15
data/Rakefile ADDED
@@ -0,0 +1,97 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'rake/testtask'
4
+ require 'rake/rdoctask'
5
+ require 'rake/gempackagetask'
6
+ require 'rake/contrib/rubyforgepublisher'
7
+
8
+ PKG_VERSION = "0.0.1"
9
+
10
+ PKG_FILES = FileList[
11
+ "lib/**/*", "bin/*", "test/**/*", "[A-Z]*", "Rakefile", "html/**/*"
12
+ ]
13
+
14
+ desc "Default Task"
15
+ task :default => [ :test ]
16
+
17
+ # Run the unit tests
18
+ desc "Run all unit tests"
19
+ Rake::TestTask.new("test") { |t|
20
+ t.libs << "lib"
21
+ t.pattern = 'test/*/*_test.rb'
22
+ t.verbose = true
23
+ }
24
+
25
+ # Make a console, useful when working on tests
26
+ desc "Generate a test console"
27
+ task :console do
28
+ verbose( false ) { sh "irb -I lib/ -r 'classifier'" }
29
+ end
30
+
31
+ # Genereate the RDoc documentation
32
+ desc "Create documentation"
33
+ Rake::RDocTask.new("doc") { |rdoc|
34
+ rdoc.title = "Ruby Classifier Fork by ATsukamoto - Bayesian and LSI classification library with Redis for persistence
35
+ "
36
+ rdoc.rdoc_dir = 'html'
37
+ rdoc.rdoc_files.include('README')
38
+ rdoc.rdoc_files.include('lib/**/*.rb')
39
+ }
40
+
41
+ # Genereate the package
42
+ spec = Gem::Specification.new do |s|
43
+
44
+ #### Basic information.
45
+
46
+ s.name = 'classifier'
47
+ s.version = PKG_VERSION
48
+ s.summary = <<-EOF
49
+ A general classifier module to allow Bayesian and other types of classifications.
50
+ EOF
51
+ s.description = <<-EOF
52
+ A general classifier module to allow Bayesian and other types of classifications.
53
+ EOF
54
+
55
+ #### Which files are to be included in this gem? Everything! (Except CVS directories.)
56
+
57
+ s.files = PKG_FILES
58
+
59
+ #### Load-time details: library and application (you will need one or both).
60
+
61
+ s.require_path = 'lib'
62
+ s.autorequire = 'classifier'
63
+
64
+ #### Documentation and testing.
65
+
66
+ s.has_rdoc = true
67
+
68
+ #### Dependencies and requirements.
69
+
70
+ s.add_dependency('fast-stemmer', '>= 1.0.0')
71
+ s.requirements << "A porter-stemmer module to split word stems."
72
+
73
+ #### Author and project details.
74
+ s.author = "Lucas Carlson"
75
+ s.email = "lucas@rufy.com"
76
+ s.homepage = "http://classifier.rufy.com/"
77
+ end
78
+
79
+ Rake::GemPackageTask.new(spec) do |pkg|
80
+ pkg.need_zip = true
81
+ pkg.need_tar = true
82
+ end
83
+
84
+ desc "Report code statistics (KLOCs, etc) from the application"
85
+ task :stats do
86
+ require 'code_statistics'
87
+ CodeStatistics.new(
88
+ ["Library", "lib"],
89
+ ["Units", "test"]
90
+ ).to_s
91
+ end
92
+
93
+ desc "Publish new documentation"
94
+ task :publish do
95
+ `ssh rufy update-classifier-doc`
96
+ Rake::RubyForgePublisher.new('classifier', 'cardmagic').upload
97
+ end
@@ -0,0 +1,156 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ require 'lingua/stemmer'
8
+
9
+ class Bayes
10
+ # The class can be created with one or more categories, each of which will be
11
+ # initialized and given a training method. E.g.,
12
+ # b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
13
+ def initialize(lang, *categories)
14
+ #@categories = Hash.new
15
+ #categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
16
+ # RedisStore.total_words = 0
17
+ @categories = RedisStore.new lang, categories
18
+ @categories.init_total
19
+ @stemmer = Lingua::Stemmer.new(:language => lang.downcase)
20
+ end
21
+
22
+ #
23
+ # Provides a general training method for all categories specified in Bayes#new
24
+ # For example:
25
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
26
+ # b.train :this, "This text"
27
+ # b.train "that", "That text"
28
+ # b.train "The other", "The other text"
29
+ def train(category, text)
30
+ category = category.prepare_category_name
31
+ text.word_hash(@stemmer).each do |word, count|
32
+ # @categories[category][word] ||= 0
33
+ @categories.init(category, word)
34
+
35
+ # @categories[category][word] += count
36
+ @categories.incr(category, word, count)
37
+
38
+ # @total_words += count
39
+ @categories.incr_total(count)
40
+ end
41
+ end
42
+
43
+ #
44
+ # Provides a untraining method for all categories specified in Bayes#new
45
+ # Be very careful with this method.
46
+ #
47
+ # For example:
48
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
49
+ # b.train :this, "This text"
50
+ # b.untrain :this, "This text"
51
+ def untrain(category, text)
52
+ category = category.prepare_category_name
53
+ text.word_hash(@stemmer).each do |word, count|
54
+ # @total_words >= 0
55
+ if @categories.total_words >= 0
56
+ # orig = @categories[category][word]
57
+ orig = @categories.get(category,word)
58
+
59
+ # @categories[category][word] ||= 0
60
+ @categories.init(category, word)
61
+
62
+ # @categories[category][word] -= count
63
+ @categories.decr(category, word, count)
64
+
65
+
66
+ #if @categories[category][word] <= 0
67
+ if @categories.get(category,word) <= 0
68
+ # @categories[category].delete(word)
69
+ @categories.remove(category,word)
70
+ count = orig
71
+ end
72
+ #@total_words -= count
73
+ @categories.decr_total(count)
74
+ end
75
+ end
76
+ end
77
+
78
+ #
79
+ # Returns the scores in each category the provided +text+. E.g.,
80
+ # b.classifications "I hate bad words and you"
81
+ # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
82
+ # The largest of these scores (the one closest to 0) is the one picked out by #classify
83
+ def classifications(text)
84
+ score = Hash.new
85
+ # actual categories saved in the beggining but each do |category|
86
+ @categories.names.each do |category, category_words|
87
+ score[category.to_s] = 0
88
+
89
+ # total = category_words.values.inject(0) {|sum, element| sum+element}
90
+ total = category_words.inject(0) { |sum, element| sum + element }
91
+
92
+ text.word_hash(@stemmer).each do |word, count|
93
+ #s = category_words.has_key?(word) ? category_words[word] : 0.1
94
+ s = @categories.has_word?(category, word) ? @categories.get(category, word) : 0.1
95
+
96
+ score[category.to_s] += Math.log(s/total.to_f)
97
+ end
98
+ end
99
+ return score
100
+ end
101
+
102
+ #
103
+ # Returns the classification of the provided +text+, which is one of the
104
+ # categories given in the initializer. E.g.,
105
+ # b.classify "I hate bad words and you"
106
+ # => 'Uninteresting'
107
+ def classify(text)
108
+ (classifications(text).sort_by { |a| -a[1] })[0][0]
109
+ end
110
+
111
+ #
112
+ # Provides training and untraining methods for the categories specified in Bayes#new
113
+ # For example:
114
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
115
+ # b.train_this "This text"
116
+ # b.train_that "That text"
117
+ # b.untrain_that "That text"
118
+ # b.train_the_other "The other text"
119
+ def method_missing(name, *args)
120
+ category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
121
+ # categories.has_key?(key)
122
+ if @categories.names.include? category
123
+ args.each { |text| eval("#{$1}train(category, text)") }
124
+ elsif name.to_s =~ /(un)?train_([\w]+)/
125
+ raise StandardError, "No such category: #{category}"
126
+ else
127
+ super #raise StandardError, "No such method: #{name}"
128
+ end
129
+ end
130
+
131
+ #
132
+ # Provides a list of category names
133
+ # For example:
134
+ # b.categories
135
+ # => ['This', 'That', 'the_other']
136
+ def categories # :nodoc:
137
+ @categories
138
+ end
139
+
140
+ #
141
+ # Allows you to add categories to the classifier.
142
+ # For example:
143
+ # b.add_category "Not spam"
144
+ #
145
+ # WARNING: Adding categories to a trained classifier will
146
+ # result in an undertrained category that will tend to match
147
+ # more criteria than the trained selective categories. In short,
148
+ # try to initialize your categories at initialization.
149
+ def add_category(category)
150
+ @categories[category.prepare_category_name] = Hash.new
151
+ end
152
+
153
+ alias append_category add_category
154
+ end
155
+
156
+ end
@@ -0,0 +1,10 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ # require 'fast_stemmer'
6
+ require 'classifier/extensions/word_hash'
7
+
8
+ class Object
9
+ def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
10
+ end
@@ -0,0 +1,113 @@
1
+ # Author:: Ernest Ellingson
2
+ # Copyright:: Copyright (c) 2005
3
+
4
+ # These are extensions to the std-lib 'matrix' to allow an all ruby SVD
5
+
6
+ require 'matrix'
7
+ require 'mathn'
8
+
9
+ class Array
10
+ # TODO! Change name!
11
+ def a_sum(identity = 0, &block)
12
+ return identity unless size > 0
13
+
14
+ if block_given?
15
+ map(&block).sum
16
+ else
17
+ inject { |sum, element| sum + element }.to_f
18
+ end
19
+ end
20
+ end
21
+
22
+ class Vector
23
+ def magnitude
24
+ sumsqs = 0.0
25
+ self.size.times do |i|
26
+ sumsqs += self[i] ** 2.0
27
+ end
28
+ Math.sqrt(sumsqs)
29
+ end
30
+ def normalize
31
+ nv = []
32
+ mag = self.magnitude
33
+ self.size.times do |i|
34
+
35
+ nv << (self[i] / mag)
36
+
37
+ end
38
+ Vector[*nv]
39
+ end
40
+ end
41
+
42
+ class Matrix
43
+ def Matrix.diag(s)
44
+ Matrix.diagonal(*s)
45
+ end
46
+
47
+ alias :trans :transpose
48
+
49
+ def SV_decomp(maxSweeps = 20)
50
+ if self.row_size >= self.column_size
51
+ q = self.trans * self
52
+ else
53
+ q = self * self.trans
54
+ end
55
+
56
+ qrot = q.dup
57
+ v = Matrix.identity(q.row_size)
58
+ azrot = nil
59
+ mzrot = nil
60
+ cnt = 0
61
+ s_old = nil
62
+ mu = nil
63
+
64
+ while true do
65
+ cnt += 1
66
+ for row in (0...qrot.row_size-1) do
67
+ for col in (1..qrot.row_size-1) do
68
+ next if row == col
69
+ h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
70
+ hcos = Math.cos(h)
71
+ hsin = Math.sin(h)
72
+ mzrot = Matrix.identity(qrot.row_size)
73
+ mzrot[row,row] = hcos
74
+ mzrot[row,col] = -hsin
75
+ mzrot[col,row] = hsin
76
+ mzrot[col,col] = hcos
77
+ qrot = mzrot.trans * qrot * mzrot
78
+ v = v * mzrot
79
+ end
80
+ end
81
+ s_old = qrot.dup if cnt == 1
82
+ sum_qrot = 0.0
83
+ if cnt > 1
84
+ qrot.row_size.times do |r|
85
+ sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
86
+ end
87
+ s_old = qrot.dup
88
+ end
89
+ break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
90
+ end # of do while true
91
+ s = []
92
+ qrot.row_size.times do |r|
93
+ s << Math.sqrt(qrot[r,r])
94
+ end
95
+ #puts "cnt = #{cnt}"
96
+ if self.row_size >= self.column_size
97
+ mu = self * v * Matrix.diagonal(*s).inverse
98
+ return [mu, v, s]
99
+ else
100
+ puts v.row_size
101
+ puts v.column_size
102
+ puts self.row_size
103
+ puts self.column_size
104
+ puts s.size
105
+
106
+ mu = (self.trans * v * Matrix.diagonal(*s).inverse)
107
+ return [mu, v, s]
108
+ end
109
+ end
110
+ def []=(i,j,val)
111
+ @rows[i][j] = val
112
+ end
113
+ end
@@ -0,0 +1,20 @@
1
+ module GSL
2
+
3
+ class Vector
4
+ def _dump(v)
5
+ Marshal.dump( self.to_a )
6
+ end
7
+
8
+ def self._load(arr)
9
+ arry = Marshal.load(arr)
10
+ return GSL::Vector.alloc(arry)
11
+ end
12
+
13
+ end
14
+
15
+ class Matrix
16
+ class <<self
17
+ alias :diag :diagonal
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,129 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ # These are extensions to the String class to provide convenience
6
+ # methods for the Classifier package.
7
+ require 'lingua/stemmer'
8
+
9
+ class String
10
+
11
+ # Removes common punctuation symbols, returning a new string.
12
+ # E.g.,
13
+ # "Hello (greeting's), with {braces} < >...?".without_punctuation
14
+ # => "Hello greetings with braces "
15
+ def without_punctuation
16
+ tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
17
+ end
18
+
19
+ # Return a Hash of strings => ints. Each word in the string is stemmed,
20
+ # interned, and indexes to its frequency in the document.
21
+ def word_hash(stemmer)
22
+ word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split, stemmer)
23
+ end
24
+
25
+ # Return a word hash without extra punctuation or short symbols, just stemmed words
26
+ def clean_word_hash
27
+ word_hash_for_words gsub(/[^\w\s]/,"").split
28
+ end
29
+
30
+ private
31
+
32
+ def word_hash_for_words(words, stemmer)
33
+ d = Hash.new
34
+ words.each do |word|
35
+ word.downcase! if word =~ /[\w]+/
36
+ #key = word.stem.intern
37
+ key = stemmer.stem(word).intern
38
+ if word =~ /[^\w]/ || ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
39
+ d[key] ||= 0
40
+ d[key] += 1
41
+ end
42
+ end
43
+ return d
44
+ end
45
+
46
+ # TODO! Actualize for each language
47
+ CORPUS_SKIP_WORDS = [
48
+ "a",
49
+ "again",
50
+ "all",
51
+ "along",
52
+ "are",
53
+ "also",
54
+ "an",
55
+ "and",
56
+ "as",
57
+ "at",
58
+ "but",
59
+ "by",
60
+ "came",
61
+ "can",
62
+ "cant",
63
+ "couldnt",
64
+ "did",
65
+ "didn",
66
+ "didnt",
67
+ "do",
68
+ "doesnt",
69
+ "dont",
70
+ "ever",
71
+ "first",
72
+ "from",
73
+ "have",
74
+ "her",
75
+ "here",
76
+ "him",
77
+ "how",
78
+ "i",
79
+ "if",
80
+ "in",
81
+ "into",
82
+ "is",
83
+ "isnt",
84
+ "it",
85
+ "itll",
86
+ "just",
87
+ "last",
88
+ "least",
89
+ "like",
90
+ "most",
91
+ "my",
92
+ "new",
93
+ "no",
94
+ "not",
95
+ "now",
96
+ "of",
97
+ "on",
98
+ "or",
99
+ "should",
100
+ "sinc",
101
+ "so",
102
+ "some",
103
+ "th",
104
+ "than",
105
+ "this",
106
+ "that",
107
+ "the",
108
+ "their",
109
+ "then",
110
+ "those",
111
+ "to",
112
+ "told",
113
+ "too",
114
+ "true",
115
+ "try",
116
+ "until",
117
+ "url",
118
+ "us",
119
+ "were",
120
+ "when",
121
+ "whether",
122
+ "while",
123
+ "with",
124
+ "within",
125
+ "yes",
126
+ "you",
127
+ "youll",
128
+ ]
129
+ end
@@ -0,0 +1,72 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ # This is an internal data structure class for the LSI node. Save for
8
+ # raw_vector_with, it should be fairly straightforward to understand.
9
+ # You should never have to use it directly.
10
+ class ContentNode
11
+ attr_accessor :raw_vector, :raw_norm,
12
+ :lsi_vector, :lsi_norm,
13
+ :categories
14
+
15
+ attr_reader :word_hash
16
+ # If text_proc is not specified, the source will be duck-typed
17
+ # via source.to_s
18
+ def initialize( word_hash, *categories )
19
+ @categories = categories || []
20
+ @word_hash = word_hash
21
+ end
22
+
23
+ # Use this to fetch the appropriate search vector.
24
+ def search_vector
25
+ @lsi_vector || @raw_vector
26
+ end
27
+
28
+ # Use this to fetch the appropriate search vector in normalized form.
29
+ def search_norm
30
+ @lsi_norm || @raw_norm
31
+ end
32
+
33
+ # Creates the raw vector out of word_hash using word_list as the
34
+ # key for mapping the vector space.
35
+ def raw_vector_with( word_list )
36
+ if $GSL
37
+ vec = GSL::Vector.alloc(word_list.size)
38
+ else
39
+ vec = Array.new(word_list.size, 0)
40
+ end
41
+
42
+ @word_hash.each_key do |word|
43
+ vec[word_list[word]] = @word_hash[word] if word_list[word]
44
+ end
45
+
46
+ # Perform the scaling transform
47
+ total_words = vec.a_sum
48
+
49
+ # Perform first-order association transform if this vector has more
50
+ # than one word in it.
51
+ if total_words > 1.0
52
+ weighted_total = 0.0
53
+ vec.each do |term|
54
+ if ( term > 0 )
55
+ weighted_total += (( term / total_words ) * Math.log( term / total_words ))
56
+ end
57
+ end
58
+ vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
59
+ end
60
+
61
+ if $GSL
62
+ @raw_norm = vec.normalize
63
+ @raw_vector = vec
64
+ else
65
+ @raw_norm = Vector[*vec].normalize
66
+ @raw_vector = Vector[*vec]
67
+ end
68
+ end
69
+
70
+ end
71
+
72
+ end
@@ -0,0 +1,31 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ class String
6
+ def summary( count=10, separator=" [...] " )
7
+ perform_lsi split_sentences, count, separator
8
+ end
9
+
10
+ def paragraph_summary( count=1, separator=" [...] " )
11
+ perform_lsi split_paragraphs, count, separator
12
+ end
13
+
14
+ def split_sentences
15
+ split /(\.|\!|\?)/ # TODO: make this less primitive
16
+ end
17
+
18
+ def split_paragraphs
19
+ split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
20
+ end
21
+
22
+ private
23
+
24
+ def perform_lsi(chunks, count, separator)
25
+ lsi = Classifier::LSI.new :auto_rebuild => false
26
+ chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
27
+ lsi.build_index
28
+ summaries = lsi.highest_relative_content count
29
+ return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
30
+ end
31
+ end
@@ -0,0 +1,36 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+ # This class keeps a word => index mapping. It is used to map stemmed words
7
+ # to dimensions of a vector.
8
+
9
+ class WordList
10
+ def initialize
11
+ @location_table = Hash.new
12
+ end
13
+
14
+ # Adds a word (if it is new) and assigns it a unique dimension.
15
+ def add_word(word)
16
+ term = word
17
+ @location_table[term] = @location_table.size unless @location_table[term]
18
+ end
19
+
20
+ # Returns the dimension of the word or nil if the word is not in the space.
21
+ def [](lookup)
22
+ term = lookup
23
+ @location_table[term]
24
+ end
25
+
26
+ def word_for_index(ind)
27
+ @location_table.invert[ind]
28
+ end
29
+
30
+ # Returns the number of words mapped.
31
+ def size
32
+ @location_table.size
33
+ end
34
+
35
+ end
36
+ end
@@ -0,0 +1,318 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ begin
6
+ raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
7
+
8
+ require 'gsl' # requires http://rb-gsl.rubyforge.org/
9
+ require 'classifier/extensions/vector_serialize'
10
+ $GSL = true
11
+
12
+ rescue LoadError
13
+ warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
14
+ require 'classifier/extensions/vector'
15
+ end
16
+
17
+ require 'classifier/lsi/word_list'
18
+ require 'classifier/lsi/content_node'
19
+ require 'classifier/lsi/summary'
20
+
21
+ module Classifier
22
+
23
+ # This class implements a Latent Semantic Indexer, which can search, classify and cluster
24
+ # data based on underlying semantic relations. For more information on the algorithms used,
25
+ # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
26
+ class LSI
27
+
28
+ attr_reader :word_list
29
+ attr_accessor :auto_rebuild
30
+
31
+ # Create a fresh index.
32
+ # If you want to call #build_index manually, use
33
+ # Classifier::LSI.new :auto_rebuild => false
34
+ #
35
+ def initialize(options = {})
36
+ @auto_rebuild = true unless options[:auto_rebuild] == false
37
+ @word_list, @items = WordList.new, {}
38
+ @version, @built_at_version = 0, -1
39
+ end
40
+
41
+ # Returns true if the index needs to be rebuilt. The index needs
42
+ # to be built after all informaton is added, but before you start
43
+ # using it for search, classification and cluster detection.
44
+ def needs_rebuild?
45
+ (@items.keys.size > 1) && (@version != @built_at_version)
46
+ end
47
+
48
+ # Adds an item to the index. item is assumed to be a string, but
49
+ # any item may be indexed so long as it responds to #to_s or if
50
+ # you provide an optional block explaining how the indexer can
51
+ # fetch fresh string data. This optional block is passed the item,
52
+ # so the item may only be a reference to a URL or file name.
53
+ #
54
+ # For example:
55
+ # lsi = Classifier::LSI.new
56
+ # lsi.add_item "This is just plain text"
57
+ # lsi.add_item "/home/me/filename.txt" { |x| File.read x }
58
+ # ar = ActiveRecordObject.find( :all )
59
+ # lsi.add_item ar, *ar.categories { |x| ar.content }
60
+ #
61
+ def add_item( item, *categories, &block )
62
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
63
+ @items[item] = ContentNode.new(clean_word_hash, *categories)
64
+ @version += 1
65
+ build_index if @auto_rebuild
66
+ end
67
+
68
+ # A less flexible shorthand for add_item that assumes
69
+ # you are passing in a string with no categorries. item
70
+ # will be duck typed via to_s .
71
+ #
72
+ def <<( item )
73
+ add_item item
74
+ end
75
+
76
+ # Returns the categories for a given indexed items. You are free to add and remove
77
+ # items from this as you see fit. It does not invalide an index to change its categories.
78
+ def categories_for(item)
79
+ return [] unless @items[item]
80
+ return @items[item].categories
81
+ end
82
+
83
+ # Removes an item from the database, if it is indexed.
84
+ #
85
+ def remove_item( item )
86
+ if @items.keys.contain? item
87
+ @items.remove item
88
+ @version += 1
89
+ end
90
+ end
91
+
92
+ # Returns an array of items that are indexed.
93
+ def items
94
+ @items.keys
95
+ end
96
+
97
+ # Returns the categories for a given indexed items. You are free to add and remove
98
+ # items from this as you see fit. It does not invalide an index to change its categories.
99
+ def categories_for(item)
100
+ return [] unless @items[item]
101
+ return @items[item].categories
102
+ end
103
+
104
+ # This function rebuilds the index if needs_rebuild? returns true.
105
+ # For very large document spaces, this indexing operation may take some
106
+ # time to complete, so it may be wise to place the operation in another
107
+ # thread.
108
+ #
109
+ # As a rule, indexing will be fairly swift on modern machines until
110
+ # you have well over 500 documents indexed, or have an incredibly diverse
111
+ # vocabulary for your documents.
112
+ #
113
+ # The optional parameter "cutoff" is a tuning parameter. When the index is
114
+ # built, a certain number of s-values are discarded from the system. The
115
+ # cutoff parameter tells the indexer how many of these values to keep.
116
+ # A value of 1 for cutoff means that no semantic analysis will take place,
117
+ # turning the LSI class into a simple vector search engine.
118
+ def build_index( cutoff=0.75 )
119
+ return unless needs_rebuild?
120
+ make_word_list
121
+
122
+ doc_list = @items.values
123
+ tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
124
+
125
+ if $GSL
126
+ tdm = GSL::Matrix.alloc(*tda).trans
127
+ ntdm = build_reduced_matrix(tdm, cutoff)
128
+
129
+ ntdm.size[1].times do |col|
130
+ vec = GSL::Vector.alloc( ntdm.column(col) ).row
131
+ doc_list[col].lsi_vector = vec
132
+ doc_list[col].lsi_norm = vec.normalize
133
+ end
134
+ else
135
+ tdm = Matrix.rows(tda).trans
136
+ ntdm = build_reduced_matrix(tdm, cutoff)
137
+
138
+ ntdm.row_size.times do |col|
139
+ doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
140
+ doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
141
+ end
142
+ end
143
+
144
+ @built_at_version = @version
145
+ end
146
+
147
+ # This method returns max_chunks entries, ordered by their average semantic rating.
148
+ # Essentially, the average distance of each entry from all other entries is calculated,
149
+ # the highest are returned.
150
+ #
151
+ # This can be used to build a summary service, or to provide more information about
152
+ # your dataset's general content. For example, if you were to use categorize on the
153
+ # results of this data, you could gather information on what your dataset is generally
154
+ # about.
155
+ def highest_relative_content( max_chunks=10 )
156
+ return [] if needs_rebuild?
157
+
158
+ avg_density = Hash.new
159
+ @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
160
+
161
+ avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
162
+ end
163
+
164
+ # This function is the primitive that find_related and classify
165
+ # build upon. It returns an array of 2-element arrays. The first element
166
+ # of this array is a document, and the second is its "score", defining
167
+ # how "close" it is to other indexed items.
168
+ #
169
+ # These values are somewhat arbitrary, having to do with the vector space
170
+ # created by your content, so the magnitude is interpretable but not always
171
+ # meaningful between indexes.
172
+ #
173
+ # The parameter doc is the content to compare. If that content is not
174
+ # indexed, you can pass an optional block to define how to create the
175
+ # text data. See add_item for examples of how this works.
176
+ def proximity_array_for_content( doc, &block )
177
+ return [] if needs_rebuild?
178
+
179
+ content_node = node_for_content( doc, &block )
180
+ result =
181
+ @items.keys.collect do |item|
182
+ if $GSL
183
+ val = content_node.search_vector * @items[item].search_vector.col
184
+ else
185
+ val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
186
+ end
187
+ [item, val]
188
+ end
189
+ result.sort_by { |x| x[1] }.reverse
190
+ end
191
+
192
+ # Similar to proximity_array_for_content, this function takes similar
193
+ # arguments and returns a similar array. However, it uses the normalized
194
+ # calculated vectors instead of their full versions. This is useful when
195
+ # you're trying to perform operations on content that is much smaller than
196
+ # the text you're working with. search uses this primitive.
197
+ def proximity_norms_for_content( doc, &block )
198
+ return [] if needs_rebuild?
199
+
200
+ content_node = node_for_content( doc, &block )
201
+ result =
202
+ @items.keys.collect do |item|
203
+ if $GSL
204
+ val = content_node.search_norm * @items[item].search_norm.col
205
+ else
206
+ val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
207
+ end
208
+ [item, val]
209
+ end
210
+ result.sort_by { |x| x[1] }.reverse
211
+ end
212
+
213
+ # This function allows for text-based search of your index. Unlike other functions
214
+ # like find_related and classify, search only takes short strings. It will also ignore
215
+ # factors like repeated words. It is best for short, google-like search terms.
216
+ # A search will first priortize lexical relationships, then semantic ones.
217
+ #
218
+ # While this may seem backwards compared to the other functions that LSI supports,
219
+ # it is actually the same algorithm, just applied on a smaller document.
220
+ def search( string, max_nearest=3 )
221
+ return [] if needs_rebuild?
222
+ carry = proximity_norms_for_content( string )
223
+ result = carry.collect { |x| x[0] }
224
+ return result[0..max_nearest-1]
225
+ end
226
+
227
+ # This function takes content and finds other documents
228
+ # that are semantically "close", returning an array of documents sorted
229
+ # from most to least relavant.
230
+ # max_nearest specifies the number of documents to return. A value of
231
+ # 0 means that it returns all the indexed documents, sorted by relavence.
232
+ #
233
+ # This is particularly useful for identifing clusters in your document space.
234
+ # For example you may want to identify several "What's Related" items for weblog
235
+ # articles, or find paragraphs that relate to each other in an essay.
236
+ def find_related( doc, max_nearest=3, &block )
237
+ carry =
238
+ proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
239
+ result = carry.collect { |x| x[0] }
240
+ return result[0..max_nearest-1]
241
+ end
242
+
243
+ # This function uses a voting system to categorize documents, based on
244
+ # the categories of other documents. It uses the same logic as the
245
+ # find_related function to find related documents, then returns the
246
+ # most obvious category from this list.
247
+ #
248
+ # cutoff signifies the number of documents to consider when clasifying
249
+ # text. A cutoff of 1 means that every document in the index votes on
250
+ # what category the document is in. This may not always make sense.
251
+ #
252
+ def classify( doc, cutoff=0.30, &block )
253
+ icutoff = (@items.size * cutoff).round
254
+ carry = proximity_array_for_content( doc, &block )
255
+ carry = carry[0..icutoff-1]
256
+ votes = {}
257
+ carry.each do |pair|
258
+ categories = @items[pair[0]].categories
259
+ categories.each do |category|
260
+ votes[category] ||= 0.0
261
+ votes[category] += pair[1]
262
+ end
263
+ end
264
+
265
+ ranking = votes.keys.sort_by { |x| votes[x] }
266
+ return ranking[-1]
267
+ end
268
+
269
+ # Prototype, only works on indexed documents.
270
+ # I have no clue if this is going to work, but in theory
271
+ # it's supposed to.
272
+ def highest_ranked_stems( doc, count=3 )
273
+ raise "Requested stem ranking on non-indexed content!" unless @items[doc]
274
+ arr = node_for_content(doc).lsi_vector.to_a
275
+ top_n = arr.sort.reverse[0..count-1]
276
+ return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
277
+ end
278
+
279
+ private
280
+ def build_reduced_matrix( matrix, cutoff=0.75 )
281
+ # TODO: Check that M>=N on these dimensions! Transpose helps assure this
282
+ u, v, s = matrix.SV_decomp
283
+
284
+ # TODO: Better than 75% term, please. :\
285
+ s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
286
+ s.size.times do |ord|
287
+ s[ord] = 0.0 if s[ord] < s_cutoff
288
+ end
289
+ # Reconstruct the term document matrix, only with reduced rank
290
+ u * ($GSL ? GSL::Matrix : ::Matrix).diag( s ) * v.trans
291
+ end
292
+
293
+ def node_for_content(item, &block)
294
+ if @items[item]
295
+ return @items[item]
296
+ else
297
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
298
+
299
+ cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
300
+
301
+ unless needs_rebuild?
302
+ cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
303
+ end
304
+ end
305
+
306
+ return cn
307
+ end
308
+
309
+ def make_word_list
310
+ @word_list = WordList.new
311
+ @items.each_value do |node|
312
+ node.word_hash.each_key { |key| @word_list.add_word key }
313
+ end
314
+ end
315
+
316
+ end
317
+ end
318
+
@@ -0,0 +1,125 @@
1
+ module Classifier
2
+ require 'redis'
3
+
4
+ #if !String.instance_methods.include?(:underscore)
5
+ class String
6
+ def underscore
7
+ self.gsub(/::/, '/').
8
+ gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
9
+ gsub(/([a-z\d])([A-Z])/,'\1_\2').
10
+ tr("-", "_").
11
+ downcase
12
+ end
13
+ end
14
+ #end
15
+
16
+ class RedisStore
17
+ include Enumerable
18
+
19
+ attr_accessor :names
20
+
21
+ def initialize(lang, categories)
22
+ $redis = Redis.new
23
+ @names = []
24
+ @lang = lang
25
+ categories.each_with_index do |category, index|
26
+ @names << category.prepare_category_name
27
+ end
28
+ end
29
+
30
+ def init(category, word)
31
+ if !key_for?(category, word)
32
+ insert(category, word, 0)
33
+ end
34
+ end
35
+
36
+ def init_total
37
+ $redis.set redis_total_key, 0
38
+ end
39
+
40
+ def total_words
41
+ $redis.get(redis_total_key).to_i
42
+ end
43
+
44
+ def key_for?(category, word)
45
+ $redis.exists(redis_key(category, word))
46
+ end
47
+
48
+ alias :has_word? :key_for?
49
+
50
+ def insert(category, word, val)
51
+ $redis.set(redis_key(category, word), "#{val}")
52
+ end
53
+
54
+ def get(category, word)
55
+ val = $redis.get redis_key(category, word)
56
+ val.nil? ? nil : val.to_i
57
+ end
58
+
59
+ def remove(category, word)
60
+ $redis.del redis_key(category, word)
61
+ end
62
+
63
+ def incr(category, word, count)
64
+ $redis.incrby redis_key(category, word), count.to_i
65
+ end
66
+
67
+ def incr_total(count)
68
+ $redis.incrby redis_total_key, count.to_i
69
+ end
70
+
71
+ def decr
72
+ $redis.decrby redis_key(category, word), count.to_i
73
+ end
74
+
75
+ def decr_total(count)
76
+ $redis.decrby redis_total_key, count.to_i
77
+ end
78
+
79
+ def each(&block)
80
+ #return enum_for(__method__) if block.nil?
81
+ @names.each do |category|
82
+ if block_given?
83
+ block.call(category, get_by_wild_keys(category))
84
+ else
85
+ yield category
86
+ end
87
+ end
88
+ end
89
+
90
+ #protected
91
+
92
+ def redis_key(category, word)
93
+ "#{escape_lang}:#{escape_category(category)}:#{escape_word(word)}"
94
+ end
95
+
96
+ def redis_total_key
97
+ "redis_bayes_store_#{@lang}"
98
+ end
99
+
100
+ def escape_category(category)
101
+ category.to_s.gsub(" ", "_").downcase
102
+ end
103
+
104
+ def escape_word(word)
105
+ word.to_s.force_encoding('UTF-8')
106
+ end
107
+
108
+ def escape_lang
109
+ @lang.to_s.downcase
110
+ end
111
+
112
+ def get_by_wild_keys(category)
113
+ wildlings = []
114
+ $redis.keys("#{escape_category(category)}:*").each do |key|
115
+ wildlings << get_by_key(key).to_i
116
+ end
117
+ wildlings
118
+ end
119
+
120
+ def get_by_key(key)
121
+ val = $redis.get(key)
122
+ val.is_a?(String) ? eval(val) : val
123
+ end
124
+ end
125
+ end
data/lib/classifier.rb ADDED
@@ -0,0 +1,31 @@
1
+ #--
2
+ # Copyright (c) 2005 Lucas Carlson
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
24
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
25
+ # License:: LGPL
26
+
27
+ require 'rubygems'
28
+ require 'classifier/extensions/string'
29
+ require 'classifier/bayes'
30
+ require 'classifier/lsi'
31
+ require 'classifier/redis_store'
@@ -0,0 +1,33 @@
1
+ require File.dirname(__FILE__) + '/../test_helper'
2
+ class BayesianTest < Test::Unit::TestCase
3
+ def setup
4
+ @classifier = Classifier::Bayes.new 'Interesting', 'Uninteresting'
5
+ end
6
+
7
+ def test_good_training
8
+ assert_nothing_raised { @classifier.train_interesting "love" }
9
+ end
10
+
11
+ def test_bad_training
12
+ assert_raise(StandardError) { @classifier.train_no_category "words" }
13
+ end
14
+
15
+ def test_bad_method
16
+ assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
17
+ end
18
+
19
+ def test_categories
20
+ assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
21
+ end
22
+
23
+ def test_add_category
24
+ @classifier.add_category 'Test'
25
+ assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
26
+ end
27
+
28
+ def test_classification
29
+ @classifier.train_interesting "here are some good words. I hope you love them"
30
+ @classifier.train_uninteresting "here are some bad words, I hate you"
31
+ assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
32
+ end
33
+ end
metadata ADDED
@@ -0,0 +1,57 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: classifier_atsukamoto
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Lucas Carlson
8
+ - Afonso Tsukamoto
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-12-13 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Classifier with redis
15
+ email: atsukamoto@faber-ventures.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - Rakefile
21
+ - lib/classifier/extensions/string.rb
22
+ - lib/classifier/extensions/vector.rb
23
+ - lib/classifier/extensions/vector_serialize.rb
24
+ - lib/classifier/extensions/word_hash.rb
25
+ - lib/classifier/lsi/content_node.rb
26
+ - lib/classifier/lsi/summary.rb
27
+ - lib/classifier/lsi/word_list.rb
28
+ - lib/classifier/bayes.rb
29
+ - lib/classifier/lsi.rb
30
+ - lib/classifier/redis_store.rb
31
+ - lib/classifier.rb
32
+ - test/bayes/bayesian_test.rb
33
+ homepage: http://rubygems.org/gems/classifier_atsukamoto
34
+ licenses:
35
+ - GNU
36
+ metadata: {}
37
+ post_install_message:
38
+ rdoc_options: []
39
+ require_paths:
40
+ - lib
41
+ required_ruby_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ required_rubygems_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - '>='
49
+ - !ruby/object:Gem::Version
50
+ version: '0'
51
+ requirements: []
52
+ rubyforge_project:
53
+ rubygems_version: 2.1.11
54
+ signing_key:
55
+ specification_version: 4
56
+ summary: Classifier with Redis
57
+ test_files: []