reclassifier 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,87 @@
1
+ # Reclassifier
2
+
3
+ Reclassifier is a gem that provides [classification](http://en.wikipedia.org/wiki/Statistical_classification) of strings.
4
+
5
+ Classification can be done via [Naïve Bayes](https://en.wikipedia.org/wiki/Naive_Bayes_classifier) or [Latent Semantic Indexing](http://en.wikipedia.org/wiki/Latent_semantic_indexing).
6
+
7
+ It is a fork of the original [Classifier](https://github.com/cardmagic/classifier) gem, which appears to be unmaintained as of a couple of years ago.
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ gem 'reclassifier'
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install reclassifier
22
+
23
+ ## Dependencies
24
+
25
+ Currently you need to install the GNU GSL library in order to use Reclassifier: http://www.gnu.org/software/gsl
26
+
27
+ ## Usage
28
+
29
+ ### Bayes
30
+ Bayesian Classifiers are accurate, fast, and have modest memory requirements.
31
+
32
+ #### Usage
33
+ require 'reclassifier'
34
+ b = Reclassifier::Bayes.new 'Interesting', 'Uninteresting'
35
+ b.train_interesting "here are some good words. I hope you love them"
36
+ b.train_uninteresting "here are some bad words, I hate you"
37
+ b.classify "I hate bad words and you" # returns 'Uninteresting'
38
+
39
+ require 'madeleine'
40
+ m = SnapshotMadeleine.new("bayes_data") {
41
+ Reclassifier::Bayes.new 'Interesting', 'Uninteresting'
42
+ }
43
+ m.system.train_interesting "here are some good words. I hope you love them"
44
+ m.system.train_uninteresting "here are some bad words, I hate you"
45
+ m.take_snapshot
46
+ m.system.classify "I love you" # returns 'Interesting'
47
+
48
+ Using Madeleine, your application can persist the learned data over time.
49
+
50
+ ### LSI
51
+ Latent Semantic Indexing engines are not as fast or as small as Bayesian classifiers, but are more flexible, providing
52
+ fast search and clustering detection as well as semantic analysis of the text that theoretically simulates human learning.
53
+
54
+ #### Usage
55
+ require 'reclassifier'
56
+ lsi = Reclassifier::LSI.new
57
+ strings = [ ["This text deals with dogs. Dogs.", :dog],
58
+ ["This text involves dogs too. Dogs! ", :dog],
59
+ ["This text revolves around cats. Cats.", :cat],
60
+ ["This text also involves cats. Cats!", :cat],
61
+ ["This text involves birds. Birds.",:bird ]]
62
+ strings.each {|x| lsi.add_item x.first, x.last}
63
+
64
+ lsi.search("dog", 3)
65
+ # returns => ["This text deals with dogs. Dogs.", "This text involves dogs too. Dogs! ",
66
+ # "This text also involves cats. Cats!"]
67
+
68
+ lsi.find_related(strings[2], 2)
69
+ # returns => ["This text revolves around cats. Cats.", "This text also involves cats. Cats!"]
70
+
71
+ lsi.classify "This text is also about dogs!"
72
+ # returns => :dog
73
+
74
+ Please see the Reclassifier::LSI documentation for more information. It is possible to index, search and classify
75
+ with more than just simple strings.
76
+
77
+ ## Contributing
78
+
79
+ 1. Fork it
80
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
81
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
82
+ 4. Push to the branch (`git push origin my-new-feature`)
83
+ 5. Create new Pull Request
84
+
85
+ ## License
86
+
87
+ This library is released under the terms of the GNU LGPL. See LICENSE for more details.
@@ -0,0 +1,7 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rake/testtask'
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << 'test'
6
+ t.test_files = FileList['test/**/*_test.rb']
7
+ end
@@ -0,0 +1,12 @@
1
+ module GSL
2
+ class Vector
3
+ def _dump(v)
4
+ Marshal.dump(self.to_a)
5
+ end
6
+
7
+ def self._load(arr)
8
+ arry = Marshal.load(arr)
9
+ return GSL::Vector.alloc(arry)
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,19 @@
1
+ # gems
2
+ require 'matrix'
3
+ require 'fast-stemmer'
4
+ require 'gsl'
5
+
6
+ # files
7
+ require 'reclassifier/version'
8
+ require 'reclassifier/core_ext/array'
9
+ require 'reclassifier/core_ext/matrix'
10
+ require 'reclassifier/core_ext/object'
11
+ require 'reclassifier/core_ext/string'
12
+ require 'gsl/vector'
13
+
14
+ module Reclassifier
15
+ autoload :Bayes, 'reclassifier/bayes'
16
+ autoload :LSI, 'reclassifier/lsi'
17
+ autoload :ContentNode, 'reclassifier/content_node'
18
+ autoload :WordList, 'reclassifier/word_list'
19
+ end
@@ -0,0 +1,129 @@
1
+ module Reclassifier
2
+ class Bayes
3
+ # The class can be created with one or more categories, each of which will be
4
+ # initialized and given a training method. E.g.,
5
+ # b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
6
+ def initialize(*categories)
7
+ @categories = Hash.new
8
+ categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
9
+ @total_words = 0
10
+ @category_counts = Hash.new(0)
11
+ end
12
+
13
+ #
14
+ # Provides a general training method for all categories specified in Bayes#new
15
+ # For example:
16
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
17
+ # b.train :this, "This text"
18
+ # b.train "that", "That text"
19
+ # b.train "The other", "The other text"
20
+ def train(category, text)
21
+ category = category.prepare_category_name
22
+ @category_counts[category] += 1
23
+ text.word_hash.each do |word, count|
24
+ @categories[category][word] ||= 0
25
+ @categories[category][word] += count
26
+ @total_words += count
27
+ end
28
+ end
29
+
30
+ #
31
+ # Provides a untraining method for all categories specified in Bayes#new
32
+ # Be very careful with this method.
33
+ #
34
+ # For example:
35
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
36
+ # b.train :this, "This text"
37
+ # b.untrain :this, "This text"
38
+ def untrain(category, text)
39
+ category = category.prepare_category_name
40
+ @category_counts[category] -= 1
41
+ text.word_hash.each do |word, count|
42
+ if @total_words >= 0
43
+ orig = @categories[category][word]
44
+ @categories[category][word] ||= 0
45
+ @categories[category][word] -= count
46
+ if @categories[category][word] <= 0
47
+ @categories[category].delete(word)
48
+ count = orig
49
+ end
50
+ @total_words -= count
51
+ end
52
+ end
53
+ end
54
+
55
+ #
56
+ # Returns the scores in each category the provided +text+. E.g.,
57
+ # b.classifications "I hate bad words and you"
58
+ # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
59
+ # The largest of these scores (the one closest to 0) is the one picked out by #classify
60
+ def classifications(text)
61
+ score = Hash.new
62
+ training_count = @category_counts.values.inject { |x,y| x+y }.to_f
63
+ @categories.each do |category, category_words|
64
+ score[category.to_s] = 0
65
+ total = category_words.values.inject(0) {|sum, element| sum+element}
66
+ text.word_hash.each do |word, count|
67
+ s = category_words.has_key?(word) ? category_words[word] : 0.1
68
+ score[category.to_s] += Math.log(s/total.to_f)
69
+ end
70
+ # now add prior probability for the category
71
+ s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
72
+ score[category.to_s] += Math.log(s / training_count)
73
+ end
74
+ return score
75
+ end
76
+
77
+ #
78
+ # Returns the classification of the provided +text+, which is one of the
79
+ # categories given in the initializer. E.g.,
80
+ # b.classify "I hate bad words and you"
81
+ # => 'Uninteresting'
82
+ def classify(text)
83
+ (classifications(text).sort_by { |a| -a[1] })[0][0]
84
+ end
85
+
86
+ #
87
+ # Provides training and untraining methods for the categories specified in Bayes#new
88
+ # For example:
89
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
90
+ # b.train_this "This text"
91
+ # b.train_that "That text"
92
+ # b.untrain_that "That text"
93
+ # b.train_the_other "The other text"
94
+ def method_missing(name, *args)
95
+ category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
96
+ if @categories.has_key? category
97
+ args.each { |text| eval("#{$1}train(category, text)") }
98
+ elsif name.to_s =~ /(un)?train_([\w]+)/
99
+ raise StandardError, "No such category: #{category}"
100
+ else
101
+ super #raise StandardError, "No such method: #{name}"
102
+ end
103
+ end
104
+
105
+ #
106
+ # Provides a list of category names
107
+ # For example:
108
+ # b.categories
109
+ # => ['This', 'That', 'the_other']
110
+ def categories # :nodoc:
111
+ @categories.keys.collect {|c| c.to_s}
112
+ end
113
+
114
+ #
115
+ # Allows you to add categories to the classifier.
116
+ # For example:
117
+ # b.add_category "Not spam"
118
+ #
119
+ # WARNING: Adding categories to a trained classifier will
120
+ # result in an undertrained category that will tend to match
121
+ # more criteria than the trained selective categories. In short,
122
+ # try to initialize your categories at initialization.
123
+ def add_category(category)
124
+ @categories[category.prepare_category_name] = Hash.new
125
+ end
126
+
127
+ alias append_category add_category
128
+ end
129
+ end
@@ -0,0 +1,66 @@
1
+ module Reclassifier
2
+
3
+ # This is an internal data structure class for the LSI node. Save for
4
+ # raw_vector_with, it should be fairly straightforward to understand.
5
+ # You should never have to use it directly.
6
+ class ContentNode
7
+ attr_accessor :raw_vector, :raw_norm,
8
+ :lsi_vector, :lsi_norm,
9
+ :categories
10
+
11
+ attr_reader :word_hash
12
+ # If text_proc is not specified, the source will be duck-typed
13
+ # via source.to_s
14
+ def initialize( word_hash, *categories )
15
+ @categories = categories || []
16
+ @word_hash = word_hash
17
+ end
18
+
19
+ # Use this to fetch the appropriate search vector.
20
+ def search_vector
21
+ @lsi_vector || @raw_vector
22
+ end
23
+
24
+ # Use this to fetch the appropriate search vector in normalized form.
25
+ def search_norm
26
+ @lsi_norm || @raw_norm
27
+ end
28
+
29
+ # Creates the raw vector out of word_hash using word_list as the
30
+ # key for mapping the vector space.
31
+ def raw_vector_with( word_list )
32
+ if $GSL
33
+ vec = GSL::Vector.alloc(word_list.size)
34
+ else
35
+ vec = Array.new(word_list.size, 0)
36
+ end
37
+
38
+ @word_hash.each_key do |word|
39
+ vec[word_list[word]] = @word_hash[word] if word_list[word]
40
+ end
41
+
42
+ # Perform the scaling transform
43
+ total_words = $GSL ? vec.sum : vec.sum_with_identity
44
+
45
+ # Perform first-order association transform if this vector has more
46
+ # than one word in it.
47
+ if total_words > 1.0
48
+ weighted_total = 0.0
49
+ vec.each do |term|
50
+ if ( term > 0 )
51
+ weighted_total += (( term / total_words ) * Math.log( term / total_words ))
52
+ end
53
+ end
54
+ vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
55
+ end
56
+
57
+ if $GSL
58
+ @raw_norm = vec.normalize
59
+ @raw_vector = vec
60
+ else
61
+ @raw_norm = Vector[*vec].normalize
62
+ @raw_vector = Vector[*vec]
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,11 @@
1
+ class Array
2
+ def sum_with_identity(identity = 0, &block)
3
+ return identity unless size > 0
4
+
5
+ if block_given?
6
+ map(&block).sum
7
+ else
8
+ reduce(:+)
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,72 @@
1
+ class Matrix
2
+ def Matrix.diag(s)
3
+ Matrix.diagonal(*s)
4
+ end
5
+
6
+ alias :trans :transpose
7
+
8
+ def SV_decomp(maxSweeps = 20)
9
+ if self.row_size >= self.column_size
10
+ q = self.trans * self
11
+ else
12
+ q = self * self.trans
13
+ end
14
+
15
+ qrot = q.dup
16
+ v = Matrix.identity(q.row_size)
17
+ azrot = nil
18
+ mzrot = nil
19
+ cnt = 0
20
+ s_old = nil
21
+ mu = nil
22
+
23
+ while true do
24
+ cnt += 1
25
+ for row in (0...qrot.row_size-1) do
26
+ for col in (1..qrot.row_size-1) do
27
+ next if row == col
28
+ h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
29
+ hcos = Math.cos(h)
30
+ hsin = Math.sin(h)
31
+ mzrot = Matrix.identity(qrot.row_size)
32
+ mzrot[row,row] = hcos
33
+ mzrot[row,col] = -hsin
34
+ mzrot[col,row] = hsin
35
+ mzrot[col,col] = hcos
36
+ qrot = mzrot.trans * qrot * mzrot
37
+ v = v * mzrot
38
+ end
39
+ end
40
+ s_old = qrot.dup if cnt == 1
41
+ sum_qrot = 0.0
42
+ if cnt > 1
43
+ qrot.row_size.times do |r|
44
+ sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
45
+ end
46
+ s_old = qrot.dup
47
+ end
48
+ break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
49
+ end # of do while true
50
+ s = []
51
+ qrot.row_size.times do |r|
52
+ s << Math.sqrt(qrot[r,r])
53
+ end
54
+ #puts "cnt = #{cnt}"
55
+ if self.row_size >= self.column_size
56
+ mu = self * v * Matrix.diagonal(*s).inverse
57
+ return [mu, v, s]
58
+ else
59
+ puts v.row_size
60
+ puts v.column_size
61
+ puts self.row_size
62
+ puts self.column_size
63
+ puts s.size
64
+
65
+ mu = (self.trans * v * Matrix.diagonal(*s).inverse)
66
+ return [mu, v, s]
67
+ end
68
+ end
69
+ def []=(i,j,val)
70
+ @rows[i][j] = val
71
+ end
72
+ end
@@ -0,0 +1,3 @@
1
+ class Object
2
+ def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
3
+ end
@@ -0,0 +1,143 @@
1
+ class String
2
+
3
+ # Removes common punctuation symbols, returning a new string.
4
+ # E.g.,
5
+ # "Hello (greeting's), with {braces} < >...?".without_punctuation
6
+ # => "Hello greetings with braces "
7
+ def without_punctuation
8
+ tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
9
+ end
10
+
11
+ # Return a Hash of strings => ints. Each word in the string is stemmed,
12
+ # symbolized, and indexed to its frequency in the document.
13
+ def word_hash
14
+ word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split)
15
+ end
16
+
17
+ # Return a word hash without extra punctuation or short symbols, just stemmed words
18
+ def clean_word_hash
19
+ word_hash_for_words gsub(/[^\w\s]/,"").split
20
+ end
21
+
22
+ def word_hash_for_words(words)
23
+ d = Hash.new
24
+ words.each do |word|
25
+ word.downcase! if word =~ /[\w]+/
26
+ key = word.stem.to_sym
27
+ if word =~ /[^\w]/ || ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
28
+ d[key] ||= 0
29
+ d[key] += 1
30
+ end
31
+ end
32
+ return d
33
+ end
34
+
35
+ CORPUS_SKIP_WORDS = [
36
+ "a",
37
+ "again",
38
+ "all",
39
+ "along",
40
+ "are",
41
+ "also",
42
+ "an",
43
+ "and",
44
+ "as",
45
+ "at",
46
+ "but",
47
+ "by",
48
+ "came",
49
+ "can",
50
+ "cant",
51
+ "couldnt",
52
+ "did",
53
+ "didn",
54
+ "didnt",
55
+ "do",
56
+ "doesnt",
57
+ "dont",
58
+ "ever",
59
+ "first",
60
+ "from",
61
+ "have",
62
+ "her",
63
+ "here",
64
+ "him",
65
+ "how",
66
+ "i",
67
+ "if",
68
+ "in",
69
+ "into",
70
+ "is",
71
+ "isnt",
72
+ "it",
73
+ "itll",
74
+ "just",
75
+ "last",
76
+ "least",
77
+ "like",
78
+ "most",
79
+ "my",
80
+ "new",
81
+ "no",
82
+ "not",
83
+ "now",
84
+ "of",
85
+ "on",
86
+ "or",
87
+ "should",
88
+ "sinc",
89
+ "so",
90
+ "some",
91
+ "th",
92
+ "than",
93
+ "this",
94
+ "that",
95
+ "the",
96
+ "their",
97
+ "then",
98
+ "those",
99
+ "to",
100
+ "told",
101
+ "too",
102
+ "true",
103
+ "try",
104
+ "until",
105
+ "url",
106
+ "us",
107
+ "were",
108
+ "when",
109
+ "whether",
110
+ "while",
111
+ "with",
112
+ "within",
113
+ "yes",
114
+ "you",
115
+ "youll",
116
+ ]
117
+
118
+ def summary( count=10, separator=" [...] " )
119
+ perform_lsi split_sentences, count, separator
120
+ end
121
+
122
+ def paragraph_summary( count=1, separator=" [...] " )
123
+ perform_lsi split_paragraphs, count, separator
124
+ end
125
+
126
+ def split_sentences
127
+ split /(\.|\!|\?)/ # TODO: make this less primitive
128
+ end
129
+
130
+ def split_paragraphs
131
+ split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
132
+ end
133
+
134
+ private
135
+
136
+ def perform_lsi(chunks, count, separator)
137
+ lsi = Reclassifier::LSI.new :auto_rebuild => false
138
+ chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
139
+ lsi.build_index
140
+ summaries = lsi.highest_relative_content count
141
+ return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
142
+ end
143
+ end