reclassifier 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,87 @@
1
+ # Reclassifier
2
+
3
+ Reclassifier is a gem that provides [classification](http://en.wikipedia.org/wiki/Statistical_classification) of strings.
4
+
5
+ Classification can be done via [Naïve Bayes](https://en.wikipedia.org/wiki/Naive_Bayes_classifier) or [Latent Semantic Indexing](http://en.wikipedia.org/wiki/Latent_semantic_indexing).
6
+
7
+ It is a fork of the original [Classifier](https://github.com/cardmagic/classifier) gem, which appears to be unmaintained as of a couple of years ago.
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ gem 'reclassifier'
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install reclassifier
22
+
23
+ ## Dependencies
24
+
25
+ Currently you need to install the GNU GSL library in order to use Reclassifier: http://www.gnu.org/software/gsl
26
+
27
+ ## Usage
28
+
29
+ ### Bayes
30
+ Bayesian Classifiers are accurate, fast, and have modest memory requirements.
31
+
32
+ #### Usage
33
+ require 'reclassifier'
34
+ b = Reclassifier::Bayes.new 'Interesting', 'Uninteresting'
35
+ b.train_interesting "here are some good words. I hope you love them"
36
+ b.train_uninteresting "here are some bad words, I hate you"
37
+ b.classify "I hate bad words and you" # returns 'Uninteresting'
38
+
39
+ require 'madeleine'
40
+ m = SnapshotMadeleine.new("bayes_data") {
41
+ Reclassifier::Bayes.new 'Interesting', 'Uninteresting'
42
+ }
43
+ m.system.train_interesting "here are some good words. I hope you love them"
44
+ m.system.train_uninteresting "here are some bad words, I hate you"
45
+ m.take_snapshot
46
+ m.system.classify "I love you" # returns 'Interesting'
47
+
48
+ Using Madeleine, your application can persist the learned data over time.
49
+
50
+ ### LSI
51
+ Latent Semantic Indexing engines are not as fast or as small as Bayesian classifiers, but are more flexible, providing
52
+ fast search and clustering detection as well as semantic analysis of the text that theoretically simulates human learning.
53
+
54
+ #### Usage
55
+ require 'reclassifier'
56
+ lsi = Reclassifier::LSI.new
57
+ strings = [ ["This text deals with dogs. Dogs.", :dog],
58
+ ["This text involves dogs too. Dogs! ", :dog],
59
+ ["This text revolves around cats. Cats.", :cat],
60
+ ["This text also involves cats. Cats!", :cat],
61
+ ["This text involves birds. Birds.",:bird ]]
62
+ strings.each {|x| lsi.add_item x.first, x.last}
63
+
64
+ lsi.search("dog", 3)
65
+ # returns => ["This text deals with dogs. Dogs.", "This text involves dogs too. Dogs! ",
66
+ # "This text also involves cats. Cats!"]
67
+
68
+ lsi.find_related(strings[2], 2)
69
+ # returns => ["This text revolves around cats. Cats.", "This text also involves cats. Cats!"]
70
+
71
+ lsi.classify "This text is also about dogs!"
72
+ # returns => :dog
73
+
74
+ Please see the Reclassifier::LSI documentation for more information. It is possible to index, search and classify
75
+ with more than just simple strings.
76
+
77
+ ## Contributing
78
+
79
+ 1. Fork it
80
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
81
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
82
+ 4. Push to the branch (`git push origin my-new-feature`)
83
+ 5. Create new Pull Request
84
+
85
+ ## License
86
+
87
+ This library is released under the terms of the GNU LGPL. See LICENSE for more details.
@@ -0,0 +1,7 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rake/testtask'
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << 'test'
6
+ t.test_files = FileList['test/**/*_test.rb']
7
+ end
@@ -0,0 +1,12 @@
1
+ module GSL
2
+ class Vector
3
+ def _dump(v)
4
+ Marshal.dump(self.to_a)
5
+ end
6
+
7
+ def self._load(arr)
8
+ arry = Marshal.load(arr)
9
+ return GSL::Vector.alloc(arry)
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,19 @@
1
+ # gems
2
+ require 'matrix'
3
+ require 'fast-stemmer'
4
+ require 'gsl'
5
+
6
+ # files
7
+ require 'reclassifier/version'
8
+ require 'reclassifier/core_ext/array'
9
+ require 'reclassifier/core_ext/matrix'
10
+ require 'reclassifier/core_ext/object'
11
+ require 'reclassifier/core_ext/string'
12
+ require 'gsl/vector'
13
+
14
+ module Reclassifier
15
+ autoload :Bayes, 'reclassifier/bayes'
16
+ autoload :LSI, 'reclassifier/lsi'
17
+ autoload :ContentNode, 'reclassifier/content_node'
18
+ autoload :WordList, 'reclassifier/word_list'
19
+ end
@@ -0,0 +1,129 @@
1
+ module Reclassifier
2
+ class Bayes
3
+ # The class can be created with one or more categories, each of which will be
4
+ # initialized and given a training method. E.g.,
5
+ # b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
6
+ def initialize(*categories)
7
+ @categories = Hash.new
8
+ categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
9
+ @total_words = 0
10
+ @category_counts = Hash.new(0)
11
+ end
12
+
13
+ #
14
+ # Provides a general training method for all categories specified in Bayes#new
15
+ # For example:
16
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
17
+ # b.train :this, "This text"
18
+ # b.train "that", "That text"
19
+ # b.train "The other", "The other text"
20
+ def train(category, text)
21
+ category = category.prepare_category_name
22
+ @category_counts[category] += 1
23
+ text.word_hash.each do |word, count|
24
+ @categories[category][word] ||= 0
25
+ @categories[category][word] += count
26
+ @total_words += count
27
+ end
28
+ end
29
+
30
+ #
31
+ # Provides a untraining method for all categories specified in Bayes#new
32
+ # Be very careful with this method.
33
+ #
34
+ # For example:
35
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
36
+ # b.train :this, "This text"
37
+ # b.untrain :this, "This text"
38
+ def untrain(category, text)
39
+ category = category.prepare_category_name
40
+ @category_counts[category] -= 1
41
+ text.word_hash.each do |word, count|
42
+ if @total_words >= 0
43
+ orig = @categories[category][word]
44
+ @categories[category][word] ||= 0
45
+ @categories[category][word] -= count
46
+ if @categories[category][word] <= 0
47
+ @categories[category].delete(word)
48
+ count = orig
49
+ end
50
+ @total_words -= count
51
+ end
52
+ end
53
+ end
54
+
55
+ #
56
+ # Returns the scores in each category the provided +text+. E.g.,
57
+ # b.classifications "I hate bad words and you"
58
+ # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
59
+ # The largest of these scores (the one closest to 0) is the one picked out by #classify
60
+ def classifications(text)
61
+ score = Hash.new
62
+ training_count = @category_counts.values.inject { |x,y| x+y }.to_f
63
+ @categories.each do |category, category_words|
64
+ score[category.to_s] = 0
65
+ total = category_words.values.inject(0) {|sum, element| sum+element}
66
+ text.word_hash.each do |word, count|
67
+ s = category_words.has_key?(word) ? category_words[word] : 0.1
68
+ score[category.to_s] += Math.log(s/total.to_f)
69
+ end
70
+ # now add prior probability for the category
71
+ s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
72
+ score[category.to_s] += Math.log(s / training_count)
73
+ end
74
+ return score
75
+ end
76
+
77
+ #
78
+ # Returns the classification of the provided +text+, which is one of the
79
+ # categories given in the initializer. E.g.,
80
+ # b.classify "I hate bad words and you"
81
+ # => 'Uninteresting'
82
+ def classify(text)
83
+ (classifications(text).sort_by { |a| -a[1] })[0][0]
84
+ end
85
+
86
+ #
87
+ # Provides training and untraining methods for the categories specified in Bayes#new
88
+ # For example:
89
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
90
+ # b.train_this "This text"
91
+ # b.train_that "That text"
92
+ # b.untrain_that "That text"
93
+ # b.train_the_other "The other text"
94
+ def method_missing(name, *args)
95
+ category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
96
+ if @categories.has_key? category
97
+ args.each { |text| eval("#{$1}train(category, text)") }
98
+ elsif name.to_s =~ /(un)?train_([\w]+)/
99
+ raise StandardError, "No such category: #{category}"
100
+ else
101
+ super #raise StandardError, "No such method: #{name}"
102
+ end
103
+ end
104
+
105
+ #
106
+ # Provides a list of category names
107
+ # For example:
108
+ # b.categories
109
+ # => ['This', 'That', 'the_other']
110
+ def categories # :nodoc:
111
+ @categories.keys.collect {|c| c.to_s}
112
+ end
113
+
114
+ #
115
+ # Allows you to add categories to the classifier.
116
+ # For example:
117
+ # b.add_category "Not spam"
118
+ #
119
+ # WARNING: Adding categories to a trained classifier will
120
+ # result in an undertrained category that will tend to match
121
+ # more criteria than the trained selective categories. In short,
122
+ # try to initialize your categories at initialization.
123
+ def add_category(category)
124
+ @categories[category.prepare_category_name] = Hash.new
125
+ end
126
+
127
+ alias append_category add_category
128
+ end
129
+ end
@@ -0,0 +1,66 @@
1
+ module Reclassifier
2
+
3
+ # This is an internal data structure class for the LSI node. Save for
4
+ # raw_vector_with, it should be fairly straightforward to understand.
5
+ # You should never have to use it directly.
6
+ class ContentNode
7
+ attr_accessor :raw_vector, :raw_norm,
8
+ :lsi_vector, :lsi_norm,
9
+ :categories
10
+
11
+ attr_reader :word_hash
12
+ # If text_proc is not specified, the source will be duck-typed
13
+ # via source.to_s
14
+ def initialize( word_hash, *categories )
15
+ @categories = categories || []
16
+ @word_hash = word_hash
17
+ end
18
+
19
+ # Use this to fetch the appropriate search vector.
20
+ def search_vector
21
+ @lsi_vector || @raw_vector
22
+ end
23
+
24
+ # Use this to fetch the appropriate search vector in normalized form.
25
+ def search_norm
26
+ @lsi_norm || @raw_norm
27
+ end
28
+
29
+ # Creates the raw vector out of word_hash using word_list as the
30
+ # key for mapping the vector space.
31
+ def raw_vector_with( word_list )
32
+ if $GSL
33
+ vec = GSL::Vector.alloc(word_list.size)
34
+ else
35
+ vec = Array.new(word_list.size, 0)
36
+ end
37
+
38
+ @word_hash.each_key do |word|
39
+ vec[word_list[word]] = @word_hash[word] if word_list[word]
40
+ end
41
+
42
+ # Perform the scaling transform
43
+ total_words = $GSL ? vec.sum : vec.sum_with_identity
44
+
45
+ # Perform first-order association transform if this vector has more
46
+ # than one word in it.
47
+ if total_words > 1.0
48
+ weighted_total = 0.0
49
+ vec.each do |term|
50
+ if ( term > 0 )
51
+ weighted_total += (( term / total_words ) * Math.log( term / total_words ))
52
+ end
53
+ end
54
+ vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
55
+ end
56
+
57
+ if $GSL
58
+ @raw_norm = vec.normalize
59
+ @raw_vector = vec
60
+ else
61
+ @raw_norm = Vector[*vec].normalize
62
+ @raw_vector = Vector[*vec]
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,11 @@
1
+ class Array
2
+ def sum_with_identity(identity = 0, &block)
3
+ return identity unless size > 0
4
+
5
+ if block_given?
6
+ map(&block).sum
7
+ else
8
+ reduce(:+)
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,72 @@
1
+ class Matrix
2
+ def Matrix.diag(s)
3
+ Matrix.diagonal(*s)
4
+ end
5
+
6
+ alias :trans :transpose
7
+
8
+ def SV_decomp(maxSweeps = 20)
9
+ if self.row_size >= self.column_size
10
+ q = self.trans * self
11
+ else
12
+ q = self * self.trans
13
+ end
14
+
15
+ qrot = q.dup
16
+ v = Matrix.identity(q.row_size)
17
+ azrot = nil
18
+ mzrot = nil
19
+ cnt = 0
20
+ s_old = nil
21
+ mu = nil
22
+
23
+ while true do
24
+ cnt += 1
25
+ for row in (0...qrot.row_size-1) do
26
+ for col in (1..qrot.row_size-1) do
27
+ next if row == col
28
+ h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
29
+ hcos = Math.cos(h)
30
+ hsin = Math.sin(h)
31
+ mzrot = Matrix.identity(qrot.row_size)
32
+ mzrot[row,row] = hcos
33
+ mzrot[row,col] = -hsin
34
+ mzrot[col,row] = hsin
35
+ mzrot[col,col] = hcos
36
+ qrot = mzrot.trans * qrot * mzrot
37
+ v = v * mzrot
38
+ end
39
+ end
40
+ s_old = qrot.dup if cnt == 1
41
+ sum_qrot = 0.0
42
+ if cnt > 1
43
+ qrot.row_size.times do |r|
44
+ sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
45
+ end
46
+ s_old = qrot.dup
47
+ end
48
+ break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
49
+ end # of do while true
50
+ s = []
51
+ qrot.row_size.times do |r|
52
+ s << Math.sqrt(qrot[r,r])
53
+ end
54
+ #puts "cnt = #{cnt}"
55
+ if self.row_size >= self.column_size
56
+ mu = self * v * Matrix.diagonal(*s).inverse
57
+ return [mu, v, s]
58
+ else
59
+ puts v.row_size
60
+ puts v.column_size
61
+ puts self.row_size
62
+ puts self.column_size
63
+ puts s.size
64
+
65
+ mu = (self.trans * v * Matrix.diagonal(*s).inverse)
66
+ return [mu, v, s]
67
+ end
68
+ end
69
+ def []=(i,j,val)
70
+ @rows[i][j] = val
71
+ end
72
+ end
@@ -0,0 +1,3 @@
1
+ class Object
2
+ def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
3
+ end
@@ -0,0 +1,143 @@
1
+ class String
2
+
3
+ # Removes common punctuation symbols, returning a new string.
4
+ # E.g.,
5
+ # "Hello (greeting's), with {braces} < >...?".without_punctuation
6
+ # => "Hello greetings with braces "
7
+ def without_punctuation
8
+ tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
9
+ end
10
+
11
+ # Return a Hash of strings => ints. Each word in the string is stemmed,
12
+ # symbolized, and indexed to its frequency in the document.
13
+ def word_hash
14
+ word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split)
15
+ end
16
+
17
+ # Return a word hash without extra punctuation or short symbols, just stemmed words
18
+ def clean_word_hash
19
+ word_hash_for_words gsub(/[^\w\s]/,"").split
20
+ end
21
+
22
+ def word_hash_for_words(words)
23
+ d = Hash.new
24
+ words.each do |word|
25
+ word.downcase! if word =~ /[\w]+/
26
+ key = word.stem.to_sym
27
+ if word =~ /[^\w]/ || ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
28
+ d[key] ||= 0
29
+ d[key] += 1
30
+ end
31
+ end
32
+ return d
33
+ end
34
+
35
+ CORPUS_SKIP_WORDS = [
36
+ "a",
37
+ "again",
38
+ "all",
39
+ "along",
40
+ "are",
41
+ "also",
42
+ "an",
43
+ "and",
44
+ "as",
45
+ "at",
46
+ "but",
47
+ "by",
48
+ "came",
49
+ "can",
50
+ "cant",
51
+ "couldnt",
52
+ "did",
53
+ "didn",
54
+ "didnt",
55
+ "do",
56
+ "doesnt",
57
+ "dont",
58
+ "ever",
59
+ "first",
60
+ "from",
61
+ "have",
62
+ "her",
63
+ "here",
64
+ "him",
65
+ "how",
66
+ "i",
67
+ "if",
68
+ "in",
69
+ "into",
70
+ "is",
71
+ "isnt",
72
+ "it",
73
+ "itll",
74
+ "just",
75
+ "last",
76
+ "least",
77
+ "like",
78
+ "most",
79
+ "my",
80
+ "new",
81
+ "no",
82
+ "not",
83
+ "now",
84
+ "of",
85
+ "on",
86
+ "or",
87
+ "should",
88
+ "sinc",
89
+ "so",
90
+ "some",
91
+ "th",
92
+ "than",
93
+ "this",
94
+ "that",
95
+ "the",
96
+ "their",
97
+ "then",
98
+ "those",
99
+ "to",
100
+ "told",
101
+ "too",
102
+ "true",
103
+ "try",
104
+ "until",
105
+ "url",
106
+ "us",
107
+ "were",
108
+ "when",
109
+ "whether",
110
+ "while",
111
+ "with",
112
+ "within",
113
+ "yes",
114
+ "you",
115
+ "youll",
116
+ ]
117
+
118
+ def summary( count=10, separator=" [...] " )
119
+ perform_lsi split_sentences, count, separator
120
+ end
121
+
122
+ def paragraph_summary( count=1, separator=" [...] " )
123
+ perform_lsi split_paragraphs, count, separator
124
+ end
125
+
126
+ def split_sentences
127
+ split /(\.|\!|\?)/ # TODO: make this less primitive
128
+ end
129
+
130
+ def split_paragraphs
131
+ split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
132
+ end
133
+
134
+ private
135
+
136
+ def perform_lsi(chunks, count, separator)
137
+ lsi = Reclassifier::LSI.new :auto_rebuild => false
138
+ chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
139
+ lsi.build_index
140
+ summaries = lsi.highest_relative_content count
141
+ return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
142
+ end
143
+ end