classifier 1.1.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. data/LICENSE +341 -0
  2. data/README +59 -6
  3. data/Rakefile +16 -4
  4. data/bin/bayes.rb +8 -2
  5. data/doc/classes/Classifier.html +15 -10
  6. data/doc/classes/Classifier/Bayes.html +68 -38
  7. data/doc/classes/Classifier/Bayes.src/{M000005.html → M000023.html} +1 -1
  8. data/doc/classes/Classifier/Bayes.src/{M000006.html → M000024.html} +1 -1
  9. data/doc/classes/Classifier/Bayes.src/M000025.html +30 -0
  10. data/doc/classes/Classifier/Bayes.src/{M000007.html → M000026.html} +1 -1
  11. data/doc/classes/Classifier/Bayes.src/{M000008.html → M000027.html} +1 -1
  12. data/doc/classes/Classifier/Bayes.src/{M000009.html → M000028.html} +4 -4
  13. data/doc/classes/Classifier/Bayes.src/{M000010.html → M000029.html} +2 -2
  14. data/doc/classes/Classifier/ContentNode.html +252 -0
  15. data/doc/classes/Classifier/ContentNode.src/M000031.html +21 -0
  16. data/doc/classes/Classifier/ContentNode.src/M000032.html +18 -0
  17. data/doc/classes/Classifier/ContentNode.src/M000033.html +18 -0
  18. data/doc/classes/Classifier/ContentNode.src/M000034.html +41 -0
  19. data/doc/classes/Classifier/LSI.html +449 -0
  20. data/doc/classes/Classifier/LSI.src/M000011.html +20 -0
  21. data/doc/classes/Classifier/LSI.src/M000012.html +18 -0
  22. data/doc/classes/Classifier/LSI.src/M000013.html +20 -0
  23. data/doc/classes/Classifier/LSI.src/M000014.html +18 -0
  24. data/doc/classes/Classifier/LSI.src/M000015.html +21 -0
  25. data/doc/classes/Classifier/LSI.src/M000016.html +18 -0
  26. data/doc/classes/Classifier/LSI.src/M000017.html +32 -0
  27. data/doc/classes/Classifier/LSI.src/M000018.html +26 -0
  28. data/doc/classes/Classifier/LSI.src/M000019.html +26 -0
  29. data/doc/classes/Classifier/LSI.src/M000020.html +23 -0
  30. data/doc/classes/Classifier/LSI.src/M000021.html +21 -0
  31. data/doc/classes/Classifier/LSI.src/M000022.html +31 -0
  32. data/doc/classes/Classifier/WordList.html +202 -0
  33. data/doc/classes/Classifier/WordList.src/M000007.html +18 -0
  34. data/doc/classes/Classifier/WordList.src/M000008.html +19 -0
  35. data/doc/classes/Classifier/WordList.src/M000009.html +19 -0
  36. data/doc/classes/Classifier/WordList.src/M000010.html +18 -0
  37. data/doc/classes/GSL.html +111 -0
  38. data/doc/classes/GSL/Vector.html +156 -0
  39. data/doc/classes/GSL/Vector.src/M000005.html +18 -0
  40. data/doc/classes/GSL/Vector.src/M000006.html +19 -0
  41. data/doc/classes/Object.html +139 -0
  42. data/doc/classes/Object.src/M000001.html +16 -0
  43. data/doc/classes/String.html +95 -9
  44. data/doc/classes/{Classifier/WordHash.src/M000001.html → String.src/M000002.html} +3 -3
  45. data/doc/classes/String.src/M000003.html +18 -0
  46. data/doc/classes/String.src/M000004.html +18 -0
  47. data/doc/created.rid +1 -1
  48. data/doc/files/README.html +102 -12
  49. data/doc/files/lib/classifier/bayes_rb.html +1 -1
  50. data/doc/files/lib/classifier/{string_extensions/porter_stemmer_rb.html → extensions/vector_serialize_rb.html} +4 -15
  51. data/doc/files/lib/classifier/{string_extensions → extensions}/word_hash_rb.html +2 -2
  52. data/doc/files/lib/classifier/extensions/word_list_rb.html +115 -0
  53. data/doc/files/lib/classifier/lsi/content_node_rb.html +115 -0
  54. data/doc/files/lib/classifier/lsi_rb.html +125 -0
  55. data/doc/files/lib/classifier/string_extensions_rb.html +2 -3
  56. data/doc/files/lib/classifier_rb.html +3 -1
  57. data/doc/fr_class_index.html +6 -2
  58. data/doc/fr_file_index.html +5 -2
  59. data/doc/fr_method_index.html +34 -11
  60. data/lib/classifier.rb +3 -1
  61. data/lib/classifier/bayes.rb +34 -9
  62. data/lib/classifier/extensions/vector_serialize.rb +14 -0
  63. data/lib/classifier/extensions/word_hash.rb +125 -0
  64. data/lib/classifier/extensions/word_list.rb +31 -0
  65. data/lib/classifier/lsi.rb +248 -0
  66. data/lib/classifier/lsi/content_node.rb +67 -0
  67. data/lib/classifier/string_extensions.rb +10 -5
  68. data/test/bayes/bayesian_test.rb +2 -2
  69. data/test/lsi/lsi_test.rb +88 -0
  70. data/test/string_extensions/word_hash_test.rb +7 -5
  71. metadata +79 -24
  72. data/doc/classes/Classifier/Stemmable.html +0 -243
  73. data/doc/classes/Classifier/Stemmable.src/M000003.html +0 -102
  74. data/doc/classes/Classifier/WordHash.html +0 -178
  75. data/doc/classes/Classifier/WordHash.src/M000002.html +0 -28
  76. data/lib/classifier/string_extensions/porter_stemmer.rb +0 -199
  77. data/lib/classifier/string_extensions/word_hash.rb +0 -119
@@ -0,0 +1,14 @@
1
+ module GSL
2
+
3
+ class Vector
4
+ def _dump(v)
5
+ Marshal.dump( self.to_a )
6
+ end
7
+
8
+ def self._load(arr)
9
+ arry = Marshal.load(arr)
10
+ return GSL::Vector.new(arry)
11
+ end
12
+
13
+ end
14
+ end
@@ -0,0 +1,125 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ # These are extensions to the String class to provide convenience
6
+ # methods for the Classifier package.
7
+ class String
8
+
9
+ # Removes common punctuation symbols, returning a new string.
10
+ # E.g.,
11
+ # "Hello (greeting's), with {braces} < >...?".without_punctuation
12
+ # => "Hello greetings with braces "
13
+ def without_punctuation
14
+ tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
15
+ end
16
+
17
+ # Return a Hash of strings => ints. Each word in the string is stemmed,
18
+ # interned, and indexes to its frequency in the document.
19
+ def word_hash
20
+ word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split)
21
+ end
22
+
23
+ # Return a word hash without extra punctuation or short symbols, just stemmed words
24
+ def clean_word_hash
25
+ word_hash_for_words gsub(/[^\w\s]/,"").split
26
+ end
27
+
28
+ private
29
+
30
+ def word_hash_for_words(words)
31
+ d = Hash.new
32
+ words.each do |word|
33
+ word.downcase! if word =~ /[\w]+/
34
+ key = word.stem.intern
35
+ if word =~ /[^\w]/ || ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
36
+ d[key] ||= 0
37
+ d[key] += 1
38
+ end
39
+ end
40
+ return d
41
+ end
42
+
43
+ CORPUS_SKIP_WORDS = [
44
+ "a",
45
+ "again",
46
+ "all",
47
+ "along",
48
+ "are",
49
+ "also",
50
+ "an",
51
+ "and",
52
+ "as",
53
+ "at",
54
+ "but",
55
+ "by",
56
+ "came",
57
+ "can",
58
+ "cant",
59
+ "couldnt",
60
+ "did",
61
+ "didn",
62
+ "didnt",
63
+ "do",
64
+ "doesnt",
65
+ "dont",
66
+ "ever",
67
+ "first",
68
+ "from",
69
+ "have",
70
+ "her",
71
+ "here",
72
+ "him",
73
+ "how",
74
+ "i",
75
+ "if",
76
+ "in",
77
+ "into",
78
+ "is",
79
+ "isnt",
80
+ "it",
81
+ "itll",
82
+ "just",
83
+ "last",
84
+ "least",
85
+ "like",
86
+ "most",
87
+ "my",
88
+ "new",
89
+ "no",
90
+ "not",
91
+ "now",
92
+ "of",
93
+ "on",
94
+ "or",
95
+ "should",
96
+ "sinc",
97
+ "so",
98
+ "some",
99
+ "th",
100
+ "than",
101
+ "this",
102
+ "that",
103
+ "the",
104
+ "their",
105
+ "then",
106
+ "those",
107
+ "to",
108
+ "told",
109
+ "too",
110
+ "true",
111
+ "try",
112
+ "until",
113
+ "url",
114
+ "us",
115
+ "were",
116
+ "when",
117
+ "whether",
118
+ "while",
119
+ "with",
120
+ "within",
121
+ "yes",
122
+ "you",
123
+ "youll",
124
+ ]
125
+ end
@@ -0,0 +1,31 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: GPL
4
+
5
+ module Classifier
6
+ # This class keeps a word => index mapping. It is used to map stemmed words
7
+ # to dimensions of a vector.
8
+ class WordList
9
+ def initialize
10
+ @location_table = {}
11
+ end
12
+
13
+ # Adds a word (if it is new) and assigns it a unique dimension.
14
+ def add_word(word)
15
+ term = word
16
+ @location_table[term] = @location_table.size unless @location_table[term]
17
+ end
18
+
19
+ # Returns the dimension of the word or nil if the word is not in the space.
20
+ def [](lookup)
21
+ term = lookup
22
+ @location_table[term]
23
+ end
24
+
25
+ # Returns the number of words mapped.
26
+ def size
27
+ @location_table.size
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,248 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: GPL
4
+
5
+ begin
6
+
7
+ require 'gsl' # requires http://rb-gsl.rubyforge.org/
8
+
9
+ require 'classifier/extensions/word_list'
10
+ require 'classifier/extensions/vector_serialize'
11
+ require 'classifier/lsi/content_node'
12
+
13
+ module Classifier
14
+
15
+ # This class implements a Latent Semantic Indexer, which can search, classify and cluster
16
+ # data based on underlying semantic relations. For more information on the algorithms used,
17
+ # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
18
+ class LSI
19
+
20
+ attr_reader :word_list
21
+
22
+ # Create a fresh index.
23
+ # If you want to call #build_index manually, use
24
+ # Classifier::LSI.new :auto_rebuild => false
25
+ #
26
+ def initialize(options = {})
27
+ @auto_rebuild = true unless options[:auto_rebuild] == false
28
+ @word_list, @items = WordList.new, {}
29
+ @version, @built_at_version = 0, -1
30
+ end
31
+
32
+ # Returns true if the index needs to be rebuilt. The index needs
33
+ # to be built after all informaton is added, but before you start
34
+ # using it for search, classification and cluster detection.
35
+ def needs_rebuild?
36
+ @version != @built_at_version
37
+ end
38
+
39
+ # Adds an item to the index. item is assumed to be a string, but
40
+ # any item may be indexed so long as it responds to #to_s or if
41
+ # you provide an optional block explaining how the indexer can
42
+ # fetch fresh string data. This optional block is passed the item,
43
+ # so the item may only be a reference to a URL or file name.
44
+ #
45
+ # For example:
46
+ # lsi = Classifier::LSI.new
47
+ # lsi.add_item "This is just plain text"
48
+ # lsi.add_item "/home/me/filename.txt" { |x| File.read x }
49
+ # ar = ActiveRecordObject.find( :all )
50
+ # lsi.add_item ar, *ar.categories { |x| ar.content }
51
+ #
52
+ def add_item( item, *categories, &block )
53
+ @items[item] = ContentNode.new(item, categories, block)
54
+ @version += 1
55
+ build_index if @auto_rebuild
56
+ end
57
+
58
+ # A less flexible shorthand for add_item that assumes
59
+ # you are passing in a string with no categorries. item
60
+ # will be duck typed via to_s .
61
+ #
62
+ def <<( item )
63
+ add_item item
64
+ end
65
+
66
+ # Removes an item from the database, if it is indexed.
67
+ #
68
+ def remove_item( item )
69
+ if @items.keys.contain? item
70
+ @items.remove item
71
+ @version += 1
72
+ end
73
+ end
74
+
75
+ # Returns an array of items that are indexed.
76
+ def items
77
+ @items.keys
78
+ end
79
+
80
+ # This function rebuilds the index if needs_rebuild? returns true.
81
+ # For very large document spaces, this indexing operation may take some
82
+ # time to complete, so it may be wise to place the operation in another
83
+ # thread.
84
+ #
85
+ # As a rule, indexing will be fairly swift on modern machines until
86
+ # you have well over 500 documents indexed, or have an incredibly diverse
87
+ # vocabulary for your documents.
88
+ #
89
+ # The optional parameter "cutoff" is a tuning parameter. When the index is
90
+ # built, a certain number of s-values are discarded from the system. The
91
+ # cutoff parameter tells the indexer how many of these values to keep.
92
+ # A value of 1 for cutoff means that no semantic analysis will take place,
93
+ # turning the LSI class into a simple vector search engine.
94
+ def build_index( cutoff=0.75 )
95
+ return unless needs_rebuild?
96
+ make_word_list
97
+
98
+ doc_list = @items.values
99
+ tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
100
+ tdm = GSL::Matrix.new( *tda ).trans
101
+ ntdm = build_reduced_matrix(tdm, cutoff)
102
+
103
+ ntdm.size[1].times do |col|
104
+ vec = GSL::Vector.new( ntdm.column(col) ).row
105
+ doc_list[col].lsi_vector = vec
106
+ doc_list[col].lsi_norm = vec.normalize
107
+ end
108
+
109
+ @built_at_version = @version
110
+ end
111
+
112
+ # This function is the primitive that find_related and classify
113
+ # build upon. It returns an array of 2-element arrays. The first element
114
+ # of this array is a document, and the second is its "score", defining
115
+ # how "close" it is to other indexed items.
116
+ #
117
+ # These values are somewhat arbitrary, having to do with the vector space
118
+ # created by your content, so the magnitude is interpretable but not always
119
+ # meaningful between indexes.
120
+ #
121
+ # The parameter doc is the content to compare. If that content is not
122
+ # indexed, you can pass an optional block to define how to create the
123
+ # text data. See add_item for examples of how this works.
124
+ def proximity_array_for_content( doc, &block )
125
+ return [] if needs_rebuild?
126
+
127
+ content_node = node_for_content( doc, &block )
128
+ result =
129
+ @items.keys.collect do |item|
130
+ val = content_node.search_vector * @items[item].search_vector.col
131
+ [item, val]
132
+ end
133
+ result.sort_by { |x| x[1] }.reverse
134
+ end
135
+
136
+ # Similar to proximity_array_for_content, this function takes similar
137
+ # arguments and returns a similar array. However, it uses the normalized
138
+ # calculated vectors instead of their full versions. This is useful when
139
+ # you're trying to perform operations on content that is much smaller than
140
+ # the text you're working with. search uses this primitive.
141
+ def proximity_norms_for_content( doc, &block )
142
+ return [] if needs_rebuild?
143
+
144
+ content_node = node_for_content( doc, &block )
145
+ result =
146
+ @items.keys.collect do |item|
147
+ val = content_node.search_norm * @items[item].search_norm.col
148
+ [item, val]
149
+ end
150
+ result.sort_by { |x| x[1] }.reverse
151
+ end
152
+
153
+ # This function allows for text-based search of your index. Unlike other functions
154
+ # like find_related and classify, search only takes short strings. It will also ignore
155
+ # factors like repeated words. It is best for short, google-like search terms.
156
+ # A search will first priortize lexical relationships, then semantic ones.
157
+ #
158
+ # While this may seem backwards compared to the other functions that LSI supports,
159
+ # it is actually the same algorithm, just applied on a smaller document.
160
+ def search( string, max_nearest=3 )
161
+ return [] if needs_rebuild?
162
+
163
+ carry =
164
+ proximity_norms_for_content( string )
165
+ result = carry.collect { |x| x[0] }
166
+ return result[0..max_nearest-1]
167
+ end
168
+
169
+ # This function takes content and finds other documents
170
+ # that are semantically "close", returning an array of documents sorted
171
+ # from most to least relavant.
172
+ # max_nearest specifies the number of documents to return. A value of
173
+ # 0 means that it returns all the indexed documents, sorted by relavence.
174
+ #
175
+ # This is particularly useful for identifing clusters in your document space.
176
+ # For example you may want to identify several "What's Related" items for weblog
177
+ # articles, or find paragraphs that relate to each other in an essay.
178
+ def find_related( doc, max_nearest=3, &block )
179
+ carry =
180
+ proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
181
+ result = carry.collect { |x| x[0] }
182
+ return result[0..max_nearest-1]
183
+ end
184
+
185
+ # This function uses a voting system to categorize documents, based on
186
+ # the categories of other documents. It uses the same logic as the
187
+ # find_related function to find related documents, then returns the
188
+ # most obvious category from this list.
189
+ #
190
+ # cutoff signifies the number of documents to consider when clasifying
191
+ # text. A cutoff of 1 means that every document in the index votes on
192
+ # what category the document is in. This may not always make sense.
193
+ #
194
+ def classify( doc, cutoff=0.30, &block )
195
+ icutoff = (@items.size * cutoff).round
196
+ carry = proximity_array_for_content( doc, &block )
197
+ carry = carry[0..icutoff-1]
198
+ votes = {}
199
+ carry.each do |pair|
200
+ categories = @items[pair[0]].categories
201
+ categories.each do |category|
202
+ votes[category] ||= 0.0
203
+ votes[category] += pair[1]
204
+ end
205
+ end
206
+
207
+ ranking = votes.keys.sort_by { |x| votes[x] }
208
+ return ranking[-1]
209
+ end
210
+
211
+ private
212
+ def build_reduced_matrix( matrix, cutoff=0.75 )
213
+ # TODO: Check that M>=N on these dimensions! Transpose helps assure this
214
+ u, v, s = matrix.SV_decomp
215
+ # TODO: Better than 75% term, please. :\
216
+ s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
217
+ s.size.times do |ord|
218
+ s[ord] = 0.0 if s[ord] < s_cutoff
219
+ end
220
+
221
+ # Reconstruct the term document matrix, only with reduced rank
222
+ u * Matrix.diagonal( s ) * v.trans
223
+ end
224
+
225
+ def node_for_content(item, &block)
226
+ if @items[item]
227
+ return @items[item]
228
+ else
229
+ cn = ContentNode.new(item, &block) # make the node and extract the data
230
+ cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
231
+ end
232
+
233
+ cn
234
+ end
235
+
236
+ def make_word_list
237
+ @word_list = WordList.new
238
+ @items.each_value do |node|
239
+ node.word_hash.each_key { |key| @word_list.add_word key }
240
+ end
241
+ end
242
+
243
+ end
244
+ end
245
+
246
+ rescue LoadError
247
+ $stderr.puts "For LSI support, you need to install http://rb-gsl.rubyforge.org/"
248
+ end
@@ -0,0 +1,67 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: GPL
4
+
5
+ module Classifier
6
+
7
+
8
+ # This is an internal data structure class for the LSI node. Save for
9
+ # raw_vector_with, it should be fairly straightforward to understand.
10
+ # You should never have to use it directly.
11
+ class ContentNode
12
+ attr_accessor :word_hash, :raw_vector, :raw_norm,
13
+ :lsi_vector, :lsi_norm,
14
+ :categories
15
+ attr_reader :source
16
+
17
+ # If text_proc is not specified, the source will be duck-typed
18
+ # via source.to_s
19
+ def initialize( source, categories=nil, text_proc=nil )
20
+ text_proc = text_proc || (proc {|x| x.to_s})
21
+ @categories = categories || []
22
+ @source = source
23
+ @word_hash = text_proc.call( @source ).clean_word_hash
24
+ end
25
+
26
+ # Use this to fetch the appropriate search vector.
27
+ def search_vector
28
+ @lsi_vector || @raw_vector
29
+ end
30
+
31
+ # Use this to fetch the appropriate search vector in normalized form.
32
+ def search_norm
33
+ @lsi_norm || @raw_norm
34
+ end
35
+
36
+ # Creates the raw vector out of word_hash using word_list as the
37
+ # key for mapping the vector space.
38
+ def raw_vector_with( word_list )
39
+ vec = Array.new(word_list.size, 0)
40
+
41
+ @word_hash.each_key do |word|
42
+ vec[word_list[word]] = @word_hash[word] if word_list[word]
43
+ end
44
+
45
+ # Perform the scaling transform
46
+ total_words = vec.inject(0) { |sum,term| sum += term }.to_f
47
+
48
+ # Perform first-order association transform if this vector has more
49
+ # than one word in it.
50
+ if total_words > 1.0
51
+ weighted_total = vec.inject(0.0) do |sum,term|
52
+ if( term > 0 )
53
+ sum += (( term / total_words ) * Math.log( term / total_words ))
54
+ else
55
+ sum
56
+ end
57
+ end
58
+ vec.map! { |val| Math.log( val + 1 ) / -weighted_total }
59
+ end
60
+
61
+ @raw_norm = GSL::Vector.new( vec ).normalize
62
+ @raw_vector = GSL::Vector.new( vec )
63
+ end
64
+
65
+ end
66
+
67
+ end