classifier 1.1.1 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (77) hide show
  1. data/LICENSE +341 -0
  2. data/README +59 -6
  3. data/Rakefile +16 -4
  4. data/bin/bayes.rb +8 -2
  5. data/doc/classes/Classifier.html +15 -10
  6. data/doc/classes/Classifier/Bayes.html +68 -38
  7. data/doc/classes/Classifier/Bayes.src/{M000005.html → M000023.html} +1 -1
  8. data/doc/classes/Classifier/Bayes.src/{M000006.html → M000024.html} +1 -1
  9. data/doc/classes/Classifier/Bayes.src/M000025.html +30 -0
  10. data/doc/classes/Classifier/Bayes.src/{M000007.html → M000026.html} +1 -1
  11. data/doc/classes/Classifier/Bayes.src/{M000008.html → M000027.html} +1 -1
  12. data/doc/classes/Classifier/Bayes.src/{M000009.html → M000028.html} +4 -4
  13. data/doc/classes/Classifier/Bayes.src/{M000010.html → M000029.html} +2 -2
  14. data/doc/classes/Classifier/ContentNode.html +252 -0
  15. data/doc/classes/Classifier/ContentNode.src/M000031.html +21 -0
  16. data/doc/classes/Classifier/ContentNode.src/M000032.html +18 -0
  17. data/doc/classes/Classifier/ContentNode.src/M000033.html +18 -0
  18. data/doc/classes/Classifier/ContentNode.src/M000034.html +41 -0
  19. data/doc/classes/Classifier/LSI.html +449 -0
  20. data/doc/classes/Classifier/LSI.src/M000011.html +20 -0
  21. data/doc/classes/Classifier/LSI.src/M000012.html +18 -0
  22. data/doc/classes/Classifier/LSI.src/M000013.html +20 -0
  23. data/doc/classes/Classifier/LSI.src/M000014.html +18 -0
  24. data/doc/classes/Classifier/LSI.src/M000015.html +21 -0
  25. data/doc/classes/Classifier/LSI.src/M000016.html +18 -0
  26. data/doc/classes/Classifier/LSI.src/M000017.html +32 -0
  27. data/doc/classes/Classifier/LSI.src/M000018.html +26 -0
  28. data/doc/classes/Classifier/LSI.src/M000019.html +26 -0
  29. data/doc/classes/Classifier/LSI.src/M000020.html +23 -0
  30. data/doc/classes/Classifier/LSI.src/M000021.html +21 -0
  31. data/doc/classes/Classifier/LSI.src/M000022.html +31 -0
  32. data/doc/classes/Classifier/WordList.html +202 -0
  33. data/doc/classes/Classifier/WordList.src/M000007.html +18 -0
  34. data/doc/classes/Classifier/WordList.src/M000008.html +19 -0
  35. data/doc/classes/Classifier/WordList.src/M000009.html +19 -0
  36. data/doc/classes/Classifier/WordList.src/M000010.html +18 -0
  37. data/doc/classes/GSL.html +111 -0
  38. data/doc/classes/GSL/Vector.html +156 -0
  39. data/doc/classes/GSL/Vector.src/M000005.html +18 -0
  40. data/doc/classes/GSL/Vector.src/M000006.html +19 -0
  41. data/doc/classes/Object.html +139 -0
  42. data/doc/classes/Object.src/M000001.html +16 -0
  43. data/doc/classes/String.html +95 -9
  44. data/doc/classes/{Classifier/WordHash.src/M000001.html → String.src/M000002.html} +3 -3
  45. data/doc/classes/String.src/M000003.html +18 -0
  46. data/doc/classes/String.src/M000004.html +18 -0
  47. data/doc/created.rid +1 -1
  48. data/doc/files/README.html +102 -12
  49. data/doc/files/lib/classifier/bayes_rb.html +1 -1
  50. data/doc/files/lib/classifier/{string_extensions/porter_stemmer_rb.html → extensions/vector_serialize_rb.html} +4 -15
  51. data/doc/files/lib/classifier/{string_extensions → extensions}/word_hash_rb.html +2 -2
  52. data/doc/files/lib/classifier/extensions/word_list_rb.html +115 -0
  53. data/doc/files/lib/classifier/lsi/content_node_rb.html +115 -0
  54. data/doc/files/lib/classifier/lsi_rb.html +125 -0
  55. data/doc/files/lib/classifier/string_extensions_rb.html +2 -3
  56. data/doc/files/lib/classifier_rb.html +3 -1
  57. data/doc/fr_class_index.html +6 -2
  58. data/doc/fr_file_index.html +5 -2
  59. data/doc/fr_method_index.html +34 -11
  60. data/lib/classifier.rb +3 -1
  61. data/lib/classifier/bayes.rb +34 -9
  62. data/lib/classifier/extensions/vector_serialize.rb +14 -0
  63. data/lib/classifier/extensions/word_hash.rb +125 -0
  64. data/lib/classifier/extensions/word_list.rb +31 -0
  65. data/lib/classifier/lsi.rb +248 -0
  66. data/lib/classifier/lsi/content_node.rb +67 -0
  67. data/lib/classifier/string_extensions.rb +10 -5
  68. data/test/bayes/bayesian_test.rb +2 -2
  69. data/test/lsi/lsi_test.rb +88 -0
  70. data/test/string_extensions/word_hash_test.rb +7 -5
  71. metadata +79 -24
  72. data/doc/classes/Classifier/Stemmable.html +0 -243
  73. data/doc/classes/Classifier/Stemmable.src/M000003.html +0 -102
  74. data/doc/classes/Classifier/WordHash.html +0 -178
  75. data/doc/classes/Classifier/WordHash.src/M000002.html +0 -28
  76. data/lib/classifier/string_extensions/porter_stemmer.rb +0 -199
  77. data/lib/classifier/string_extensions/word_hash.rb +0 -119
@@ -0,0 +1,14 @@
1
+ module GSL
2
+
3
+ class Vector
4
+ def _dump(v)
5
+ Marshal.dump( self.to_a )
6
+ end
7
+
8
+ def self._load(arr)
9
+ arry = Marshal.load(arr)
10
+ return GSL::Vector.new(arry)
11
+ end
12
+
13
+ end
14
+ end
@@ -0,0 +1,125 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ # These are extensions to the String class to provide convenience
6
+ # methods for the Classifier package.
7
+ class String
8
+
9
+ # Removes common punctuation symbols, returning a new string.
10
+ # E.g.,
11
+ # "Hello (greeting's), with {braces} < >...?".without_punctuation
12
+ # => "Hello greetings with braces "
13
+ def without_punctuation
14
+ tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
15
+ end
16
+
17
+ # Return a Hash of strings => ints. Each word in the string is stemmed,
18
+ # interned, and indexes to its frequency in the document.
19
+ def word_hash
20
+ word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split)
21
+ end
22
+
23
+ # Return a word hash without extra punctuation or short symbols, just stemmed words
24
+ def clean_word_hash
25
+ word_hash_for_words gsub(/[^\w\s]/,"").split
26
+ end
27
+
28
+ private
29
+
30
+ def word_hash_for_words(words)
31
+ d = Hash.new
32
+ words.each do |word|
33
+ word.downcase! if word =~ /[\w]+/
34
+ key = word.stem.intern
35
+ if word =~ /[^\w]/ || ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
36
+ d[key] ||= 0
37
+ d[key] += 1
38
+ end
39
+ end
40
+ return d
41
+ end
42
+
43
+ CORPUS_SKIP_WORDS = [
44
+ "a",
45
+ "again",
46
+ "all",
47
+ "along",
48
+ "are",
49
+ "also",
50
+ "an",
51
+ "and",
52
+ "as",
53
+ "at",
54
+ "but",
55
+ "by",
56
+ "came",
57
+ "can",
58
+ "cant",
59
+ "couldnt",
60
+ "did",
61
+ "didn",
62
+ "didnt",
63
+ "do",
64
+ "doesnt",
65
+ "dont",
66
+ "ever",
67
+ "first",
68
+ "from",
69
+ "have",
70
+ "her",
71
+ "here",
72
+ "him",
73
+ "how",
74
+ "i",
75
+ "if",
76
+ "in",
77
+ "into",
78
+ "is",
79
+ "isnt",
80
+ "it",
81
+ "itll",
82
+ "just",
83
+ "last",
84
+ "least",
85
+ "like",
86
+ "most",
87
+ "my",
88
+ "new",
89
+ "no",
90
+ "not",
91
+ "now",
92
+ "of",
93
+ "on",
94
+ "or",
95
+ "should",
96
+ "sinc",
97
+ "so",
98
+ "some",
99
+ "th",
100
+ "than",
101
+ "this",
102
+ "that",
103
+ "the",
104
+ "their",
105
+ "then",
106
+ "those",
107
+ "to",
108
+ "told",
109
+ "too",
110
+ "true",
111
+ "try",
112
+ "until",
113
+ "url",
114
+ "us",
115
+ "were",
116
+ "when",
117
+ "whether",
118
+ "while",
119
+ "with",
120
+ "within",
121
+ "yes",
122
+ "you",
123
+ "youll",
124
+ ]
125
+ end
@@ -0,0 +1,31 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: GPL
4
+
5
+ module Classifier
6
+ # This class keeps a word => index mapping. It is used to map stemmed words
7
+ # to dimensions of a vector.
8
+ class WordList
9
+ def initialize
10
+ @location_table = {}
11
+ end
12
+
13
+ # Adds a word (if it is new) and assigns it a unique dimension.
14
+ def add_word(word)
15
+ term = word
16
+ @location_table[term] = @location_table.size unless @location_table[term]
17
+ end
18
+
19
+ # Returns the dimension of the word or nil if the word is not in the space.
20
+ def [](lookup)
21
+ term = lookup
22
+ @location_table[term]
23
+ end
24
+
25
+ # Returns the number of words mapped.
26
+ def size
27
+ @location_table.size
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,248 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: GPL
4
+
5
+ begin
6
+
7
+ require 'gsl' # requires http://rb-gsl.rubyforge.org/
8
+
9
+ require 'classifier/extensions/word_list'
10
+ require 'classifier/extensions/vector_serialize'
11
+ require 'classifier/lsi/content_node'
12
+
13
+ module Classifier
14
+
15
+ # This class implements a Latent Semantic Indexer, which can search, classify and cluster
16
+ # data based on underlying semantic relations. For more information on the algorithms used,
17
+ # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
18
+ class LSI
19
+
20
+ attr_reader :word_list
21
+
22
+ # Create a fresh index.
23
+ # If you want to call #build_index manually, use
24
+ # Classifier::LSI.new :auto_rebuild => false
25
+ #
26
+ def initialize(options = {})
27
+ @auto_rebuild = true unless options[:auto_rebuild] == false
28
+ @word_list, @items = WordList.new, {}
29
+ @version, @built_at_version = 0, -1
30
+ end
31
+
32
+ # Returns true if the index needs to be rebuilt. The index needs
33
+ # to be built after all informaton is added, but before you start
34
+ # using it for search, classification and cluster detection.
35
+ def needs_rebuild?
36
+ @version != @built_at_version
37
+ end
38
+
39
+ # Adds an item to the index. item is assumed to be a string, but
40
+ # any item may be indexed so long as it responds to #to_s or if
41
+ # you provide an optional block explaining how the indexer can
42
+ # fetch fresh string data. This optional block is passed the item,
43
+ # so the item may only be a reference to a URL or file name.
44
+ #
45
+ # For example:
46
+ # lsi = Classifier::LSI.new
47
+ # lsi.add_item "This is just plain text"
48
+ # lsi.add_item "/home/me/filename.txt" { |x| File.read x }
49
+ # ar = ActiveRecordObject.find( :all )
50
+ # lsi.add_item ar, *ar.categories { |x| ar.content }
51
+ #
52
+ def add_item( item, *categories, &block )
53
+ @items[item] = ContentNode.new(item, categories, block)
54
+ @version += 1
55
+ build_index if @auto_rebuild
56
+ end
57
+
58
+ # A less flexible shorthand for add_item that assumes
59
+ # you are passing in a string with no categorries. item
60
+ # will be duck typed via to_s .
61
+ #
62
+ def <<( item )
63
+ add_item item
64
+ end
65
+
66
+ # Removes an item from the database, if it is indexed.
67
+ #
68
+ def remove_item( item )
69
+ if @items.keys.contain? item
70
+ @items.remove item
71
+ @version += 1
72
+ end
73
+ end
74
+
75
+ # Returns an array of items that are indexed.
76
+ def items
77
+ @items.keys
78
+ end
79
+
80
+ # This function rebuilds the index if needs_rebuild? returns true.
81
+ # For very large document spaces, this indexing operation may take some
82
+ # time to complete, so it may be wise to place the operation in another
83
+ # thread.
84
+ #
85
+ # As a rule, indexing will be fairly swift on modern machines until
86
+ # you have well over 500 documents indexed, or have an incredibly diverse
87
+ # vocabulary for your documents.
88
+ #
89
+ # The optional parameter "cutoff" is a tuning parameter. When the index is
90
+ # built, a certain number of s-values are discarded from the system. The
91
+ # cutoff parameter tells the indexer how many of these values to keep.
92
+ # A value of 1 for cutoff means that no semantic analysis will take place,
93
+ # turning the LSI class into a simple vector search engine.
94
+ def build_index( cutoff=0.75 )
95
+ return unless needs_rebuild?
96
+ make_word_list
97
+
98
+ doc_list = @items.values
99
+ tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
100
+ tdm = GSL::Matrix.new( *tda ).trans
101
+ ntdm = build_reduced_matrix(tdm, cutoff)
102
+
103
+ ntdm.size[1].times do |col|
104
+ vec = GSL::Vector.new( ntdm.column(col) ).row
105
+ doc_list[col].lsi_vector = vec
106
+ doc_list[col].lsi_norm = vec.normalize
107
+ end
108
+
109
+ @built_at_version = @version
110
+ end
111
+
112
+ # This function is the primitive that find_related and classify
113
+ # build upon. It returns an array of 2-element arrays. The first element
114
+ # of this array is a document, and the second is its "score", defining
115
+ # how "close" it is to other indexed items.
116
+ #
117
+ # These values are somewhat arbitrary, having to do with the vector space
118
+ # created by your content, so the magnitude is interpretable but not always
119
+ # meaningful between indexes.
120
+ #
121
+ # The parameter doc is the content to compare. If that content is not
122
+ # indexed, you can pass an optional block to define how to create the
123
+ # text data. See add_item for examples of how this works.
124
+ def proximity_array_for_content( doc, &block )
125
+ return [] if needs_rebuild?
126
+
127
+ content_node = node_for_content( doc, &block )
128
+ result =
129
+ @items.keys.collect do |item|
130
+ val = content_node.search_vector * @items[item].search_vector.col
131
+ [item, val]
132
+ end
133
+ result.sort_by { |x| x[1] }.reverse
134
+ end
135
+
136
+ # Similar to proximity_array_for_content, this function takes similar
137
+ # arguments and returns a similar array. However, it uses the normalized
138
+ # calculated vectors instead of their full versions. This is useful when
139
+ # you're trying to perform operations on content that is much smaller than
140
+ # the text you're working with. search uses this primitive.
141
+ def proximity_norms_for_content( doc, &block )
142
+ return [] if needs_rebuild?
143
+
144
+ content_node = node_for_content( doc, &block )
145
+ result =
146
+ @items.keys.collect do |item|
147
+ val = content_node.search_norm * @items[item].search_norm.col
148
+ [item, val]
149
+ end
150
+ result.sort_by { |x| x[1] }.reverse
151
+ end
152
+
153
+ # This function allows for text-based search of your index. Unlike other functions
154
+ # like find_related and classify, search only takes short strings. It will also ignore
155
+ # factors like repeated words. It is best for short, google-like search terms.
156
+ # A search will first priortize lexical relationships, then semantic ones.
157
+ #
158
+ # While this may seem backwards compared to the other functions that LSI supports,
159
+ # it is actually the same algorithm, just applied on a smaller document.
160
+ def search( string, max_nearest=3 )
161
+ return [] if needs_rebuild?
162
+
163
+ carry =
164
+ proximity_norms_for_content( string )
165
+ result = carry.collect { |x| x[0] }
166
+ return result[0..max_nearest-1]
167
+ end
168
+
169
+ # This function takes content and finds other documents
170
+ # that are semantically "close", returning an array of documents sorted
171
+ # from most to least relavant.
172
+ # max_nearest specifies the number of documents to return. A value of
173
+ # 0 means that it returns all the indexed documents, sorted by relavence.
174
+ #
175
+ # This is particularly useful for identifing clusters in your document space.
176
+ # For example you may want to identify several "What's Related" items for weblog
177
+ # articles, or find paragraphs that relate to each other in an essay.
178
+ def find_related( doc, max_nearest=3, &block )
179
+ carry =
180
+ proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
181
+ result = carry.collect { |x| x[0] }
182
+ return result[0..max_nearest-1]
183
+ end
184
+
185
+ # This function uses a voting system to categorize documents, based on
186
+ # the categories of other documents. It uses the same logic as the
187
+ # find_related function to find related documents, then returns the
188
+ # most obvious category from this list.
189
+ #
190
+ # cutoff signifies the number of documents to consider when clasifying
191
+ # text. A cutoff of 1 means that every document in the index votes on
192
+ # what category the document is in. This may not always make sense.
193
+ #
194
+ def classify( doc, cutoff=0.30, &block )
195
+ icutoff = (@items.size * cutoff).round
196
+ carry = proximity_array_for_content( doc, &block )
197
+ carry = carry[0..icutoff-1]
198
+ votes = {}
199
+ carry.each do |pair|
200
+ categories = @items[pair[0]].categories
201
+ categories.each do |category|
202
+ votes[category] ||= 0.0
203
+ votes[category] += pair[1]
204
+ end
205
+ end
206
+
207
+ ranking = votes.keys.sort_by { |x| votes[x] }
208
+ return ranking[-1]
209
+ end
210
+
211
+ private
212
+ def build_reduced_matrix( matrix, cutoff=0.75 )
213
+ # TODO: Check that M>=N on these dimensions! Transpose helps assure this
214
+ u, v, s = matrix.SV_decomp
215
+ # TODO: Better than 75% term, please. :\
216
+ s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
217
+ s.size.times do |ord|
218
+ s[ord] = 0.0 if s[ord] < s_cutoff
219
+ end
220
+
221
+ # Reconstruct the term document matrix, only with reduced rank
222
+ u * Matrix.diagonal( s ) * v.trans
223
+ end
224
+
225
+ def node_for_content(item, &block)
226
+ if @items[item]
227
+ return @items[item]
228
+ else
229
+ cn = ContentNode.new(item, &block) # make the node and extract the data
230
+ cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
231
+ end
232
+
233
+ cn
234
+ end
235
+
236
+ def make_word_list
237
+ @word_list = WordList.new
238
+ @items.each_value do |node|
239
+ node.word_hash.each_key { |key| @word_list.add_word key }
240
+ end
241
+ end
242
+
243
+ end
244
+ end
245
+
246
+ rescue LoadError
247
+ $stderr.puts "For LSI support, you need to install http://rb-gsl.rubyforge.org/"
248
+ end
@@ -0,0 +1,67 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: GPL
4
+
5
+ module Classifier
6
+
7
+
8
+ # This is an internal data structure class for the LSI node. Save for
9
+ # raw_vector_with, it should be fairly straightforward to understand.
10
+ # You should never have to use it directly.
11
+ class ContentNode
12
+ attr_accessor :word_hash, :raw_vector, :raw_norm,
13
+ :lsi_vector, :lsi_norm,
14
+ :categories
15
+ attr_reader :source
16
+
17
+ # If text_proc is not specified, the source will be duck-typed
18
+ # via source.to_s
19
+ def initialize( source, categories=nil, text_proc=nil )
20
+ text_proc = text_proc || (proc {|x| x.to_s})
21
+ @categories = categories || []
22
+ @source = source
23
+ @word_hash = text_proc.call( @source ).clean_word_hash
24
+ end
25
+
26
+ # Use this to fetch the appropriate search vector.
27
+ def search_vector
28
+ @lsi_vector || @raw_vector
29
+ end
30
+
31
+ # Use this to fetch the appropriate search vector in normalized form.
32
+ def search_norm
33
+ @lsi_norm || @raw_norm
34
+ end
35
+
36
+ # Creates the raw vector out of word_hash using word_list as the
37
+ # key for mapping the vector space.
38
+ def raw_vector_with( word_list )
39
+ vec = Array.new(word_list.size, 0)
40
+
41
+ @word_hash.each_key do |word|
42
+ vec[word_list[word]] = @word_hash[word] if word_list[word]
43
+ end
44
+
45
+ # Perform the scaling transform
46
+ total_words = vec.inject(0) { |sum,term| sum += term }.to_f
47
+
48
+ # Perform first-order association transform if this vector has more
49
+ # than one word in it.
50
+ if total_words > 1.0
51
+ weighted_total = vec.inject(0.0) do |sum,term|
52
+ if( term > 0 )
53
+ sum += (( term / total_words ) * Math.log( term / total_words ))
54
+ else
55
+ sum
56
+ end
57
+ end
58
+ vec.map! { |val| Math.log( val + 1 ) / -weighted_total }
59
+ end
60
+
61
+ @raw_norm = GSL::Vector.new( vec ).normalize
62
+ @raw_vector = GSL::Vector.new( vec )
63
+ end
64
+
65
+ end
66
+
67
+ end