classifier 1.3.4 → 1.3.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 3c53668ddd328fb78862c67723b185df9c2aa717
4
- data.tar.gz: 3655405d082fdd8a01e4ca893a70360ca9f62322
2
+ SHA256:
3
+ metadata.gz: 1d453b4ca9e0a0c44a2b8d3c9ef55db5b55a17efe5ff0cfbcab93f24965cd536
4
+ data.tar.gz: 26f5d9595ddd35c8d7c239f946afa8251a4b68f24f5b8bc41e30be13ded60547
5
5
  SHA512:
6
- metadata.gz: 40b7395e2f04f56bdbabb49a4d0013dba36e9c1325ae66e5bff92451059c5b559677aaea30e50f8f2fbbae58e50bf0f084925ef38e0e3d3fb729e37e357469d4
7
- data.tar.gz: 150f8f387706d870a37e86b0418c5e68ad386b82518294bdf21585ab3509fd98515648bc5e06dfb78b97f1e544099fe1da5ddcd69413826e0ccc39780d457940
6
+ metadata.gz: c13c1666c7d2fe92d47ab7ced0a885fec7b13719aea25f283a013d5015744d6aea7d473706af54042972ba687b214c2a3a619f58d75560d90e57c3569d38f957
7
+ data.tar.gz: 23261ba6708307ecf6faac636d8572c50720fbf9a2e6db6ee736a07ce3de445daa431afb3bbf2c49dd4d2bd699327ca1c419029b4972236c52a9a5c1f00ab5a2
data/LICENSE CHANGED
@@ -146,7 +146,7 @@ such a program is covered only if its contents constitute a work based
146
146
  on the Library (independent of the use of the Library in a tool for
147
147
  writing it). Whether that is true depends on what the Library does
148
148
  and what the program that uses the Library does.
149
-
149
+
150
150
  1. You may copy and distribute verbatim copies of the Library's
151
151
  complete source code as you receive it, in any medium, provided that
152
152
  you conspicuously and appropriately publish on each copy an
@@ -426,4 +426,4 @@ the Free Software Foundation.
426
426
  14. If you wish to incorporate parts of the Library into other free
427
427
  programs whose distribution conditions are incompatible with these,
428
428
  write to the author to ask for permission. For software which is
429
- copyrighted by
429
+ copyrighted by
@@ -27,4 +27,4 @@
27
27
  require 'rubygems'
28
28
  require 'classifier/extensions/string'
29
29
  require 'classifier/bayes'
30
- require 'classifier/lsi'
30
+ require 'classifier/lsi'
@@ -6,7 +6,7 @@ module Classifier
6
6
 
7
7
  class Bayes
8
8
  # The class can be created with one or more categories, each of which will be
9
- # initialized and given a training method. E.g.,
9
+ # initialized and given a training method. E.g.,
10
10
  # b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
11
11
  def initialize(*categories)
12
12
  @categories = Hash.new
@@ -56,7 +56,7 @@ class Bayes
56
56
  end
57
57
  end
58
58
  end
59
-
59
+
60
60
  #
61
61
  # Returns the scores in each category the provided +text+. E.g.,
62
62
  # b.classifications "I hate bad words and you"
@@ -80,14 +80,14 @@ class Bayes
80
80
  end
81
81
 
82
82
  #
83
- # Returns the classification of the provided +text+, which is one of the
83
+ # Returns the classification of the provided +text+, which is one of the
84
84
  # categories given in the initializer. E.g.,
85
85
  # b.classify "I hate bad words and you"
86
86
  # => 'Uninteresting'
87
87
  def classify(text)
88
88
  (classifications(text).sort_by { |a| -a[1] })[0][0]
89
89
  end
90
-
90
+
91
91
  #
92
92
  # Provides training and untraining methods for the categories specified in Bayes#new
93
93
  # For example:
@@ -106,7 +106,7 @@ class Bayes
106
106
  super #raise StandardError, "No such method: #{name}"
107
107
  end
108
108
  end
109
-
109
+
110
110
  #
111
111
  # Provides a list of category names
112
112
  # For example:
@@ -115,7 +115,7 @@ class Bayes
115
115
  def categories # :nodoc:
116
116
  @categories.keys.collect {|c| c.to_s}
117
117
  end
118
-
118
+
119
119
  #
120
120
  # Allows you to add categories to the classifier.
121
121
  # For example:
@@ -128,7 +128,7 @@ class Bayes
128
128
  def add_category(category)
129
129
  @categories[category.prepare_category_name] = Hash.new
130
130
  end
131
-
131
+
132
132
  alias append_category add_category
133
133
  end
134
134
 
@@ -1,5 +1,5 @@
1
1
  # Author:: Ernest Ellingson
2
- # Copyright:: Copyright (c) 2005
2
+ # Copyright:: Copyright (c) 2005
3
3
 
4
4
  # These are extensions to the std-lib 'matrix' to allow an all ruby SVD
5
5
 
@@ -9,7 +9,7 @@ require 'mathn'
9
9
  class Array
10
10
  def sum(identity = 0, &block)
11
11
  return identity unless size > 0
12
-
12
+
13
13
  if block_given?
14
14
  map(&block).sum
15
15
  else
@@ -22,7 +22,7 @@ class Vector
22
22
  def magnitude
23
23
  sumsqs = 0.0
24
24
  self.size.times do |i|
25
- sumsqs += self[i] ** 2.0
25
+ sumsqs += self[i] ** 2.0
26
26
  end
27
27
  Math.sqrt(sumsqs)
28
28
  end
@@ -42,7 +42,7 @@ class Matrix
42
42
  def Matrix.diag(s)
43
43
  Matrix.diagonal(*s)
44
44
  end
45
-
45
+
46
46
  alias :trans :transpose
47
47
 
48
48
  def SV_decomp(maxSweeps = 20)
@@ -51,7 +51,7 @@ class Matrix
51
51
  else
52
52
  q = self * self.trans
53
53
  end
54
-
54
+
55
55
  qrot = q.dup
56
56
  v = Matrix.identity(q.row_size)
57
57
  azrot = nil
@@ -75,16 +75,16 @@ class Matrix
75
75
  mzrot[col,col] = hcos
76
76
  qrot = mzrot.trans * qrot * mzrot
77
77
  v = v * mzrot
78
- end
78
+ end
79
79
  end
80
80
  s_old = qrot.dup if cnt == 1
81
- sum_qrot = 0.0
81
+ sum_qrot = 0.0
82
82
  if cnt > 1
83
83
  qrot.row_size.times do |r|
84
84
  sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
85
85
  end
86
86
  s_old = qrot.dup
87
- end
87
+ end
88
88
  break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
89
89
  end # of do while true
90
90
  s = []
@@ -93,7 +93,7 @@ class Matrix
93
93
  end
94
94
  #puts "cnt = #{cnt}"
95
95
  if self.row_size >= self.column_size
96
- mu = self * v * Matrix.diagonal(*s).inverse
96
+ mu = self * v * Matrix.diagonal(*s).inverse
97
97
  return [mu, v, s]
98
98
  else
99
99
  puts v.row_size
@@ -1,17 +1,17 @@
1
1
  module GSL
2
-
2
+
3
3
  class Vector
4
4
  def _dump(v)
5
5
  Marshal.dump( self.to_a )
6
6
  end
7
-
7
+
8
8
  def self._load(arr)
9
9
  arry = Marshal.load(arr)
10
10
  return GSL::Vector.alloc(arry)
11
11
  end
12
-
12
+
13
13
  end
14
-
14
+
15
15
  class Matrix
16
16
  class <<self
17
17
  alias :diag :diagonal
@@ -4,20 +4,20 @@
4
4
 
5
5
  require "set"
6
6
 
7
- # These are extensions to the String class to provide convenience
7
+ # These are extensions to the String class to provide convenience
8
8
  # methods for the Classifier package.
9
9
  class String
10
-
11
- # Removes common punctuation symbols, returning a new string.
10
+
11
+ # Removes common punctuation symbols, returning a new string.
12
12
  # E.g.,
13
13
  # "Hello (greeting's), with {braces} < >...?".without_punctuation
14
14
  # => "Hello greetings with braces "
15
15
  def without_punctuation
16
16
  tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
17
17
  end
18
-
18
+
19
19
  # Return a Hash of strings => ints. Each word in the string is stemmed,
20
- # interned, and indexes to its frequency in the document.
20
+ # interned, and indexes to its frequency in the document.
21
21
  def word_hash
22
22
  word_hash = clean_word_hash()
23
23
  symbol_hash = word_hash_for_symbols(gsub(/[\w]/," ").split)
@@ -28,9 +28,9 @@ class String
28
28
  def clean_word_hash
29
29
  word_hash_for_words gsub(/[^\w\s]/,"").split
30
30
  end
31
-
31
+
32
32
  private
33
-
33
+
34
34
  def word_hash_for_words(words)
35
35
  d = Hash.new(0)
36
36
  words.each do |word|
@@ -50,7 +50,7 @@ class String
50
50
  end
51
51
  return d
52
52
  end
53
-
53
+
54
54
  CORPUS_SKIP_WORDS = Set.new([
55
55
  "a",
56
56
  "again",
@@ -4,30 +4,30 @@
4
4
 
5
5
  begin
6
6
  raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
7
-
7
+
8
8
  require 'gsl' # requires http://rb-gsl.rubyforge.org/
9
9
  require 'classifier/extensions/vector_serialize'
10
10
  $GSL = true
11
-
11
+
12
12
  rescue LoadError
13
13
  warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
14
- require 'classifier/extensions/vector'
14
+ require 'classifier/extensions/vector'
15
15
  end
16
-
16
+
17
17
  require 'classifier/lsi/word_list'
18
18
  require 'classifier/lsi/content_node'
19
19
  require 'classifier/lsi/summary'
20
20
 
21
21
  module Classifier
22
-
22
+
23
23
  # This class implements a Latent Semantic Indexer, which can search, classify and cluster
24
24
  # data based on underlying semantic relations. For more information on the algorithms used,
25
25
  # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
26
26
  class LSI
27
-
27
+
28
28
  attr_reader :word_list
29
29
  attr_accessor :auto_rebuild
30
-
30
+
31
31
  # Create a fresh index.
32
32
  # If you want to call #build_index manually, use
33
33
  # Classifier::LSI.new :auto_rebuild => false
@@ -37,20 +37,20 @@ module Classifier
37
37
  @word_list, @items = WordList.new, {}
38
38
  @version, @built_at_version = 0, -1
39
39
  end
40
-
40
+
41
41
  # Returns true if the index needs to be rebuilt. The index needs
42
42
  # to be built after all informaton is added, but before you start
43
43
  # using it for search, classification and cluster detection.
44
44
  def needs_rebuild?
45
45
  (@items.keys.size > 1) && (@version != @built_at_version)
46
46
  end
47
-
48
- # Adds an item to the index. item is assumed to be a string, but
47
+
48
+ # Adds an item to the index. item is assumed to be a string, but
49
49
  # any item may be indexed so long as it responds to #to_s or if
50
- # you provide an optional block explaining how the indexer can
50
+ # you provide an optional block explaining how the indexer can
51
51
  # fetch fresh string data. This optional block is passed the item,
52
52
  # so the item may only be a reference to a URL or file name.
53
- #
53
+ #
54
54
  # For example:
55
55
  # lsi = Classifier::LSI.new
56
56
  # lsi.add_item "This is just plain text"
@@ -65,14 +65,14 @@ module Classifier
65
65
  build_index if @auto_rebuild
66
66
  end
67
67
 
68
- # A less flexible shorthand for add_item that assumes
68
+ # A less flexible shorthand for add_item that assumes
69
69
  # you are passing in a string with no categorries. item
70
- # will be duck typed via to_s .
70
+ # will be duck typed via to_s .
71
71
  #
72
72
  def <<( item )
73
73
  add_item item
74
74
  end
75
-
75
+
76
76
  # Returns the categories for a given indexed items. You are free to add and remove
77
77
  # items from this as you see fit. It does not invalide an index to change its categories.
78
78
  def categories_for(item)
@@ -80,7 +80,7 @@ module Classifier
80
80
  return @items[item].categories
81
81
  end
82
82
 
83
- # Removes an item from the database, if it is indexed.
83
+ # Removes an item from the database, if it is indexed.
84
84
  #
85
85
  def remove_item( item )
86
86
  if @items.keys.contain? item
@@ -88,12 +88,12 @@ module Classifier
88
88
  @version += 1
89
89
  end
90
90
  end
91
-
92
- # Returns an array of items that are indexed.
91
+
92
+ # Returns an array of items that are indexed.
93
93
  def items
94
94
  @items.keys
95
95
  end
96
-
96
+
97
97
  # Returns the categories for a given indexed items. You are free to add and remove
98
98
  # items from this as you see fit. It does not invalide an index to change its categories.
99
99
  def categories_for(item)
@@ -103,30 +103,30 @@ module Classifier
103
103
 
104
104
  # This function rebuilds the index if needs_rebuild? returns true.
105
105
  # For very large document spaces, this indexing operation may take some
106
- # time to complete, so it may be wise to place the operation in another
107
- # thread.
106
+ # time to complete, so it may be wise to place the operation in another
107
+ # thread.
108
108
  #
109
109
  # As a rule, indexing will be fairly swift on modern machines until
110
- # you have well over 500 documents indexed, or have an incredibly diverse
111
- # vocabulary for your documents.
110
+ # you have well over 500 documents indexed, or have an incredibly diverse
111
+ # vocabulary for your documents.
112
112
  #
113
113
  # The optional parameter "cutoff" is a tuning parameter. When the index is
114
- # built, a certain number of s-values are discarded from the system. The
114
+ # built, a certain number of s-values are discarded from the system. The
115
115
  # cutoff parameter tells the indexer how many of these values to keep.
116
116
  # A value of 1 for cutoff means that no semantic analysis will take place,
117
117
  # turning the LSI class into a simple vector search engine.
118
118
  def build_index( cutoff=0.75 )
119
119
  return unless needs_rebuild?
120
120
  make_word_list
121
-
121
+
122
122
  doc_list = @items.values
123
123
  tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
124
-
124
+
125
125
  if $GSL
126
126
  tdm = GSL::Matrix.alloc(*tda).trans
127
127
  ntdm = build_reduced_matrix(tdm, cutoff)
128
128
 
129
- ntdm.size[1].times do |col|
129
+ ntdm.size[1].times do |col|
130
130
  vec = GSL::Vector.alloc( ntdm.column(col) ).row
131
131
  doc_list[col].lsi_vector = vec
132
132
  doc_list[col].lsi_norm = vec.normalize
@@ -134,50 +134,50 @@ module Classifier
134
134
  else
135
135
  tdm = Matrix.rows(tda).trans
136
136
  ntdm = build_reduced_matrix(tdm, cutoff)
137
-
137
+
138
138
  ntdm.row_size.times do |col|
139
139
  doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
140
140
  doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
141
141
  end
142
142
  end
143
-
143
+
144
144
  @built_at_version = @version
145
145
  end
146
-
146
+
147
147
  # This method returns max_chunks entries, ordered by their average semantic rating.
148
148
  # Essentially, the average distance of each entry from all other entries is calculated,
149
149
  # the highest are returned.
150
150
  #
151
151
  # This can be used to build a summary service, or to provide more information about
152
152
  # your dataset's general content. For example, if you were to use categorize on the
153
- # results of this data, you could gather information on what your dataset is generally
153
+ # results of this data, you could gather information on what your dataset is generally
154
154
  # about.
155
155
  def highest_relative_content( max_chunks=10 )
156
156
  return [] if needs_rebuild?
157
-
157
+
158
158
  avg_density = Hash.new
159
159
  @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
160
-
160
+
161
161
  avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
162
162
  end
163
163
 
164
- # This function is the primitive that find_related and classify
164
+ # This function is the primitive that find_related and classify
165
165
  # build upon. It returns an array of 2-element arrays. The first element
166
166
  # of this array is a document, and the second is its "score", defining
167
167
  # how "close" it is to other indexed items.
168
- #
168
+ #
169
169
  # These values are somewhat arbitrary, having to do with the vector space
170
170
  # created by your content, so the magnitude is interpretable but not always
171
- # meaningful between indexes.
171
+ # meaningful between indexes.
172
172
  #
173
173
  # The parameter doc is the content to compare. If that content is not
174
- # indexed, you can pass an optional block to define how to create the
175
- # text data. See add_item for examples of how this works.
174
+ # indexed, you can pass an optional block to define how to create the
175
+ # text data. See add_item for examples of how this works.
176
176
  def proximity_array_for_content( doc, &block )
177
177
  return [] if needs_rebuild?
178
-
178
+
179
179
  content_node = node_for_content( doc, &block )
180
- result =
180
+ result =
181
181
  @items.keys.collect do |item|
182
182
  if $GSL
183
183
  val = content_node.search_vector * @items[item].search_vector.col
@@ -187,18 +187,18 @@ module Classifier
187
187
  [item, val]
188
188
  end
189
189
  result.sort_by { |x| x[1] }.reverse
190
- end
191
-
190
+ end
191
+
192
192
  # Similar to proximity_array_for_content, this function takes similar
193
193
  # arguments and returns a similar array. However, it uses the normalized
194
- # calculated vectors instead of their full versions. This is useful when
194
+ # calculated vectors instead of their full versions. This is useful when
195
195
  # you're trying to perform operations on content that is much smaller than
196
196
  # the text you're working with. search uses this primitive.
197
197
  def proximity_norms_for_content( doc, &block )
198
198
  return [] if needs_rebuild?
199
-
199
+
200
200
  content_node = node_for_content( doc, &block )
201
- result =
201
+ result =
202
202
  @items.keys.collect do |item|
203
203
  if $GSL
204
204
  val = content_node.search_norm * @items[item].search_norm.col
@@ -208,12 +208,12 @@ module Classifier
208
208
  [item, val]
209
209
  end
210
210
  result.sort_by { |x| x[1] }.reverse
211
- end
212
-
211
+ end
212
+
213
213
  # This function allows for text-based search of your index. Unlike other functions
214
214
  # like find_related and classify, search only takes short strings. It will also ignore
215
- # factors like repeated words. It is best for short, google-like search terms.
216
- # A search will first priortize lexical relationships, then semantic ones.
215
+ # factors like repeated words. It is best for short, google-like search terms.
216
+ # A search will first priortize lexical relationships, then semantic ones.
217
217
  #
218
218
  # While this may seem backwards compared to the other functions that LSI supports,
219
219
  # it is actually the same algorithm, just applied on a smaller document.
@@ -223,30 +223,30 @@ module Classifier
223
223
  result = carry.collect { |x| x[0] }
224
224
  return result[0..max_nearest-1]
225
225
  end
226
-
226
+
227
227
  # This function takes content and finds other documents
228
228
  # that are semantically "close", returning an array of documents sorted
229
229
  # from most to least relavant.
230
- # max_nearest specifies the number of documents to return. A value of
231
- # 0 means that it returns all the indexed documents, sorted by relavence.
230
+ # max_nearest specifies the number of documents to return. A value of
231
+ # 0 means that it returns all the indexed documents, sorted by relavence.
232
232
  #
233
- # This is particularly useful for identifing clusters in your document space.
233
+ # This is particularly useful for identifing clusters in your document space.
234
234
  # For example you may want to identify several "What's Related" items for weblog
235
235
  # articles, or find paragraphs that relate to each other in an essay.
236
236
  def find_related( doc, max_nearest=3, &block )
237
- carry =
237
+ carry =
238
238
  proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
239
239
  result = carry.collect { |x| x[0] }
240
240
  return result[0..max_nearest-1]
241
241
  end
242
-
243
- # This function uses a voting system to categorize documents, based on
244
- # the categories of other documents. It uses the same logic as the
242
+
243
+ # This function uses a voting system to categorize documents, based on
244
+ # the categories of other documents. It uses the same logic as the
245
245
  # find_related function to find related documents, then returns the
246
- # most obvious category from this list.
246
+ # most obvious category from this list.
247
247
  #
248
- # cutoff signifies the number of documents to consider when clasifying
249
- # text. A cutoff of 1 means that every document in the index votes on
248
+ # cutoff signifies the number of documents to consider when clasifying
249
+ # text. A cutoff of 1 means that every document in the index votes on
250
250
  # what category the document is in. This may not always make sense.
251
251
  #
252
252
  def classify( doc, cutoff=0.30, &block )
@@ -256,16 +256,16 @@ module Classifier
256
256
  votes = {}
257
257
  carry.each do |pair|
258
258
  categories = @items[pair[0]].categories
259
- categories.each do |category|
259
+ categories.each do |category|
260
260
  votes[category] ||= 0.0
261
- votes[category] += pair[1]
261
+ votes[category] += pair[1]
262
262
  end
263
263
  end
264
-
264
+
265
265
  ranking = votes.keys.sort_by { |x| votes[x] }
266
266
  return ranking[-1]
267
267
  end
268
-
268
+
269
269
  # Prototype, only works on indexed documents.
270
270
  # I have no clue if this is going to work, but in theory
271
271
  # it's supposed to.
@@ -289,8 +289,8 @@ module Classifier
289
289
  # Reconstruct the term document matrix, only with reduced rank
290
290
  u * ($GSL ? GSL::Matrix : ::Matrix).diag( s ) * v.trans
291
291
  end
292
-
293
- def node_for_content(item, &block)
292
+
293
+ def node_for_content(item, &block)
294
294
  if @items[item]
295
295
  return @items[item]
296
296
  else
@@ -302,10 +302,10 @@ module Classifier
302
302
  cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
303
303
  end
304
304
  end
305
-
305
+
306
306
  return cn
307
307
  end
308
-
308
+
309
309
  def make_word_list
310
310
  @word_list = WordList.new
311
311
  @items.each_value do |node|
@@ -4,14 +4,14 @@
4
4
 
5
5
  module Classifier
6
6
 
7
- # This is an internal data structure class for the LSI node. Save for
7
+ # This is an internal data structure class for the LSI node. Save for
8
8
  # raw_vector_with, it should be fairly straightforward to understand.
9
9
  # You should never have to use it directly.
10
10
  class ContentNode
11
- attr_accessor :raw_vector, :raw_norm,
11
+ attr_accessor :raw_vector, :raw_norm,
12
12
  :lsi_vector, :lsi_norm,
13
- :categories
14
-
13
+ :categories
14
+
15
15
  attr_reader :word_hash
16
16
  # If text_proc is not specified, the source will be duck-typed
17
17
  # via source.to_s
@@ -19,17 +19,17 @@ module Classifier
19
19
  @categories = categories || []
20
20
  @word_hash = word_hash
21
21
  end
22
-
22
+
23
23
  # Use this to fetch the appropriate search vector.
24
24
  def search_vector
25
25
  @lsi_vector || @raw_vector
26
26
  end
27
-
27
+
28
28
  # Use this to fetch the appropriate search vector in normalized form.
29
29
  def search_norm
30
30
  @lsi_norm || @raw_norm
31
31
  end
32
-
32
+
33
33
  # Creates the raw vector out of word_hash using word_list as the
34
34
  # key for mapping the vector space.
35
35
  def raw_vector_with( word_list )
@@ -42,22 +42,22 @@ module Classifier
42
42
  @word_hash.each_key do |word|
43
43
  vec[word_list[word]] = @word_hash[word] if word_list[word]
44
44
  end
45
-
45
+
46
46
  # Perform the scaling transform
47
47
  total_words = vec.sum
48
-
48
+
49
49
  # Perform first-order association transform if this vector has more
50
- # than one word in it.
51
- if total_words > 1.0
50
+ # than one word in it.
51
+ if total_words > 1.0
52
52
  weighted_total = 0.0
53
53
  vec.each do |term|
54
54
  if ( term > 0 )
55
55
  weighted_total += (( term / total_words ) * Math.log( term / total_words ))
56
56
  end
57
- end
57
+ end
58
58
  vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
59
59
  end
60
-
60
+
61
61
  if $GSL
62
62
  @raw_norm = vec.normalize
63
63
  @raw_vector = vec
@@ -65,8 +65,8 @@ module Classifier
65
65
  @raw_norm = Vector[*vec].normalize
66
66
  @raw_vector = Vector[*vec]
67
67
  end
68
- end
69
-
70
- end
71
-
68
+ end
69
+
70
+ end
71
+
72
72
  end
@@ -14,13 +14,13 @@ class String
14
14
  def split_sentences
15
15
  split /(\.|\!|\?)/ # TODO: make this less primitive
16
16
  end
17
-
17
+
18
18
  def split_paragraphs
19
19
  split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
20
20
  end
21
-
21
+
22
22
  private
23
-
23
+
24
24
  def perform_lsi(chunks, count, separator)
25
25
  lsi = Classifier::LSI.new :auto_rebuild => false
26
26
  chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
@@ -28,4 +28,4 @@ class String
28
28
  summaries = lsi.highest_relative_content count
29
29
  return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
30
30
  end
31
- end
31
+ end
@@ -2,35 +2,35 @@
2
2
  # Copyright:: Copyright (c) 2005 David Fayram II
3
3
  # License:: LGPL
4
4
 
5
- module Classifier
5
+ module Classifier
6
6
  # This class keeps a word => index mapping. It is used to map stemmed words
7
7
  # to dimensions of a vector.
8
-
8
+
9
9
  class WordList
10
10
  def initialize
11
11
  @location_table = Hash.new
12
12
  end
13
-
13
+
14
14
  # Adds a word (if it is new) and assigns it a unique dimension.
15
15
  def add_word(word)
16
16
  term = word
17
17
  @location_table[term] = @location_table.size unless @location_table[term]
18
18
  end
19
-
19
+
20
20
  # Returns the dimension of the word or nil if the word is not in the space.
21
21
  def [](lookup)
22
22
  term = lookup
23
23
  @location_table[term]
24
24
  end
25
-
25
+
26
26
  def word_for_index(ind)
27
27
  @location_table.invert[ind]
28
28
  end
29
-
29
+
30
30
  # Returns the number of words mapped.
31
31
  def size
32
32
  @location_table.size
33
33
  end
34
-
34
+
35
35
  end
36
36
  end
metadata CHANGED
@@ -1,36 +1,94 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.4
4
+ version: 1.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lucas Carlson
8
- autorequire: classifier
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-12-31 00:00:00.000000000 Z
11
+ date: 2018-04-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: fast-stemmer
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - '>='
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: 1.0.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - '>='
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: 1.0.0
27
- description: |2
28
- A general classifier module to allow Bayesian and other types of classifications.
27
+ - !ruby/object:Gem::Dependency
28
+ name: mathn
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rdoc
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: A general classifier module to allow Bayesian and other types of classifications.
29
84
  email: lucas@rufy.com
30
85
  executables: []
31
86
  extensions: []
32
87
  extra_rdoc_files: []
33
88
  files:
89
+ - LICENSE
90
+ - bin/bayes.rb
91
+ - bin/summarize.rb
34
92
  - lib/classifier.rb
35
93
  - lib/classifier/bayes.rb
36
94
  - lib/classifier/extensions/string.rb
@@ -41,19 +99,9 @@ files:
41
99
  - lib/classifier/lsi/content_node.rb
42
100
  - lib/classifier/lsi/summary.rb
43
101
  - lib/classifier/lsi/word_list.rb
44
- - bin/bayes.rb
45
- - bin/summarize.rb
46
- - test/bayes/bayesian_test.rb
47
- - test/extensions/word_hash_test.rb
48
- - test/lsi/lsi_test.rb
49
- - test/test_helper.rb
50
- - Gemfile
51
- - Gemfile.lock
52
- - LICENSE
53
- - README.markdown
54
- - Rakefile
55
- homepage: http://classifier.rufy.com/
56
- licenses: []
102
+ homepage: https://github.com/cardmagic/classifier
103
+ licenses:
104
+ - LGPL
57
105
  metadata: {}
58
106
  post_install_message:
59
107
  rdoc_options: []
@@ -61,18 +109,17 @@ require_paths:
61
109
  - lib
62
110
  required_ruby_version: !ruby/object:Gem::Requirement
63
111
  requirements:
64
- - - '>='
112
+ - - ">="
65
113
  - !ruby/object:Gem::Version
66
114
  version: '0'
67
115
  required_rubygems_version: !ruby/object:Gem::Requirement
68
116
  requirements:
69
- - - '>='
117
+ - - ">="
70
118
  - !ruby/object:Gem::Version
71
119
  version: '0'
72
- requirements:
73
- - A porter-stemmer module to split word stems.
120
+ requirements: []
74
121
  rubyforge_project:
75
- rubygems_version: 2.0.3
122
+ rubygems_version: 2.7.6
76
123
  signing_key:
77
124
  specification_version: 4
78
125
  summary: A general classifier module to allow Bayesian and other types of classifications.
data/Gemfile DELETED
@@ -1,5 +0,0 @@
1
- source 'https://rubygems.org'
2
- gem 'rake'
3
- gem 'rspec', :require => 'spec'
4
- gem 'rdoc'
5
- gem 'fast-stemmer'
@@ -1,26 +0,0 @@
1
- GEM
2
- remote: https://rubygems.org/
3
- specs:
4
- diff-lcs (1.2.5)
5
- fast-stemmer (1.0.2)
6
- json (1.8.1)
7
- rake (10.1.1)
8
- rdoc (4.1.0)
9
- json (~> 1.4)
10
- rspec (2.14.1)
11
- rspec-core (~> 2.14.0)
12
- rspec-expectations (~> 2.14.0)
13
- rspec-mocks (~> 2.14.0)
14
- rspec-core (2.14.7)
15
- rspec-expectations (2.14.4)
16
- diff-lcs (>= 1.1.3, < 2.0)
17
- rspec-mocks (2.14.4)
18
-
19
- PLATFORMS
20
- ruby
21
-
22
- DEPENDENCIES
23
- fast-stemmer
24
- rake
25
- rdoc
26
- rspec
@@ -1,97 +0,0 @@
1
- ## Welcome to Classifier
2
-
3
- Classifier is a general module to allow Bayesian and other types of classifications.
4
-
5
- ## Download
6
-
7
- * https://github.com/cardmagic/classifier
8
- * gem install classifier
9
- * git clone https://github.com/cardmagic/classifier.git
10
-
11
- ## Dependencies
12
-
13
- If you install Classifier from source, you'll need to install Roman Shterenzon's fast-stemmer gem with RubyGems as follows:
14
-
15
- gem install fast-stemmer
16
-
17
- If you would like to speed up LSI classification by at least 10x, please install the following libraries:
18
- GNU GSL:: http://www.gnu.org/software/gsl
19
- rb-gsl:: http://rb-gsl.rubyforge.org
20
-
21
- Notice that LSI will work without these libraries, but as soon as they are installed, Classifier will make use of them. No configuration changes are needed, we like to keep things ridiculously easy for you.
22
-
23
- ## Bayes
24
-
25
- A Bayesian classifier by Lucas Carlson. Bayesian Classifiers are accurate, fast, and have modest memory requirements.
26
-
27
- ### Usage
28
-
29
- require 'classifier'
30
- b = Classifier::Bayes.new 'Interesting', 'Uninteresting'
31
- b.train_interesting "here are some good words. I hope you love them"
32
- b.train_uninteresting "here are some bad words, I hate you"
33
- b.classify "I hate bad words and you" # returns 'Uninteresting'
34
-
35
- require 'madeleine'
36
- m = SnapshotMadeleine.new("bayes_data") {
37
- Classifier::Bayes.new 'Interesting', 'Uninteresting'
38
- }
39
- m.system.train_interesting "here are some good words. I hope you love them"
40
- m.system.train_uninteresting "here are some bad words, I hate you"
41
- m.take_snapshot
42
- m.system.classify "I love you" # returns 'Interesting'
43
-
44
- Using Madeleine, your application can persist the learned data over time.
45
-
46
- ### Bayesian Classification
47
-
48
- * http://www.process.com/precisemail/bayesian_filtering.htm
49
- * http://en.wikipedia.org/wiki/Bayesian_filtering
50
- * http://www.paulgraham.com/spam.html
51
-
52
- ## LSI
53
-
54
- A Latent Semantic Indexer by David Fayram. Latent Semantic Indexing engines
55
- are not as fast or as small as Bayesian classifiers, but are more flexible, providing
56
- fast search and clustering detection as well as semantic analysis of the text that
57
- theoretically simulates human learning.
58
-
59
- ### Usage
60
-
61
- require 'classifier'
62
- lsi = Classifier::LSI.new
63
- strings = [ ["This text deals with dogs. Dogs.", :dog],
64
- ["This text involves dogs too. Dogs! ", :dog],
65
- ["This text revolves around cats. Cats.", :cat],
66
- ["This text also involves cats. Cats!", :cat],
67
- ["This text involves birds. Birds.",:bird ]]
68
- strings.each {|x| lsi.add_item x.first, x.last}
69
-
70
- lsi.search("dog", 3)
71
- # returns => ["This text deals with dogs. Dogs.", "This text involves dogs too. Dogs! ",
72
- # "This text also involves cats. Cats!"]
73
-
74
- lsi.find_related(strings[2], 2)
75
- # returns => ["This text revolves around cats. Cats.", "This text also involves cats. Cats!"]
76
-
77
- lsi.classify "This text is also about dogs!"
78
- # returns => :dog
79
-
80
- Please see the Classifier::LSI documentation for more information. It is possible to index, search and classify
81
- with more than just simple strings.
82
-
83
- ### Latent Semantic Indexing
84
-
85
- * http://www.c2.com/cgi/wiki?LatentSemanticIndexing
86
- * http://www.chadfowler.com/index.cgi/Computing/LatentSemanticIndexing.rdoc
87
- * http://en.wikipedia.org/wiki/Latent_semantic_analysis
88
-
89
- ## Authors
90
-
91
- * Lucas Carlson (lucas@rufy.com)
92
- * David Fayram II (dfayram@gmail.com)
93
- * Cameron McBride (cameron.mcbride@gmail.com)
94
- * Ivan Acosta-Rubio (ivan@softwarecriollo.com)
95
-
96
- This library is released under the terms of the GNU LGPL. See LICENSE for more details.
97
-
data/Rakefile DELETED
@@ -1,84 +0,0 @@
1
- require 'rubygems'
2
- require 'rake'
3
- require 'rake/testtask'
4
- require 'rdoc/task'
5
- require 'rake/contrib/rubyforgepublisher'
6
-
7
- desc "Default Task"
8
- task :default => [ :test ]
9
-
10
- # Run the unit tests
11
- desc "Run all unit tests"
12
- Rake::TestTask.new("test") { |t|
13
- t.libs << "lib"
14
- t.pattern = 'test/*/*_test.rb'
15
- t.verbose = true
16
- }
17
-
18
- # Make a console, useful when working on tests
19
- desc "Generate a test console"
20
- task :console do
21
- verbose( false ) { sh "irb -I lib/ -r 'classifier'" }
22
- end
23
-
24
- # Genereate the RDoc documentation
25
- desc "Create documentation"
26
- Rake::RDocTask.new("doc") { |rdoc|
27
- rdoc.title = "Ruby Classifier - Bayesian and LSI classification library"
28
- rdoc.rdoc_dir = 'html'
29
- rdoc.rdoc_files.include('README')
30
- rdoc.rdoc_files.include('lib/**/*.rb')
31
- }
32
-
33
- # Genereate the package
34
- spec = Gem::Specification.new do |s|
35
-
36
- #### Basic information.
37
-
38
- s.name = 'classifier'
39
- s.version = PKG_VERSION
40
- s.summary = <<-EOF
41
- A general classifier module to allow Bayesian and other types of classifications.
42
- EOF
43
- s.description = <<-EOF
44
- A general classifier module to allow Bayesian and other types of classifications.
45
- EOF
46
-
47
- #### Which files are to be included in this gem? Everything! (Except CVS directories.)
48
-
49
- s.files = PKG_FILES
50
-
51
- #### Load-time details: library and application (you will need one or both).
52
-
53
- s.require_path = 'lib'
54
- s.autorequire = 'classifier'
55
-
56
- #### Documentation and testing.
57
-
58
- s.has_rdoc = true
59
-
60
- #### Dependencies and requirements.
61
-
62
- s.add_dependency('fast-stemmer', '>= 1.0.0')
63
- s.requirements << "A porter-stemmer module to split word stems."
64
-
65
- #### Author and project details.
66
- s.author = "Lucas Carlson"
67
- s.email = "lucas@rufy.com"
68
- s.homepage = "http://classifier.rufy.com/"
69
- end
70
-
71
- desc "Report code statistics (KLOCs, etc) from the application"
72
- task :stats do
73
- require 'code_statistics'
74
- CodeStatistics.new(
75
- ["Library", "lib"],
76
- ["Units", "test"]
77
- ).to_s
78
- end
79
-
80
- desc "Publish new documentation"
81
- task :publish do
82
- `ssh rufy update-classifier-doc`
83
- Rake::RubyForgePublisher.new('classifier', 'cardmagic').upload
84
- end
@@ -1,33 +0,0 @@
1
- require File.dirname(__FILE__) + '/../test_helper'
2
- class BayesianTest < Test::Unit::TestCase
3
- def setup
4
- @classifier = Classifier::Bayes.new 'Interesting', 'Uninteresting'
5
- end
6
-
7
- def test_good_training
8
- assert_nothing_raised { @classifier.train_interesting "love" }
9
- end
10
-
11
- def test_bad_training
12
- assert_raise(StandardError) { @classifier.train_no_category "words" }
13
- end
14
-
15
- def test_bad_method
16
- assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
17
- end
18
-
19
- def test_categories
20
- assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
21
- end
22
-
23
- def test_add_category
24
- @classifier.add_category 'Test'
25
- assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
26
- end
27
-
28
- def test_classification
29
- @classifier.train_interesting "here are some good words. I hope you love them"
30
- @classifier.train_uninteresting "here are some bad words, I hate you"
31
- assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
32
- end
33
- end
@@ -1,35 +0,0 @@
1
- require File.dirname(__FILE__) + '/../test_helper'
2
- class StringExtensionsTest < Test::Unit::TestCase
3
- def test_word_hash
4
- hash = {:good=>1, :"!"=>1, :hope=>1, :"'"=>1, :"."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
5
- assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
6
- end
7
-
8
-
9
- def test_clean_word_hash
10
- hash = {:good=>1, :word=>1, :hope=>1, :love=>1, :them=>1, :test=>1}
11
- assert_equal hash, "here are some good words of test's. I hope you love them!".clean_word_hash
12
- end
13
-
14
- end
15
-
16
-
17
- class ArrayExtensionsTest < Test::Unit::TestCase
18
-
19
- def test_plays_nicely_with_any_array
20
- assert_equal [Array].sum, Array
21
- end
22
-
23
- def test_monkey_path_array_sum
24
- assert_equal [1,2,3].sum, 6
25
- end
26
-
27
- def test_summing_an_empty_array
28
- assert_equal [nil].sum, 0
29
- end
30
-
31
- def test_summing_an_empty_array
32
- assert_equal Array[].sum, 0
33
- end
34
-
35
- end
@@ -1,123 +0,0 @@
1
- require File.dirname(__FILE__) + '/../test_helper'
2
- class LSITest < Test::Unit::TestCase
3
- def setup
4
- # we repeat principle words to help weight them.
5
- # This test is rather delicate, since this system is mostly noise.
6
- @str1 = "This text deals with dogs. Dogs."
7
- @str2 = "This text involves dogs too. Dogs! "
8
- @str3 = "This text revolves around cats. Cats."
9
- @str4 = "This text also involves cats. Cats!"
10
- @str5 = "This text involves birds. Birds."
11
- end
12
-
13
- def test_basic_indexing
14
- lsi = Classifier::LSI.new
15
- [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
16
- assert ! lsi.needs_rebuild?
17
-
18
- # note that the closest match to str1 is str2, even though it is not
19
- # the closest text match.
20
- assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
21
- end
22
-
23
- def test_not_auto_rebuild
24
- lsi = Classifier::LSI.new :auto_rebuild => false
25
- lsi.add_item @str1, "Dog"
26
- lsi.add_item @str2, "Dog"
27
- assert lsi.needs_rebuild?
28
- lsi.build_index
29
- assert ! lsi.needs_rebuild?
30
- end
31
-
32
- def test_basic_categorizing
33
- lsi = Classifier::LSI.new
34
- lsi.add_item @str2, "Dog"
35
- lsi.add_item @str3, "Cat"
36
- lsi.add_item @str4, "Cat"
37
- lsi.add_item @str5, "Bird"
38
-
39
- assert_equal "Dog", lsi.classify( @str1 )
40
- assert_equal "Cat", lsi.classify( @str3 )
41
- assert_equal "Bird", lsi.classify( @str5 )
42
- end
43
-
44
- def test_external_classifying
45
- lsi = Classifier::LSI.new
46
- bayes = Classifier::Bayes.new 'Dog', 'Cat', 'Bird'
47
- lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
48
- lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
49
- lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
50
- lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
51
- lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
52
-
53
- # We're talking about dogs. Even though the text matches the corpus on
54
- # cats better. Dogs have more semantic weight than cats. So bayes
55
- # will fail here, but the LSI recognizes content.
56
- tricky_case = "This text revolves around dogs."
57
- assert_equal "Dog", lsi.classify( tricky_case )
58
- assert_not_equal "Dog", bayes.classify( tricky_case )
59
- end
60
-
61
- def test_recategorize_interface
62
- lsi = Classifier::LSI.new
63
- lsi.add_item @str1, "Dog"
64
- lsi.add_item @str2, "Dog"
65
- lsi.add_item @str3, "Cat"
66
- lsi.add_item @str4, "Cat"
67
- lsi.add_item @str5, "Bird"
68
-
69
- tricky_case = "This text revolves around dogs."
70
- assert_equal "Dog", lsi.classify( tricky_case )
71
-
72
- # Recategorize as needed.
73
- lsi.categories_for(@str1).clear.push "Cow"
74
- lsi.categories_for(@str2).clear.push "Cow"
75
-
76
- assert !lsi.needs_rebuild?
77
- assert_equal "Cow", lsi.classify( tricky_case )
78
- end
79
-
80
- def test_search
81
- lsi = Classifier::LSI.new
82
- [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
83
-
84
- # Searching by content and text, note that @str2 comes up first, because
85
- # both "dog" and "involve" are present. But, the next match is @str1 instead
86
- # of @str4, because "dog" carries more weight than involves.
87
- assert_equal( [@str2, @str1, @str4, @str5, @str3],
88
- lsi.search("dog involves", 100) )
89
-
90
- # Keyword search shows how the space is mapped out in relation to
91
- # dog when magnitude is remove. Note the relations. We move from dog
92
- # through involve and then finally to other words.
93
- assert_equal( [@str1, @str2, @str4, @str5, @str3],
94
- lsi.search("dog", 5) )
95
- end
96
-
97
- def test_serialize_safe
98
- lsi = Classifier::LSI.new
99
- [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
100
-
101
- lsi_md = Marshal.dump lsi
102
- lsi_m = Marshal.load lsi_md
103
-
104
- assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
105
- assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
106
- end
107
-
108
- def test_keyword_search
109
- lsi = Classifier::LSI.new
110
- lsi.add_item @str1, "Dog"
111
- lsi.add_item @str2, "Dog"
112
- lsi.add_item @str3, "Cat"
113
- lsi.add_item @str4, "Cat"
114
- lsi.add_item @str5, "Bird"
115
-
116
- assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
117
- end
118
-
119
- def test_summary
120
- assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
121
- end
122
-
123
- end
@@ -1,4 +0,0 @@
1
- $:.unshift(File.dirname(__FILE__) + '/../lib')
2
-
3
- require 'test/unit'
4
- require 'classifier'