classifier-reborn 2.0.4 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.markdown +25 -3
- data/bin/bayes.rb +22 -22
- data/bin/summarize.rb +4 -4
- data/lib/classifier-reborn.rb +1 -1
- data/lib/classifier-reborn/bayes.rb +50 -46
- data/lib/classifier-reborn/category_namer.rb +4 -3
- data/lib/classifier-reborn/extensions/hasher.rb +19 -11
- data/lib/classifier-reborn/extensions/vector.rb +27 -26
- data/lib/classifier-reborn/extensions/vector_serialize.rb +8 -10
- data/lib/classifier-reborn/lsi.rb +89 -77
- data/lib/classifier-reborn/lsi/cached_content_node.rb +4 -5
- data/lib/classifier-reborn/lsi/content_node.rb +15 -18
- data/lib/classifier-reborn/lsi/summarizer.rb +5 -5
- data/lib/classifier-reborn/lsi/word_list.rb +1 -2
- data/lib/classifier-reborn/version.rb +1 -1
- metadata +46 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 53745cead2833e74e9d74f3359bae0bc7fd01fa4
|
4
|
+
data.tar.gz: 2926365890cf0bc43f7ff17570789ed5e032a45d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5def462eccbb9ef7a45d3968a5fbc2e2a886bfdda734a98e1f5904943ba1b4012ea331a51ff52e2c165db776dc8c6aed35fac9e01ea3e3f001f82daf98027f74
|
7
|
+
data.tar.gz: 22726b8f6c2acab5bb47b9faac2a2d615557bfff535327ef5bbe022a661c1be13c3b7cec95d1c0030f93158eb0942e121d97f2a0213a640eca00607de9d55f61
|
data/README.markdown
CHANGED
@@ -1,5 +1,12 @@
|
|
1
1
|
## Welcome to Classifier Reborn
|
2
2
|
|
3
|
+
[][ruby-gems]
|
4
|
+
[][travis]
|
5
|
+
[][gemnasium]
|
6
|
+
[ruby-gems]: https://rubygems.org/gems/jekyll/classifier-reborn
|
7
|
+
[gemnasium]: https://gemnasium.com/jekyll/classifier-reborn
|
8
|
+
[travis]: https://travis-ci.org/jekyll/classifier-reborn
|
9
|
+
|
3
10
|
Classifier is a general module to allow Bayesian and other types of classifications.
|
4
11
|
|
5
12
|
Classifier Reborn is a fork of cardmagic/classifier under more active development.
|
@@ -37,6 +44,8 @@ Notice that LSI will work without these libraries, but as soon as they are insta
|
|
37
44
|
|
38
45
|
A Bayesian classifier by Lucas Carlson. Bayesian Classifiers are accurate, fast, and have modest memory requirements.
|
39
46
|
|
47
|
+
*Note: Classifier only supports UTF-8 characters.*
|
48
|
+
|
40
49
|
### Usage
|
41
50
|
|
42
51
|
```ruby
|
@@ -60,7 +69,7 @@ trained_classifier.classify "I love" # returns 'Interesting'
|
|
60
69
|
```
|
61
70
|
|
62
71
|
Beyond the basic example, the constructor and trainer can be used in a more
|
63
|
-
flexible way to
|
72
|
+
flexible way to accommodate non-trival applications. Consider the following
|
64
73
|
program:
|
65
74
|
|
66
75
|
```ruby
|
@@ -72,7 +81,8 @@ require 'classifier-reborn'
|
|
72
81
|
training_set = DATA.read.split("\n")
|
73
82
|
categories = training_set.shift.split(',').map{|c| c.strip}
|
74
83
|
|
75
|
-
|
84
|
+
# pass :auto_categorize option to allow feeding previously unknown categories
|
85
|
+
classifier = ClassifierReborn::Bayes.new categories, auto_categorize: true
|
76
86
|
|
77
87
|
training_set.each do |a_line|
|
78
88
|
next if a_line.empty? || '#' == a_line.strip[0]
|
@@ -141,7 +151,7 @@ Or suppose you just want the ability to have multiple categories and a 'None of
|
|
141
151
|
When you initialize the *ClassifierReborn::Bayes* classifier there are several options which can be set that control threshold processing.
|
142
152
|
|
143
153
|
```ruby
|
144
|
-
b =
|
154
|
+
b = ClassifierReborn::Bayes.new(
|
145
155
|
'good', # one or more categories
|
146
156
|
enable_threshold: true, # default: false
|
147
157
|
threshold: -10.0 # default: 0.0
|
@@ -218,6 +228,18 @@ with more than just simple strings.
|
|
218
228
|
* http://www.chadfowler.com/index.cgi/Computing/LatentSemanticIndexing.rdoc
|
219
229
|
* http://en.wikipedia.org/wiki/Latent_semantic_analysis
|
220
230
|
|
231
|
+
|
232
|
+
## Code of Conduct
|
233
|
+
|
234
|
+
In order to have a more open and welcoming community, Classifier-Reborn adheres to the Jekyll
|
235
|
+
[code of conduct](https://github.com/jekyll/jekyll/blob/master/CONDUCT.markdown) adapted from the Ruby on Rails code of
|
236
|
+
conduct.
|
237
|
+
|
238
|
+
Please adhere to this code of conduct in any interactions you have in the
|
239
|
+
Classifier community. If you encounter someone violating
|
240
|
+
these terms, please let [@chase](https://github.com/Ch4s3) know and we will address it as soon as possible.
|
241
|
+
|
242
|
+
|
221
243
|
## Authors
|
222
244
|
|
223
245
|
* Lucas Carlson (lucas@rufy.com)
|
data/bin/bayes.rb
CHANGED
@@ -1,36 +1,36 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
begin
|
4
|
-
|
5
|
-
|
4
|
+
require 'rubygems'
|
5
|
+
require 'classifier'
|
6
6
|
rescue
|
7
|
-
|
7
|
+
require 'classifier'
|
8
8
|
end
|
9
9
|
|
10
10
|
require 'madeleine'
|
11
11
|
|
12
|
-
m = SnapshotMadeleine.new(File.expand_path(
|
13
|
-
|
14
|
-
|
12
|
+
m = SnapshotMadeleine.new(File.expand_path('~/.bayes_data')) do
|
13
|
+
ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting'
|
14
|
+
end
|
15
15
|
|
16
16
|
case ARGV[0]
|
17
|
-
when
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
when
|
30
|
-
|
17
|
+
when 'add'
|
18
|
+
case ARGV[1].downcase
|
19
|
+
when 'interesting'
|
20
|
+
m.system.train_interesting File.open(ARGV[2]).read
|
21
|
+
puts "#{ARGV[2]} has been classified as interesting"
|
22
|
+
when 'uninteresting'
|
23
|
+
m.system.train_uninteresting File.open(ARGV[2]).read
|
24
|
+
puts "#{ARGV[2]} has been classified as uninteresting"
|
25
|
+
else
|
26
|
+
puts 'Invalid category: choose between interesting and uninteresting'
|
27
|
+
exit(1)
|
28
|
+
end
|
29
|
+
when 'classify'
|
30
|
+
puts m.system.classify(File.open(ARGV[1]).read)
|
31
31
|
else
|
32
|
-
|
33
|
-
|
32
|
+
puts 'Invalid option: choose add [category] [file] or clasify [file]'
|
33
|
+
exit(-1)
|
34
34
|
end
|
35
35
|
|
36
36
|
m.take_snapshot
|
data/bin/summarize.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
begin
|
4
|
-
|
5
|
-
|
4
|
+
require 'rubygems'
|
5
|
+
require 'classifier'
|
6
6
|
rescue
|
7
|
-
|
7
|
+
require 'classifier'
|
8
8
|
end
|
9
9
|
|
10
10
|
require 'open-uri'
|
@@ -13,4 +13,4 @@ num = ARGV[1].to_i
|
|
13
13
|
num = num < 1 ? 10 : num
|
14
14
|
|
15
15
|
text = open(ARGV.first).read
|
16
|
-
puts text.gsub(/<[^>]+>/,
|
16
|
+
puts text.gsub(/<[^>]+>/, '').gsub(/[\s]+/, ' ').summary(num)
|
data/lib/classifier-reborn.rb
CHANGED
@@ -17,20 +17,22 @@ module ClassifierReborn
|
|
17
17
|
# auto_categorize: false When true, enables ability to dynamically declare a category
|
18
18
|
# enable_threshold: false When true, enables a threshold requirement for classifition
|
19
19
|
# threshold: 0.0 Default threshold, only used when enabled
|
20
|
+
# enable_stemmer: true When false, disables word stemming
|
20
21
|
def initialize(*args)
|
21
|
-
@categories =
|
22
|
-
options = { language: 'en',
|
22
|
+
@categories = {}
|
23
|
+
options = { language: 'en',
|
23
24
|
auto_categorize: false,
|
24
25
|
enable_threshold: false,
|
25
|
-
threshold: 0.0
|
26
|
+
threshold: 0.0,
|
27
|
+
enable_stemmer: true
|
26
28
|
}
|
27
|
-
args.flatten.each
|
28
|
-
if arg.
|
29
|
+
args.flatten.each do |arg|
|
30
|
+
if arg.is_a?(Hash)
|
29
31
|
options.merge!(arg)
|
30
32
|
else
|
31
33
|
add_category(arg)
|
32
34
|
end
|
33
|
-
|
35
|
+
end
|
34
36
|
|
35
37
|
@total_words = 0
|
36
38
|
@category_counts = Hash.new(0)
|
@@ -40,6 +42,7 @@ module ClassifierReborn
|
|
40
42
|
@auto_categorize = options[:auto_categorize]
|
41
43
|
@enable_threshold = options[:enable_threshold]
|
42
44
|
@threshold = options[:threshold]
|
45
|
+
@enable_stemmer = options[:enable_stemmer]
|
43
46
|
end
|
44
47
|
|
45
48
|
# Provides a general training method for all categories specified in Bayes#new
|
@@ -52,18 +55,18 @@ module ClassifierReborn
|
|
52
55
|
category = CategoryNamer.prepare_name(category)
|
53
56
|
|
54
57
|
# Add the category dynamically or raise an error
|
55
|
-
|
58
|
+
unless @categories.key?(category)
|
56
59
|
if @auto_categorize
|
57
60
|
add_category(category)
|
58
61
|
else
|
59
|
-
raise CategoryNotFoundError
|
62
|
+
raise CategoryNotFoundError, "Cannot train; category #{category} does not exist"
|
60
63
|
end
|
61
64
|
end
|
62
65
|
|
63
66
|
@category_counts[category] += 1
|
64
|
-
Hasher.word_hash(text, @language).each do |word, count|
|
65
|
-
@categories[category][word]
|
66
|
-
@category_word_count[category]
|
67
|
+
Hasher.word_hash(text, @language, @enable_stemmer).each do |word, count|
|
68
|
+
@categories[category][word] += count
|
69
|
+
@category_word_count[category] += count
|
67
70
|
@total_words += count
|
68
71
|
end
|
69
72
|
end
|
@@ -78,20 +81,17 @@ module ClassifierReborn
|
|
78
81
|
def untrain(category, text)
|
79
82
|
category = CategoryNamer.prepare_name(category)
|
80
83
|
@category_counts[category] -= 1
|
81
|
-
Hasher.word_hash(text, @language).each do |word, count|
|
82
|
-
if @total_words
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
end
|
89
|
-
|
90
|
-
if @category_word_count[category] >= count
|
91
|
-
@category_word_count[category] -= count
|
92
|
-
end
|
93
|
-
@total_words -= count
|
84
|
+
Hasher.word_hash(text, @language, @enable_stemmer).each do |word, count|
|
85
|
+
next if @total_words < 0
|
86
|
+
orig = @categories[category][word] || 0
|
87
|
+
@categories[category][word] -= count
|
88
|
+
if @categories[category][word] <= 0
|
89
|
+
@categories[category].delete(word)
|
90
|
+
count = orig
|
94
91
|
end
|
92
|
+
|
93
|
+
@category_word_count[category] -= count if @category_word_count[category] >= count
|
94
|
+
@total_words -= count
|
95
95
|
end
|
96
96
|
end
|
97
97
|
|
@@ -100,21 +100,21 @@ module ClassifierReborn
|
|
100
100
|
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
101
101
|
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
102
102
|
def classifications(text)
|
103
|
-
score =
|
104
|
-
word_hash = Hasher.word_hash(text, @language)
|
103
|
+
score = {}
|
104
|
+
word_hash = Hasher.word_hash(text, @language, @enable_stemmer)
|
105
105
|
training_count = @category_counts.values.reduce(:+).to_f
|
106
106
|
@categories.each do |category, category_words|
|
107
107
|
score[category.to_s] = 0
|
108
108
|
total = (@category_word_count[category] || 1).to_f
|
109
|
-
word_hash.each do |word,
|
110
|
-
s = category_words.
|
111
|
-
score[category.to_s] += Math.log(s/total)
|
109
|
+
word_hash.each do |word, _count|
|
110
|
+
s = category_words.key?(word) ? category_words[word] : 0.1
|
111
|
+
score[category.to_s] += Math.log(s / total)
|
112
112
|
end
|
113
113
|
# now add prior probability for the category
|
114
|
-
s = @category_counts.
|
114
|
+
s = @category_counts.key?(category) ? @category_counts[category] : 0.1
|
115
115
|
score[category.to_s] += Math.log(s / training_count)
|
116
116
|
end
|
117
|
-
|
117
|
+
score
|
118
118
|
end
|
119
119
|
|
120
120
|
# Returns the classification of the provided +text+, which is one of the
|
@@ -128,21 +128,15 @@ module ClassifierReborn
|
|
128
128
|
# Return the classification without the score
|
129
129
|
def classify(text)
|
130
130
|
result, score = classify_with_score(text)
|
131
|
-
if threshold_enabled?
|
132
|
-
|
133
|
-
end
|
134
|
-
return result
|
131
|
+
result = nil if score < @threshold || score == Float::INFINITY if threshold_enabled?
|
132
|
+
result
|
135
133
|
end
|
136
134
|
|
137
135
|
# Retrieve the current threshold value
|
138
|
-
|
139
|
-
@threshold
|
140
|
-
end
|
136
|
+
attr_reader :threshold
|
141
137
|
|
142
138
|
# Dynamically set the threshold value
|
143
|
-
|
144
|
-
@threshold = a_float
|
145
|
-
end
|
139
|
+
attr_writer :threshold
|
146
140
|
|
147
141
|
# Dynamically enable threshold for classify results
|
148
142
|
def enable_threshold
|
@@ -164,6 +158,16 @@ module ClassifierReborn
|
|
164
158
|
!@enable_threshold
|
165
159
|
end
|
166
160
|
|
161
|
+
# Is word stemming enabled?
|
162
|
+
def stemmer_enabled?
|
163
|
+
@enable_stemmer
|
164
|
+
end
|
165
|
+
|
166
|
+
# Is word stemming disabled?
|
167
|
+
def stemmer_disabled?
|
168
|
+
!@enable_stemmer
|
169
|
+
end
|
170
|
+
|
167
171
|
# Provides training and untraining methods for the categories specified in Bayes#new
|
168
172
|
# For example:
|
169
173
|
# b = ClassifierReborn::Bayes.new 'This', 'That', 'the_other'
|
@@ -174,12 +178,12 @@ module ClassifierReborn
|
|
174
178
|
def method_missing(name, *args)
|
175
179
|
cleaned_name = name.to_s.gsub(/(un)?train_([\w]+)/, '\2')
|
176
180
|
category = CategoryNamer.prepare_name(cleaned_name)
|
177
|
-
if @categories.
|
178
|
-
args.each { |text| eval("#{
|
181
|
+
if @categories.key? category
|
182
|
+
args.each { |text| eval("#{Regexp.last_match(1)}train(category, text)") }
|
179
183
|
elsif name.to_s =~ /(un)?train_([\w]+)/
|
180
184
|
raise StandardError, "No such category: #{category}"
|
181
185
|
else
|
182
|
-
super
|
186
|
+
super # raise StandardError, "No such method: #{name}"
|
183
187
|
end
|
184
188
|
end
|
185
189
|
|
@@ -188,7 +192,7 @@ module ClassifierReborn
|
|
188
192
|
# b.categories
|
189
193
|
# => ['This', 'That', 'the_other']
|
190
194
|
def categories # :nodoc:
|
191
|
-
@categories.keys.collect
|
195
|
+
@categories.keys.collect(&:to_s)
|
192
196
|
end
|
193
197
|
|
194
198
|
# Allows you to add categories to the classifier.
|
@@ -203,6 +207,6 @@ module ClassifierReborn
|
|
203
207
|
@categories[CategoryNamer.prepare_name(category)] ||= Hash.new(0)
|
204
208
|
end
|
205
209
|
|
206
|
-
|
210
|
+
alias_method :append_category, :add_category
|
207
211
|
end
|
208
212
|
end
|
@@ -7,11 +7,12 @@ require 'classifier-reborn/extensions/hasher'
|
|
7
7
|
|
8
8
|
module ClassifierReborn
|
9
9
|
module CategoryNamer
|
10
|
-
|
11
|
-
|
10
|
+
module_function
|
11
|
+
|
12
|
+
def prepare_name(name)
|
12
13
|
return name if name.is_a?(Symbol)
|
13
14
|
|
14
|
-
name.to_s.
|
15
|
+
name.to_s.tr('_', ' ').capitalize.intern
|
15
16
|
end
|
16
17
|
end
|
17
18
|
end
|
@@ -9,29 +9,37 @@ module ClassifierReborn
|
|
9
9
|
module Hasher
|
10
10
|
STOPWORDS_PATH = [File.expand_path(File.dirname(__FILE__) + '/../../../data/stopwords')]
|
11
11
|
|
12
|
-
|
12
|
+
module_function
|
13
13
|
|
14
14
|
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
15
15
|
# interned, and indexes to its frequency in the document.
|
16
|
-
def word_hash(str, language = 'en')
|
17
|
-
cleaned_word_hash = clean_word_hash(str, language)
|
16
|
+
def word_hash(str, language = 'en', enable_stemmer = true)
|
17
|
+
cleaned_word_hash = clean_word_hash(str, language, enable_stemmer)
|
18
18
|
symbol_hash = word_hash_for_symbols(str.scan(/[^\s\p{WORD}]/))
|
19
|
-
|
19
|
+
cleaned_word_hash.merge(symbol_hash)
|
20
20
|
end
|
21
21
|
|
22
22
|
# Return a word hash without extra punctuation or short symbols, just stemmed words
|
23
|
-
def clean_word_hash(str, language = 'en')
|
24
|
-
word_hash_for_words str.gsub(/[^\p{WORD}\s]/,'').downcase.split, language
|
23
|
+
def clean_word_hash(str, language = 'en', enable_stemmer = true)
|
24
|
+
word_hash_for_words str.gsub(/[^\p{WORD}\s]/, '').downcase.split, language, enable_stemmer
|
25
25
|
end
|
26
26
|
|
27
|
-
def word_hash_for_words(words, language = 'en')
|
27
|
+
def word_hash_for_words(words, language = 'en', enable_stemmer = true)
|
28
28
|
d = Hash.new(0)
|
29
29
|
words.each do |word|
|
30
|
-
|
30
|
+
next unless word.length > 2 && !STOPWORDS[language].include?(word)
|
31
|
+
if enable_stemmer
|
31
32
|
d[word.stem.intern] += 1
|
33
|
+
else
|
34
|
+
d[word.intern] += 1
|
32
35
|
end
|
33
36
|
end
|
34
|
-
|
37
|
+
d
|
38
|
+
end
|
39
|
+
|
40
|
+
# Add custom path to a new stopword file created by user
|
41
|
+
def add_custom_stopword_path(path)
|
42
|
+
STOPWORDS_PATH.unshift(path)
|
35
43
|
end
|
36
44
|
|
37
45
|
def word_hash_for_symbols(words)
|
@@ -39,7 +47,7 @@ module ClassifierReborn
|
|
39
47
|
words.each do |word|
|
40
48
|
d[word.intern] += 1
|
41
49
|
end
|
42
|
-
|
50
|
+
d
|
43
51
|
end
|
44
52
|
|
45
53
|
# Create a lazily-loaded hash of stopword data
|
@@ -48,7 +56,7 @@ module ClassifierReborn
|
|
48
56
|
|
49
57
|
STOPWORDS_PATH.each do |path|
|
50
58
|
if File.exist?(File.join(path, language))
|
51
|
-
hash[language] = Set.new File.read(File.join(path, language.to_s)).split
|
59
|
+
hash[language] = Set.new File.read(File.join(path, language.to_s)).force_encoding("utf-8").split
|
52
60
|
break
|
53
61
|
end
|
54
62
|
end
|
@@ -6,17 +6,17 @@
|
|
6
6
|
require 'matrix'
|
7
7
|
|
8
8
|
class Matrix
|
9
|
-
def
|
10
|
-
|
9
|
+
def self.diag(s)
|
10
|
+
Matrix.diagonal(*s)
|
11
11
|
end
|
12
12
|
|
13
|
-
|
13
|
+
alias_method :trans, :transpose
|
14
14
|
|
15
15
|
def SV_decomp(maxSweeps = 20)
|
16
|
-
if
|
17
|
-
q =
|
16
|
+
if row_size >= column_size
|
17
|
+
q = trans * self
|
18
18
|
else
|
19
|
-
q = self *
|
19
|
+
q = self * trans
|
20
20
|
end
|
21
21
|
|
22
22
|
qrot = q.dup
|
@@ -24,55 +24,56 @@ class Matrix
|
|
24
24
|
mzrot = nil
|
25
25
|
cnt = 0
|
26
26
|
s_old = nil
|
27
|
-
mu = nil
|
28
27
|
|
29
|
-
|
28
|
+
loop do
|
30
29
|
cnt += 1
|
31
|
-
|
32
|
-
|
30
|
+
(0...qrot.row_size - 1).each do |row|
|
31
|
+
(1..qrot.row_size - 1).each do |col|
|
33
32
|
next if row == col
|
34
|
-
|
33
|
+
|
34
|
+
h = Math.atan((2 * qrot[row, col]) / (qrot[row, row] - qrot[col, col])) / 2.0
|
35
35
|
hcos = Math.cos(h)
|
36
36
|
hsin = Math.sin(h)
|
37
37
|
mzrot = Matrix.identity(qrot.row_size)
|
38
|
-
mzrot[row,row] = hcos
|
39
|
-
mzrot[row,col] = -hsin
|
40
|
-
mzrot[col,row] = hsin
|
41
|
-
mzrot[col,col] = hcos
|
38
|
+
mzrot[row, row] = hcos
|
39
|
+
mzrot[row, col] = -hsin
|
40
|
+
mzrot[col, row] = hsin
|
41
|
+
mzrot[col, col] = hcos
|
42
42
|
qrot = mzrot.trans * qrot * mzrot
|
43
|
-
v
|
43
|
+
v *= mzrot
|
44
44
|
end
|
45
45
|
end
|
46
46
|
s_old = qrot.dup if cnt == 1
|
47
47
|
sum_qrot = 0.0
|
48
48
|
if cnt > 1
|
49
49
|
qrot.row_size.times do |r|
|
50
|
-
sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
|
50
|
+
sum_qrot += (qrot[r, r] - s_old[r, r]).abs if (qrot[r, r] - s_old[r, r]).abs > 0.001
|
51
51
|
end
|
52
52
|
s_old = qrot.dup
|
53
53
|
end
|
54
|
-
break if (sum_qrot <= 0.001
|
54
|
+
break if (sum_qrot <= 0.001 && cnt > 1) || cnt >= maxSweeps
|
55
55
|
end # of do while true
|
56
56
|
s = []
|
57
57
|
qrot.row_size.times do |r|
|
58
|
-
s << Math.sqrt(qrot[r,r])
|
58
|
+
s << Math.sqrt(qrot[r, r])
|
59
59
|
end
|
60
|
-
#puts "cnt = #{cnt}"
|
61
|
-
if
|
62
|
-
mu = self *
|
60
|
+
# puts "cnt = #{cnt}"
|
61
|
+
if row_size >= column_size
|
62
|
+
mu = self * v * Matrix.diagonal(*s).inverse
|
63
63
|
return [mu, v, s]
|
64
64
|
else
|
65
65
|
puts v.row_size
|
66
66
|
puts v.column_size
|
67
|
-
puts
|
68
|
-
puts
|
67
|
+
puts row_size
|
68
|
+
puts column_size
|
69
69
|
puts s.size
|
70
70
|
|
71
|
-
mu = (
|
71
|
+
mu = (trans * v * Matrix.diagonal(*s).inverse)
|
72
72
|
return [mu, v, s]
|
73
73
|
end
|
74
74
|
end
|
75
|
-
|
75
|
+
|
76
|
+
def []=(i, j, val)
|
76
77
|
@rows[i][j] = val
|
77
78
|
end
|
78
79
|
end
|
@@ -1,20 +1,18 @@
|
|
1
1
|
module GSL
|
2
|
-
|
3
2
|
class Vector
|
4
|
-
def _dump(
|
5
|
-
Marshal.dump(
|
3
|
+
def _dump(_v)
|
4
|
+
Marshal.dump(to_a)
|
6
5
|
end
|
7
|
-
|
6
|
+
|
8
7
|
def self._load(arr)
|
9
8
|
arry = Marshal.load(arr)
|
10
|
-
|
9
|
+
GSL::Vector.alloc(arry)
|
11
10
|
end
|
12
|
-
|
13
11
|
end
|
14
|
-
|
12
|
+
|
15
13
|
class Matrix
|
16
|
-
|
17
|
-
|
18
|
-
|
14
|
+
class <<self
|
15
|
+
alias_method :diag, :diagonal
|
16
|
+
end
|
19
17
|
end
|
20
18
|
end
|
@@ -3,13 +3,14 @@
|
|
3
3
|
# License:: LGPL
|
4
4
|
|
5
5
|
begin
|
6
|
-
raise LoadError if ENV['NATIVE_VECTOR'] ==
|
6
|
+
raise LoadError if ENV['NATIVE_VECTOR'] == 'true' # to test the native vector class, try `rake test NATIVE_VECTOR=true`
|
7
7
|
|
8
|
-
require 'gsl' # requires
|
8
|
+
require 'gsl' # requires https://github.com/SciRuby/rb-gsl
|
9
9
|
require_relative 'extensions/vector_serialize'
|
10
10
|
$GSL = true
|
11
11
|
|
12
12
|
rescue LoadError
|
13
|
+
$GSL = false
|
13
14
|
require_relative 'extensions/vector'
|
14
15
|
end
|
15
16
|
|
@@ -19,12 +20,10 @@ require_relative 'lsi/cached_content_node'
|
|
19
20
|
require_relative 'lsi/summarizer'
|
20
21
|
|
21
22
|
module ClassifierReborn
|
22
|
-
|
23
23
|
# This class implements a Latent Semantic Indexer, which can search, classify and cluster
|
24
24
|
# data based on underlying semantic relations. For more information on the algorithms used,
|
25
25
|
# please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
|
26
26
|
class LSI
|
27
|
-
|
28
27
|
attr_reader :word_list, :cache_node_vectors
|
29
28
|
attr_accessor :auto_rebuild
|
30
29
|
|
@@ -36,12 +35,12 @@ module ClassifierReborn
|
|
36
35
|
#
|
37
36
|
def initialize(options = {})
|
38
37
|
@auto_rebuild = options[:auto_rebuild] != false
|
39
|
-
@word_list
|
40
|
-
@
|
38
|
+
@word_list = WordList.new
|
39
|
+
@items = {}
|
40
|
+
@version = 0
|
41
|
+
@built_at_version = -1
|
41
42
|
@language = options[:language] || 'en'
|
42
|
-
if @cache_node_vectors = options[:cache_node_vectors]
|
43
|
-
extend CachedContentNode::InstanceMethods
|
44
|
-
end
|
43
|
+
extend CachedContentNode::InstanceMethods if @cache_node_vectors = options[:cache_node_vectors]
|
45
44
|
end
|
46
45
|
|
47
46
|
# Returns true if the index needs to be rebuilt. The index needs
|
@@ -64,39 +63,44 @@ module ClassifierReborn
|
|
64
63
|
# ar = ActiveRecordObject.find( :all )
|
65
64
|
# lsi.add_item ar, *ar.categories { |x| ar.content }
|
66
65
|
#
|
67
|
-
def add_item(
|
66
|
+
def add_item(item, *categories, &block)
|
68
67
|
clean_word_hash = Hasher.clean_word_hash((block ? block.call(item) : item.to_s), @language)
|
69
|
-
|
70
|
-
|
68
|
+
if clean_word_hash.empty?
|
69
|
+
puts "Input: '#{item}' is entirely stopwords or words with 2 or fewer characters. Classifier-Reborn cannot handle this document properly."
|
71
70
|
else
|
72
|
-
|
71
|
+
@items[item] = if @cache_node_vectors
|
72
|
+
CachedContentNode.new(clean_word_hash, *categories)
|
73
|
+
else
|
74
|
+
ContentNode.new(clean_word_hash, *categories)
|
75
|
+
end
|
76
|
+
@version += 1
|
77
|
+
build_index if @auto_rebuild
|
73
78
|
end
|
74
|
-
@version += 1
|
75
|
-
build_index if @auto_rebuild
|
76
79
|
end
|
77
80
|
|
78
81
|
# A less flexible shorthand for add_item that assumes
|
79
82
|
# you are passing in a string with no categorries. item
|
80
83
|
# will be duck typed via to_s .
|
81
84
|
#
|
82
|
-
def <<(
|
83
|
-
add_item
|
85
|
+
def <<(item)
|
86
|
+
add_item(item)
|
84
87
|
end
|
85
88
|
|
86
89
|
# Returns the categories for a given indexed items. You are free to add and remove
|
87
90
|
# items from this as you see fit. It does not invalide an index to change its categories.
|
88
91
|
def categories_for(item)
|
89
92
|
return [] unless @items[item]
|
90
|
-
|
93
|
+
|
94
|
+
@items[item].categories
|
91
95
|
end
|
92
96
|
|
93
97
|
# Removes an item from the database, if it is indexed.
|
94
98
|
#
|
95
|
-
def remove_item(
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
99
|
+
def remove_item(item)
|
100
|
+
return unless @items.key? item
|
101
|
+
|
102
|
+
@items.delete item
|
103
|
+
@version += 1
|
100
104
|
end
|
101
105
|
|
102
106
|
# Returns an array of items that are indexed.
|
@@ -118,30 +122,30 @@ module ClassifierReborn
|
|
118
122
|
# cutoff parameter tells the indexer how many of these values to keep.
|
119
123
|
# A value of 1 for cutoff means that no semantic analysis will take place,
|
120
124
|
# turning the LSI class into a simple vector search engine.
|
121
|
-
def build_index(
|
125
|
+
def build_index(cutoff = 0.75)
|
122
126
|
return unless needs_rebuild?
|
123
127
|
make_word_list
|
124
128
|
|
125
129
|
doc_list = @items.values
|
126
|
-
tda = doc_list.collect { |node| node.raw_vector_with(
|
130
|
+
tda = doc_list.collect { |node| node.raw_vector_with(@word_list) }
|
127
131
|
|
128
132
|
if $GSL
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
133
|
+
tdm = GSL::Matrix.alloc(*tda).trans
|
134
|
+
ntdm = build_reduced_matrix(tdm, cutoff)
|
135
|
+
|
136
|
+
ntdm.size[1].times do |col|
|
137
|
+
vec = GSL::Vector.alloc(ntdm.column(col)).row
|
138
|
+
doc_list[col].lsi_vector = vec
|
139
|
+
doc_list[col].lsi_norm = vec.normalize
|
140
|
+
end
|
137
141
|
else
|
138
|
-
|
139
|
-
|
142
|
+
tdm = Matrix.rows(tda).trans
|
143
|
+
ntdm = build_reduced_matrix(tdm, cutoff)
|
140
144
|
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
+
ntdm.row_size.times do |col|
|
146
|
+
doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
|
147
|
+
doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
|
148
|
+
end
|
145
149
|
end
|
146
150
|
|
147
151
|
@built_at_version = @version
|
@@ -155,13 +159,13 @@ module ClassifierReborn
|
|
155
159
|
# your dataset's general content. For example, if you were to use categorize on the
|
156
160
|
# results of this data, you could gather information on what your dataset is generally
|
157
161
|
# about.
|
158
|
-
def highest_relative_content(
|
159
|
-
|
162
|
+
def highest_relative_content(max_chunks = 10)
|
163
|
+
return [] if needs_rebuild?
|
160
164
|
|
161
|
-
|
162
|
-
|
165
|
+
avg_density = {}
|
166
|
+
@items.each_key { |item| avg_density[item] = proximity_array_for_content(item).inject(0.0) { |x, y| x + y[1] } }
|
163
167
|
|
164
|
-
|
168
|
+
avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks - 1].map
|
165
169
|
end
|
166
170
|
|
167
171
|
# This function is the primitive that find_related and classify
|
@@ -176,10 +180,10 @@ module ClassifierReborn
|
|
176
180
|
# The parameter doc is the content to compare. If that content is not
|
177
181
|
# indexed, you can pass an optional block to define how to create the
|
178
182
|
# text data. See add_item for examples of how this works.
|
179
|
-
def proximity_array_for_content(
|
183
|
+
def proximity_array_for_content(doc, &block)
|
180
184
|
return [] if needs_rebuild?
|
181
185
|
|
182
|
-
content_node = node_for_content(
|
186
|
+
content_node = node_for_content(doc, &block)
|
183
187
|
result =
|
184
188
|
@items.keys.collect do |item|
|
185
189
|
if $GSL
|
@@ -197,10 +201,18 @@ module ClassifierReborn
|
|
197
201
|
# calculated vectors instead of their full versions. This is useful when
|
198
202
|
# you're trying to perform operations on content that is much smaller than
|
199
203
|
# the text you're working with. search uses this primitive.
|
200
|
-
def proximity_norms_for_content(
|
204
|
+
def proximity_norms_for_content(doc, &block)
|
201
205
|
return [] if needs_rebuild?
|
202
206
|
|
203
|
-
content_node = node_for_content(
|
207
|
+
content_node = node_for_content(doc, &block)
|
208
|
+
if $GSL && content_node.raw_norm.isnan?.all?
|
209
|
+
puts "There are no documents that are similar to #{doc}"
|
210
|
+
else
|
211
|
+
content_node_norms(content_node)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
def content_node_norms(content_node)
|
204
216
|
result =
|
205
217
|
@items.keys.collect do |item|
|
206
218
|
if $GSL
|
@@ -220,11 +232,13 @@ module ClassifierReborn
|
|
220
232
|
#
|
221
233
|
# While this may seem backwards compared to the other functions that LSI supports,
|
222
234
|
# it is actually the same algorithm, just applied on a smaller document.
|
223
|
-
def search(
|
235
|
+
def search(string, max_nearest = 3)
|
224
236
|
return [] if needs_rebuild?
|
225
|
-
carry = proximity_norms_for_content(
|
226
|
-
|
227
|
-
|
237
|
+
carry = proximity_norms_for_content(string)
|
238
|
+
unless carry.nil?
|
239
|
+
result = carry.collect { |x| x[0] }
|
240
|
+
result[0..max_nearest - 1]
|
241
|
+
end
|
228
242
|
end
|
229
243
|
|
230
244
|
# This function takes content and finds other documents
|
@@ -236,21 +250,21 @@ module ClassifierReborn
|
|
236
250
|
# This is particularly useful for identifing clusters in your document space.
|
237
251
|
# For example you may want to identify several "What's Related" items for weblog
|
238
252
|
# articles, or find paragraphs that relate to each other in an essay.
|
239
|
-
def find_related(
|
253
|
+
def find_related(doc, max_nearest = 3, &block)
|
240
254
|
carry =
|
241
|
-
proximity_array_for_content(
|
255
|
+
proximity_array_for_content(doc, &block).reject { |pair| pair[0].eql? doc }
|
242
256
|
result = carry.collect { |x| x[0] }
|
243
|
-
|
257
|
+
result[0..max_nearest - 1]
|
244
258
|
end
|
245
259
|
|
246
260
|
# Return the most obvious category with the score
|
247
|
-
def classify_with_score(
|
248
|
-
|
261
|
+
def classify_with_score(doc, cutoff = 0.30, &block)
|
262
|
+
scored_categories(doc, cutoff, &block).last
|
249
263
|
end
|
250
264
|
|
251
265
|
# Return the most obvious category without the score
|
252
|
-
def classify(
|
253
|
-
|
266
|
+
def classify(doc, cutoff = 0.30, &block)
|
267
|
+
scored_categories(doc, cutoff, &block).last.first
|
254
268
|
end
|
255
269
|
|
256
270
|
# This function uses a voting system to categorize documents, based on
|
@@ -262,10 +276,10 @@ module ClassifierReborn
|
|
262
276
|
# text. A cutoff of 1 means that every document in the index votes on
|
263
277
|
# what category the document is in. This may not always make sense.
|
264
278
|
#
|
265
|
-
def scored_categories(
|
279
|
+
def scored_categories(doc, cutoff = 0.30, &block)
|
266
280
|
icutoff = (@items.size * cutoff).round
|
267
|
-
carry = proximity_array_for_content(
|
268
|
-
carry = carry[0..icutoff-1]
|
281
|
+
carry = proximity_array_for_content(doc, &block)
|
282
|
+
carry = carry[0..icutoff - 1]
|
269
283
|
votes = Hash.new(0.0)
|
270
284
|
carry.each do |pair|
|
271
285
|
@items[pair[0]].categories.each do |category|
|
@@ -273,31 +287,31 @@ module ClassifierReborn
|
|
273
287
|
end
|
274
288
|
end
|
275
289
|
|
276
|
-
|
290
|
+
votes.sort_by { |_, score| score }
|
277
291
|
end
|
278
292
|
|
279
293
|
# Prototype, only works on indexed documents.
|
280
294
|
# I have no clue if this is going to work, but in theory
|
281
295
|
# it's supposed to.
|
282
|
-
def highest_ranked_stems(
|
283
|
-
raise
|
284
|
-
|
285
|
-
top_n =
|
286
|
-
|
296
|
+
def highest_ranked_stems(doc, count = 3)
|
297
|
+
raise 'Requested stem ranking on non-indexed content!' unless @items[doc]
|
298
|
+
content_vector_array = node_for_content(doc).lsi_vector.to_a
|
299
|
+
top_n = content_vector_array.sort.reverse[0..count - 1]
|
300
|
+
top_n.collect { |x| @word_list.word_for_index(content_vector_array.index(x)) }
|
287
301
|
end
|
288
302
|
|
289
303
|
private
|
290
|
-
|
304
|
+
|
305
|
+
def build_reduced_matrix(matrix, cutoff = 0.75)
|
291
306
|
# TODO: Check that M>=N on these dimensions! Transpose helps assure this
|
292
307
|
u, v, s = matrix.SV_decomp
|
293
|
-
|
294
308
|
# TODO: Better than 75% term, please. :\
|
295
309
|
s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
|
296
310
|
s.size.times do |ord|
|
297
311
|
s[ord] = 0.0 if s[ord] < s_cutoff
|
298
312
|
end
|
299
313
|
# Reconstruct the term document matrix, only with reduced rank
|
300
|
-
u * ($GSL ? GSL::Matrix : ::Matrix).diag(
|
314
|
+
u * ($GSL ? GSL::Matrix : ::Matrix).diag(s) * v.trans
|
301
315
|
end
|
302
316
|
|
303
317
|
def node_for_content(item, &block)
|
@@ -306,23 +320,21 @@ module ClassifierReborn
|
|
306
320
|
else
|
307
321
|
clean_word_hash = Hasher.clean_word_hash((block ? block.call(item) : item.to_s), @language)
|
308
322
|
|
309
|
-
|
323
|
+
content_node = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
|
310
324
|
|
311
325
|
unless needs_rebuild?
|
312
|
-
|
326
|
+
content_node.raw_vector_with(@word_list) # make the lsi raw and norm vectors
|
313
327
|
end
|
314
328
|
end
|
315
329
|
|
316
|
-
|
330
|
+
content_node
|
317
331
|
end
|
318
332
|
|
319
333
|
def make_word_list
|
320
334
|
@word_list = WordList.new
|
321
335
|
@items.each_value do |node|
|
322
|
-
node.word_hash.each_key { |key| @word_list.add_word
|
336
|
+
node.word_hash.each_key { |key| @word_list.add_word(key) }
|
323
337
|
end
|
324
338
|
end
|
325
|
-
|
326
339
|
end
|
327
340
|
end
|
328
|
-
|
@@ -3,7 +3,6 @@
|
|
3
3
|
# License:: LGPL
|
4
4
|
|
5
5
|
module ClassifierReborn
|
6
|
-
|
7
6
|
# Subclass of ContentNode which caches the search_vector transpositions.
|
8
7
|
# Its great because its much faster for large indexes, but at the cost of more ram. Additionally,
|
9
8
|
# if you Marshal your classifier and want to keep the size down, you'll need to manually
|
@@ -16,7 +15,7 @@ module ClassifierReborn
|
|
16
15
|
end
|
17
16
|
end
|
18
17
|
|
19
|
-
def initialize(
|
18
|
+
def initialize(word_hash, *categories)
|
20
19
|
clear_cache!
|
21
20
|
super
|
22
21
|
end
|
@@ -29,13 +28,13 @@ module ClassifierReborn
|
|
29
28
|
def transposed_search_vector
|
30
29
|
@transposed_search_vector ||= super
|
31
30
|
end
|
32
|
-
|
31
|
+
|
33
32
|
# Clear the cache before we continue on
|
34
|
-
def raw_vector_with(
|
33
|
+
def raw_vector_with(word_list)
|
35
34
|
clear_cache!
|
36
35
|
super
|
37
36
|
end
|
38
|
-
|
37
|
+
|
39
38
|
# We don't want the cached_data here
|
40
39
|
def marshal_dump
|
41
40
|
[@lsi_vector, @lsi_norm, @raw_vector, @raw_norm, @categories, @word_hash]
|
@@ -3,10 +3,9 @@
|
|
3
3
|
# License:: LGPL
|
4
4
|
|
5
5
|
module ClassifierReborn
|
6
|
-
|
7
|
-
#
|
8
|
-
#
|
9
|
-
# You should never have to use it directly.
|
6
|
+
# This is an internal data structure class for the LSI node. Save for
|
7
|
+
# raw_vector_with, it should be fairly straightforward to understand.
|
8
|
+
# You should never have to use it directly.
|
10
9
|
class ContentNode
|
11
10
|
attr_accessor :raw_vector, :raw_norm,
|
12
11
|
:lsi_vector, :lsi_norm,
|
@@ -15,7 +14,7 @@ module ClassifierReborn
|
|
15
14
|
attr_reader :word_hash
|
16
15
|
# If text_proc is not specified, the source will be duck-typed
|
17
16
|
# via source.to_s
|
18
|
-
def initialize(
|
17
|
+
def initialize(word_hash, *categories)
|
19
18
|
@categories = categories || []
|
20
19
|
@word_hash = word_hash
|
21
20
|
@lsi_norm, @lsi_vector = nil
|
@@ -38,11 +37,11 @@ module ClassifierReborn
|
|
38
37
|
|
39
38
|
# Creates the raw vector out of word_hash using word_list as the
|
40
39
|
# key for mapping the vector space.
|
41
|
-
def raw_vector_with(
|
40
|
+
def raw_vector_with(word_list)
|
42
41
|
if $GSL
|
43
|
-
|
42
|
+
vec = GSL::Vector.alloc(word_list.size)
|
44
43
|
else
|
45
|
-
|
44
|
+
vec = Array.new(word_list.size, 0)
|
46
45
|
end
|
47
46
|
|
48
47
|
@word_hash.each_key do |word|
|
@@ -52,7 +51,7 @@ module ClassifierReborn
|
|
52
51
|
# Perform the scaling transform and force floating point arithmetic
|
53
52
|
if $GSL
|
54
53
|
sum = 0.0
|
55
|
-
vec.each {|v| sum += v }
|
54
|
+
vec.each { |v| sum += v }
|
56
55
|
total_words = sum
|
57
56
|
else
|
58
57
|
total_words = vec.reduce(0, :+).to_f
|
@@ -63,7 +62,7 @@ module ClassifierReborn
|
|
63
62
|
if $GSL
|
64
63
|
vec.each { |word| total_unique_words += 1 if word != 0.0 }
|
65
64
|
else
|
66
|
-
total_unique_words = vec.count{ |word| word != 0 }
|
65
|
+
total_unique_words = vec.count { |word| word != 0 }
|
67
66
|
end
|
68
67
|
|
69
68
|
# Perform first-order association transform if this vector has more
|
@@ -71,9 +70,9 @@ module ClassifierReborn
|
|
71
70
|
if total_words > 1.0 && total_unique_words > 1
|
72
71
|
weighted_total = 0.0
|
73
72
|
# Cache calculations, this takes too long on large indexes
|
74
|
-
cached_calcs = Hash.new
|
75
|
-
hash[term] = ((
|
76
|
-
|
73
|
+
cached_calcs = Hash.new do |hash, term|
|
74
|
+
hash[term] = ((term / total_words) * Math.log(term / total_words))
|
75
|
+
end
|
77
76
|
|
78
77
|
vec.each do |term|
|
79
78
|
weighted_total += cached_calcs[term] if term > 0.0
|
@@ -81,12 +80,12 @@ module ClassifierReborn
|
|
81
80
|
|
82
81
|
# Cache calculations, this takes too long on large indexes
|
83
82
|
cached_calcs = Hash.new do |hash, val|
|
84
|
-
hash[val] = Math.log(
|
83
|
+
hash[val] = Math.log(val + 1) / -weighted_total
|
85
84
|
end
|
86
85
|
|
87
|
-
vec.collect!
|
86
|
+
vec.collect! do |val|
|
88
87
|
cached_calcs[val]
|
89
|
-
|
88
|
+
end
|
90
89
|
end
|
91
90
|
|
92
91
|
if $GSL
|
@@ -97,7 +96,5 @@ module ClassifierReborn
|
|
97
96
|
@raw_vector = Vector[*vec]
|
98
97
|
end
|
99
98
|
end
|
100
|
-
|
101
99
|
end
|
102
|
-
|
103
100
|
end
|
@@ -4,13 +4,13 @@
|
|
4
4
|
|
5
5
|
module ClassifierReborn
|
6
6
|
module Summarizer
|
7
|
-
|
7
|
+
module_function
|
8
8
|
|
9
|
-
def summary(
|
9
|
+
def summary(str, count = 10, separator = ' [...] ')
|
10
10
|
perform_lsi split_sentences(str), count, separator
|
11
11
|
end
|
12
12
|
|
13
|
-
def paragraph_summary(
|
13
|
+
def paragraph_summary(str, count = 1, separator = ' [...] ')
|
14
14
|
perform_lsi split_paragraphs(str), count, separator
|
15
15
|
end
|
16
16
|
|
@@ -23,11 +23,11 @@ module ClassifierReborn
|
|
23
23
|
end
|
24
24
|
|
25
25
|
def perform_lsi(chunks, count, separator)
|
26
|
-
lsi = ClassifierReborn::LSI.new :
|
26
|
+
lsi = ClassifierReborn::LSI.new auto_rebuild: false
|
27
27
|
chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
|
28
28
|
lsi.build_index
|
29
29
|
summaries = lsi.highest_relative_content count
|
30
|
-
|
30
|
+
summaries.reject { |chunk| !summaries.include? chunk }.map(&:strip).join(separator)
|
31
31
|
end
|
32
32
|
end
|
33
33
|
end
|
@@ -8,7 +8,7 @@ module ClassifierReborn
|
|
8
8
|
|
9
9
|
class WordList
|
10
10
|
def initialize
|
11
|
-
@location_table =
|
11
|
+
@location_table = {}
|
12
12
|
end
|
13
13
|
|
14
14
|
# Adds a word (if it is new) and assigns it a unique dimension.
|
@@ -31,6 +31,5 @@ module ClassifierReborn
|
|
31
31
|
def size
|
32
32
|
@location_table.size
|
33
33
|
end
|
34
|
-
|
35
34
|
end
|
36
35
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: classifier-reborn
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0
|
4
|
+
version: 2.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lucas Carlson
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date:
|
13
|
+
date: 2017-01-01 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: fast-stemmer
|
@@ -55,7 +55,49 @@ dependencies:
|
|
55
55
|
- !ruby/object:Gem::Version
|
56
56
|
version: '0'
|
57
57
|
- !ruby/object:Gem::Dependency
|
58
|
-
name:
|
58
|
+
name: minitest
|
59
|
+
requirement: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
64
|
+
type: :development
|
65
|
+
prerelease: false
|
66
|
+
version_requirements: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: '0'
|
71
|
+
- !ruby/object:Gem::Dependency
|
72
|
+
name: minitest-reporters
|
73
|
+
requirement: !ruby/object:Gem::Requirement
|
74
|
+
requirements:
|
75
|
+
- - ">="
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
type: :development
|
79
|
+
prerelease: false
|
80
|
+
version_requirements: !ruby/object:Gem::Requirement
|
81
|
+
requirements:
|
82
|
+
- - ">="
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: '0'
|
85
|
+
- !ruby/object:Gem::Dependency
|
86
|
+
name: rubocop
|
87
|
+
requirement: !ruby/object:Gem::Requirement
|
88
|
+
requirements:
|
89
|
+
- - ">="
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
version: '0'
|
92
|
+
type: :development
|
93
|
+
prerelease: false
|
94
|
+
version_requirements: !ruby/object:Gem::Requirement
|
95
|
+
requirements:
|
96
|
+
- - ">="
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: '0'
|
99
|
+
- !ruby/object:Gem::Dependency
|
100
|
+
name: pry
|
59
101
|
requirement: !ruby/object:Gem::Requirement
|
60
102
|
requirements:
|
61
103
|
- - ">="
|
@@ -134,9 +176,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
134
176
|
version: '0'
|
135
177
|
requirements: []
|
136
178
|
rubyforge_project:
|
137
|
-
rubygems_version: 2.
|
179
|
+
rubygems_version: 2.5.2
|
138
180
|
signing_key:
|
139
181
|
specification_version: 2
|
140
182
|
summary: A general classifier module to allow Bayesian and other types of classifications.
|
141
183
|
test_files: []
|
142
|
-
has_rdoc: true
|