classifier-reborn 2.0.4 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.markdown +25 -3
- data/bin/bayes.rb +22 -22
- data/bin/summarize.rb +4 -4
- data/lib/classifier-reborn.rb +1 -1
- data/lib/classifier-reborn/bayes.rb +50 -46
- data/lib/classifier-reborn/category_namer.rb +4 -3
- data/lib/classifier-reborn/extensions/hasher.rb +19 -11
- data/lib/classifier-reborn/extensions/vector.rb +27 -26
- data/lib/classifier-reborn/extensions/vector_serialize.rb +8 -10
- data/lib/classifier-reborn/lsi.rb +89 -77
- data/lib/classifier-reborn/lsi/cached_content_node.rb +4 -5
- data/lib/classifier-reborn/lsi/content_node.rb +15 -18
- data/lib/classifier-reborn/lsi/summarizer.rb +5 -5
- data/lib/classifier-reborn/lsi/word_list.rb +1 -2
- data/lib/classifier-reborn/version.rb +1 -1
- metadata +46 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 53745cead2833e74e9d74f3359bae0bc7fd01fa4
|
4
|
+
data.tar.gz: 2926365890cf0bc43f7ff17570789ed5e032a45d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5def462eccbb9ef7a45d3968a5fbc2e2a886bfdda734a98e1f5904943ba1b4012ea331a51ff52e2c165db776dc8c6aed35fac9e01ea3e3f001f82daf98027f74
|
7
|
+
data.tar.gz: 22726b8f6c2acab5bb47b9faac2a2d615557bfff535327ef5bbe022a661c1be13c3b7cec95d1c0030f93158eb0942e121d97f2a0213a640eca00607de9d55f61
|
data/README.markdown
CHANGED
@@ -1,5 +1,12 @@
|
|
1
1
|
## Welcome to Classifier Reborn
|
2
2
|
|
3
|
+
[![Gem Version](https://img.shields.io/gem/v/classifier-reborn.svg)][ruby-gems]
|
4
|
+
[![Build Status](https://img.shields.io/travis/jekyll/classifier-reborn/master.svg)][travis]
|
5
|
+
[![Dependency Status](https://img.shields.io/gemnasium/jekyll/classifier-reborn.svg)][gemnasium]
|
6
|
+
[ruby-gems]: https://rubygems.org/gems/jekyll/classifier-reborn
|
7
|
+
[gemnasium]: https://gemnasium.com/jekyll/classifier-reborn
|
8
|
+
[travis]: https://travis-ci.org/jekyll/classifier-reborn
|
9
|
+
|
3
10
|
Classifier is a general module to allow Bayesian and other types of classifications.
|
4
11
|
|
5
12
|
Classifier Reborn is a fork of cardmagic/classifier under more active development.
|
@@ -37,6 +44,8 @@ Notice that LSI will work without these libraries, but as soon as they are insta
|
|
37
44
|
|
38
45
|
A Bayesian classifier by Lucas Carlson. Bayesian Classifiers are accurate, fast, and have modest memory requirements.
|
39
46
|
|
47
|
+
*Note: Classifier only supports UTF-8 characters.*
|
48
|
+
|
40
49
|
### Usage
|
41
50
|
|
42
51
|
```ruby
|
@@ -60,7 +69,7 @@ trained_classifier.classify "I love" # returns 'Interesting'
|
|
60
69
|
```
|
61
70
|
|
62
71
|
Beyond the basic example, the constructor and trainer can be used in a more
|
63
|
-
flexible way to
|
72
|
+
flexible way to accommodate non-trival applications. Consider the following
|
64
73
|
program:
|
65
74
|
|
66
75
|
```ruby
|
@@ -72,7 +81,8 @@ require 'classifier-reborn'
|
|
72
81
|
training_set = DATA.read.split("\n")
|
73
82
|
categories = training_set.shift.split(',').map{|c| c.strip}
|
74
83
|
|
75
|
-
|
84
|
+
# pass :auto_categorize option to allow feeding previously unknown categories
|
85
|
+
classifier = ClassifierReborn::Bayes.new categories, auto_categorize: true
|
76
86
|
|
77
87
|
training_set.each do |a_line|
|
78
88
|
next if a_line.empty? || '#' == a_line.strip[0]
|
@@ -141,7 +151,7 @@ Or suppose you just want the ability to have multiple categories and a 'None of
|
|
141
151
|
When you initialize the *ClassifierReborn::Bayes* classifier there are several options which can be set that control threshold processing.
|
142
152
|
|
143
153
|
```ruby
|
144
|
-
b =
|
154
|
+
b = ClassifierReborn::Bayes.new(
|
145
155
|
'good', # one or more categories
|
146
156
|
enable_threshold: true, # default: false
|
147
157
|
threshold: -10.0 # default: 0.0
|
@@ -218,6 +228,18 @@ with more than just simple strings.
|
|
218
228
|
* http://www.chadfowler.com/index.cgi/Computing/LatentSemanticIndexing.rdoc
|
219
229
|
* http://en.wikipedia.org/wiki/Latent_semantic_analysis
|
220
230
|
|
231
|
+
|
232
|
+
## Code of Conduct
|
233
|
+
|
234
|
+
In order to have a more open and welcoming community, Classifier-Reborn adheres to the Jekyll
|
235
|
+
[code of conduct](https://github.com/jekyll/jekyll/blob/master/CONDUCT.markdown) adapted from the Ruby on Rails code of
|
236
|
+
conduct.
|
237
|
+
|
238
|
+
Please adhere to this code of conduct in any interactions you have in the
|
239
|
+
Classifier community. If you encounter someone violating
|
240
|
+
these terms, please let [@chase](https://github.com/Ch4s3) know and we will address it as soon as possible.
|
241
|
+
|
242
|
+
|
221
243
|
## Authors
|
222
244
|
|
223
245
|
* Lucas Carlson (lucas@rufy.com)
|
data/bin/bayes.rb
CHANGED
@@ -1,36 +1,36 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
begin
|
4
|
-
|
5
|
-
|
4
|
+
require 'rubygems'
|
5
|
+
require 'classifier'
|
6
6
|
rescue
|
7
|
-
|
7
|
+
require 'classifier'
|
8
8
|
end
|
9
9
|
|
10
10
|
require 'madeleine'
|
11
11
|
|
12
|
-
m = SnapshotMadeleine.new(File.expand_path(
|
13
|
-
|
14
|
-
|
12
|
+
m = SnapshotMadeleine.new(File.expand_path('~/.bayes_data')) do
|
13
|
+
ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting'
|
14
|
+
end
|
15
15
|
|
16
16
|
case ARGV[0]
|
17
|
-
when
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
when
|
30
|
-
|
17
|
+
when 'add'
|
18
|
+
case ARGV[1].downcase
|
19
|
+
when 'interesting'
|
20
|
+
m.system.train_interesting File.open(ARGV[2]).read
|
21
|
+
puts "#{ARGV[2]} has been classified as interesting"
|
22
|
+
when 'uninteresting'
|
23
|
+
m.system.train_uninteresting File.open(ARGV[2]).read
|
24
|
+
puts "#{ARGV[2]} has been classified as uninteresting"
|
25
|
+
else
|
26
|
+
puts 'Invalid category: choose between interesting and uninteresting'
|
27
|
+
exit(1)
|
28
|
+
end
|
29
|
+
when 'classify'
|
30
|
+
puts m.system.classify(File.open(ARGV[1]).read)
|
31
31
|
else
|
32
|
-
|
33
|
-
|
32
|
+
puts 'Invalid option: choose add [category] [file] or clasify [file]'
|
33
|
+
exit(-1)
|
34
34
|
end
|
35
35
|
|
36
36
|
m.take_snapshot
|
data/bin/summarize.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
begin
|
4
|
-
|
5
|
-
|
4
|
+
require 'rubygems'
|
5
|
+
require 'classifier'
|
6
6
|
rescue
|
7
|
-
|
7
|
+
require 'classifier'
|
8
8
|
end
|
9
9
|
|
10
10
|
require 'open-uri'
|
@@ -13,4 +13,4 @@ num = ARGV[1].to_i
|
|
13
13
|
num = num < 1 ? 10 : num
|
14
14
|
|
15
15
|
text = open(ARGV.first).read
|
16
|
-
puts text.gsub(/<[^>]+>/,
|
16
|
+
puts text.gsub(/<[^>]+>/, '').gsub(/[\s]+/, ' ').summary(num)
|
data/lib/classifier-reborn.rb
CHANGED
@@ -17,20 +17,22 @@ module ClassifierReborn
|
|
17
17
|
# auto_categorize: false When true, enables ability to dynamically declare a category
|
18
18
|
# enable_threshold: false When true, enables a threshold requirement for classifition
|
19
19
|
# threshold: 0.0 Default threshold, only used when enabled
|
20
|
+
# enable_stemmer: true When false, disables word stemming
|
20
21
|
def initialize(*args)
|
21
|
-
@categories =
|
22
|
-
options = { language: 'en',
|
22
|
+
@categories = {}
|
23
|
+
options = { language: 'en',
|
23
24
|
auto_categorize: false,
|
24
25
|
enable_threshold: false,
|
25
|
-
threshold: 0.0
|
26
|
+
threshold: 0.0,
|
27
|
+
enable_stemmer: true
|
26
28
|
}
|
27
|
-
args.flatten.each
|
28
|
-
if arg.
|
29
|
+
args.flatten.each do |arg|
|
30
|
+
if arg.is_a?(Hash)
|
29
31
|
options.merge!(arg)
|
30
32
|
else
|
31
33
|
add_category(arg)
|
32
34
|
end
|
33
|
-
|
35
|
+
end
|
34
36
|
|
35
37
|
@total_words = 0
|
36
38
|
@category_counts = Hash.new(0)
|
@@ -40,6 +42,7 @@ module ClassifierReborn
|
|
40
42
|
@auto_categorize = options[:auto_categorize]
|
41
43
|
@enable_threshold = options[:enable_threshold]
|
42
44
|
@threshold = options[:threshold]
|
45
|
+
@enable_stemmer = options[:enable_stemmer]
|
43
46
|
end
|
44
47
|
|
45
48
|
# Provides a general training method for all categories specified in Bayes#new
|
@@ -52,18 +55,18 @@ module ClassifierReborn
|
|
52
55
|
category = CategoryNamer.prepare_name(category)
|
53
56
|
|
54
57
|
# Add the category dynamically or raise an error
|
55
|
-
|
58
|
+
unless @categories.key?(category)
|
56
59
|
if @auto_categorize
|
57
60
|
add_category(category)
|
58
61
|
else
|
59
|
-
raise CategoryNotFoundError
|
62
|
+
raise CategoryNotFoundError, "Cannot train; category #{category} does not exist"
|
60
63
|
end
|
61
64
|
end
|
62
65
|
|
63
66
|
@category_counts[category] += 1
|
64
|
-
Hasher.word_hash(text, @language).each do |word, count|
|
65
|
-
@categories[category][word]
|
66
|
-
@category_word_count[category]
|
67
|
+
Hasher.word_hash(text, @language, @enable_stemmer).each do |word, count|
|
68
|
+
@categories[category][word] += count
|
69
|
+
@category_word_count[category] += count
|
67
70
|
@total_words += count
|
68
71
|
end
|
69
72
|
end
|
@@ -78,20 +81,17 @@ module ClassifierReborn
|
|
78
81
|
def untrain(category, text)
|
79
82
|
category = CategoryNamer.prepare_name(category)
|
80
83
|
@category_counts[category] -= 1
|
81
|
-
Hasher.word_hash(text, @language).each do |word, count|
|
82
|
-
if @total_words
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
end
|
89
|
-
|
90
|
-
if @category_word_count[category] >= count
|
91
|
-
@category_word_count[category] -= count
|
92
|
-
end
|
93
|
-
@total_words -= count
|
84
|
+
Hasher.word_hash(text, @language, @enable_stemmer).each do |word, count|
|
85
|
+
next if @total_words < 0
|
86
|
+
orig = @categories[category][word] || 0
|
87
|
+
@categories[category][word] -= count
|
88
|
+
if @categories[category][word] <= 0
|
89
|
+
@categories[category].delete(word)
|
90
|
+
count = orig
|
94
91
|
end
|
92
|
+
|
93
|
+
@category_word_count[category] -= count if @category_word_count[category] >= count
|
94
|
+
@total_words -= count
|
95
95
|
end
|
96
96
|
end
|
97
97
|
|
@@ -100,21 +100,21 @@ module ClassifierReborn
|
|
100
100
|
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
101
101
|
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
102
102
|
def classifications(text)
|
103
|
-
score =
|
104
|
-
word_hash = Hasher.word_hash(text, @language)
|
103
|
+
score = {}
|
104
|
+
word_hash = Hasher.word_hash(text, @language, @enable_stemmer)
|
105
105
|
training_count = @category_counts.values.reduce(:+).to_f
|
106
106
|
@categories.each do |category, category_words|
|
107
107
|
score[category.to_s] = 0
|
108
108
|
total = (@category_word_count[category] || 1).to_f
|
109
|
-
word_hash.each do |word,
|
110
|
-
s = category_words.
|
111
|
-
score[category.to_s] += Math.log(s/total)
|
109
|
+
word_hash.each do |word, _count|
|
110
|
+
s = category_words.key?(word) ? category_words[word] : 0.1
|
111
|
+
score[category.to_s] += Math.log(s / total)
|
112
112
|
end
|
113
113
|
# now add prior probability for the category
|
114
|
-
s = @category_counts.
|
114
|
+
s = @category_counts.key?(category) ? @category_counts[category] : 0.1
|
115
115
|
score[category.to_s] += Math.log(s / training_count)
|
116
116
|
end
|
117
|
-
|
117
|
+
score
|
118
118
|
end
|
119
119
|
|
120
120
|
# Returns the classification of the provided +text+, which is one of the
|
@@ -128,21 +128,15 @@ module ClassifierReborn
|
|
128
128
|
# Return the classification without the score
|
129
129
|
def classify(text)
|
130
130
|
result, score = classify_with_score(text)
|
131
|
-
if threshold_enabled?
|
132
|
-
|
133
|
-
end
|
134
|
-
return result
|
131
|
+
result = nil if score < @threshold || score == Float::INFINITY if threshold_enabled?
|
132
|
+
result
|
135
133
|
end
|
136
134
|
|
137
135
|
# Retrieve the current threshold value
|
138
|
-
|
139
|
-
@threshold
|
140
|
-
end
|
136
|
+
attr_reader :threshold
|
141
137
|
|
142
138
|
# Dynamically set the threshold value
|
143
|
-
|
144
|
-
@threshold = a_float
|
145
|
-
end
|
139
|
+
attr_writer :threshold
|
146
140
|
|
147
141
|
# Dynamically enable threshold for classify results
|
148
142
|
def enable_threshold
|
@@ -164,6 +158,16 @@ module ClassifierReborn
|
|
164
158
|
!@enable_threshold
|
165
159
|
end
|
166
160
|
|
161
|
+
# Is word stemming enabled?
|
162
|
+
def stemmer_enabled?
|
163
|
+
@enable_stemmer
|
164
|
+
end
|
165
|
+
|
166
|
+
# Is word stemming disabled?
|
167
|
+
def stemmer_disabled?
|
168
|
+
!@enable_stemmer
|
169
|
+
end
|
170
|
+
|
167
171
|
# Provides training and untraining methods for the categories specified in Bayes#new
|
168
172
|
# For example:
|
169
173
|
# b = ClassifierReborn::Bayes.new 'This', 'That', 'the_other'
|
@@ -174,12 +178,12 @@ module ClassifierReborn
|
|
174
178
|
def method_missing(name, *args)
|
175
179
|
cleaned_name = name.to_s.gsub(/(un)?train_([\w]+)/, '\2')
|
176
180
|
category = CategoryNamer.prepare_name(cleaned_name)
|
177
|
-
if @categories.
|
178
|
-
args.each { |text| eval("#{
|
181
|
+
if @categories.key? category
|
182
|
+
args.each { |text| eval("#{Regexp.last_match(1)}train(category, text)") }
|
179
183
|
elsif name.to_s =~ /(un)?train_([\w]+)/
|
180
184
|
raise StandardError, "No such category: #{category}"
|
181
185
|
else
|
182
|
-
super
|
186
|
+
super # raise StandardError, "No such method: #{name}"
|
183
187
|
end
|
184
188
|
end
|
185
189
|
|
@@ -188,7 +192,7 @@ module ClassifierReborn
|
|
188
192
|
# b.categories
|
189
193
|
# => ['This', 'That', 'the_other']
|
190
194
|
def categories # :nodoc:
|
191
|
-
@categories.keys.collect
|
195
|
+
@categories.keys.collect(&:to_s)
|
192
196
|
end
|
193
197
|
|
194
198
|
# Allows you to add categories to the classifier.
|
@@ -203,6 +207,6 @@ module ClassifierReborn
|
|
203
207
|
@categories[CategoryNamer.prepare_name(category)] ||= Hash.new(0)
|
204
208
|
end
|
205
209
|
|
206
|
-
|
210
|
+
alias_method :append_category, :add_category
|
207
211
|
end
|
208
212
|
end
|
@@ -7,11 +7,12 @@ require 'classifier-reborn/extensions/hasher'
|
|
7
7
|
|
8
8
|
module ClassifierReborn
|
9
9
|
module CategoryNamer
|
10
|
-
|
11
|
-
|
10
|
+
module_function
|
11
|
+
|
12
|
+
def prepare_name(name)
|
12
13
|
return name if name.is_a?(Symbol)
|
13
14
|
|
14
|
-
name.to_s.
|
15
|
+
name.to_s.tr('_', ' ').capitalize.intern
|
15
16
|
end
|
16
17
|
end
|
17
18
|
end
|
@@ -9,29 +9,37 @@ module ClassifierReborn
|
|
9
9
|
module Hasher
|
10
10
|
STOPWORDS_PATH = [File.expand_path(File.dirname(__FILE__) + '/../../../data/stopwords')]
|
11
11
|
|
12
|
-
|
12
|
+
module_function
|
13
13
|
|
14
14
|
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
15
15
|
# interned, and indexes to its frequency in the document.
|
16
|
-
def word_hash(str, language = 'en')
|
17
|
-
cleaned_word_hash = clean_word_hash(str, language)
|
16
|
+
def word_hash(str, language = 'en', enable_stemmer = true)
|
17
|
+
cleaned_word_hash = clean_word_hash(str, language, enable_stemmer)
|
18
18
|
symbol_hash = word_hash_for_symbols(str.scan(/[^\s\p{WORD}]/))
|
19
|
-
|
19
|
+
cleaned_word_hash.merge(symbol_hash)
|
20
20
|
end
|
21
21
|
|
22
22
|
# Return a word hash without extra punctuation or short symbols, just stemmed words
|
23
|
-
def clean_word_hash(str, language = 'en')
|
24
|
-
word_hash_for_words str.gsub(/[^\p{WORD}\s]/,'').downcase.split, language
|
23
|
+
def clean_word_hash(str, language = 'en', enable_stemmer = true)
|
24
|
+
word_hash_for_words str.gsub(/[^\p{WORD}\s]/, '').downcase.split, language, enable_stemmer
|
25
25
|
end
|
26
26
|
|
27
|
-
def word_hash_for_words(words, language = 'en')
|
27
|
+
def word_hash_for_words(words, language = 'en', enable_stemmer = true)
|
28
28
|
d = Hash.new(0)
|
29
29
|
words.each do |word|
|
30
|
-
|
30
|
+
next unless word.length > 2 && !STOPWORDS[language].include?(word)
|
31
|
+
if enable_stemmer
|
31
32
|
d[word.stem.intern] += 1
|
33
|
+
else
|
34
|
+
d[word.intern] += 1
|
32
35
|
end
|
33
36
|
end
|
34
|
-
|
37
|
+
d
|
38
|
+
end
|
39
|
+
|
40
|
+
# Add custom path to a new stopword file created by user
|
41
|
+
def add_custom_stopword_path(path)
|
42
|
+
STOPWORDS_PATH.unshift(path)
|
35
43
|
end
|
36
44
|
|
37
45
|
def word_hash_for_symbols(words)
|
@@ -39,7 +47,7 @@ module ClassifierReborn
|
|
39
47
|
words.each do |word|
|
40
48
|
d[word.intern] += 1
|
41
49
|
end
|
42
|
-
|
50
|
+
d
|
43
51
|
end
|
44
52
|
|
45
53
|
# Create a lazily-loaded hash of stopword data
|
@@ -48,7 +56,7 @@ module ClassifierReborn
|
|
48
56
|
|
49
57
|
STOPWORDS_PATH.each do |path|
|
50
58
|
if File.exist?(File.join(path, language))
|
51
|
-
hash[language] = Set.new File.read(File.join(path, language.to_s)).split
|
59
|
+
hash[language] = Set.new File.read(File.join(path, language.to_s)).force_encoding("utf-8").split
|
52
60
|
break
|
53
61
|
end
|
54
62
|
end
|
@@ -6,17 +6,17 @@
|
|
6
6
|
require 'matrix'
|
7
7
|
|
8
8
|
class Matrix
|
9
|
-
def
|
10
|
-
|
9
|
+
def self.diag(s)
|
10
|
+
Matrix.diagonal(*s)
|
11
11
|
end
|
12
12
|
|
13
|
-
|
13
|
+
alias_method :trans, :transpose
|
14
14
|
|
15
15
|
def SV_decomp(maxSweeps = 20)
|
16
|
-
if
|
17
|
-
q =
|
16
|
+
if row_size >= column_size
|
17
|
+
q = trans * self
|
18
18
|
else
|
19
|
-
q = self *
|
19
|
+
q = self * trans
|
20
20
|
end
|
21
21
|
|
22
22
|
qrot = q.dup
|
@@ -24,55 +24,56 @@ class Matrix
|
|
24
24
|
mzrot = nil
|
25
25
|
cnt = 0
|
26
26
|
s_old = nil
|
27
|
-
mu = nil
|
28
27
|
|
29
|
-
|
28
|
+
loop do
|
30
29
|
cnt += 1
|
31
|
-
|
32
|
-
|
30
|
+
(0...qrot.row_size - 1).each do |row|
|
31
|
+
(1..qrot.row_size - 1).each do |col|
|
33
32
|
next if row == col
|
34
|
-
|
33
|
+
|
34
|
+
h = Math.atan((2 * qrot[row, col]) / (qrot[row, row] - qrot[col, col])) / 2.0
|
35
35
|
hcos = Math.cos(h)
|
36
36
|
hsin = Math.sin(h)
|
37
37
|
mzrot = Matrix.identity(qrot.row_size)
|
38
|
-
mzrot[row,row] = hcos
|
39
|
-
mzrot[row,col] = -hsin
|
40
|
-
mzrot[col,row] = hsin
|
41
|
-
mzrot[col,col] = hcos
|
38
|
+
mzrot[row, row] = hcos
|
39
|
+
mzrot[row, col] = -hsin
|
40
|
+
mzrot[col, row] = hsin
|
41
|
+
mzrot[col, col] = hcos
|
42
42
|
qrot = mzrot.trans * qrot * mzrot
|
43
|
-
v
|
43
|
+
v *= mzrot
|
44
44
|
end
|
45
45
|
end
|
46
46
|
s_old = qrot.dup if cnt == 1
|
47
47
|
sum_qrot = 0.0
|
48
48
|
if cnt > 1
|
49
49
|
qrot.row_size.times do |r|
|
50
|
-
sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
|
50
|
+
sum_qrot += (qrot[r, r] - s_old[r, r]).abs if (qrot[r, r] - s_old[r, r]).abs > 0.001
|
51
51
|
end
|
52
52
|
s_old = qrot.dup
|
53
53
|
end
|
54
|
-
break if (sum_qrot <= 0.001
|
54
|
+
break if (sum_qrot <= 0.001 && cnt > 1) || cnt >= maxSweeps
|
55
55
|
end # of do while true
|
56
56
|
s = []
|
57
57
|
qrot.row_size.times do |r|
|
58
|
-
s << Math.sqrt(qrot[r,r])
|
58
|
+
s << Math.sqrt(qrot[r, r])
|
59
59
|
end
|
60
|
-
#puts "cnt = #{cnt}"
|
61
|
-
if
|
62
|
-
mu = self *
|
60
|
+
# puts "cnt = #{cnt}"
|
61
|
+
if row_size >= column_size
|
62
|
+
mu = self * v * Matrix.diagonal(*s).inverse
|
63
63
|
return [mu, v, s]
|
64
64
|
else
|
65
65
|
puts v.row_size
|
66
66
|
puts v.column_size
|
67
|
-
puts
|
68
|
-
puts
|
67
|
+
puts row_size
|
68
|
+
puts column_size
|
69
69
|
puts s.size
|
70
70
|
|
71
|
-
mu = (
|
71
|
+
mu = (trans * v * Matrix.diagonal(*s).inverse)
|
72
72
|
return [mu, v, s]
|
73
73
|
end
|
74
74
|
end
|
75
|
-
|
75
|
+
|
76
|
+
def []=(i, j, val)
|
76
77
|
@rows[i][j] = val
|
77
78
|
end
|
78
79
|
end
|
@@ -1,20 +1,18 @@
|
|
1
1
|
module GSL
|
2
|
-
|
3
2
|
class Vector
|
4
|
-
def _dump(
|
5
|
-
Marshal.dump(
|
3
|
+
def _dump(_v)
|
4
|
+
Marshal.dump(to_a)
|
6
5
|
end
|
7
|
-
|
6
|
+
|
8
7
|
def self._load(arr)
|
9
8
|
arry = Marshal.load(arr)
|
10
|
-
|
9
|
+
GSL::Vector.alloc(arry)
|
11
10
|
end
|
12
|
-
|
13
11
|
end
|
14
|
-
|
12
|
+
|
15
13
|
class Matrix
|
16
|
-
|
17
|
-
|
18
|
-
|
14
|
+
class <<self
|
15
|
+
alias_method :diag, :diagonal
|
16
|
+
end
|
19
17
|
end
|
20
18
|
end
|
@@ -3,13 +3,14 @@
|
|
3
3
|
# License:: LGPL
|
4
4
|
|
5
5
|
begin
|
6
|
-
raise LoadError if ENV['NATIVE_VECTOR'] ==
|
6
|
+
raise LoadError if ENV['NATIVE_VECTOR'] == 'true' # to test the native vector class, try `rake test NATIVE_VECTOR=true`
|
7
7
|
|
8
|
-
require 'gsl' # requires
|
8
|
+
require 'gsl' # requires https://github.com/SciRuby/rb-gsl
|
9
9
|
require_relative 'extensions/vector_serialize'
|
10
10
|
$GSL = true
|
11
11
|
|
12
12
|
rescue LoadError
|
13
|
+
$GSL = false
|
13
14
|
require_relative 'extensions/vector'
|
14
15
|
end
|
15
16
|
|
@@ -19,12 +20,10 @@ require_relative 'lsi/cached_content_node'
|
|
19
20
|
require_relative 'lsi/summarizer'
|
20
21
|
|
21
22
|
module ClassifierReborn
|
22
|
-
|
23
23
|
# This class implements a Latent Semantic Indexer, which can search, classify and cluster
|
24
24
|
# data based on underlying semantic relations. For more information on the algorithms used,
|
25
25
|
# please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
|
26
26
|
class LSI
|
27
|
-
|
28
27
|
attr_reader :word_list, :cache_node_vectors
|
29
28
|
attr_accessor :auto_rebuild
|
30
29
|
|
@@ -36,12 +35,12 @@ module ClassifierReborn
|
|
36
35
|
#
|
37
36
|
def initialize(options = {})
|
38
37
|
@auto_rebuild = options[:auto_rebuild] != false
|
39
|
-
@word_list
|
40
|
-
@
|
38
|
+
@word_list = WordList.new
|
39
|
+
@items = {}
|
40
|
+
@version = 0
|
41
|
+
@built_at_version = -1
|
41
42
|
@language = options[:language] || 'en'
|
42
|
-
if @cache_node_vectors = options[:cache_node_vectors]
|
43
|
-
extend CachedContentNode::InstanceMethods
|
44
|
-
end
|
43
|
+
extend CachedContentNode::InstanceMethods if @cache_node_vectors = options[:cache_node_vectors]
|
45
44
|
end
|
46
45
|
|
47
46
|
# Returns true if the index needs to be rebuilt. The index needs
|
@@ -64,39 +63,44 @@ module ClassifierReborn
|
|
64
63
|
# ar = ActiveRecordObject.find( :all )
|
65
64
|
# lsi.add_item ar, *ar.categories { |x| ar.content }
|
66
65
|
#
|
67
|
-
def add_item(
|
66
|
+
def add_item(item, *categories, &block)
|
68
67
|
clean_word_hash = Hasher.clean_word_hash((block ? block.call(item) : item.to_s), @language)
|
69
|
-
|
70
|
-
|
68
|
+
if clean_word_hash.empty?
|
69
|
+
puts "Input: '#{item}' is entirely stopwords or words with 2 or fewer characters. Classifier-Reborn cannot handle this document properly."
|
71
70
|
else
|
72
|
-
|
71
|
+
@items[item] = if @cache_node_vectors
|
72
|
+
CachedContentNode.new(clean_word_hash, *categories)
|
73
|
+
else
|
74
|
+
ContentNode.new(clean_word_hash, *categories)
|
75
|
+
end
|
76
|
+
@version += 1
|
77
|
+
build_index if @auto_rebuild
|
73
78
|
end
|
74
|
-
@version += 1
|
75
|
-
build_index if @auto_rebuild
|
76
79
|
end
|
77
80
|
|
78
81
|
# A less flexible shorthand for add_item that assumes
|
79
82
|
# you are passing in a string with no categorries. item
|
80
83
|
# will be duck typed via to_s .
|
81
84
|
#
|
82
|
-
def <<(
|
83
|
-
add_item
|
85
|
+
def <<(item)
|
86
|
+
add_item(item)
|
84
87
|
end
|
85
88
|
|
86
89
|
# Returns the categories for a given indexed items. You are free to add and remove
|
87
90
|
# items from this as you see fit. It does not invalide an index to change its categories.
|
88
91
|
def categories_for(item)
|
89
92
|
return [] unless @items[item]
|
90
|
-
|
93
|
+
|
94
|
+
@items[item].categories
|
91
95
|
end
|
92
96
|
|
93
97
|
# Removes an item from the database, if it is indexed.
|
94
98
|
#
|
95
|
-
def remove_item(
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
99
|
+
def remove_item(item)
|
100
|
+
return unless @items.key? item
|
101
|
+
|
102
|
+
@items.delete item
|
103
|
+
@version += 1
|
100
104
|
end
|
101
105
|
|
102
106
|
# Returns an array of items that are indexed.
|
@@ -118,30 +122,30 @@ module ClassifierReborn
|
|
118
122
|
# cutoff parameter tells the indexer how many of these values to keep.
|
119
123
|
# A value of 1 for cutoff means that no semantic analysis will take place,
|
120
124
|
# turning the LSI class into a simple vector search engine.
|
121
|
-
def build_index(
|
125
|
+
def build_index(cutoff = 0.75)
|
122
126
|
return unless needs_rebuild?
|
123
127
|
make_word_list
|
124
128
|
|
125
129
|
doc_list = @items.values
|
126
|
-
tda = doc_list.collect { |node| node.raw_vector_with(
|
130
|
+
tda = doc_list.collect { |node| node.raw_vector_with(@word_list) }
|
127
131
|
|
128
132
|
if $GSL
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
133
|
+
tdm = GSL::Matrix.alloc(*tda).trans
|
134
|
+
ntdm = build_reduced_matrix(tdm, cutoff)
|
135
|
+
|
136
|
+
ntdm.size[1].times do |col|
|
137
|
+
vec = GSL::Vector.alloc(ntdm.column(col)).row
|
138
|
+
doc_list[col].lsi_vector = vec
|
139
|
+
doc_list[col].lsi_norm = vec.normalize
|
140
|
+
end
|
137
141
|
else
|
138
|
-
|
139
|
-
|
142
|
+
tdm = Matrix.rows(tda).trans
|
143
|
+
ntdm = build_reduced_matrix(tdm, cutoff)
|
140
144
|
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
+
ntdm.row_size.times do |col|
|
146
|
+
doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
|
147
|
+
doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
|
148
|
+
end
|
145
149
|
end
|
146
150
|
|
147
151
|
@built_at_version = @version
|
@@ -155,13 +159,13 @@ module ClassifierReborn
|
|
155
159
|
# your dataset's general content. For example, if you were to use categorize on the
|
156
160
|
# results of this data, you could gather information on what your dataset is generally
|
157
161
|
# about.
|
158
|
-
def highest_relative_content(
|
159
|
-
|
162
|
+
def highest_relative_content(max_chunks = 10)
|
163
|
+
return [] if needs_rebuild?
|
160
164
|
|
161
|
-
|
162
|
-
|
165
|
+
avg_density = {}
|
166
|
+
@items.each_key { |item| avg_density[item] = proximity_array_for_content(item).inject(0.0) { |x, y| x + y[1] } }
|
163
167
|
|
164
|
-
|
168
|
+
avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks - 1].map
|
165
169
|
end
|
166
170
|
|
167
171
|
# This function is the primitive that find_related and classify
|
@@ -176,10 +180,10 @@ module ClassifierReborn
|
|
176
180
|
# The parameter doc is the content to compare. If that content is not
|
177
181
|
# indexed, you can pass an optional block to define how to create the
|
178
182
|
# text data. See add_item for examples of how this works.
|
179
|
-
def proximity_array_for_content(
|
183
|
+
def proximity_array_for_content(doc, &block)
|
180
184
|
return [] if needs_rebuild?
|
181
185
|
|
182
|
-
content_node = node_for_content(
|
186
|
+
content_node = node_for_content(doc, &block)
|
183
187
|
result =
|
184
188
|
@items.keys.collect do |item|
|
185
189
|
if $GSL
|
@@ -197,10 +201,18 @@ module ClassifierReborn
|
|
197
201
|
# calculated vectors instead of their full versions. This is useful when
|
198
202
|
# you're trying to perform operations on content that is much smaller than
|
199
203
|
# the text you're working with. search uses this primitive.
|
200
|
-
def proximity_norms_for_content(
|
204
|
+
def proximity_norms_for_content(doc, &block)
|
201
205
|
return [] if needs_rebuild?
|
202
206
|
|
203
|
-
content_node = node_for_content(
|
207
|
+
content_node = node_for_content(doc, &block)
|
208
|
+
if $GSL && content_node.raw_norm.isnan?.all?
|
209
|
+
puts "There are no documents that are similar to #{doc}"
|
210
|
+
else
|
211
|
+
content_node_norms(content_node)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
def content_node_norms(content_node)
|
204
216
|
result =
|
205
217
|
@items.keys.collect do |item|
|
206
218
|
if $GSL
|
@@ -220,11 +232,13 @@ module ClassifierReborn
|
|
220
232
|
#
|
221
233
|
# While this may seem backwards compared to the other functions that LSI supports,
|
222
234
|
# it is actually the same algorithm, just applied on a smaller document.
|
223
|
-
def search(
|
235
|
+
def search(string, max_nearest = 3)
|
224
236
|
return [] if needs_rebuild?
|
225
|
-
carry = proximity_norms_for_content(
|
226
|
-
|
227
|
-
|
237
|
+
carry = proximity_norms_for_content(string)
|
238
|
+
unless carry.nil?
|
239
|
+
result = carry.collect { |x| x[0] }
|
240
|
+
result[0..max_nearest - 1]
|
241
|
+
end
|
228
242
|
end
|
229
243
|
|
230
244
|
# This function takes content and finds other documents
|
@@ -236,21 +250,21 @@ module ClassifierReborn
|
|
236
250
|
# This is particularly useful for identifing clusters in your document space.
|
237
251
|
# For example you may want to identify several "What's Related" items for weblog
|
238
252
|
# articles, or find paragraphs that relate to each other in an essay.
|
239
|
-
def find_related(
|
253
|
+
def find_related(doc, max_nearest = 3, &block)
|
240
254
|
carry =
|
241
|
-
proximity_array_for_content(
|
255
|
+
proximity_array_for_content(doc, &block).reject { |pair| pair[0].eql? doc }
|
242
256
|
result = carry.collect { |x| x[0] }
|
243
|
-
|
257
|
+
result[0..max_nearest - 1]
|
244
258
|
end
|
245
259
|
|
246
260
|
# Return the most obvious category with the score
|
247
|
-
def classify_with_score(
|
248
|
-
|
261
|
+
def classify_with_score(doc, cutoff = 0.30, &block)
|
262
|
+
scored_categories(doc, cutoff, &block).last
|
249
263
|
end
|
250
264
|
|
251
265
|
# Return the most obvious category without the score
|
252
|
-
def classify(
|
253
|
-
|
266
|
+
def classify(doc, cutoff = 0.30, &block)
|
267
|
+
scored_categories(doc, cutoff, &block).last.first
|
254
268
|
end
|
255
269
|
|
256
270
|
# This function uses a voting system to categorize documents, based on
|
@@ -262,10 +276,10 @@ module ClassifierReborn
|
|
262
276
|
# text. A cutoff of 1 means that every document in the index votes on
|
263
277
|
# what category the document is in. This may not always make sense.
|
264
278
|
#
|
265
|
-
def scored_categories(
|
279
|
+
def scored_categories(doc, cutoff = 0.30, &block)
|
266
280
|
icutoff = (@items.size * cutoff).round
|
267
|
-
carry = proximity_array_for_content(
|
268
|
-
carry = carry[0..icutoff-1]
|
281
|
+
carry = proximity_array_for_content(doc, &block)
|
282
|
+
carry = carry[0..icutoff - 1]
|
269
283
|
votes = Hash.new(0.0)
|
270
284
|
carry.each do |pair|
|
271
285
|
@items[pair[0]].categories.each do |category|
|
@@ -273,31 +287,31 @@ module ClassifierReborn
|
|
273
287
|
end
|
274
288
|
end
|
275
289
|
|
276
|
-
|
290
|
+
votes.sort_by { |_, score| score }
|
277
291
|
end
|
278
292
|
|
279
293
|
# Prototype, only works on indexed documents.
|
280
294
|
# I have no clue if this is going to work, but in theory
|
281
295
|
# it's supposed to.
|
282
|
-
def highest_ranked_stems(
|
283
|
-
raise
|
284
|
-
|
285
|
-
top_n =
|
286
|
-
|
296
|
+
def highest_ranked_stems(doc, count = 3)
|
297
|
+
raise 'Requested stem ranking on non-indexed content!' unless @items[doc]
|
298
|
+
content_vector_array = node_for_content(doc).lsi_vector.to_a
|
299
|
+
top_n = content_vector_array.sort.reverse[0..count - 1]
|
300
|
+
top_n.collect { |x| @word_list.word_for_index(content_vector_array.index(x)) }
|
287
301
|
end
|
288
302
|
|
289
303
|
private
|
290
|
-
|
304
|
+
|
305
|
+
def build_reduced_matrix(matrix, cutoff = 0.75)
|
291
306
|
# TODO: Check that M>=N on these dimensions! Transpose helps assure this
|
292
307
|
u, v, s = matrix.SV_decomp
|
293
|
-
|
294
308
|
# TODO: Better than 75% term, please. :\
|
295
309
|
s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
|
296
310
|
s.size.times do |ord|
|
297
311
|
s[ord] = 0.0 if s[ord] < s_cutoff
|
298
312
|
end
|
299
313
|
# Reconstruct the term document matrix, only with reduced rank
|
300
|
-
u * ($GSL ? GSL::Matrix : ::Matrix).diag(
|
314
|
+
u * ($GSL ? GSL::Matrix : ::Matrix).diag(s) * v.trans
|
301
315
|
end
|
302
316
|
|
303
317
|
def node_for_content(item, &block)
|
@@ -306,23 +320,21 @@ module ClassifierReborn
|
|
306
320
|
else
|
307
321
|
clean_word_hash = Hasher.clean_word_hash((block ? block.call(item) : item.to_s), @language)
|
308
322
|
|
309
|
-
|
323
|
+
content_node = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
|
310
324
|
|
311
325
|
unless needs_rebuild?
|
312
|
-
|
326
|
+
content_node.raw_vector_with(@word_list) # make the lsi raw and norm vectors
|
313
327
|
end
|
314
328
|
end
|
315
329
|
|
316
|
-
|
330
|
+
content_node
|
317
331
|
end
|
318
332
|
|
319
333
|
def make_word_list
|
320
334
|
@word_list = WordList.new
|
321
335
|
@items.each_value do |node|
|
322
|
-
node.word_hash.each_key { |key| @word_list.add_word
|
336
|
+
node.word_hash.each_key { |key| @word_list.add_word(key) }
|
323
337
|
end
|
324
338
|
end
|
325
|
-
|
326
339
|
end
|
327
340
|
end
|
328
|
-
|
@@ -3,7 +3,6 @@
|
|
3
3
|
# License:: LGPL
|
4
4
|
|
5
5
|
module ClassifierReborn
|
6
|
-
|
7
6
|
# Subclass of ContentNode which caches the search_vector transpositions.
|
8
7
|
# Its great because its much faster for large indexes, but at the cost of more ram. Additionally,
|
9
8
|
# if you Marshal your classifier and want to keep the size down, you'll need to manually
|
@@ -16,7 +15,7 @@ module ClassifierReborn
|
|
16
15
|
end
|
17
16
|
end
|
18
17
|
|
19
|
-
def initialize(
|
18
|
+
def initialize(word_hash, *categories)
|
20
19
|
clear_cache!
|
21
20
|
super
|
22
21
|
end
|
@@ -29,13 +28,13 @@ module ClassifierReborn
|
|
29
28
|
def transposed_search_vector
|
30
29
|
@transposed_search_vector ||= super
|
31
30
|
end
|
32
|
-
|
31
|
+
|
33
32
|
# Clear the cache before we continue on
|
34
|
-
def raw_vector_with(
|
33
|
+
def raw_vector_with(word_list)
|
35
34
|
clear_cache!
|
36
35
|
super
|
37
36
|
end
|
38
|
-
|
37
|
+
|
39
38
|
# We don't want the cached_data here
|
40
39
|
def marshal_dump
|
41
40
|
[@lsi_vector, @lsi_norm, @raw_vector, @raw_norm, @categories, @word_hash]
|
@@ -3,10 +3,9 @@
|
|
3
3
|
# License:: LGPL
|
4
4
|
|
5
5
|
module ClassifierReborn
|
6
|
-
|
7
|
-
#
|
8
|
-
#
|
9
|
-
# You should never have to use it directly.
|
6
|
+
# This is an internal data structure class for the LSI node. Save for
|
7
|
+
# raw_vector_with, it should be fairly straightforward to understand.
|
8
|
+
# You should never have to use it directly.
|
10
9
|
class ContentNode
|
11
10
|
attr_accessor :raw_vector, :raw_norm,
|
12
11
|
:lsi_vector, :lsi_norm,
|
@@ -15,7 +14,7 @@ module ClassifierReborn
|
|
15
14
|
attr_reader :word_hash
|
16
15
|
# If text_proc is not specified, the source will be duck-typed
|
17
16
|
# via source.to_s
|
18
|
-
def initialize(
|
17
|
+
def initialize(word_hash, *categories)
|
19
18
|
@categories = categories || []
|
20
19
|
@word_hash = word_hash
|
21
20
|
@lsi_norm, @lsi_vector = nil
|
@@ -38,11 +37,11 @@ module ClassifierReborn
|
|
38
37
|
|
39
38
|
# Creates the raw vector out of word_hash using word_list as the
|
40
39
|
# key for mapping the vector space.
|
41
|
-
def raw_vector_with(
|
40
|
+
def raw_vector_with(word_list)
|
42
41
|
if $GSL
|
43
|
-
|
42
|
+
vec = GSL::Vector.alloc(word_list.size)
|
44
43
|
else
|
45
|
-
|
44
|
+
vec = Array.new(word_list.size, 0)
|
46
45
|
end
|
47
46
|
|
48
47
|
@word_hash.each_key do |word|
|
@@ -52,7 +51,7 @@ module ClassifierReborn
|
|
52
51
|
# Perform the scaling transform and force floating point arithmetic
|
53
52
|
if $GSL
|
54
53
|
sum = 0.0
|
55
|
-
vec.each {|v| sum += v }
|
54
|
+
vec.each { |v| sum += v }
|
56
55
|
total_words = sum
|
57
56
|
else
|
58
57
|
total_words = vec.reduce(0, :+).to_f
|
@@ -63,7 +62,7 @@ module ClassifierReborn
|
|
63
62
|
if $GSL
|
64
63
|
vec.each { |word| total_unique_words += 1 if word != 0.0 }
|
65
64
|
else
|
66
|
-
total_unique_words = vec.count{ |word| word != 0 }
|
65
|
+
total_unique_words = vec.count { |word| word != 0 }
|
67
66
|
end
|
68
67
|
|
69
68
|
# Perform first-order association transform if this vector has more
|
@@ -71,9 +70,9 @@ module ClassifierReborn
|
|
71
70
|
if total_words > 1.0 && total_unique_words > 1
|
72
71
|
weighted_total = 0.0
|
73
72
|
# Cache calculations, this takes too long on large indexes
|
74
|
-
cached_calcs = Hash.new
|
75
|
-
hash[term] = ((
|
76
|
-
|
73
|
+
cached_calcs = Hash.new do |hash, term|
|
74
|
+
hash[term] = ((term / total_words) * Math.log(term / total_words))
|
75
|
+
end
|
77
76
|
|
78
77
|
vec.each do |term|
|
79
78
|
weighted_total += cached_calcs[term] if term > 0.0
|
@@ -81,12 +80,12 @@ module ClassifierReborn
|
|
81
80
|
|
82
81
|
# Cache calculations, this takes too long on large indexes
|
83
82
|
cached_calcs = Hash.new do |hash, val|
|
84
|
-
hash[val] = Math.log(
|
83
|
+
hash[val] = Math.log(val + 1) / -weighted_total
|
85
84
|
end
|
86
85
|
|
87
|
-
vec.collect!
|
86
|
+
vec.collect! do |val|
|
88
87
|
cached_calcs[val]
|
89
|
-
|
88
|
+
end
|
90
89
|
end
|
91
90
|
|
92
91
|
if $GSL
|
@@ -97,7 +96,5 @@ module ClassifierReborn
|
|
97
96
|
@raw_vector = Vector[*vec]
|
98
97
|
end
|
99
98
|
end
|
100
|
-
|
101
99
|
end
|
102
|
-
|
103
100
|
end
|
@@ -4,13 +4,13 @@
|
|
4
4
|
|
5
5
|
module ClassifierReborn
|
6
6
|
module Summarizer
|
7
|
-
|
7
|
+
module_function
|
8
8
|
|
9
|
-
def summary(
|
9
|
+
def summary(str, count = 10, separator = ' [...] ')
|
10
10
|
perform_lsi split_sentences(str), count, separator
|
11
11
|
end
|
12
12
|
|
13
|
-
def paragraph_summary(
|
13
|
+
def paragraph_summary(str, count = 1, separator = ' [...] ')
|
14
14
|
perform_lsi split_paragraphs(str), count, separator
|
15
15
|
end
|
16
16
|
|
@@ -23,11 +23,11 @@ module ClassifierReborn
|
|
23
23
|
end
|
24
24
|
|
25
25
|
def perform_lsi(chunks, count, separator)
|
26
|
-
lsi = ClassifierReborn::LSI.new :
|
26
|
+
lsi = ClassifierReborn::LSI.new auto_rebuild: false
|
27
27
|
chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
|
28
28
|
lsi.build_index
|
29
29
|
summaries = lsi.highest_relative_content count
|
30
|
-
|
30
|
+
summaries.reject { |chunk| !summaries.include? chunk }.map(&:strip).join(separator)
|
31
31
|
end
|
32
32
|
end
|
33
33
|
end
|
@@ -8,7 +8,7 @@ module ClassifierReborn
|
|
8
8
|
|
9
9
|
class WordList
|
10
10
|
def initialize
|
11
|
-
@location_table =
|
11
|
+
@location_table = {}
|
12
12
|
end
|
13
13
|
|
14
14
|
# Adds a word (if it is new) and assigns it a unique dimension.
|
@@ -31,6 +31,5 @@ module ClassifierReborn
|
|
31
31
|
def size
|
32
32
|
@location_table.size
|
33
33
|
end
|
34
|
-
|
35
34
|
end
|
36
35
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: classifier-reborn
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0
|
4
|
+
version: 2.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lucas Carlson
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date:
|
13
|
+
date: 2017-01-01 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: fast-stemmer
|
@@ -55,7 +55,49 @@ dependencies:
|
|
55
55
|
- !ruby/object:Gem::Version
|
56
56
|
version: '0'
|
57
57
|
- !ruby/object:Gem::Dependency
|
58
|
-
name:
|
58
|
+
name: minitest
|
59
|
+
requirement: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
64
|
+
type: :development
|
65
|
+
prerelease: false
|
66
|
+
version_requirements: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: '0'
|
71
|
+
- !ruby/object:Gem::Dependency
|
72
|
+
name: minitest-reporters
|
73
|
+
requirement: !ruby/object:Gem::Requirement
|
74
|
+
requirements:
|
75
|
+
- - ">="
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
type: :development
|
79
|
+
prerelease: false
|
80
|
+
version_requirements: !ruby/object:Gem::Requirement
|
81
|
+
requirements:
|
82
|
+
- - ">="
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: '0'
|
85
|
+
- !ruby/object:Gem::Dependency
|
86
|
+
name: rubocop
|
87
|
+
requirement: !ruby/object:Gem::Requirement
|
88
|
+
requirements:
|
89
|
+
- - ">="
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
version: '0'
|
92
|
+
type: :development
|
93
|
+
prerelease: false
|
94
|
+
version_requirements: !ruby/object:Gem::Requirement
|
95
|
+
requirements:
|
96
|
+
- - ">="
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: '0'
|
99
|
+
- !ruby/object:Gem::Dependency
|
100
|
+
name: pry
|
59
101
|
requirement: !ruby/object:Gem::Requirement
|
60
102
|
requirements:
|
61
103
|
- - ">="
|
@@ -134,9 +176,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
134
176
|
version: '0'
|
135
177
|
requirements: []
|
136
178
|
rubyforge_project:
|
137
|
-
rubygems_version: 2.
|
179
|
+
rubygems_version: 2.5.2
|
138
180
|
signing_key:
|
139
181
|
specification_version: 2
|
140
182
|
summary: A general classifier module to allow Bayesian and other types of classifications.
|
141
183
|
test_files: []
|
142
|
-
has_rdoc: true
|