reclassifier 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +429 -0
- data/README.md +87 -0
- data/Rakefile +7 -0
- data/lib/gsl/vector.rb +12 -0
- data/lib/reclassifier.rb +19 -0
- data/lib/reclassifier/bayes.rb +129 -0
- data/lib/reclassifier/content_node.rb +66 -0
- data/lib/reclassifier/core_ext/array.rb +11 -0
- data/lib/reclassifier/core_ext/matrix.rb +72 -0
- data/lib/reclassifier/core_ext/object.rb +3 -0
- data/lib/reclassifier/core_ext/string.rb +143 -0
- data/lib/reclassifier/core_ext/vector.rb +20 -0
- data/lib/reclassifier/lsi.rb +300 -0
- data/lib/reclassifier/version.rb +3 -0
- data/lib/reclassifier/word_list.rb +32 -0
- data/reclassifier.gemspec +27 -0
- data/test/bayes_test.rb +34 -0
- data/test/core_ext/array_test.rb +15 -0
- data/test/core_ext/string_test.rb +13 -0
- data/test/lsi_test.rb +123 -0
- data/test/test_helper.rb +4 -0
- metadata +154 -0
data/README.md
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
# Reclassifier
|
2
|
+
|
3
|
+
Reclassifier is a gem that provides [classification](http://en.wikipedia.org/wiki/Statistical_classification) of strings.
|
4
|
+
|
5
|
+
Classification can be done via [Naïve Bayes](https://en.wikipedia.org/wiki/Naive_Bayes_classifier) or [Latent Semantic Indexing](http://en.wikipedia.org/wiki/Latent_semantic_indexing).
|
6
|
+
|
7
|
+
It is a fork of the original [Classifier](https://github.com/cardmagic/classifier) gem, which appears to be unmaintained as of a couple of years ago.
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
|
11
|
+
Add this line to your application's Gemfile:
|
12
|
+
|
13
|
+
gem 'reclassifier'
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install reclassifier
|
22
|
+
|
23
|
+
## Dependencies
|
24
|
+
|
25
|
+
Currently you need to install the GNU GSL library in order to use Reclassifier: http://www.gnu.org/software/gsl
|
26
|
+
|
27
|
+
## Usage
|
28
|
+
|
29
|
+
### Bayes
|
30
|
+
Bayesian Classifiers are accurate, fast, and have modest memory requirements.
|
31
|
+
|
32
|
+
#### Usage
|
33
|
+
require 'reclassifier'
|
34
|
+
b = Reclassifier::Bayes.new 'Interesting', 'Uninteresting'
|
35
|
+
b.train_interesting "here are some good words. I hope you love them"
|
36
|
+
b.train_uninteresting "here are some bad words, I hate you"
|
37
|
+
b.classify "I hate bad words and you" # returns 'Uninteresting'
|
38
|
+
|
39
|
+
require 'madeleine'
|
40
|
+
m = SnapshotMadeleine.new("bayes_data") {
|
41
|
+
Reclassifier::Bayes.new 'Interesting', 'Uninteresting'
|
42
|
+
}
|
43
|
+
m.system.train_interesting "here are some good words. I hope you love them"
|
44
|
+
m.system.train_uninteresting "here are some bad words, I hate you"
|
45
|
+
m.take_snapshot
|
46
|
+
m.system.classify "I love you" # returns 'Interesting'
|
47
|
+
|
48
|
+
Using Madeleine, your application can persist the learned data over time.
|
49
|
+
|
50
|
+
### LSI
|
51
|
+
Latent Semantic Indexing engines are not as fast or as small as Bayesian classifiers, but are more flexible, providing
|
52
|
+
fast search and clustering detection as well as semantic analysis of the text that theoretically simulates human learning.
|
53
|
+
|
54
|
+
#### Usage
|
55
|
+
require 'reclassifier'
|
56
|
+
lsi = Reclassifier::LSI.new
|
57
|
+
strings = [ ["This text deals with dogs. Dogs.", :dog],
|
58
|
+
["This text involves dogs too. Dogs! ", :dog],
|
59
|
+
["This text revolves around cats. Cats.", :cat],
|
60
|
+
["This text also involves cats. Cats!", :cat],
|
61
|
+
["This text involves birds. Birds.",:bird ]]
|
62
|
+
strings.each {|x| lsi.add_item x.first, x.last}
|
63
|
+
|
64
|
+
lsi.search("dog", 3)
|
65
|
+
# returns => ["This text deals with dogs. Dogs.", "This text involves dogs too. Dogs! ",
|
66
|
+
# "This text also involves cats. Cats!"]
|
67
|
+
|
68
|
+
lsi.find_related(strings[2], 2)
|
69
|
+
# returns => ["This text revolves around cats. Cats.", "This text also involves cats. Cats!"]
|
70
|
+
|
71
|
+
lsi.classify "This text is also about dogs!"
|
72
|
+
# returns => :dog
|
73
|
+
|
74
|
+
Please see the Reclassifier::LSI documentation for more information. It is possible to index, search and classify
|
75
|
+
with more than just simple strings.
|
76
|
+
|
77
|
+
## Contributing
|
78
|
+
|
79
|
+
1. Fork it
|
80
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
81
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
82
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
83
|
+
5. Create new Pull Request
|
84
|
+
|
85
|
+
## License
|
86
|
+
|
87
|
+
This library is released under the terms of the GNU LGPL. See LICENSE for more details.
|
data/Rakefile
ADDED
data/lib/gsl/vector.rb
ADDED
data/lib/reclassifier.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
# gems
|
2
|
+
require 'matrix'
|
3
|
+
require 'fast-stemmer'
|
4
|
+
require 'gsl'
|
5
|
+
|
6
|
+
# files
|
7
|
+
require 'reclassifier/version'
|
8
|
+
require 'reclassifier/core_ext/array'
|
9
|
+
require 'reclassifier/core_ext/matrix'
|
10
|
+
require 'reclassifier/core_ext/object'
|
11
|
+
require 'reclassifier/core_ext/string'
|
12
|
+
require 'gsl/vector'
|
13
|
+
|
14
|
+
module Reclassifier
|
15
|
+
autoload :Bayes, 'reclassifier/bayes'
|
16
|
+
autoload :LSI, 'reclassifier/lsi'
|
17
|
+
autoload :ContentNode, 'reclassifier/content_node'
|
18
|
+
autoload :WordList, 'reclassifier/word_list'
|
19
|
+
end
|
@@ -0,0 +1,129 @@
|
|
1
|
+
module Reclassifier
|
2
|
+
class Bayes
|
3
|
+
# The class can be created with one or more categories, each of which will be
|
4
|
+
# initialized and given a training method. E.g.,
|
5
|
+
# b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
|
6
|
+
def initialize(*categories)
|
7
|
+
@categories = Hash.new
|
8
|
+
categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
|
9
|
+
@total_words = 0
|
10
|
+
@category_counts = Hash.new(0)
|
11
|
+
end
|
12
|
+
|
13
|
+
#
|
14
|
+
# Provides a general training method for all categories specified in Bayes#new
|
15
|
+
# For example:
|
16
|
+
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
17
|
+
# b.train :this, "This text"
|
18
|
+
# b.train "that", "That text"
|
19
|
+
# b.train "The other", "The other text"
|
20
|
+
def train(category, text)
|
21
|
+
category = category.prepare_category_name
|
22
|
+
@category_counts[category] += 1
|
23
|
+
text.word_hash.each do |word, count|
|
24
|
+
@categories[category][word] ||= 0
|
25
|
+
@categories[category][word] += count
|
26
|
+
@total_words += count
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
#
|
31
|
+
# Provides a untraining method for all categories specified in Bayes#new
|
32
|
+
# Be very careful with this method.
|
33
|
+
#
|
34
|
+
# For example:
|
35
|
+
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
36
|
+
# b.train :this, "This text"
|
37
|
+
# b.untrain :this, "This text"
|
38
|
+
def untrain(category, text)
|
39
|
+
category = category.prepare_category_name
|
40
|
+
@category_counts[category] -= 1
|
41
|
+
text.word_hash.each do |word, count|
|
42
|
+
if @total_words >= 0
|
43
|
+
orig = @categories[category][word]
|
44
|
+
@categories[category][word] ||= 0
|
45
|
+
@categories[category][word] -= count
|
46
|
+
if @categories[category][word] <= 0
|
47
|
+
@categories[category].delete(word)
|
48
|
+
count = orig
|
49
|
+
end
|
50
|
+
@total_words -= count
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
#
|
56
|
+
# Returns the scores in each category the provided +text+. E.g.,
|
57
|
+
# b.classifications "I hate bad words and you"
|
58
|
+
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
59
|
+
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
60
|
+
def classifications(text)
|
61
|
+
score = Hash.new
|
62
|
+
training_count = @category_counts.values.inject { |x,y| x+y }.to_f
|
63
|
+
@categories.each do |category, category_words|
|
64
|
+
score[category.to_s] = 0
|
65
|
+
total = category_words.values.inject(0) {|sum, element| sum+element}
|
66
|
+
text.word_hash.each do |word, count|
|
67
|
+
s = category_words.has_key?(word) ? category_words[word] : 0.1
|
68
|
+
score[category.to_s] += Math.log(s/total.to_f)
|
69
|
+
end
|
70
|
+
# now add prior probability for the category
|
71
|
+
s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
|
72
|
+
score[category.to_s] += Math.log(s / training_count)
|
73
|
+
end
|
74
|
+
return score
|
75
|
+
end
|
76
|
+
|
77
|
+
#
|
78
|
+
# Returns the classification of the provided +text+, which is one of the
|
79
|
+
# categories given in the initializer. E.g.,
|
80
|
+
# b.classify "I hate bad words and you"
|
81
|
+
# => 'Uninteresting'
|
82
|
+
def classify(text)
|
83
|
+
(classifications(text).sort_by { |a| -a[1] })[0][0]
|
84
|
+
end
|
85
|
+
|
86
|
+
#
|
87
|
+
# Provides training and untraining methods for the categories specified in Bayes#new
|
88
|
+
# For example:
|
89
|
+
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
90
|
+
# b.train_this "This text"
|
91
|
+
# b.train_that "That text"
|
92
|
+
# b.untrain_that "That text"
|
93
|
+
# b.train_the_other "The other text"
|
94
|
+
def method_missing(name, *args)
|
95
|
+
category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
|
96
|
+
if @categories.has_key? category
|
97
|
+
args.each { |text| eval("#{$1}train(category, text)") }
|
98
|
+
elsif name.to_s =~ /(un)?train_([\w]+)/
|
99
|
+
raise StandardError, "No such category: #{category}"
|
100
|
+
else
|
101
|
+
super #raise StandardError, "No such method: #{name}"
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
#
|
106
|
+
# Provides a list of category names
|
107
|
+
# For example:
|
108
|
+
# b.categories
|
109
|
+
# => ['This', 'That', 'the_other']
|
110
|
+
def categories # :nodoc:
|
111
|
+
@categories.keys.collect {|c| c.to_s}
|
112
|
+
end
|
113
|
+
|
114
|
+
#
|
115
|
+
# Allows you to add categories to the classifier.
|
116
|
+
# For example:
|
117
|
+
# b.add_category "Not spam"
|
118
|
+
#
|
119
|
+
# WARNING: Adding categories to a trained classifier will
|
120
|
+
# result in an undertrained category that will tend to match
|
121
|
+
# more criteria than the trained selective categories. In short,
|
122
|
+
# try to initialize your categories at initialization.
|
123
|
+
def add_category(category)
|
124
|
+
@categories[category.prepare_category_name] = Hash.new
|
125
|
+
end
|
126
|
+
|
127
|
+
alias append_category add_category
|
128
|
+
end
|
129
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
module Reclassifier
|
2
|
+
|
3
|
+
# This is an internal data structure class for the LSI node. Save for
|
4
|
+
# raw_vector_with, it should be fairly straightforward to understand.
|
5
|
+
# You should never have to use it directly.
|
6
|
+
class ContentNode
|
7
|
+
attr_accessor :raw_vector, :raw_norm,
|
8
|
+
:lsi_vector, :lsi_norm,
|
9
|
+
:categories
|
10
|
+
|
11
|
+
attr_reader :word_hash
|
12
|
+
# If text_proc is not specified, the source will be duck-typed
|
13
|
+
# via source.to_s
|
14
|
+
def initialize( word_hash, *categories )
|
15
|
+
@categories = categories || []
|
16
|
+
@word_hash = word_hash
|
17
|
+
end
|
18
|
+
|
19
|
+
# Use this to fetch the appropriate search vector.
|
20
|
+
def search_vector
|
21
|
+
@lsi_vector || @raw_vector
|
22
|
+
end
|
23
|
+
|
24
|
+
# Use this to fetch the appropriate search vector in normalized form.
|
25
|
+
def search_norm
|
26
|
+
@lsi_norm || @raw_norm
|
27
|
+
end
|
28
|
+
|
29
|
+
# Creates the raw vector out of word_hash using word_list as the
|
30
|
+
# key for mapping the vector space.
|
31
|
+
def raw_vector_with( word_list )
|
32
|
+
if $GSL
|
33
|
+
vec = GSL::Vector.alloc(word_list.size)
|
34
|
+
else
|
35
|
+
vec = Array.new(word_list.size, 0)
|
36
|
+
end
|
37
|
+
|
38
|
+
@word_hash.each_key do |word|
|
39
|
+
vec[word_list[word]] = @word_hash[word] if word_list[word]
|
40
|
+
end
|
41
|
+
|
42
|
+
# Perform the scaling transform
|
43
|
+
total_words = $GSL ? vec.sum : vec.sum_with_identity
|
44
|
+
|
45
|
+
# Perform first-order association transform if this vector has more
|
46
|
+
# than one word in it.
|
47
|
+
if total_words > 1.0
|
48
|
+
weighted_total = 0.0
|
49
|
+
vec.each do |term|
|
50
|
+
if ( term > 0 )
|
51
|
+
weighted_total += (( term / total_words ) * Math.log( term / total_words ))
|
52
|
+
end
|
53
|
+
end
|
54
|
+
vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
|
55
|
+
end
|
56
|
+
|
57
|
+
if $GSL
|
58
|
+
@raw_norm = vec.normalize
|
59
|
+
@raw_vector = vec
|
60
|
+
else
|
61
|
+
@raw_norm = Vector[*vec].normalize
|
62
|
+
@raw_vector = Vector[*vec]
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
class Matrix
|
2
|
+
def Matrix.diag(s)
|
3
|
+
Matrix.diagonal(*s)
|
4
|
+
end
|
5
|
+
|
6
|
+
alias :trans :transpose
|
7
|
+
|
8
|
+
def SV_decomp(maxSweeps = 20)
|
9
|
+
if self.row_size >= self.column_size
|
10
|
+
q = self.trans * self
|
11
|
+
else
|
12
|
+
q = self * self.trans
|
13
|
+
end
|
14
|
+
|
15
|
+
qrot = q.dup
|
16
|
+
v = Matrix.identity(q.row_size)
|
17
|
+
azrot = nil
|
18
|
+
mzrot = nil
|
19
|
+
cnt = 0
|
20
|
+
s_old = nil
|
21
|
+
mu = nil
|
22
|
+
|
23
|
+
while true do
|
24
|
+
cnt += 1
|
25
|
+
for row in (0...qrot.row_size-1) do
|
26
|
+
for col in (1..qrot.row_size-1) do
|
27
|
+
next if row == col
|
28
|
+
h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
|
29
|
+
hcos = Math.cos(h)
|
30
|
+
hsin = Math.sin(h)
|
31
|
+
mzrot = Matrix.identity(qrot.row_size)
|
32
|
+
mzrot[row,row] = hcos
|
33
|
+
mzrot[row,col] = -hsin
|
34
|
+
mzrot[col,row] = hsin
|
35
|
+
mzrot[col,col] = hcos
|
36
|
+
qrot = mzrot.trans * qrot * mzrot
|
37
|
+
v = v * mzrot
|
38
|
+
end
|
39
|
+
end
|
40
|
+
s_old = qrot.dup if cnt == 1
|
41
|
+
sum_qrot = 0.0
|
42
|
+
if cnt > 1
|
43
|
+
qrot.row_size.times do |r|
|
44
|
+
sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
|
45
|
+
end
|
46
|
+
s_old = qrot.dup
|
47
|
+
end
|
48
|
+
break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
|
49
|
+
end # of do while true
|
50
|
+
s = []
|
51
|
+
qrot.row_size.times do |r|
|
52
|
+
s << Math.sqrt(qrot[r,r])
|
53
|
+
end
|
54
|
+
#puts "cnt = #{cnt}"
|
55
|
+
if self.row_size >= self.column_size
|
56
|
+
mu = self * v * Matrix.diagonal(*s).inverse
|
57
|
+
return [mu, v, s]
|
58
|
+
else
|
59
|
+
puts v.row_size
|
60
|
+
puts v.column_size
|
61
|
+
puts self.row_size
|
62
|
+
puts self.column_size
|
63
|
+
puts s.size
|
64
|
+
|
65
|
+
mu = (self.trans * v * Matrix.diagonal(*s).inverse)
|
66
|
+
return [mu, v, s]
|
67
|
+
end
|
68
|
+
end
|
69
|
+
def []=(i,j,val)
|
70
|
+
@rows[i][j] = val
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,143 @@
|
|
1
|
+
class String
|
2
|
+
|
3
|
+
# Removes common punctuation symbols, returning a new string.
|
4
|
+
# E.g.,
|
5
|
+
# "Hello (greeting's), with {braces} < >...?".without_punctuation
|
6
|
+
# => "Hello greetings with braces "
|
7
|
+
def without_punctuation
|
8
|
+
tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
|
9
|
+
end
|
10
|
+
|
11
|
+
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
12
|
+
# symbolized, and indexed to its frequency in the document.
|
13
|
+
def word_hash
|
14
|
+
word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split)
|
15
|
+
end
|
16
|
+
|
17
|
+
# Return a word hash without extra punctuation or short symbols, just stemmed words
|
18
|
+
def clean_word_hash
|
19
|
+
word_hash_for_words gsub(/[^\w\s]/,"").split
|
20
|
+
end
|
21
|
+
|
22
|
+
def word_hash_for_words(words)
|
23
|
+
d = Hash.new
|
24
|
+
words.each do |word|
|
25
|
+
word.downcase! if word =~ /[\w]+/
|
26
|
+
key = word.stem.to_sym
|
27
|
+
if word =~ /[^\w]/ || ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
|
28
|
+
d[key] ||= 0
|
29
|
+
d[key] += 1
|
30
|
+
end
|
31
|
+
end
|
32
|
+
return d
|
33
|
+
end
|
34
|
+
|
35
|
+
CORPUS_SKIP_WORDS = [
|
36
|
+
"a",
|
37
|
+
"again",
|
38
|
+
"all",
|
39
|
+
"along",
|
40
|
+
"are",
|
41
|
+
"also",
|
42
|
+
"an",
|
43
|
+
"and",
|
44
|
+
"as",
|
45
|
+
"at",
|
46
|
+
"but",
|
47
|
+
"by",
|
48
|
+
"came",
|
49
|
+
"can",
|
50
|
+
"cant",
|
51
|
+
"couldnt",
|
52
|
+
"did",
|
53
|
+
"didn",
|
54
|
+
"didnt",
|
55
|
+
"do",
|
56
|
+
"doesnt",
|
57
|
+
"dont",
|
58
|
+
"ever",
|
59
|
+
"first",
|
60
|
+
"from",
|
61
|
+
"have",
|
62
|
+
"her",
|
63
|
+
"here",
|
64
|
+
"him",
|
65
|
+
"how",
|
66
|
+
"i",
|
67
|
+
"if",
|
68
|
+
"in",
|
69
|
+
"into",
|
70
|
+
"is",
|
71
|
+
"isnt",
|
72
|
+
"it",
|
73
|
+
"itll",
|
74
|
+
"just",
|
75
|
+
"last",
|
76
|
+
"least",
|
77
|
+
"like",
|
78
|
+
"most",
|
79
|
+
"my",
|
80
|
+
"new",
|
81
|
+
"no",
|
82
|
+
"not",
|
83
|
+
"now",
|
84
|
+
"of",
|
85
|
+
"on",
|
86
|
+
"or",
|
87
|
+
"should",
|
88
|
+
"sinc",
|
89
|
+
"so",
|
90
|
+
"some",
|
91
|
+
"th",
|
92
|
+
"than",
|
93
|
+
"this",
|
94
|
+
"that",
|
95
|
+
"the",
|
96
|
+
"their",
|
97
|
+
"then",
|
98
|
+
"those",
|
99
|
+
"to",
|
100
|
+
"told",
|
101
|
+
"too",
|
102
|
+
"true",
|
103
|
+
"try",
|
104
|
+
"until",
|
105
|
+
"url",
|
106
|
+
"us",
|
107
|
+
"were",
|
108
|
+
"when",
|
109
|
+
"whether",
|
110
|
+
"while",
|
111
|
+
"with",
|
112
|
+
"within",
|
113
|
+
"yes",
|
114
|
+
"you",
|
115
|
+
"youll",
|
116
|
+
]
|
117
|
+
|
118
|
+
def summary( count=10, separator=" [...] " )
|
119
|
+
perform_lsi split_sentences, count, separator
|
120
|
+
end
|
121
|
+
|
122
|
+
def paragraph_summary( count=1, separator=" [...] " )
|
123
|
+
perform_lsi split_paragraphs, count, separator
|
124
|
+
end
|
125
|
+
|
126
|
+
def split_sentences
|
127
|
+
split /(\.|\!|\?)/ # TODO: make this less primitive
|
128
|
+
end
|
129
|
+
|
130
|
+
def split_paragraphs
|
131
|
+
split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
|
132
|
+
end
|
133
|
+
|
134
|
+
private
|
135
|
+
|
136
|
+
def perform_lsi(chunks, count, separator)
|
137
|
+
lsi = Reclassifier::LSI.new :auto_rebuild => false
|
138
|
+
chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
|
139
|
+
lsi.build_index
|
140
|
+
summaries = lsi.highest_relative_content count
|
141
|
+
return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
|
142
|
+
end
|
143
|
+
end
|