classifier-reborn 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +74 -1
- data/README.markdown +57 -227
- data/data/stopwords/ar +104 -0
- data/data/stopwords/bn +362 -0
- data/data/stopwords/hi +97 -0
- data/data/stopwords/ja +43 -0
- data/data/stopwords/ru +420 -0
- data/data/stopwords/tr +199 -30
- data/data/stopwords/vi +647 -0
- data/data/stopwords/zh +125 -0
- data/lib/classifier-reborn.rb +9 -0
- data/lib/classifier-reborn/backends/bayes_memory_backend.rb +75 -0
- data/lib/classifier-reborn/backends/bayes_redis_backend.rb +107 -0
- data/lib/classifier-reborn/backends/no_redis_error.rb +12 -0
- data/lib/classifier-reborn/bayes.rb +98 -38
- data/lib/classifier-reborn/category_namer.rb +0 -1
- data/lib/classifier-reborn/extensions/hasher.rb +1 -1
- data/lib/classifier-reborn/lsi.rb +5 -1
- data/lib/classifier-reborn/lsi/word_list.rb +2 -4
- data/lib/classifier-reborn/validators/classifier_validator.rb +169 -0
- data/lib/classifier-reborn/version.rb +1 -1
- metadata +30 -8
- data/bin/bayes.rb +0 -36
- data/bin/summarize.rb +0 -16
@@ -21,7 +21,7 @@ module ClassifierReborn
|
|
21
21
|
|
22
22
|
# Return a word hash without extra punctuation or short symbols, just stemmed words
|
23
23
|
def clean_word_hash(str, language = 'en', enable_stemmer = true)
|
24
|
-
word_hash_for_words
|
24
|
+
word_hash_for_words(str.gsub(/[^\p{WORD}\s]/, '').downcase.split, language, enable_stemmer)
|
25
25
|
end
|
26
26
|
|
27
27
|
def word_hash_for_words(words, language = 'en', enable_stemmer = true)
|
@@ -86,7 +86,7 @@ module ClassifierReborn
|
|
86
86
|
add_item(item)
|
87
87
|
end
|
88
88
|
|
89
|
-
# Returns
|
89
|
+
# Returns categories for a given indexed item. You are free to add and remove
|
90
90
|
# items from this as you see fit. It does not invalide an index to change its categories.
|
91
91
|
def categories_for(item)
|
92
92
|
return [] unless @items[item]
|
@@ -300,6 +300,10 @@ module ClassifierReborn
|
|
300
300
|
top_n.collect { |x| @word_list.word_for_index(content_vector_array.index(x)) }
|
301
301
|
end
|
302
302
|
|
303
|
+
def reset
|
304
|
+
initialize(auto_rebuild: @auto_rebuild, cache_node_vectors: @cache_node_vectors)
|
305
|
+
end
|
306
|
+
|
303
307
|
private
|
304
308
|
|
305
309
|
def build_reduced_matrix(matrix, cutoff = 0.75)
|
@@ -13,14 +13,12 @@ module ClassifierReborn
|
|
13
13
|
|
14
14
|
# Adds a word (if it is new) and assigns it a unique dimension.
|
15
15
|
def add_word(word)
|
16
|
-
|
17
|
-
@location_table[term] = @location_table.size unless @location_table[term]
|
16
|
+
@location_table[word] = @location_table.size unless @location_table[word]
|
18
17
|
end
|
19
18
|
|
20
19
|
# Returns the dimension of the word or nil if the word is not in the space.
|
21
20
|
def [](lookup)
|
22
|
-
|
23
|
-
@location_table[term]
|
21
|
+
@location_table[lookup]
|
24
22
|
end
|
25
23
|
|
26
24
|
def word_for_index(ind)
|
@@ -0,0 +1,169 @@
|
|
1
|
+
module ClassifierReborn
|
2
|
+
module ClassifierValidator
|
3
|
+
|
4
|
+
module_function
|
5
|
+
|
6
|
+
def cross_validate(classifier, sample_data, fold=10, *options)
|
7
|
+
classifier = ClassifierReborn::const_get(classifier).new(options) if classifier.is_a?(String)
|
8
|
+
sample_data.shuffle!
|
9
|
+
partition_size = sample_data.length / fold
|
10
|
+
partitioned_data = sample_data.each_slice(partition_size)
|
11
|
+
conf_mats = []
|
12
|
+
fold.times do |i|
|
13
|
+
training_data = partitioned_data.take(fold)
|
14
|
+
test_data = training_data.slice!(i)
|
15
|
+
conf_mats << validate(classifier, training_data.flatten!(1), test_data)
|
16
|
+
end
|
17
|
+
classifier.reset()
|
18
|
+
generate_report(conf_mats)
|
19
|
+
end
|
20
|
+
|
21
|
+
def validate(classifier, training_data, test_data, *options)
|
22
|
+
classifier = ClassifierReborn::const_get(classifier).new(options) if classifier.is_a?(String)
|
23
|
+
classifier.reset()
|
24
|
+
training_data.each do |rec|
|
25
|
+
classifier.train(rec.first, rec.last)
|
26
|
+
end
|
27
|
+
evaluate(classifier, test_data)
|
28
|
+
end
|
29
|
+
|
30
|
+
def evaluate(classifier, test_data)
|
31
|
+
conf_mat = empty_conf_mat(classifier.categories.sort)
|
32
|
+
test_data.each do |rec|
|
33
|
+
actual = rec.first.tr('_', ' ').capitalize
|
34
|
+
predicted = classifier.classify(rec.last)
|
35
|
+
conf_mat[actual][predicted] += 1 unless predicted.nil?
|
36
|
+
end
|
37
|
+
conf_mat
|
38
|
+
end
|
39
|
+
|
40
|
+
def generate_report(*conf_mats)
|
41
|
+
conf_mats.flatten!
|
42
|
+
accumulated_conf_mat = conf_mats.length == 1 ? conf_mats.first : empty_conf_mat(conf_mats.first.keys.sort)
|
43
|
+
header = "Run Total Correct Incorrect Accuracy"
|
44
|
+
puts
|
45
|
+
puts " Run Report ".center(header.length, "-")
|
46
|
+
puts header
|
47
|
+
puts "-" * header.length
|
48
|
+
if conf_mats.length > 1
|
49
|
+
conf_mats.each_with_index do |conf_mat, i|
|
50
|
+
run_report = build_run_report(conf_mat)
|
51
|
+
print_run_report(run_report, i+1)
|
52
|
+
conf_mat.each do |actual, cols|
|
53
|
+
cols.each do |predicted, v|
|
54
|
+
accumulated_conf_mat[actual][predicted] += v
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
puts "-" * header.length
|
59
|
+
end
|
60
|
+
run_report = build_run_report(accumulated_conf_mat)
|
61
|
+
print_run_report(run_report, "All")
|
62
|
+
puts
|
63
|
+
print_conf_mat(accumulated_conf_mat)
|
64
|
+
puts
|
65
|
+
conf_tab = conf_mat_to_tab(accumulated_conf_mat)
|
66
|
+
print_conf_tab(conf_tab)
|
67
|
+
end
|
68
|
+
|
69
|
+
def build_run_report(conf_mat)
|
70
|
+
correct = incorrect = 0
|
71
|
+
conf_mat.each do |actual, cols|
|
72
|
+
cols.each do |predicted, v|
|
73
|
+
if actual == predicted
|
74
|
+
correct += v
|
75
|
+
else
|
76
|
+
incorrect += v
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
total = correct + incorrect
|
81
|
+
{total: total, correct: correct, incorrect: incorrect, accuracy: divide(correct, total)}
|
82
|
+
end
|
83
|
+
|
84
|
+
def conf_mat_to_tab(conf_mat)
|
85
|
+
conf_tab = Hash.new {|h, k| h[k] = {p: {t: 0, f: 0}, n: {t: 0, f: 0}}}
|
86
|
+
conf_mat.each_key do |positive|
|
87
|
+
conf_mat.each do |actual, cols|
|
88
|
+
cols.each do |predicted, v|
|
89
|
+
conf_tab[positive][positive == predicted ? :p : :n][actual == predicted ? :t : :f] += v
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
conf_tab
|
94
|
+
end
|
95
|
+
|
96
|
+
def print_run_report(stats, prefix="", print_header=false)
|
97
|
+
puts "#{"Run".rjust([3, prefix.length].max)} Total Correct Incorrect Accuracy" if print_header
|
98
|
+
puts "#{prefix.to_s.rjust(3)} #{stats[:total].to_s.rjust(9)} #{stats[:correct].to_s.rjust(9)} #{stats[:incorrect].to_s.rjust(9)} #{stats[:accuracy].round(5).to_s.ljust(7, '0').rjust(9)}"
|
99
|
+
end
|
100
|
+
|
101
|
+
def print_conf_mat(conf_mat)
|
102
|
+
header = ["Predicted ->"] + conf_mat.keys + ["Total", "Recall"]
|
103
|
+
cell_size = header.map(&:length).max
|
104
|
+
header = header.map{|h| h.rjust(cell_size)}.join(" ")
|
105
|
+
puts " Confusion Matrix ".center(header.length, "-")
|
106
|
+
puts header
|
107
|
+
puts "-" * header.length
|
108
|
+
predicted_totals = conf_mat.keys.map{|predicted| [predicted, 0]}.to_h
|
109
|
+
correct = 0
|
110
|
+
conf_mat.each do |k, rec|
|
111
|
+
actual_total = rec.values.reduce(:+)
|
112
|
+
puts ([k.ljust(cell_size)] + rec.values.map{|v| v.to_s.rjust(cell_size)} + [actual_total.to_s.rjust(cell_size), divide(rec[k], actual_total).round(5).to_s.rjust(cell_size)]).join(" ")
|
113
|
+
rec.each do |cat, val|
|
114
|
+
predicted_totals[cat] += val
|
115
|
+
correct += val if cat == k
|
116
|
+
end
|
117
|
+
end
|
118
|
+
total = predicted_totals.values.reduce(:+)
|
119
|
+
puts "-" * header.length
|
120
|
+
puts (["Total".ljust(cell_size)] + predicted_totals.values.map{|v| v.to_s.rjust(cell_size)} + [total.to_s.rjust(cell_size), "".rjust(cell_size)]).join(" ")
|
121
|
+
puts (["Precision".ljust(cell_size)] + predicted_totals.keys.map{|k| divide(conf_mat[k][k], predicted_totals[k]).round(5).to_s.rjust(cell_size)} + ["Accuracy ->".rjust(cell_size), divide(correct, total).round(5).to_s.rjust(cell_size)]).join(" ")
|
122
|
+
end
|
123
|
+
|
124
|
+
def print_conf_tab(conf_tab)
|
125
|
+
conf_tab.each do |positive, tab|
|
126
|
+
puts "# Positive class: #{positive}"
|
127
|
+
derivations = conf_tab_derivations(tab)
|
128
|
+
print_derivations(derivations)
|
129
|
+
puts
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
def conf_tab_derivations(tab)
|
134
|
+
positives = tab[:p][:t] + tab[:n][:f]
|
135
|
+
negatives = tab[:n][:t] + tab[:p][:f]
|
136
|
+
total = positives + negatives
|
137
|
+
{
|
138
|
+
total_population: positives + negatives,
|
139
|
+
condition_positive: positives,
|
140
|
+
condition_negative: negatives,
|
141
|
+
true_positive: tab[:p][:t],
|
142
|
+
true_negative: tab[:n][:t],
|
143
|
+
false_positive: tab[:p][:f],
|
144
|
+
false_negative: tab[:n][:f],
|
145
|
+
prevalence: divide(positives, total),
|
146
|
+
specificity: divide(tab[:n][:t], negatives),
|
147
|
+
recall: divide(tab[:p][:t], positives),
|
148
|
+
precision: divide(tab[:p][:t], tab[:p][:t] + tab[:p][:f]),
|
149
|
+
accuracy: divide(tab[:p][:t] + tab[:n][:t], total),
|
150
|
+
f1_score: divide(2 * tab[:p][:t], 2 * tab[:p][:t] + tab[:p][:f] + tab[:n][:f])
|
151
|
+
}
|
152
|
+
end
|
153
|
+
|
154
|
+
def print_derivations(derivations)
|
155
|
+
max_len = derivations.keys.map(&:length).max
|
156
|
+
derivations.each do |k, v|
|
157
|
+
puts k.to_s.tr('_', ' ').capitalize.ljust(max_len) + " : " + v.to_s
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def empty_conf_mat(categories)
|
162
|
+
categories.map{|actual| [actual, categories.map{|predicted| [predicted, 0]}.to_h]}.to_h
|
163
|
+
end
|
164
|
+
|
165
|
+
def divide(dividend, divisor)
|
166
|
+
divisor.zero? ? 0.0 : dividend / divisor.to_f
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: classifier-reborn
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lucas Carlson
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2017-
|
13
|
+
date: 2017-12-15 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: fast-stemmer
|
@@ -110,14 +110,26 @@ dependencies:
|
|
110
110
|
- - ">="
|
111
111
|
- !ruby/object:Gem::Version
|
112
112
|
version: '0'
|
113
|
+
- !ruby/object:Gem::Dependency
|
114
|
+
name: redis
|
115
|
+
requirement: !ruby/object:Gem::Requirement
|
116
|
+
requirements:
|
117
|
+
- - ">="
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: '0'
|
120
|
+
type: :development
|
121
|
+
prerelease: false
|
122
|
+
version_requirements: !ruby/object:Gem::Requirement
|
123
|
+
requirements:
|
124
|
+
- - ">="
|
125
|
+
- !ruby/object:Gem::Version
|
126
|
+
version: '0'
|
113
127
|
description:
|
114
128
|
email:
|
115
129
|
- lucas@rufy.com
|
116
130
|
- parkrmoore@gmail.com
|
117
131
|
- chase.gilliam@gmail.com
|
118
|
-
executables:
|
119
|
-
- bayes.rb
|
120
|
-
- summarize.rb
|
132
|
+
executables: []
|
121
133
|
extensions: []
|
122
134
|
extra_rdoc_files:
|
123
135
|
- README.markdown
|
@@ -125,8 +137,8 @@ extra_rdoc_files:
|
|
125
137
|
files:
|
126
138
|
- LICENSE
|
127
139
|
- README.markdown
|
128
|
-
-
|
129
|
-
-
|
140
|
+
- data/stopwords/ar
|
141
|
+
- data/stopwords/bn
|
130
142
|
- data/stopwords/ca
|
131
143
|
- data/stopwords/cs
|
132
144
|
- data/stopwords/da
|
@@ -135,15 +147,23 @@ files:
|
|
135
147
|
- data/stopwords/es
|
136
148
|
- data/stopwords/fi
|
137
149
|
- data/stopwords/fr
|
150
|
+
- data/stopwords/hi
|
138
151
|
- data/stopwords/hu
|
139
152
|
- data/stopwords/it
|
153
|
+
- data/stopwords/ja
|
140
154
|
- data/stopwords/nl
|
141
155
|
- data/stopwords/no
|
142
156
|
- data/stopwords/pl
|
143
157
|
- data/stopwords/pt
|
158
|
+
- data/stopwords/ru
|
144
159
|
- data/stopwords/se
|
145
160
|
- data/stopwords/tr
|
161
|
+
- data/stopwords/vi
|
162
|
+
- data/stopwords/zh
|
146
163
|
- lib/classifier-reborn.rb
|
164
|
+
- lib/classifier-reborn/backends/bayes_memory_backend.rb
|
165
|
+
- lib/classifier-reborn/backends/bayes_redis_backend.rb
|
166
|
+
- lib/classifier-reborn/backends/no_redis_error.rb
|
147
167
|
- lib/classifier-reborn/bayes.rb
|
148
168
|
- lib/classifier-reborn/category_namer.rb
|
149
169
|
- lib/classifier-reborn/extensions/hasher.rb
|
@@ -154,6 +174,7 @@ files:
|
|
154
174
|
- lib/classifier-reborn/lsi/content_node.rb
|
155
175
|
- lib/classifier-reborn/lsi/summarizer.rb
|
156
176
|
- lib/classifier-reborn/lsi/word_list.rb
|
177
|
+
- lib/classifier-reborn/validators/classifier_validator.rb
|
157
178
|
- lib/classifier-reborn/version.rb
|
158
179
|
homepage: https://github.com/jekyll/classifier-reborn
|
159
180
|
licenses:
|
@@ -176,8 +197,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
176
197
|
version: '0'
|
177
198
|
requirements: []
|
178
199
|
rubyforge_project:
|
179
|
-
rubygems_version: 2.
|
200
|
+
rubygems_version: 2.6.14
|
180
201
|
signing_key:
|
181
202
|
specification_version: 2
|
182
203
|
summary: A general classifier module to allow Bayesian and other types of classifications.
|
183
204
|
test_files: []
|
205
|
+
has_rdoc: true
|
data/bin/bayes.rb
DELETED
@@ -1,36 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
begin
|
4
|
-
require 'rubygems'
|
5
|
-
require 'classifier'
|
6
|
-
rescue
|
7
|
-
require 'classifier'
|
8
|
-
end
|
9
|
-
|
10
|
-
require 'madeleine'
|
11
|
-
|
12
|
-
m = SnapshotMadeleine.new(File.expand_path('~/.bayes_data')) do
|
13
|
-
ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting'
|
14
|
-
end
|
15
|
-
|
16
|
-
case ARGV[0]
|
17
|
-
when 'add'
|
18
|
-
case ARGV[1].downcase
|
19
|
-
when 'interesting'
|
20
|
-
m.system.train_interesting File.open(ARGV[2]).read
|
21
|
-
puts "#{ARGV[2]} has been classified as interesting"
|
22
|
-
when 'uninteresting'
|
23
|
-
m.system.train_uninteresting File.open(ARGV[2]).read
|
24
|
-
puts "#{ARGV[2]} has been classified as uninteresting"
|
25
|
-
else
|
26
|
-
puts 'Invalid category: choose between interesting and uninteresting'
|
27
|
-
exit(1)
|
28
|
-
end
|
29
|
-
when 'classify'
|
30
|
-
puts m.system.classify(File.open(ARGV[1]).read)
|
31
|
-
else
|
32
|
-
puts 'Invalid option: choose add [category] [file] or clasify [file]'
|
33
|
-
exit(-1)
|
34
|
-
end
|
35
|
-
|
36
|
-
m.take_snapshot
|
data/bin/summarize.rb
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
begin
|
4
|
-
require 'rubygems'
|
5
|
-
require 'classifier'
|
6
|
-
rescue
|
7
|
-
require 'classifier'
|
8
|
-
end
|
9
|
-
|
10
|
-
require 'open-uri'
|
11
|
-
|
12
|
-
num = ARGV[1].to_i
|
13
|
-
num = num < 1 ? 10 : num
|
14
|
-
|
15
|
-
text = open(ARGV.first).read
|
16
|
-
puts text.gsub(/<[^>]+>/, '').gsub(/[\s]+/, ' ').summary(num)
|