classifier-reborn 2.1.0 → 2.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +74 -1
- data/README.markdown +57 -227
- data/data/stopwords/ar +104 -0
- data/data/stopwords/bn +362 -0
- data/data/stopwords/hi +97 -0
- data/data/stopwords/ja +43 -0
- data/data/stopwords/ru +420 -0
- data/data/stopwords/tr +199 -30
- data/data/stopwords/vi +647 -0
- data/data/stopwords/zh +125 -0
- data/lib/classifier-reborn.rb +9 -0
- data/lib/classifier-reborn/backends/bayes_memory_backend.rb +75 -0
- data/lib/classifier-reborn/backends/bayes_redis_backend.rb +107 -0
- data/lib/classifier-reborn/backends/no_redis_error.rb +12 -0
- data/lib/classifier-reborn/bayes.rb +98 -38
- data/lib/classifier-reborn/category_namer.rb +0 -1
- data/lib/classifier-reborn/extensions/hasher.rb +1 -1
- data/lib/classifier-reborn/lsi.rb +5 -1
- data/lib/classifier-reborn/lsi/word_list.rb +2 -4
- data/lib/classifier-reborn/validators/classifier_validator.rb +169 -0
- data/lib/classifier-reborn/version.rb +1 -1
- metadata +30 -8
- data/bin/bayes.rb +0 -36
- data/bin/summarize.rb +0 -16
@@ -21,7 +21,7 @@ module ClassifierReborn
|
|
21
21
|
|
22
22
|
# Return a word hash without extra punctuation or short symbols, just stemmed words
|
23
23
|
def clean_word_hash(str, language = 'en', enable_stemmer = true)
|
24
|
-
word_hash_for_words
|
24
|
+
word_hash_for_words(str.gsub(/[^\p{WORD}\s]/, '').downcase.split, language, enable_stemmer)
|
25
25
|
end
|
26
26
|
|
27
27
|
def word_hash_for_words(words, language = 'en', enable_stemmer = true)
|
@@ -86,7 +86,7 @@ module ClassifierReborn
|
|
86
86
|
add_item(item)
|
87
87
|
end
|
88
88
|
|
89
|
-
# Returns
|
89
|
+
# Returns categories for a given indexed item. You are free to add and remove
|
90
90
|
# items from this as you see fit. It does not invalide an index to change its categories.
|
91
91
|
def categories_for(item)
|
92
92
|
return [] unless @items[item]
|
@@ -300,6 +300,10 @@ module ClassifierReborn
|
|
300
300
|
top_n.collect { |x| @word_list.word_for_index(content_vector_array.index(x)) }
|
301
301
|
end
|
302
302
|
|
303
|
+
def reset
|
304
|
+
initialize(auto_rebuild: @auto_rebuild, cache_node_vectors: @cache_node_vectors)
|
305
|
+
end
|
306
|
+
|
303
307
|
private
|
304
308
|
|
305
309
|
def build_reduced_matrix(matrix, cutoff = 0.75)
|
@@ -13,14 +13,12 @@ module ClassifierReborn
|
|
13
13
|
|
14
14
|
# Adds a word (if it is new) and assigns it a unique dimension.
|
15
15
|
def add_word(word)
|
16
|
-
|
17
|
-
@location_table[term] = @location_table.size unless @location_table[term]
|
16
|
+
@location_table[word] = @location_table.size unless @location_table[word]
|
18
17
|
end
|
19
18
|
|
20
19
|
# Returns the dimension of the word or nil if the word is not in the space.
|
21
20
|
def [](lookup)
|
22
|
-
|
23
|
-
@location_table[term]
|
21
|
+
@location_table[lookup]
|
24
22
|
end
|
25
23
|
|
26
24
|
def word_for_index(ind)
|
@@ -0,0 +1,169 @@
|
|
1
|
+
module ClassifierReborn
|
2
|
+
module ClassifierValidator
|
3
|
+
|
4
|
+
module_function
|
5
|
+
|
6
|
+
def cross_validate(classifier, sample_data, fold=10, *options)
|
7
|
+
classifier = ClassifierReborn::const_get(classifier).new(options) if classifier.is_a?(String)
|
8
|
+
sample_data.shuffle!
|
9
|
+
partition_size = sample_data.length / fold
|
10
|
+
partitioned_data = sample_data.each_slice(partition_size)
|
11
|
+
conf_mats = []
|
12
|
+
fold.times do |i|
|
13
|
+
training_data = partitioned_data.take(fold)
|
14
|
+
test_data = training_data.slice!(i)
|
15
|
+
conf_mats << validate(classifier, training_data.flatten!(1), test_data)
|
16
|
+
end
|
17
|
+
classifier.reset()
|
18
|
+
generate_report(conf_mats)
|
19
|
+
end
|
20
|
+
|
21
|
+
def validate(classifier, training_data, test_data, *options)
|
22
|
+
classifier = ClassifierReborn::const_get(classifier).new(options) if classifier.is_a?(String)
|
23
|
+
classifier.reset()
|
24
|
+
training_data.each do |rec|
|
25
|
+
classifier.train(rec.first, rec.last)
|
26
|
+
end
|
27
|
+
evaluate(classifier, test_data)
|
28
|
+
end
|
29
|
+
|
30
|
+
def evaluate(classifier, test_data)
|
31
|
+
conf_mat = empty_conf_mat(classifier.categories.sort)
|
32
|
+
test_data.each do |rec|
|
33
|
+
actual = rec.first.tr('_', ' ').capitalize
|
34
|
+
predicted = classifier.classify(rec.last)
|
35
|
+
conf_mat[actual][predicted] += 1 unless predicted.nil?
|
36
|
+
end
|
37
|
+
conf_mat
|
38
|
+
end
|
39
|
+
|
40
|
+
def generate_report(*conf_mats)
|
41
|
+
conf_mats.flatten!
|
42
|
+
accumulated_conf_mat = conf_mats.length == 1 ? conf_mats.first : empty_conf_mat(conf_mats.first.keys.sort)
|
43
|
+
header = "Run Total Correct Incorrect Accuracy"
|
44
|
+
puts
|
45
|
+
puts " Run Report ".center(header.length, "-")
|
46
|
+
puts header
|
47
|
+
puts "-" * header.length
|
48
|
+
if conf_mats.length > 1
|
49
|
+
conf_mats.each_with_index do |conf_mat, i|
|
50
|
+
run_report = build_run_report(conf_mat)
|
51
|
+
print_run_report(run_report, i+1)
|
52
|
+
conf_mat.each do |actual, cols|
|
53
|
+
cols.each do |predicted, v|
|
54
|
+
accumulated_conf_mat[actual][predicted] += v
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
puts "-" * header.length
|
59
|
+
end
|
60
|
+
run_report = build_run_report(accumulated_conf_mat)
|
61
|
+
print_run_report(run_report, "All")
|
62
|
+
puts
|
63
|
+
print_conf_mat(accumulated_conf_mat)
|
64
|
+
puts
|
65
|
+
conf_tab = conf_mat_to_tab(accumulated_conf_mat)
|
66
|
+
print_conf_tab(conf_tab)
|
67
|
+
end
|
68
|
+
|
69
|
+
def build_run_report(conf_mat)
|
70
|
+
correct = incorrect = 0
|
71
|
+
conf_mat.each do |actual, cols|
|
72
|
+
cols.each do |predicted, v|
|
73
|
+
if actual == predicted
|
74
|
+
correct += v
|
75
|
+
else
|
76
|
+
incorrect += v
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
total = correct + incorrect
|
81
|
+
{total: total, correct: correct, incorrect: incorrect, accuracy: divide(correct, total)}
|
82
|
+
end
|
83
|
+
|
84
|
+
def conf_mat_to_tab(conf_mat)
|
85
|
+
conf_tab = Hash.new {|h, k| h[k] = {p: {t: 0, f: 0}, n: {t: 0, f: 0}}}
|
86
|
+
conf_mat.each_key do |positive|
|
87
|
+
conf_mat.each do |actual, cols|
|
88
|
+
cols.each do |predicted, v|
|
89
|
+
conf_tab[positive][positive == predicted ? :p : :n][actual == predicted ? :t : :f] += v
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
conf_tab
|
94
|
+
end
|
95
|
+
|
96
|
+
def print_run_report(stats, prefix="", print_header=false)
|
97
|
+
puts "#{"Run".rjust([3, prefix.length].max)} Total Correct Incorrect Accuracy" if print_header
|
98
|
+
puts "#{prefix.to_s.rjust(3)} #{stats[:total].to_s.rjust(9)} #{stats[:correct].to_s.rjust(9)} #{stats[:incorrect].to_s.rjust(9)} #{stats[:accuracy].round(5).to_s.ljust(7, '0').rjust(9)}"
|
99
|
+
end
|
100
|
+
|
101
|
+
def print_conf_mat(conf_mat)
|
102
|
+
header = ["Predicted ->"] + conf_mat.keys + ["Total", "Recall"]
|
103
|
+
cell_size = header.map(&:length).max
|
104
|
+
header = header.map{|h| h.rjust(cell_size)}.join(" ")
|
105
|
+
puts " Confusion Matrix ".center(header.length, "-")
|
106
|
+
puts header
|
107
|
+
puts "-" * header.length
|
108
|
+
predicted_totals = conf_mat.keys.map{|predicted| [predicted, 0]}.to_h
|
109
|
+
correct = 0
|
110
|
+
conf_mat.each do |k, rec|
|
111
|
+
actual_total = rec.values.reduce(:+)
|
112
|
+
puts ([k.ljust(cell_size)] + rec.values.map{|v| v.to_s.rjust(cell_size)} + [actual_total.to_s.rjust(cell_size), divide(rec[k], actual_total).round(5).to_s.rjust(cell_size)]).join(" ")
|
113
|
+
rec.each do |cat, val|
|
114
|
+
predicted_totals[cat] += val
|
115
|
+
correct += val if cat == k
|
116
|
+
end
|
117
|
+
end
|
118
|
+
total = predicted_totals.values.reduce(:+)
|
119
|
+
puts "-" * header.length
|
120
|
+
puts (["Total".ljust(cell_size)] + predicted_totals.values.map{|v| v.to_s.rjust(cell_size)} + [total.to_s.rjust(cell_size), "".rjust(cell_size)]).join(" ")
|
121
|
+
puts (["Precision".ljust(cell_size)] + predicted_totals.keys.map{|k| divide(conf_mat[k][k], predicted_totals[k]).round(5).to_s.rjust(cell_size)} + ["Accuracy ->".rjust(cell_size), divide(correct, total).round(5).to_s.rjust(cell_size)]).join(" ")
|
122
|
+
end
|
123
|
+
|
124
|
+
def print_conf_tab(conf_tab)
|
125
|
+
conf_tab.each do |positive, tab|
|
126
|
+
puts "# Positive class: #{positive}"
|
127
|
+
derivations = conf_tab_derivations(tab)
|
128
|
+
print_derivations(derivations)
|
129
|
+
puts
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
def conf_tab_derivations(tab)
|
134
|
+
positives = tab[:p][:t] + tab[:n][:f]
|
135
|
+
negatives = tab[:n][:t] + tab[:p][:f]
|
136
|
+
total = positives + negatives
|
137
|
+
{
|
138
|
+
total_population: positives + negatives,
|
139
|
+
condition_positive: positives,
|
140
|
+
condition_negative: negatives,
|
141
|
+
true_positive: tab[:p][:t],
|
142
|
+
true_negative: tab[:n][:t],
|
143
|
+
false_positive: tab[:p][:f],
|
144
|
+
false_negative: tab[:n][:f],
|
145
|
+
prevalence: divide(positives, total),
|
146
|
+
specificity: divide(tab[:n][:t], negatives),
|
147
|
+
recall: divide(tab[:p][:t], positives),
|
148
|
+
precision: divide(tab[:p][:t], tab[:p][:t] + tab[:p][:f]),
|
149
|
+
accuracy: divide(tab[:p][:t] + tab[:n][:t], total),
|
150
|
+
f1_score: divide(2 * tab[:p][:t], 2 * tab[:p][:t] + tab[:p][:f] + tab[:n][:f])
|
151
|
+
}
|
152
|
+
end
|
153
|
+
|
154
|
+
def print_derivations(derivations)
|
155
|
+
max_len = derivations.keys.map(&:length).max
|
156
|
+
derivations.each do |k, v|
|
157
|
+
puts k.to_s.tr('_', ' ').capitalize.ljust(max_len) + " : " + v.to_s
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def empty_conf_mat(categories)
|
162
|
+
categories.map{|actual| [actual, categories.map{|predicted| [predicted, 0]}.to_h]}.to_h
|
163
|
+
end
|
164
|
+
|
165
|
+
def divide(dividend, divisor)
|
166
|
+
divisor.zero? ? 0.0 : dividend / divisor.to_f
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: classifier-reborn
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lucas Carlson
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2017-
|
13
|
+
date: 2017-12-15 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: fast-stemmer
|
@@ -110,14 +110,26 @@ dependencies:
|
|
110
110
|
- - ">="
|
111
111
|
- !ruby/object:Gem::Version
|
112
112
|
version: '0'
|
113
|
+
- !ruby/object:Gem::Dependency
|
114
|
+
name: redis
|
115
|
+
requirement: !ruby/object:Gem::Requirement
|
116
|
+
requirements:
|
117
|
+
- - ">="
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: '0'
|
120
|
+
type: :development
|
121
|
+
prerelease: false
|
122
|
+
version_requirements: !ruby/object:Gem::Requirement
|
123
|
+
requirements:
|
124
|
+
- - ">="
|
125
|
+
- !ruby/object:Gem::Version
|
126
|
+
version: '0'
|
113
127
|
description:
|
114
128
|
email:
|
115
129
|
- lucas@rufy.com
|
116
130
|
- parkrmoore@gmail.com
|
117
131
|
- chase.gilliam@gmail.com
|
118
|
-
executables:
|
119
|
-
- bayes.rb
|
120
|
-
- summarize.rb
|
132
|
+
executables: []
|
121
133
|
extensions: []
|
122
134
|
extra_rdoc_files:
|
123
135
|
- README.markdown
|
@@ -125,8 +137,8 @@ extra_rdoc_files:
|
|
125
137
|
files:
|
126
138
|
- LICENSE
|
127
139
|
- README.markdown
|
128
|
-
-
|
129
|
-
-
|
140
|
+
- data/stopwords/ar
|
141
|
+
- data/stopwords/bn
|
130
142
|
- data/stopwords/ca
|
131
143
|
- data/stopwords/cs
|
132
144
|
- data/stopwords/da
|
@@ -135,15 +147,23 @@ files:
|
|
135
147
|
- data/stopwords/es
|
136
148
|
- data/stopwords/fi
|
137
149
|
- data/stopwords/fr
|
150
|
+
- data/stopwords/hi
|
138
151
|
- data/stopwords/hu
|
139
152
|
- data/stopwords/it
|
153
|
+
- data/stopwords/ja
|
140
154
|
- data/stopwords/nl
|
141
155
|
- data/stopwords/no
|
142
156
|
- data/stopwords/pl
|
143
157
|
- data/stopwords/pt
|
158
|
+
- data/stopwords/ru
|
144
159
|
- data/stopwords/se
|
145
160
|
- data/stopwords/tr
|
161
|
+
- data/stopwords/vi
|
162
|
+
- data/stopwords/zh
|
146
163
|
- lib/classifier-reborn.rb
|
164
|
+
- lib/classifier-reborn/backends/bayes_memory_backend.rb
|
165
|
+
- lib/classifier-reborn/backends/bayes_redis_backend.rb
|
166
|
+
- lib/classifier-reborn/backends/no_redis_error.rb
|
147
167
|
- lib/classifier-reborn/bayes.rb
|
148
168
|
- lib/classifier-reborn/category_namer.rb
|
149
169
|
- lib/classifier-reborn/extensions/hasher.rb
|
@@ -154,6 +174,7 @@ files:
|
|
154
174
|
- lib/classifier-reborn/lsi/content_node.rb
|
155
175
|
- lib/classifier-reborn/lsi/summarizer.rb
|
156
176
|
- lib/classifier-reborn/lsi/word_list.rb
|
177
|
+
- lib/classifier-reborn/validators/classifier_validator.rb
|
157
178
|
- lib/classifier-reborn/version.rb
|
158
179
|
homepage: https://github.com/jekyll/classifier-reborn
|
159
180
|
licenses:
|
@@ -176,8 +197,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
176
197
|
version: '0'
|
177
198
|
requirements: []
|
178
199
|
rubyforge_project:
|
179
|
-
rubygems_version: 2.
|
200
|
+
rubygems_version: 2.6.14
|
180
201
|
signing_key:
|
181
202
|
specification_version: 2
|
182
203
|
summary: A general classifier module to allow Bayesian and other types of classifications.
|
183
204
|
test_files: []
|
205
|
+
has_rdoc: true
|
data/bin/bayes.rb
DELETED
@@ -1,36 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
begin
|
4
|
-
require 'rubygems'
|
5
|
-
require 'classifier'
|
6
|
-
rescue
|
7
|
-
require 'classifier'
|
8
|
-
end
|
9
|
-
|
10
|
-
require 'madeleine'
|
11
|
-
|
12
|
-
m = SnapshotMadeleine.new(File.expand_path('~/.bayes_data')) do
|
13
|
-
ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting'
|
14
|
-
end
|
15
|
-
|
16
|
-
case ARGV[0]
|
17
|
-
when 'add'
|
18
|
-
case ARGV[1].downcase
|
19
|
-
when 'interesting'
|
20
|
-
m.system.train_interesting File.open(ARGV[2]).read
|
21
|
-
puts "#{ARGV[2]} has been classified as interesting"
|
22
|
-
when 'uninteresting'
|
23
|
-
m.system.train_uninteresting File.open(ARGV[2]).read
|
24
|
-
puts "#{ARGV[2]} has been classified as uninteresting"
|
25
|
-
else
|
26
|
-
puts 'Invalid category: choose between interesting and uninteresting'
|
27
|
-
exit(1)
|
28
|
-
end
|
29
|
-
when 'classify'
|
30
|
-
puts m.system.classify(File.open(ARGV[1]).read)
|
31
|
-
else
|
32
|
-
puts 'Invalid option: choose add [category] [file] or clasify [file]'
|
33
|
-
exit(-1)
|
34
|
-
end
|
35
|
-
|
36
|
-
m.take_snapshot
|
data/bin/summarize.rb
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
begin
|
4
|
-
require 'rubygems'
|
5
|
-
require 'classifier'
|
6
|
-
rescue
|
7
|
-
require 'classifier'
|
8
|
-
end
|
9
|
-
|
10
|
-
require 'open-uri'
|
11
|
-
|
12
|
-
num = ARGV[1].to_i
|
13
|
-
num = num < 1 ? 10 : num
|
14
|
-
|
15
|
-
text = open(ARGV.first).read
|
16
|
-
puts text.gsub(/<[^>]+>/, '').gsub(/[\s]+/, ' ').summary(num)
|