classifier-reborn 2.1.0 → 2.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,7 +2,6 @@
2
2
  # Copyright:: Copyright (c) 2005 Lucas Carlson
3
3
  # License:: LGPL
4
4
 
5
- require 'fast_stemmer'
6
5
  require 'classifier-reborn/extensions/hasher'
7
6
 
8
7
  module ClassifierReborn
@@ -21,7 +21,7 @@ module ClassifierReborn
21
21
 
22
22
  # Return a word hash without extra punctuation or short symbols, just stemmed words
23
23
  def clean_word_hash(str, language = 'en', enable_stemmer = true)
24
- word_hash_for_words str.gsub(/[^\p{WORD}\s]/, '').downcase.split, language, enable_stemmer
24
+ word_hash_for_words(str.gsub(/[^\p{WORD}\s]/, '').downcase.split, language, enable_stemmer)
25
25
  end
26
26
 
27
27
  def word_hash_for_words(words, language = 'en', enable_stemmer = true)
@@ -86,7 +86,7 @@ module ClassifierReborn
86
86
  add_item(item)
87
87
  end
88
88
 
89
- # Returns the categories for a given indexed items. You are free to add and remove
89
+ # Returns categories for a given indexed item. You are free to add and remove
90
90
  # items from this as you see fit. It does not invalide an index to change its categories.
91
91
  def categories_for(item)
92
92
  return [] unless @items[item]
@@ -300,6 +300,10 @@ module ClassifierReborn
300
300
  top_n.collect { |x| @word_list.word_for_index(content_vector_array.index(x)) }
301
301
  end
302
302
 
303
+ def reset
304
+ initialize(auto_rebuild: @auto_rebuild, cache_node_vectors: @cache_node_vectors)
305
+ end
306
+
303
307
  private
304
308
 
305
309
  def build_reduced_matrix(matrix, cutoff = 0.75)
@@ -13,14 +13,12 @@ module ClassifierReborn
13
13
 
14
14
  # Adds a word (if it is new) and assigns it a unique dimension.
15
15
  def add_word(word)
16
- term = word
17
- @location_table[term] = @location_table.size unless @location_table[term]
16
+ @location_table[word] = @location_table.size unless @location_table[word]
18
17
  end
19
18
 
20
19
  # Returns the dimension of the word or nil if the word is not in the space.
21
20
  def [](lookup)
22
- term = lookup
23
- @location_table[term]
21
+ @location_table[lookup]
24
22
  end
25
23
 
26
24
  def word_for_index(ind)
@@ -0,0 +1,169 @@
1
+ module ClassifierReborn
2
+ module ClassifierValidator
3
+
4
+ module_function
5
+
6
+ def cross_validate(classifier, sample_data, fold=10, *options)
7
+ classifier = ClassifierReborn::const_get(classifier).new(options) if classifier.is_a?(String)
8
+ sample_data.shuffle!
9
+ partition_size = sample_data.length / fold
10
+ partitioned_data = sample_data.each_slice(partition_size)
11
+ conf_mats = []
12
+ fold.times do |i|
13
+ training_data = partitioned_data.take(fold)
14
+ test_data = training_data.slice!(i)
15
+ conf_mats << validate(classifier, training_data.flatten!(1), test_data)
16
+ end
17
+ classifier.reset()
18
+ generate_report(conf_mats)
19
+ end
20
+
21
+ def validate(classifier, training_data, test_data, *options)
22
+ classifier = ClassifierReborn::const_get(classifier).new(options) if classifier.is_a?(String)
23
+ classifier.reset()
24
+ training_data.each do |rec|
25
+ classifier.train(rec.first, rec.last)
26
+ end
27
+ evaluate(classifier, test_data)
28
+ end
29
+
30
+ def evaluate(classifier, test_data)
31
+ conf_mat = empty_conf_mat(classifier.categories.sort)
32
+ test_data.each do |rec|
33
+ actual = rec.first.tr('_', ' ').capitalize
34
+ predicted = classifier.classify(rec.last)
35
+ conf_mat[actual][predicted] += 1 unless predicted.nil?
36
+ end
37
+ conf_mat
38
+ end
39
+
40
+ def generate_report(*conf_mats)
41
+ conf_mats.flatten!
42
+ accumulated_conf_mat = conf_mats.length == 1 ? conf_mats.first : empty_conf_mat(conf_mats.first.keys.sort)
43
+ header = "Run Total Correct Incorrect Accuracy"
44
+ puts
45
+ puts " Run Report ".center(header.length, "-")
46
+ puts header
47
+ puts "-" * header.length
48
+ if conf_mats.length > 1
49
+ conf_mats.each_with_index do |conf_mat, i|
50
+ run_report = build_run_report(conf_mat)
51
+ print_run_report(run_report, i+1)
52
+ conf_mat.each do |actual, cols|
53
+ cols.each do |predicted, v|
54
+ accumulated_conf_mat[actual][predicted] += v
55
+ end
56
+ end
57
+ end
58
+ puts "-" * header.length
59
+ end
60
+ run_report = build_run_report(accumulated_conf_mat)
61
+ print_run_report(run_report, "All")
62
+ puts
63
+ print_conf_mat(accumulated_conf_mat)
64
+ puts
65
+ conf_tab = conf_mat_to_tab(accumulated_conf_mat)
66
+ print_conf_tab(conf_tab)
67
+ end
68
+
69
+ def build_run_report(conf_mat)
70
+ correct = incorrect = 0
71
+ conf_mat.each do |actual, cols|
72
+ cols.each do |predicted, v|
73
+ if actual == predicted
74
+ correct += v
75
+ else
76
+ incorrect += v
77
+ end
78
+ end
79
+ end
80
+ total = correct + incorrect
81
+ {total: total, correct: correct, incorrect: incorrect, accuracy: divide(correct, total)}
82
+ end
83
+
84
+ def conf_mat_to_tab(conf_mat)
85
+ conf_tab = Hash.new {|h, k| h[k] = {p: {t: 0, f: 0}, n: {t: 0, f: 0}}}
86
+ conf_mat.each_key do |positive|
87
+ conf_mat.each do |actual, cols|
88
+ cols.each do |predicted, v|
89
+ conf_tab[positive][positive == predicted ? :p : :n][actual == predicted ? :t : :f] += v
90
+ end
91
+ end
92
+ end
93
+ conf_tab
94
+ end
95
+
96
+ def print_run_report(stats, prefix="", print_header=false)
97
+ puts "#{"Run".rjust([3, prefix.length].max)} Total Correct Incorrect Accuracy" if print_header
98
+ puts "#{prefix.to_s.rjust(3)} #{stats[:total].to_s.rjust(9)} #{stats[:correct].to_s.rjust(9)} #{stats[:incorrect].to_s.rjust(9)} #{stats[:accuracy].round(5).to_s.ljust(7, '0').rjust(9)}"
99
+ end
100
+
101
+ def print_conf_mat(conf_mat)
102
+ header = ["Predicted ->"] + conf_mat.keys + ["Total", "Recall"]
103
+ cell_size = header.map(&:length).max
104
+ header = header.map{|h| h.rjust(cell_size)}.join(" ")
105
+ puts " Confusion Matrix ".center(header.length, "-")
106
+ puts header
107
+ puts "-" * header.length
108
+ predicted_totals = conf_mat.keys.map{|predicted| [predicted, 0]}.to_h
109
+ correct = 0
110
+ conf_mat.each do |k, rec|
111
+ actual_total = rec.values.reduce(:+)
112
+ puts ([k.ljust(cell_size)] + rec.values.map{|v| v.to_s.rjust(cell_size)} + [actual_total.to_s.rjust(cell_size), divide(rec[k], actual_total).round(5).to_s.rjust(cell_size)]).join(" ")
113
+ rec.each do |cat, val|
114
+ predicted_totals[cat] += val
115
+ correct += val if cat == k
116
+ end
117
+ end
118
+ total = predicted_totals.values.reduce(:+)
119
+ puts "-" * header.length
120
+ puts (["Total".ljust(cell_size)] + predicted_totals.values.map{|v| v.to_s.rjust(cell_size)} + [total.to_s.rjust(cell_size), "".rjust(cell_size)]).join(" ")
121
+ puts (["Precision".ljust(cell_size)] + predicted_totals.keys.map{|k| divide(conf_mat[k][k], predicted_totals[k]).round(5).to_s.rjust(cell_size)} + ["Accuracy ->".rjust(cell_size), divide(correct, total).round(5).to_s.rjust(cell_size)]).join(" ")
122
+ end
123
+
124
+ def print_conf_tab(conf_tab)
125
+ conf_tab.each do |positive, tab|
126
+ puts "# Positive class: #{positive}"
127
+ derivations = conf_tab_derivations(tab)
128
+ print_derivations(derivations)
129
+ puts
130
+ end
131
+ end
132
+
133
+ def conf_tab_derivations(tab)
134
+ positives = tab[:p][:t] + tab[:n][:f]
135
+ negatives = tab[:n][:t] + tab[:p][:f]
136
+ total = positives + negatives
137
+ {
138
+ total_population: positives + negatives,
139
+ condition_positive: positives,
140
+ condition_negative: negatives,
141
+ true_positive: tab[:p][:t],
142
+ true_negative: tab[:n][:t],
143
+ false_positive: tab[:p][:f],
144
+ false_negative: tab[:n][:f],
145
+ prevalence: divide(positives, total),
146
+ specificity: divide(tab[:n][:t], negatives),
147
+ recall: divide(tab[:p][:t], positives),
148
+ precision: divide(tab[:p][:t], tab[:p][:t] + tab[:p][:f]),
149
+ accuracy: divide(tab[:p][:t] + tab[:n][:t], total),
150
+ f1_score: divide(2 * tab[:p][:t], 2 * tab[:p][:t] + tab[:p][:f] + tab[:n][:f])
151
+ }
152
+ end
153
+
154
+ def print_derivations(derivations)
155
+ max_len = derivations.keys.map(&:length).max
156
+ derivations.each do |k, v|
157
+ puts k.to_s.tr('_', ' ').capitalize.ljust(max_len) + " : " + v.to_s
158
+ end
159
+ end
160
+
161
+ def empty_conf_mat(categories)
162
+ categories.map{|actual| [actual, categories.map{|predicted| [predicted, 0]}.to_h]}.to_h
163
+ end
164
+
165
+ def divide(dividend, divisor)
166
+ divisor.zero? ? 0.0 : dividend / divisor.to_f
167
+ end
168
+ end
169
+ end
@@ -1,3 +1,3 @@
1
1
  module ClassifierReborn
2
- VERSION = '2.1.0'
2
+ VERSION = '2.2.0'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: classifier-reborn
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.0
4
+ version: 2.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lucas Carlson
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2017-01-01 00:00:00.000000000 Z
13
+ date: 2017-12-15 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: fast-stemmer
@@ -110,14 +110,26 @@ dependencies:
110
110
  - - ">="
111
111
  - !ruby/object:Gem::Version
112
112
  version: '0'
113
+ - !ruby/object:Gem::Dependency
114
+ name: redis
115
+ requirement: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ version: '0'
120
+ type: :development
121
+ prerelease: false
122
+ version_requirements: !ruby/object:Gem::Requirement
123
+ requirements:
124
+ - - ">="
125
+ - !ruby/object:Gem::Version
126
+ version: '0'
113
127
  description:
114
128
  email:
115
129
  - lucas@rufy.com
116
130
  - parkrmoore@gmail.com
117
131
  - chase.gilliam@gmail.com
118
- executables:
119
- - bayes.rb
120
- - summarize.rb
132
+ executables: []
121
133
  extensions: []
122
134
  extra_rdoc_files:
123
135
  - README.markdown
@@ -125,8 +137,8 @@ extra_rdoc_files:
125
137
  files:
126
138
  - LICENSE
127
139
  - README.markdown
128
- - bin/bayes.rb
129
- - bin/summarize.rb
140
+ - data/stopwords/ar
141
+ - data/stopwords/bn
130
142
  - data/stopwords/ca
131
143
  - data/stopwords/cs
132
144
  - data/stopwords/da
@@ -135,15 +147,23 @@ files:
135
147
  - data/stopwords/es
136
148
  - data/stopwords/fi
137
149
  - data/stopwords/fr
150
+ - data/stopwords/hi
138
151
  - data/stopwords/hu
139
152
  - data/stopwords/it
153
+ - data/stopwords/ja
140
154
  - data/stopwords/nl
141
155
  - data/stopwords/no
142
156
  - data/stopwords/pl
143
157
  - data/stopwords/pt
158
+ - data/stopwords/ru
144
159
  - data/stopwords/se
145
160
  - data/stopwords/tr
161
+ - data/stopwords/vi
162
+ - data/stopwords/zh
146
163
  - lib/classifier-reborn.rb
164
+ - lib/classifier-reborn/backends/bayes_memory_backend.rb
165
+ - lib/classifier-reborn/backends/bayes_redis_backend.rb
166
+ - lib/classifier-reborn/backends/no_redis_error.rb
147
167
  - lib/classifier-reborn/bayes.rb
148
168
  - lib/classifier-reborn/category_namer.rb
149
169
  - lib/classifier-reborn/extensions/hasher.rb
@@ -154,6 +174,7 @@ files:
154
174
  - lib/classifier-reborn/lsi/content_node.rb
155
175
  - lib/classifier-reborn/lsi/summarizer.rb
156
176
  - lib/classifier-reborn/lsi/word_list.rb
177
+ - lib/classifier-reborn/validators/classifier_validator.rb
157
178
  - lib/classifier-reborn/version.rb
158
179
  homepage: https://github.com/jekyll/classifier-reborn
159
180
  licenses:
@@ -176,8 +197,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
176
197
  version: '0'
177
198
  requirements: []
178
199
  rubyforge_project:
179
- rubygems_version: 2.5.2
200
+ rubygems_version: 2.6.14
180
201
  signing_key:
181
202
  specification_version: 2
182
203
  summary: A general classifier module to allow Bayesian and other types of classifications.
183
204
  test_files: []
205
+ has_rdoc: true
@@ -1,36 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- begin
4
- require 'rubygems'
5
- require 'classifier'
6
- rescue
7
- require 'classifier'
8
- end
9
-
10
- require 'madeleine'
11
-
12
- m = SnapshotMadeleine.new(File.expand_path('~/.bayes_data')) do
13
- ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting'
14
- end
15
-
16
- case ARGV[0]
17
- when 'add'
18
- case ARGV[1].downcase
19
- when 'interesting'
20
- m.system.train_interesting File.open(ARGV[2]).read
21
- puts "#{ARGV[2]} has been classified as interesting"
22
- when 'uninteresting'
23
- m.system.train_uninteresting File.open(ARGV[2]).read
24
- puts "#{ARGV[2]} has been classified as uninteresting"
25
- else
26
- puts 'Invalid category: choose between interesting and uninteresting'
27
- exit(1)
28
- end
29
- when 'classify'
30
- puts m.system.classify(File.open(ARGV[1]).read)
31
- else
32
- puts 'Invalid option: choose add [category] [file] or clasify [file]'
33
- exit(-1)
34
- end
35
-
36
- m.take_snapshot
@@ -1,16 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- begin
4
- require 'rubygems'
5
- require 'classifier'
6
- rescue
7
- require 'classifier'
8
- end
9
-
10
- require 'open-uri'
11
-
12
- num = ARGV[1].to_i
13
- num = num < 1 ? 10 : num
14
-
15
- text = open(ARGV.first).read
16
- puts text.gsub(/<[^>]+>/, '').gsub(/[\s]+/, ' ').summary(num)