classifier-reborn 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,6 @@
2
2
  # Copyright:: Copyright (c) 2005 Lucas Carlson
3
3
  # License:: LGPL
4
4
 
5
- require 'fast_stemmer'
6
5
  require 'classifier-reborn/extensions/hasher'
7
6
 
8
7
  module ClassifierReborn
@@ -21,7 +21,7 @@ module ClassifierReborn
21
21
 
22
22
  # Return a word hash without extra punctuation or short symbols, just stemmed words
23
23
  def clean_word_hash(str, language = 'en', enable_stemmer = true)
24
- word_hash_for_words str.gsub(/[^\p{WORD}\s]/, '').downcase.split, language, enable_stemmer
24
+ word_hash_for_words(str.gsub(/[^\p{WORD}\s]/, '').downcase.split, language, enable_stemmer)
25
25
  end
26
26
 
27
27
  def word_hash_for_words(words, language = 'en', enable_stemmer = true)
@@ -86,7 +86,7 @@ module ClassifierReborn
86
86
  add_item(item)
87
87
  end
88
88
 
89
- # Returns the categories for a given indexed items. You are free to add and remove
89
+ # Returns categories for a given indexed item. You are free to add and remove
90
90
  # items from this as you see fit. It does not invalide an index to change its categories.
91
91
  def categories_for(item)
92
92
  return [] unless @items[item]
@@ -300,6 +300,10 @@ module ClassifierReborn
300
300
  top_n.collect { |x| @word_list.word_for_index(content_vector_array.index(x)) }
301
301
  end
302
302
 
303
+ def reset
304
+ initialize(auto_rebuild: @auto_rebuild, cache_node_vectors: @cache_node_vectors)
305
+ end
306
+
303
307
  private
304
308
 
305
309
  def build_reduced_matrix(matrix, cutoff = 0.75)
@@ -13,14 +13,12 @@ module ClassifierReborn
13
13
 
14
14
  # Adds a word (if it is new) and assigns it a unique dimension.
15
15
  def add_word(word)
16
- term = word
17
- @location_table[term] = @location_table.size unless @location_table[term]
16
+ @location_table[word] = @location_table.size unless @location_table[word]
18
17
  end
19
18
 
20
19
  # Returns the dimension of the word or nil if the word is not in the space.
21
20
  def [](lookup)
22
- term = lookup
23
- @location_table[term]
21
+ @location_table[lookup]
24
22
  end
25
23
 
26
24
  def word_for_index(ind)
@@ -0,0 +1,169 @@
1
+ module ClassifierReborn
2
+ module ClassifierValidator
3
+
4
+ module_function
5
+
6
+ def cross_validate(classifier, sample_data, fold=10, *options)
7
+ classifier = ClassifierReborn::const_get(classifier).new(options) if classifier.is_a?(String)
8
+ sample_data.shuffle!
9
+ partition_size = sample_data.length / fold
10
+ partitioned_data = sample_data.each_slice(partition_size)
11
+ conf_mats = []
12
+ fold.times do |i|
13
+ training_data = partitioned_data.take(fold)
14
+ test_data = training_data.slice!(i)
15
+ conf_mats << validate(classifier, training_data.flatten!(1), test_data)
16
+ end
17
+ classifier.reset()
18
+ generate_report(conf_mats)
19
+ end
20
+
21
+ def validate(classifier, training_data, test_data, *options)
22
+ classifier = ClassifierReborn::const_get(classifier).new(options) if classifier.is_a?(String)
23
+ classifier.reset()
24
+ training_data.each do |rec|
25
+ classifier.train(rec.first, rec.last)
26
+ end
27
+ evaluate(classifier, test_data)
28
+ end
29
+
30
+ def evaluate(classifier, test_data)
31
+ conf_mat = empty_conf_mat(classifier.categories.sort)
32
+ test_data.each do |rec|
33
+ actual = rec.first.tr('_', ' ').capitalize
34
+ predicted = classifier.classify(rec.last)
35
+ conf_mat[actual][predicted] += 1 unless predicted.nil?
36
+ end
37
+ conf_mat
38
+ end
39
+
40
+ def generate_report(*conf_mats)
41
+ conf_mats.flatten!
42
+ accumulated_conf_mat = conf_mats.length == 1 ? conf_mats.first : empty_conf_mat(conf_mats.first.keys.sort)
43
+ header = "Run Total Correct Incorrect Accuracy"
44
+ puts
45
+ puts " Run Report ".center(header.length, "-")
46
+ puts header
47
+ puts "-" * header.length
48
+ if conf_mats.length > 1
49
+ conf_mats.each_with_index do |conf_mat, i|
50
+ run_report = build_run_report(conf_mat)
51
+ print_run_report(run_report, i+1)
52
+ conf_mat.each do |actual, cols|
53
+ cols.each do |predicted, v|
54
+ accumulated_conf_mat[actual][predicted] += v
55
+ end
56
+ end
57
+ end
58
+ puts "-" * header.length
59
+ end
60
+ run_report = build_run_report(accumulated_conf_mat)
61
+ print_run_report(run_report, "All")
62
+ puts
63
+ print_conf_mat(accumulated_conf_mat)
64
+ puts
65
+ conf_tab = conf_mat_to_tab(accumulated_conf_mat)
66
+ print_conf_tab(conf_tab)
67
+ end
68
+
69
+ def build_run_report(conf_mat)
70
+ correct = incorrect = 0
71
+ conf_mat.each do |actual, cols|
72
+ cols.each do |predicted, v|
73
+ if actual == predicted
74
+ correct += v
75
+ else
76
+ incorrect += v
77
+ end
78
+ end
79
+ end
80
+ total = correct + incorrect
81
+ {total: total, correct: correct, incorrect: incorrect, accuracy: divide(correct, total)}
82
+ end
83
+
84
+ def conf_mat_to_tab(conf_mat)
85
+ conf_tab = Hash.new {|h, k| h[k] = {p: {t: 0, f: 0}, n: {t: 0, f: 0}}}
86
+ conf_mat.each_key do |positive|
87
+ conf_mat.each do |actual, cols|
88
+ cols.each do |predicted, v|
89
+ conf_tab[positive][positive == predicted ? :p : :n][actual == predicted ? :t : :f] += v
90
+ end
91
+ end
92
+ end
93
+ conf_tab
94
+ end
95
+
96
+ def print_run_report(stats, prefix="", print_header=false)
97
+ puts "#{"Run".rjust([3, prefix.length].max)} Total Correct Incorrect Accuracy" if print_header
98
+ puts "#{prefix.to_s.rjust(3)} #{stats[:total].to_s.rjust(9)} #{stats[:correct].to_s.rjust(9)} #{stats[:incorrect].to_s.rjust(9)} #{stats[:accuracy].round(5).to_s.ljust(7, '0').rjust(9)}"
99
+ end
100
+
101
+ def print_conf_mat(conf_mat)
102
+ header = ["Predicted ->"] + conf_mat.keys + ["Total", "Recall"]
103
+ cell_size = header.map(&:length).max
104
+ header = header.map{|h| h.rjust(cell_size)}.join(" ")
105
+ puts " Confusion Matrix ".center(header.length, "-")
106
+ puts header
107
+ puts "-" * header.length
108
+ predicted_totals = conf_mat.keys.map{|predicted| [predicted, 0]}.to_h
109
+ correct = 0
110
+ conf_mat.each do |k, rec|
111
+ actual_total = rec.values.reduce(:+)
112
+ puts ([k.ljust(cell_size)] + rec.values.map{|v| v.to_s.rjust(cell_size)} + [actual_total.to_s.rjust(cell_size), divide(rec[k], actual_total).round(5).to_s.rjust(cell_size)]).join(" ")
113
+ rec.each do |cat, val|
114
+ predicted_totals[cat] += val
115
+ correct += val if cat == k
116
+ end
117
+ end
118
+ total = predicted_totals.values.reduce(:+)
119
+ puts "-" * header.length
120
+ puts (["Total".ljust(cell_size)] + predicted_totals.values.map{|v| v.to_s.rjust(cell_size)} + [total.to_s.rjust(cell_size), "".rjust(cell_size)]).join(" ")
121
+ puts (["Precision".ljust(cell_size)] + predicted_totals.keys.map{|k| divide(conf_mat[k][k], predicted_totals[k]).round(5).to_s.rjust(cell_size)} + ["Accuracy ->".rjust(cell_size), divide(correct, total).round(5).to_s.rjust(cell_size)]).join(" ")
122
+ end
123
+
124
+ def print_conf_tab(conf_tab)
125
+ conf_tab.each do |positive, tab|
126
+ puts "# Positive class: #{positive}"
127
+ derivations = conf_tab_derivations(tab)
128
+ print_derivations(derivations)
129
+ puts
130
+ end
131
+ end
132
+
133
+ def conf_tab_derivations(tab)
134
+ positives = tab[:p][:t] + tab[:n][:f]
135
+ negatives = tab[:n][:t] + tab[:p][:f]
136
+ total = positives + negatives
137
+ {
138
+ total_population: positives + negatives,
139
+ condition_positive: positives,
140
+ condition_negative: negatives,
141
+ true_positive: tab[:p][:t],
142
+ true_negative: tab[:n][:t],
143
+ false_positive: tab[:p][:f],
144
+ false_negative: tab[:n][:f],
145
+ prevalence: divide(positives, total),
146
+ specificity: divide(tab[:n][:t], negatives),
147
+ recall: divide(tab[:p][:t], positives),
148
+ precision: divide(tab[:p][:t], tab[:p][:t] + tab[:p][:f]),
149
+ accuracy: divide(tab[:p][:t] + tab[:n][:t], total),
150
+ f1_score: divide(2 * tab[:p][:t], 2 * tab[:p][:t] + tab[:p][:f] + tab[:n][:f])
151
+ }
152
+ end
153
+
154
+ def print_derivations(derivations)
155
+ max_len = derivations.keys.map(&:length).max
156
+ derivations.each do |k, v|
157
+ puts k.to_s.tr('_', ' ').capitalize.ljust(max_len) + " : " + v.to_s
158
+ end
159
+ end
160
+
161
+ def empty_conf_mat(categories)
162
+ categories.map{|actual| [actual, categories.map{|predicted| [predicted, 0]}.to_h]}.to_h
163
+ end
164
+
165
+ def divide(dividend, divisor)
166
+ divisor.zero? ? 0.0 : dividend / divisor.to_f
167
+ end
168
+ end
169
+ end
@@ -1,3 +1,3 @@
1
1
  module ClassifierReborn
2
- VERSION = '2.1.0'
2
+ VERSION = '2.2.0'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: classifier-reborn
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.0
4
+ version: 2.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lucas Carlson
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2017-01-01 00:00:00.000000000 Z
13
+ date: 2017-12-15 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: fast-stemmer
@@ -110,14 +110,26 @@ dependencies:
110
110
  - - ">="
111
111
  - !ruby/object:Gem::Version
112
112
  version: '0'
113
+ - !ruby/object:Gem::Dependency
114
+ name: redis
115
+ requirement: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ version: '0'
120
+ type: :development
121
+ prerelease: false
122
+ version_requirements: !ruby/object:Gem::Requirement
123
+ requirements:
124
+ - - ">="
125
+ - !ruby/object:Gem::Version
126
+ version: '0'
113
127
  description:
114
128
  email:
115
129
  - lucas@rufy.com
116
130
  - parkrmoore@gmail.com
117
131
  - chase.gilliam@gmail.com
118
- executables:
119
- - bayes.rb
120
- - summarize.rb
132
+ executables: []
121
133
  extensions: []
122
134
  extra_rdoc_files:
123
135
  - README.markdown
@@ -125,8 +137,8 @@ extra_rdoc_files:
125
137
  files:
126
138
  - LICENSE
127
139
  - README.markdown
128
- - bin/bayes.rb
129
- - bin/summarize.rb
140
+ - data/stopwords/ar
141
+ - data/stopwords/bn
130
142
  - data/stopwords/ca
131
143
  - data/stopwords/cs
132
144
  - data/stopwords/da
@@ -135,15 +147,23 @@ files:
135
147
  - data/stopwords/es
136
148
  - data/stopwords/fi
137
149
  - data/stopwords/fr
150
+ - data/stopwords/hi
138
151
  - data/stopwords/hu
139
152
  - data/stopwords/it
153
+ - data/stopwords/ja
140
154
  - data/stopwords/nl
141
155
  - data/stopwords/no
142
156
  - data/stopwords/pl
143
157
  - data/stopwords/pt
158
+ - data/stopwords/ru
144
159
  - data/stopwords/se
145
160
  - data/stopwords/tr
161
+ - data/stopwords/vi
162
+ - data/stopwords/zh
146
163
  - lib/classifier-reborn.rb
164
+ - lib/classifier-reborn/backends/bayes_memory_backend.rb
165
+ - lib/classifier-reborn/backends/bayes_redis_backend.rb
166
+ - lib/classifier-reborn/backends/no_redis_error.rb
147
167
  - lib/classifier-reborn/bayes.rb
148
168
  - lib/classifier-reborn/category_namer.rb
149
169
  - lib/classifier-reborn/extensions/hasher.rb
@@ -154,6 +174,7 @@ files:
154
174
  - lib/classifier-reborn/lsi/content_node.rb
155
175
  - lib/classifier-reborn/lsi/summarizer.rb
156
176
  - lib/classifier-reborn/lsi/word_list.rb
177
+ - lib/classifier-reborn/validators/classifier_validator.rb
157
178
  - lib/classifier-reborn/version.rb
158
179
  homepage: https://github.com/jekyll/classifier-reborn
159
180
  licenses:
@@ -176,8 +197,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
176
197
  version: '0'
177
198
  requirements: []
178
199
  rubyforge_project:
179
- rubygems_version: 2.5.2
200
+ rubygems_version: 2.6.14
180
201
  signing_key:
181
202
  specification_version: 2
182
203
  summary: A general classifier module to allow Bayesian and other types of classifications.
183
204
  test_files: []
205
+ has_rdoc: true
@@ -1,36 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- begin
4
- require 'rubygems'
5
- require 'classifier'
6
- rescue
7
- require 'classifier'
8
- end
9
-
10
- require 'madeleine'
11
-
12
- m = SnapshotMadeleine.new(File.expand_path('~/.bayes_data')) do
13
- ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting'
14
- end
15
-
16
- case ARGV[0]
17
- when 'add'
18
- case ARGV[1].downcase
19
- when 'interesting'
20
- m.system.train_interesting File.open(ARGV[2]).read
21
- puts "#{ARGV[2]} has been classified as interesting"
22
- when 'uninteresting'
23
- m.system.train_uninteresting File.open(ARGV[2]).read
24
- puts "#{ARGV[2]} has been classified as uninteresting"
25
- else
26
- puts 'Invalid category: choose between interesting and uninteresting'
27
- exit(1)
28
- end
29
- when 'classify'
30
- puts m.system.classify(File.open(ARGV[1]).read)
31
- else
32
- puts 'Invalid option: choose add [category] [file] or clasify [file]'
33
- exit(-1)
34
- end
35
-
36
- m.take_snapshot
@@ -1,16 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- begin
4
- require 'rubygems'
5
- require 'classifier'
6
- rescue
7
- require 'classifier'
8
- end
9
-
10
- require 'open-uri'
11
-
12
- num = ARGV[1].to_i
13
- num = num < 1 ? 10 : num
14
-
15
- text = open(ARGV.first).read
16
- puts text.gsub(/<[^>]+>/, '').gsub(/[\s]+/, ' ').summary(num)