classifier-reborn 2.2.0 → 2.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/README.markdown +6 -8
- data/lib/classifier-reborn/backends/bayes_memory_backend.rb +3 -1
- data/lib/classifier-reborn/backends/bayes_redis_backend.rb +2 -0
- data/lib/classifier-reborn/backends/no_redis_error.rb +5 -3
- data/lib/classifier-reborn/bayes.rb +27 -15
- data/lib/classifier-reborn/category_namer.rb +2 -0
- data/lib/classifier-reborn/extensions/hasher.rb +20 -45
- data/lib/classifier-reborn/extensions/token_filter/stemmer.rb +24 -0
- data/lib/classifier-reborn/extensions/token_filter/stopword.rb +48 -0
- data/lib/classifier-reborn/extensions/token_filter/symbol.rb +20 -0
- data/lib/classifier-reborn/extensions/tokenizer/token.rb +36 -0
- data/lib/classifier-reborn/extensions/tokenizer/whitespace.rb +28 -0
- data/lib/classifier-reborn/extensions/vector.rb +13 -7
- data/lib/classifier-reborn/extensions/vector_serialize.rb +3 -1
- data/lib/classifier-reborn/extensions/zero_vector.rb +7 -0
- data/lib/classifier-reborn/lsi/cached_content_node.rb +2 -0
- data/lib/classifier-reborn/lsi/content_node.rb +23 -10
- data/lib/classifier-reborn/lsi/summarizer.rb +3 -1
- data/lib/classifier-reborn/lsi/word_list.rb +2 -0
- data/lib/classifier-reborn/lsi.rb +79 -23
- data/lib/classifier-reborn/validators/classifier_validator.rb +39 -38
- data/lib/classifier-reborn/version.rb +3 -1
- data/lib/classifier-reborn.rb +3 -1
- metadata +36 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: '0100803f158326f660f53694ff5d0d400440792bb5174a10d80ae7eb780c5b6b'
|
4
|
+
data.tar.gz: 1f5a249471e67beb8796a0a61f47ea18fa2f0a252e832f03cb7e7b1937921fa5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e63b40492f9d35092353c198822f2ce444d05dec7613572048c3f420eecda4040c84026fe621ccb6c316e9862bc25258d47e32663168eb8f67c2b29b41733c57
|
7
|
+
data.tar.gz: abad42c42694cea59acf4bb59184a8f2aaa1d909826b126b4917b67b350c3ca9a14a3b688bd648e7ff8bba241a72e7846c749b91092e7ea91b5bc373c793b24f
|
data/README.markdown
CHANGED
@@ -2,11 +2,9 @@
|
|
2
2
|
|
3
3
|
[![Gem Version](https://badge.fury.io/rb/classifier-reborn.svg)](https://rubygems.org/gems/classifier-reborn)
|
4
4
|
[![Build Status](https://img.shields.io/travis/jekyll/classifier-reborn/master.svg)](https://travis-ci.org/jekyll/classifier-reborn)
|
5
|
-
[![Dependency Status](https://img.shields.io/gemnasium/jekyll/classifier-reborn.svg)](https://gemnasium.com/jekyll/classifier-reborn)
|
6
|
-
|
7
5
|
---
|
8
6
|
|
9
|
-
## [Read the Docs](
|
7
|
+
## [Read the Docs](https://jekyll.github.io/classifier-reborn/)
|
10
8
|
|
11
9
|
## Getting Started
|
12
10
|
|
@@ -45,11 +43,11 @@ irb(main):013:0> lsi.find_related("This text is around cats!", 2)
|
|
45
43
|
There is much more that can be done using Bayes and LSI beyond these quick examples.
|
46
44
|
For more information read the following documentation topics.
|
47
45
|
|
48
|
-
* [Installation and Dependencies](
|
49
|
-
* [Bayesian Classifier](
|
50
|
-
* [Latent Semantic Indexer (LSI)](
|
51
|
-
* [Classifier Validation](
|
52
|
-
* [Development and Contributions](
|
46
|
+
* [Installation and Dependencies](https://jekyll.github.io/classifier-reborn/)
|
47
|
+
* [Bayesian Classifier](https://jekyll.github.io/classifier-reborn/bayes)
|
48
|
+
* [Latent Semantic Indexer (LSI)](https://jekyll.github.io/classifier-reborn/lsi)
|
49
|
+
* [Classifier Validation](https://jekyll.github.io/classifier-reborn/validation)
|
50
|
+
* [Development and Contributions](https://jekyll.github.io/classifier-reborn/development) (*Optional Docker instructions included*)
|
53
51
|
|
54
52
|
### Notes on JRuby support
|
55
53
|
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module ClassifierReborn
|
2
4
|
class BayesMemoryBackend
|
3
5
|
attr_reader :total_words, :total_trainings
|
@@ -69,7 +71,7 @@ module ClassifierReborn
|
|
69
71
|
private
|
70
72
|
|
71
73
|
def category_counts(category)
|
72
|
-
@category_counts[category] ||= {training: 0, word: 0}
|
74
|
+
@category_counts[category] ||= { training: 0, word: 0 }
|
73
75
|
end
|
74
76
|
end
|
75
77
|
end
|
@@ -1,12 +1,14 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class NoRedisError < RuntimeError
|
2
4
|
def initialize
|
3
5
|
msg =
|
4
|
-
%q
|
6
|
+
%q(The Redis Backend can only be used if Redis is installed.
|
5
7
|
This error is raised from 'lib/classifier-reborn/backends/bayes_redis_backend.rb'.
|
6
8
|
If you have encountered this error and would like to use the Redis Backend,
|
7
9
|
please run 'gem install redis' or include 'gem "redis"' in
|
8
10
|
your gemfile. For more info see https://github.com/jekyll/classifier-reborn#usage.
|
9
|
-
|
11
|
+
)
|
10
12
|
super(msg)
|
11
13
|
end
|
12
14
|
end
|
@@ -1,9 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
4
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
5
|
# License:: LGPL
|
4
6
|
|
5
7
|
require 'set'
|
6
8
|
|
9
|
+
require_relative 'extensions/tokenizer/whitespace'
|
10
|
+
require_relative 'extensions/token_filter/stopword'
|
11
|
+
require_relative 'extensions/token_filter/stemmer'
|
7
12
|
require_relative 'category_namer'
|
8
13
|
require_relative 'backends/bayes_memory_backend'
|
9
14
|
require_relative 'backends/bayes_redis_backend'
|
@@ -26,12 +31,11 @@ module ClassifierReborn
|
|
26
31
|
# backend: BayesMemoryBackend.new Alternatively, BayesRedisBackend.new for persistent storage
|
27
32
|
def initialize(*args)
|
28
33
|
@initial_categories = []
|
29
|
-
options = { language:
|
34
|
+
options = { language: 'en',
|
30
35
|
enable_threshold: false,
|
31
|
-
threshold:
|
32
|
-
enable_stemmer:
|
33
|
-
backend:
|
34
|
-
}
|
36
|
+
threshold: 0.0,
|
37
|
+
enable_stemmer: true,
|
38
|
+
backend: BayesMemoryBackend.new }
|
35
39
|
args.flatten.each do |arg|
|
36
40
|
if arg.is_a?(Hash)
|
37
41
|
options.merge!(arg)
|
@@ -50,12 +54,14 @@ module ClassifierReborn
|
|
50
54
|
@threshold = options[:threshold]
|
51
55
|
@enable_stemmer = options[:enable_stemmer]
|
52
56
|
@backend = options[:backend]
|
57
|
+
@tokenizer = options[:tokenizer] || Tokenizer::Whitespace
|
58
|
+
@token_filters = options[:token_filters] || [TokenFilter::Stopword]
|
59
|
+
@token_filters << TokenFilter::Stemmer if @enable_stemmer && !@token_filters.include?(TokenFilter::Stemmer)
|
60
|
+
TokenFilter::Stopword.language = @language if @token_filters.include?(TokenFilter::Stopword)
|
53
61
|
|
54
62
|
populate_initial_categories
|
55
63
|
|
56
|
-
if options.key?(:stopwords)
|
57
|
-
custom_stopwords options[:stopwords]
|
58
|
-
end
|
64
|
+
custom_stopwords options[:stopwords] if options.key?(:stopwords)
|
59
65
|
end
|
60
66
|
|
61
67
|
# Provides a general training method for all categories specified in Bayes#new
|
@@ -65,8 +71,10 @@ module ClassifierReborn
|
|
65
71
|
# b.train "that", "That text"
|
66
72
|
# b.train "The other", "The other text"
|
67
73
|
def train(category, text)
|
68
|
-
word_hash = Hasher.word_hash(text, @
|
74
|
+
word_hash = Hasher.word_hash(text, @enable_stemmer,
|
75
|
+
tokenizer: @tokenizer, token_filters: @token_filters)
|
69
76
|
return if word_hash.empty?
|
77
|
+
|
70
78
|
category = CategoryNamer.prepare_name(category)
|
71
79
|
|
72
80
|
# Add the category dynamically or raise an error
|
@@ -95,11 +103,14 @@ module ClassifierReborn
|
|
95
103
|
# b.train :this, "This text"
|
96
104
|
# b.untrain :this, "This text"
|
97
105
|
def untrain(category, text)
|
98
|
-
word_hash = Hasher.word_hash(text, @
|
106
|
+
word_hash = Hasher.word_hash(text, @enable_stemmer,
|
107
|
+
tokenizer: @tokenizer, token_filters: @token_filters)
|
99
108
|
return if word_hash.empty?
|
109
|
+
|
100
110
|
category = CategoryNamer.prepare_name(category)
|
101
111
|
word_hash.each do |word, count|
|
102
112
|
next if @backend.total_words < 0
|
113
|
+
|
103
114
|
orig = @backend.category_word_frequency(category, word) || 0
|
104
115
|
@backend.update_category_word_frequency(category, word, -count)
|
105
116
|
if @backend.category_word_frequency(category, word) <= 0
|
@@ -120,7 +131,8 @@ module ClassifierReborn
|
|
120
131
|
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
121
132
|
def classifications(text)
|
122
133
|
score = {}
|
123
|
-
word_hash = Hasher.word_hash(text, @
|
134
|
+
word_hash = Hasher.word_hash(text, @enable_stemmer,
|
135
|
+
tokenizer: @tokenizer, token_filters: @token_filters)
|
124
136
|
if word_hash.empty?
|
125
137
|
category_keys.each do |category|
|
126
138
|
score[category.to_s] = Float::INFINITY
|
@@ -152,7 +164,7 @@ module ClassifierReborn
|
|
152
164
|
# Return the classification without the score
|
153
165
|
def classify(text)
|
154
166
|
result, score = classify_with_score(text)
|
155
|
-
result = nil if score < @threshold || score == Float::INFINITY
|
167
|
+
result = nil if threshold_enabled? && (score < @threshold || score == Float::INFINITY)
|
156
168
|
result
|
157
169
|
end
|
158
170
|
|
@@ -240,7 +252,7 @@ module ClassifierReborn
|
|
240
252
|
@backend.add_category(category)
|
241
253
|
end
|
242
254
|
|
243
|
-
|
255
|
+
alias append_category add_category
|
244
256
|
|
245
257
|
def reset
|
246
258
|
@backend.reset
|
@@ -261,12 +273,12 @@ module ClassifierReborn
|
|
261
273
|
if stopwords.strip.empty?
|
262
274
|
stopwords = []
|
263
275
|
elsif File.exist?(stopwords)
|
264
|
-
stopwords = File.read(stopwords).force_encoding(
|
276
|
+
stopwords = File.read(stopwords).force_encoding('utf-8').split
|
265
277
|
else
|
266
278
|
return # Do not overwrite the default
|
267
279
|
end
|
268
280
|
end
|
269
|
-
|
281
|
+
TokenFilter::Stopword::STOPWORDS[@language] = Set.new stopwords
|
270
282
|
end
|
271
283
|
end
|
272
284
|
end
|
@@ -1,67 +1,42 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
3
4
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
4
5
|
# License:: LGPL
|
5
6
|
|
6
7
|
require 'set'
|
7
8
|
|
9
|
+
require_relative 'tokenizer/whitespace'
|
10
|
+
require_relative 'token_filter/stopword'
|
11
|
+
require_relative 'token_filter/stemmer'
|
12
|
+
|
8
13
|
module ClassifierReborn
|
9
14
|
module Hasher
|
10
|
-
STOPWORDS_PATH = [File.expand_path(File.dirname(__FILE__) + '/../../../data/stopwords')]
|
11
|
-
|
12
15
|
module_function
|
13
16
|
|
14
17
|
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
15
18
|
# interned, and indexes to its frequency in the document.
|
16
|
-
def word_hash(str,
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
word_hash_for_words(str.gsub(/[^\p{WORD}\s]/, '').downcase.split, language, enable_stemmer)
|
25
|
-
end
|
26
|
-
|
27
|
-
def word_hash_for_words(words, language = 'en', enable_stemmer = true)
|
28
|
-
d = Hash.new(0)
|
29
|
-
words.each do |word|
|
30
|
-
next unless word.length > 2 && !STOPWORDS[language].include?(word)
|
31
|
-
if enable_stemmer
|
32
|
-
d[word.stem.intern] += 1
|
33
|
-
else
|
34
|
-
d[word.intern] += 1
|
19
|
+
def word_hash(str, enable_stemmer = true,
|
20
|
+
tokenizer: Tokenizer::Whitespace,
|
21
|
+
token_filters: [TokenFilter::Stopword])
|
22
|
+
if token_filters.include?(TokenFilter::Stemmer)
|
23
|
+
unless enable_stemmer
|
24
|
+
token_filters.reject! do |token_filter|
|
25
|
+
token_filter == TokenFilter::Stemmer
|
26
|
+
end
|
35
27
|
end
|
28
|
+
else
|
29
|
+
token_filters << TokenFilter::Stemmer if enable_stemmer
|
30
|
+
end
|
31
|
+
words = tokenizer.call(str)
|
32
|
+
token_filters.each do |token_filter|
|
33
|
+
words = token_filter.call(words)
|
36
34
|
end
|
37
|
-
d
|
38
|
-
end
|
39
|
-
|
40
|
-
# Add custom path to a new stopword file created by user
|
41
|
-
def add_custom_stopword_path(path)
|
42
|
-
STOPWORDS_PATH.unshift(path)
|
43
|
-
end
|
44
|
-
|
45
|
-
def word_hash_for_symbols(words)
|
46
35
|
d = Hash.new(0)
|
47
36
|
words.each do |word|
|
48
37
|
d[word.intern] += 1
|
49
38
|
end
|
50
39
|
d
|
51
40
|
end
|
52
|
-
|
53
|
-
# Create a lazily-loaded hash of stopword data
|
54
|
-
STOPWORDS = Hash.new do |hash, language|
|
55
|
-
hash[language] = []
|
56
|
-
|
57
|
-
STOPWORDS_PATH.each do |path|
|
58
|
-
if File.exist?(File.join(path, language))
|
59
|
-
hash[language] = Set.new File.read(File.join(path, language.to_s)).force_encoding("utf-8").split
|
60
|
-
break
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
|
-
hash[language]
|
65
|
-
end
|
66
41
|
end
|
67
42
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
4
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
5
|
+
# License:: LGPL
|
6
|
+
|
7
|
+
module ClassifierReborn
|
8
|
+
module TokenFilter
|
9
|
+
# This filter converts given tokens to their stemmed versions.
|
10
|
+
module Stemmer
|
11
|
+
module_function
|
12
|
+
|
13
|
+
def call(tokens)
|
14
|
+
tokens.collect do |token|
|
15
|
+
if token.stemmable?
|
16
|
+
token.stem
|
17
|
+
else
|
18
|
+
token
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
4
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
5
|
+
# License:: LGPL
|
6
|
+
|
7
|
+
module ClassifierReborn
|
8
|
+
module TokenFilter
|
9
|
+
# This filter removes stopwords in the language, from given tokens.
|
10
|
+
module Stopword
|
11
|
+
STOPWORDS_PATH = [File.expand_path(File.dirname(__FILE__) + '/../../../../data/stopwords')]
|
12
|
+
@language = 'en'
|
13
|
+
|
14
|
+
module_function
|
15
|
+
|
16
|
+
def call(tokens)
|
17
|
+
tokens.reject do |token|
|
18
|
+
token.maybe_stopword? &&
|
19
|
+
(token.length <= 2 || STOPWORDS[@language].include?(token))
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Add custom path to a new stopword file created by user
|
24
|
+
def add_custom_stopword_path(path)
|
25
|
+
STOPWORDS_PATH.unshift(path)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Create a lazily-loaded hash of stopword data
|
29
|
+
STOPWORDS = Hash.new do |hash, language|
|
30
|
+
hash[language] = []
|
31
|
+
|
32
|
+
STOPWORDS_PATH.each do |path|
|
33
|
+
if File.exist?(File.join(path, language))
|
34
|
+
hash[language] = Set.new File.read(File.join(path, language.to_s)).force_encoding('utf-8').split
|
35
|
+
break
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
hash[language]
|
40
|
+
end
|
41
|
+
|
42
|
+
# Changes the language of stopwords
|
43
|
+
def language=(language)
|
44
|
+
@language = language
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
4
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
5
|
+
# License:: LGPL
|
6
|
+
|
7
|
+
module ClassifierReborn
|
8
|
+
module TokenFilter
|
9
|
+
# This filter removes symbol-only terms, from given tokens.
|
10
|
+
module Symbol
|
11
|
+
module_function
|
12
|
+
|
13
|
+
def call(tokens)
|
14
|
+
tokens.reject do |token|
|
15
|
+
/[^\s\p{WORD}]/ === token
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
4
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
5
|
+
# License:: LGPL
|
6
|
+
|
7
|
+
module ClassifierReborn
|
8
|
+
module Tokenizer
|
9
|
+
class Token < String
|
10
|
+
# The class can be created with one token string and extra attributes. E.g.,
|
11
|
+
# t = ClassifierReborn::Tokenizer::Token.new 'Tokenize', stemmable: true, maybe_stopword: false
|
12
|
+
#
|
13
|
+
# Attributes available are:
|
14
|
+
# stemmable: true Possibility that the token can be stemmed. This must be false for un-stemmable terms, otherwise this should be true.
|
15
|
+
# maybe_stopword: true Possibility that the token is a stopword. This must be false for terms which never been stopword, otherwise this should be true.
|
16
|
+
def initialize(string, stemmable: true, maybe_stopword: true)
|
17
|
+
super(string)
|
18
|
+
@stemmable = stemmable
|
19
|
+
@maybe_stopword = maybe_stopword
|
20
|
+
end
|
21
|
+
|
22
|
+
def stemmable?
|
23
|
+
@stemmable
|
24
|
+
end
|
25
|
+
|
26
|
+
def maybe_stopword?
|
27
|
+
@maybe_stopword
|
28
|
+
end
|
29
|
+
|
30
|
+
def stem
|
31
|
+
stemmed = super
|
32
|
+
self.class.new(stemmed, stemmable: @stemmable, maybe_stopword: @maybe_stopword)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
4
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
5
|
+
# License:: LGPL
|
6
|
+
|
7
|
+
require_relative 'token'
|
8
|
+
|
9
|
+
module ClassifierReborn
|
10
|
+
module Tokenizer
|
11
|
+
# This tokenizes given input as white-space separated terms.
|
12
|
+
# It mainly aims to tokenize sentences written with a space between words, like English, French, and others.
|
13
|
+
module Whitespace
|
14
|
+
module_function
|
15
|
+
|
16
|
+
def call(str)
|
17
|
+
tokens = str.gsub(/[^\p{WORD}\s]/, '').downcase.split.collect do |word|
|
18
|
+
Token.new(word, stemmable: true, maybe_stopword: true)
|
19
|
+
end
|
20
|
+
symbol_tokens = str.scan(/[^\s\p{WORD}]/).collect do |word|
|
21
|
+
Token.new(word, stemmable: false, maybe_stopword: false)
|
22
|
+
end
|
23
|
+
tokens += symbol_tokens
|
24
|
+
tokens
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Ernest Ellingson
|
2
4
|
# Copyright:: Copyright (c) 2005
|
3
5
|
|
@@ -10,14 +12,14 @@ class Matrix
|
|
10
12
|
Matrix.diagonal(*s)
|
11
13
|
end
|
12
14
|
|
13
|
-
|
15
|
+
alias trans transpose
|
14
16
|
|
15
17
|
def SV_decomp(maxSweeps = 20)
|
16
|
-
if row_size >= column_size
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
18
|
+
q = if row_size >= column_size
|
19
|
+
trans * self
|
20
|
+
else
|
21
|
+
self * trans
|
22
|
+
end
|
21
23
|
|
22
24
|
qrot = q.dup
|
23
25
|
v = Matrix.identity(q.row_size)
|
@@ -31,7 +33,11 @@ class Matrix
|
|
31
33
|
(1..qrot.row_size - 1).each do |col|
|
32
34
|
next if row == col
|
33
35
|
|
34
|
-
h =
|
36
|
+
h = if (2.0 * qrot[row, col]) == (qrot[row, row] - qrot[col, col])
|
37
|
+
Math.atan(1) / 2.0
|
38
|
+
else
|
39
|
+
Math.atan((2.0 * qrot[row, col]) / (qrot[row, row] - qrot[col, col])) / 2.0
|
40
|
+
end
|
35
41
|
hcos = Math.cos(h)
|
36
42
|
hsin = Math.sin(h)
|
37
43
|
mzrot = Matrix.identity(qrot.row_size)
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: David Fayram (mailto:dfayram@lensmen.net)
|
2
4
|
# Copyright:: Copyright (c) 2005 David Fayram II
|
3
5
|
# License:: LGPL
|
@@ -27,7 +29,11 @@ module ClassifierReborn
|
|
27
29
|
|
28
30
|
# Method to access the transposed search vector
|
29
31
|
def transposed_search_vector
|
30
|
-
|
32
|
+
if $SVD == :numo
|
33
|
+
search_vector
|
34
|
+
else
|
35
|
+
search_vector.col
|
36
|
+
end
|
31
37
|
end
|
32
38
|
|
33
39
|
# Use this to fetch the appropriate search vector in normalized form.
|
@@ -38,18 +44,22 @@ module ClassifierReborn
|
|
38
44
|
# Creates the raw vector out of word_hash using word_list as the
|
39
45
|
# key for mapping the vector space.
|
40
46
|
def raw_vector_with(word_list)
|
41
|
-
if $
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
47
|
+
vec = if $SVD == :numo
|
48
|
+
Numo::DFloat.zeros(word_list.size)
|
49
|
+
elsif $SVD == :gsl
|
50
|
+
GSL::Vector.alloc(word_list.size)
|
51
|
+
else
|
52
|
+
Array.new(word_list.size, 0)
|
53
|
+
end
|
46
54
|
|
47
55
|
@word_hash.each_key do |word|
|
48
56
|
vec[word_list[word]] = @word_hash[word] if word_list[word]
|
49
57
|
end
|
50
58
|
|
51
59
|
# Perform the scaling transform and force floating point arithmetic
|
52
|
-
if $
|
60
|
+
if $SVD == :numo
|
61
|
+
total_words = vec.sum.to_f
|
62
|
+
elsif $SVD == :gsl
|
53
63
|
sum = 0.0
|
54
64
|
vec.each { |v| sum += v }
|
55
65
|
total_words = sum
|
@@ -59,7 +69,7 @@ module ClassifierReborn
|
|
59
69
|
|
60
70
|
total_unique_words = 0
|
61
71
|
|
62
|
-
if $
|
72
|
+
if [:numo, :gsl].include?($SVD)
|
63
73
|
vec.each { |word| total_unique_words += 1 if word != 0.0 }
|
64
74
|
else
|
65
75
|
total_unique_words = vec.count { |word| word != 0 }
|
@@ -83,12 +93,15 @@ module ClassifierReborn
|
|
83
93
|
hash[val] = Math.log(val + 1) / -weighted_total
|
84
94
|
end
|
85
95
|
|
86
|
-
vec.
|
96
|
+
vec = vec.map do |val|
|
87
97
|
cached_calcs[val]
|
88
98
|
end
|
89
99
|
end
|
90
100
|
|
91
|
-
if $
|
101
|
+
if $SVD == :numo
|
102
|
+
@raw_norm = vec / Numo::Linalg.norm(vec)
|
103
|
+
@raw_vector = vec
|
104
|
+
elsif $SVD == :gsl
|
92
105
|
@raw_norm = vec.normalize
|
93
106
|
@raw_vector = vec
|
94
107
|
else
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
4
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
5
|
# License:: LGPL
|
@@ -27,7 +29,7 @@ module ClassifierReborn
|
|
27
29
|
chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
|
28
30
|
lsi.build_index
|
29
31
|
summaries = lsi.highest_relative_content count
|
30
|
-
summaries.
|
32
|
+
summaries.select { |chunk| summaries.include? chunk }.map(&:strip).join(separator)
|
31
33
|
end
|
32
34
|
end
|
33
35
|
end
|
@@ -1,23 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: David Fayram (mailto:dfayram@lensmen.net)
|
2
4
|
# Copyright:: Copyright (c) 2005 David Fayram II
|
3
5
|
# License:: LGPL
|
4
6
|
|
7
|
+
# Try to load Numo first - it's the most current and the most well-supported.
|
8
|
+
# Fall back to GSL.
|
9
|
+
# Fall back to native vector.
|
5
10
|
begin
|
6
11
|
raise LoadError if ENV['NATIVE_VECTOR'] == 'true' # to test the native vector class, try `rake test NATIVE_VECTOR=true`
|
12
|
+
raise LoadError if ENV['GSL'] == 'true' # to test with gsl, try `rake test GSL=true`
|
7
13
|
|
8
|
-
require '
|
9
|
-
|
10
|
-
$
|
11
|
-
|
14
|
+
require 'numo/narray' # https://ruby-numo.github.io/narray/
|
15
|
+
require 'numo/linalg' # https://ruby-numo.github.io/linalg/
|
16
|
+
$SVD = :numo
|
12
17
|
rescue LoadError
|
13
|
-
|
14
|
-
|
18
|
+
begin
|
19
|
+
raise LoadError if ENV['NATIVE_VECTOR'] == 'true' # to test the native vector class, try `rake test NATIVE_VECTOR=true`
|
20
|
+
|
21
|
+
require 'gsl' # requires https://github.com/SciRuby/rb-gsl
|
22
|
+
require_relative 'extensions/vector_serialize'
|
23
|
+
$SVD = :gsl
|
24
|
+
rescue LoadError
|
25
|
+
$SVD = :ruby
|
26
|
+
require_relative 'extensions/vector'
|
27
|
+
require_relative 'extensions/zero_vector'
|
28
|
+
end
|
15
29
|
end
|
16
30
|
|
17
31
|
require_relative 'lsi/word_list'
|
18
32
|
require_relative 'lsi/content_node'
|
19
33
|
require_relative 'lsi/cached_content_node'
|
20
34
|
require_relative 'lsi/summarizer'
|
35
|
+
require_relative 'extensions/token_filter/stopword'
|
36
|
+
require_relative 'extensions/token_filter/symbol'
|
21
37
|
|
22
38
|
module ClassifierReborn
|
23
39
|
# This class implements a Latent Semantic Indexer, which can search, classify and cluster
|
@@ -40,6 +56,11 @@ module ClassifierReborn
|
|
40
56
|
@version = 0
|
41
57
|
@built_at_version = -1
|
42
58
|
@language = options[:language] || 'en'
|
59
|
+
@token_filters = [
|
60
|
+
TokenFilter::Stopword,
|
61
|
+
TokenFilter::Symbol
|
62
|
+
]
|
63
|
+
TokenFilter::Stopword.language = @language
|
43
64
|
extend CachedContentNode::InstanceMethods if @cache_node_vectors = options[:cache_node_vectors]
|
44
65
|
end
|
45
66
|
|
@@ -64,7 +85,8 @@ module ClassifierReborn
|
|
64
85
|
# lsi.add_item ar, *ar.categories { |x| ar.content }
|
65
86
|
#
|
66
87
|
def add_item(item, *categories, &block)
|
67
|
-
clean_word_hash = Hasher.
|
88
|
+
clean_word_hash = Hasher.word_hash((block ? yield(item) : item.to_s),
|
89
|
+
token_filters: @token_filters)
|
68
90
|
if clean_word_hash.empty?
|
69
91
|
puts "Input: '#{item}' is entirely stopwords or words with 2 or fewer characters. Classifier-Reborn cannot handle this document properly."
|
70
92
|
else
|
@@ -124,12 +146,21 @@ module ClassifierReborn
|
|
124
146
|
# turning the LSI class into a simple vector search engine.
|
125
147
|
def build_index(cutoff = 0.75)
|
126
148
|
return unless needs_rebuild?
|
149
|
+
|
127
150
|
make_word_list
|
128
151
|
|
129
152
|
doc_list = @items.values
|
130
153
|
tda = doc_list.collect { |node| node.raw_vector_with(@word_list) }
|
131
154
|
|
132
|
-
if $
|
155
|
+
if $SVD == :numo
|
156
|
+
tdm = Numo::NArray.asarray(tda.map(&:to_a)).transpose
|
157
|
+
ntdm = numo_build_reduced_matrix(tdm, cutoff)
|
158
|
+
|
159
|
+
ntdm.each_over_axis(1).with_index do |col_vec, i|
|
160
|
+
doc_list[i].lsi_vector = col_vec
|
161
|
+
doc_list[i].lsi_norm = col_vec / Numo::Linalg.norm(col_vec)
|
162
|
+
end
|
163
|
+
elsif $SVD == :gsl
|
133
164
|
tdm = GSL::Matrix.alloc(*tda).trans
|
134
165
|
ntdm = build_reduced_matrix(tdm, cutoff)
|
135
166
|
|
@@ -142,9 +173,13 @@ module ClassifierReborn
|
|
142
173
|
tdm = Matrix.rows(tda).trans
|
143
174
|
ntdm = build_reduced_matrix(tdm, cutoff)
|
144
175
|
|
145
|
-
ntdm.
|
176
|
+
ntdm.column_size.times do |col|
|
146
177
|
doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
|
147
|
-
|
178
|
+
if ntdm.column(col).zero?
|
179
|
+
doc_list[col].lsi_norm = ntdm.column(col) if doc_list[col]
|
180
|
+
else
|
181
|
+
doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
|
182
|
+
end
|
148
183
|
end
|
149
184
|
end
|
150
185
|
|
@@ -186,11 +221,13 @@ module ClassifierReborn
|
|
186
221
|
content_node = node_for_content(doc, &block)
|
187
222
|
result =
|
188
223
|
@items.keys.collect do |item|
|
189
|
-
if $
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
224
|
+
val = if $SVD == :numo
|
225
|
+
content_node.search_vector.dot(@items[item].transposed_search_vector)
|
226
|
+
elsif $SVD == :gsl
|
227
|
+
content_node.search_vector * @items[item].transposed_search_vector
|
228
|
+
else
|
229
|
+
(Matrix[content_node.search_vector] * @items[item].search_vector)[0]
|
230
|
+
end
|
194
231
|
[item, val]
|
195
232
|
end
|
196
233
|
result.sort_by { |x| x[1] }.reverse
|
@@ -205,7 +242,8 @@ module ClassifierReborn
|
|
205
242
|
return [] if needs_rebuild?
|
206
243
|
|
207
244
|
content_node = node_for_content(doc, &block)
|
208
|
-
if $
|
245
|
+
if ($SVD == :gsl && content_node.raw_norm.isnan?.all?) ||
|
246
|
+
($SVD == :numo && content_node.raw_norm.isnan.all?)
|
209
247
|
puts "There are no documents that are similar to #{doc}"
|
210
248
|
else
|
211
249
|
content_node_norms(content_node)
|
@@ -215,11 +253,13 @@ module ClassifierReborn
|
|
215
253
|
def content_node_norms(content_node)
|
216
254
|
result =
|
217
255
|
@items.keys.collect do |item|
|
218
|
-
if $
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
256
|
+
val = if $SVD == :numo
|
257
|
+
content_node.search_norm.dot(@items[item].search_norm)
|
258
|
+
elsif $SVD == :gsl
|
259
|
+
content_node.search_norm * @items[item].search_norm.col
|
260
|
+
else
|
261
|
+
(Matrix[content_node.search_norm] * @items[item].search_norm)[0]
|
262
|
+
end
|
223
263
|
[item, val]
|
224
264
|
end
|
225
265
|
result.sort_by { |x| x[1] }.reverse
|
@@ -234,6 +274,7 @@ module ClassifierReborn
|
|
234
274
|
# it is actually the same algorithm, just applied on a smaller document.
|
235
275
|
def search(string, max_nearest = 3)
|
236
276
|
return [] if needs_rebuild?
|
277
|
+
|
237
278
|
carry = proximity_norms_for_content(string)
|
238
279
|
unless carry.nil?
|
239
280
|
result = carry.collect { |x| x[0] }
|
@@ -295,6 +336,7 @@ module ClassifierReborn
|
|
295
336
|
# it's supposed to.
|
296
337
|
def highest_ranked_stems(doc, count = 3)
|
297
338
|
raise 'Requested stem ranking on non-indexed content!' unless @items[doc]
|
339
|
+
|
298
340
|
content_vector_array = node_for_content(doc).lsi_vector.to_a
|
299
341
|
top_n = content_vector_array.sort.reverse[0..count - 1]
|
300
342
|
top_n.collect { |x| @word_list.word_for_index(content_vector_array.index(x)) }
|
@@ -315,14 +357,28 @@ module ClassifierReborn
|
|
315
357
|
s[ord] = 0.0 if s[ord] < s_cutoff
|
316
358
|
end
|
317
359
|
# Reconstruct the term document matrix, only with reduced rank
|
318
|
-
u * ($
|
360
|
+
u * ($SVD == :gsl ? GSL::Matrix : ::Matrix).diag(s) * v.trans
|
361
|
+
end
|
362
|
+
|
363
|
+
def numo_build_reduced_matrix(matrix, cutoff = 0.75)
|
364
|
+
s, u, vt = Numo::Linalg.svd(matrix, driver: 'svd', job: 'S')
|
365
|
+
|
366
|
+
# TODO: Better than 75% term (as above)
|
367
|
+
s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
|
368
|
+
s.size.times do |ord|
|
369
|
+
s[ord] = 0.0 if s[ord] < s_cutoff
|
370
|
+
end
|
371
|
+
|
372
|
+
# Reconstruct the term document matrix, only with reduced rank
|
373
|
+
u.dot(::Numo::DFloat.eye(s.size) * s).dot(vt)
|
319
374
|
end
|
320
375
|
|
321
376
|
def node_for_content(item, &block)
|
322
377
|
if @items[item]
|
323
378
|
return @items[item]
|
324
379
|
else
|
325
|
-
clean_word_hash = Hasher.
|
380
|
+
clean_word_hash = Hasher.word_hash((block ? yield(item) : item.to_s),
|
381
|
+
token_filters: @token_filters)
|
326
382
|
|
327
383
|
content_node = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
|
328
384
|
|
@@ -1,10 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module ClassifierReborn
|
2
4
|
module ClassifierValidator
|
3
|
-
|
4
5
|
module_function
|
5
6
|
|
6
|
-
def cross_validate(classifier, sample_data, fold=10, *options)
|
7
|
-
classifier = ClassifierReborn
|
7
|
+
def cross_validate(classifier, sample_data, fold = 10, *options)
|
8
|
+
classifier = ClassifierReborn.const_get(classifier).new(options) if classifier.is_a?(String)
|
8
9
|
sample_data.shuffle!
|
9
10
|
partition_size = sample_data.length / fold
|
10
11
|
partitioned_data = sample_data.each_slice(partition_size)
|
@@ -14,13 +15,13 @@ module ClassifierReborn
|
|
14
15
|
test_data = training_data.slice!(i)
|
15
16
|
conf_mats << validate(classifier, training_data.flatten!(1), test_data)
|
16
17
|
end
|
17
|
-
classifier.reset
|
18
|
+
classifier.reset
|
18
19
|
generate_report(conf_mats)
|
19
20
|
end
|
20
21
|
|
21
22
|
def validate(classifier, training_data, test_data, *options)
|
22
|
-
classifier = ClassifierReborn
|
23
|
-
classifier.reset
|
23
|
+
classifier = ClassifierReborn.const_get(classifier).new(options) if classifier.is_a?(String)
|
24
|
+
classifier.reset
|
24
25
|
training_data.each do |rec|
|
25
26
|
classifier.train(rec.first, rec.last)
|
26
27
|
end
|
@@ -40,25 +41,25 @@ module ClassifierReborn
|
|
40
41
|
def generate_report(*conf_mats)
|
41
42
|
conf_mats.flatten!
|
42
43
|
accumulated_conf_mat = conf_mats.length == 1 ? conf_mats.first : empty_conf_mat(conf_mats.first.keys.sort)
|
43
|
-
header =
|
44
|
+
header = 'Run Total Correct Incorrect Accuracy'
|
44
45
|
puts
|
45
|
-
puts
|
46
|
+
puts ' Run Report '.center(header.length, '-')
|
46
47
|
puts header
|
47
|
-
puts
|
48
|
+
puts '-' * header.length
|
48
49
|
if conf_mats.length > 1
|
49
50
|
conf_mats.each_with_index do |conf_mat, i|
|
50
51
|
run_report = build_run_report(conf_mat)
|
51
|
-
print_run_report(run_report, i+1)
|
52
|
+
print_run_report(run_report, i + 1)
|
52
53
|
conf_mat.each do |actual, cols|
|
53
54
|
cols.each do |predicted, v|
|
54
55
|
accumulated_conf_mat[actual][predicted] += v
|
55
56
|
end
|
56
57
|
end
|
57
58
|
end
|
58
|
-
puts
|
59
|
+
puts '-' * header.length
|
59
60
|
end
|
60
61
|
run_report = build_run_report(accumulated_conf_mat)
|
61
|
-
print_run_report(run_report,
|
62
|
+
print_run_report(run_report, 'All')
|
62
63
|
puts
|
63
64
|
print_conf_mat(accumulated_conf_mat)
|
64
65
|
puts
|
@@ -78,11 +79,11 @@ module ClassifierReborn
|
|
78
79
|
end
|
79
80
|
end
|
80
81
|
total = correct + incorrect
|
81
|
-
{total: total, correct: correct, incorrect: incorrect, accuracy: divide(correct, total)}
|
82
|
+
{ total: total, correct: correct, incorrect: incorrect, accuracy: divide(correct, total) }
|
82
83
|
end
|
83
84
|
|
84
85
|
def conf_mat_to_tab(conf_mat)
|
85
|
-
conf_tab = Hash.new {|h, k| h[k] = {p: {t: 0, f: 0}, n: {t: 0, f: 0}}}
|
86
|
+
conf_tab = Hash.new { |h, k| h[k] = { p: { t: 0, f: 0 }, n: { t: 0, f: 0 } } }
|
86
87
|
conf_mat.each_key do |positive|
|
87
88
|
conf_mat.each do |actual, cols|
|
88
89
|
cols.each do |predicted, v|
|
@@ -93,32 +94,32 @@ module ClassifierReborn
|
|
93
94
|
conf_tab
|
94
95
|
end
|
95
96
|
|
96
|
-
def print_run_report(stats, prefix=
|
97
|
-
puts "#{
|
97
|
+
def print_run_report(stats, prefix = '', print_header = false)
|
98
|
+
puts "#{'Run'.rjust([3, prefix.length].max)} Total Correct Incorrect Accuracy" if print_header
|
98
99
|
puts "#{prefix.to_s.rjust(3)} #{stats[:total].to_s.rjust(9)} #{stats[:correct].to_s.rjust(9)} #{stats[:incorrect].to_s.rjust(9)} #{stats[:accuracy].round(5).to_s.ljust(7, '0').rjust(9)}"
|
99
100
|
end
|
100
101
|
|
101
102
|
def print_conf_mat(conf_mat)
|
102
|
-
header = [
|
103
|
+
header = ['Predicted ->'] + conf_mat.keys + %w[Total Recall]
|
103
104
|
cell_size = header.map(&:length).max
|
104
|
-
header = header.map{|h| h.rjust(cell_size)}.join(
|
105
|
-
puts
|
105
|
+
header = header.map { |h| h.rjust(cell_size) }.join(' ')
|
106
|
+
puts ' Confusion Matrix '.center(header.length, '-')
|
106
107
|
puts header
|
107
|
-
puts
|
108
|
-
predicted_totals = conf_mat.keys.map{|predicted| [predicted, 0]}.to_h
|
108
|
+
puts '-' * header.length
|
109
|
+
predicted_totals = conf_mat.keys.map { |predicted| [predicted, 0] }.to_h
|
109
110
|
correct = 0
|
110
111
|
conf_mat.each do |k, rec|
|
111
112
|
actual_total = rec.values.reduce(:+)
|
112
|
-
puts ([k.ljust(cell_size)] + rec.values.map{|v| v.to_s.rjust(cell_size)} + [actual_total.to_s.rjust(cell_size), divide(rec[k], actual_total).round(5).to_s.rjust(cell_size)]).join(
|
113
|
+
puts ([k.ljust(cell_size)] + rec.values.map { |v| v.to_s.rjust(cell_size) } + [actual_total.to_s.rjust(cell_size), divide(rec[k], actual_total).round(5).to_s.rjust(cell_size)]).join(' ')
|
113
114
|
rec.each do |cat, val|
|
114
115
|
predicted_totals[cat] += val
|
115
116
|
correct += val if cat == k
|
116
117
|
end
|
117
118
|
end
|
118
119
|
total = predicted_totals.values.reduce(:+)
|
119
|
-
puts
|
120
|
-
puts ([
|
121
|
-
puts ([
|
120
|
+
puts '-' * header.length
|
121
|
+
puts (['Total'.ljust(cell_size)] + predicted_totals.values.map { |v| v.to_s.rjust(cell_size) } + [total.to_s.rjust(cell_size), ''.rjust(cell_size)]).join(' ')
|
122
|
+
puts (['Precision'.ljust(cell_size)] + predicted_totals.keys.map { |k| divide(conf_mat[k][k], predicted_totals[k]).round(5).to_s.rjust(cell_size) } + ['Accuracy ->'.rjust(cell_size), divide(correct, total).round(5).to_s.rjust(cell_size)]).join(' ')
|
122
123
|
end
|
123
124
|
|
124
125
|
def print_conf_tab(conf_tab)
|
@@ -135,31 +136,31 @@ module ClassifierReborn
|
|
135
136
|
negatives = tab[:n][:t] + tab[:p][:f]
|
136
137
|
total = positives + negatives
|
137
138
|
{
|
138
|
-
total_population:
|
139
|
+
total_population: positives + negatives,
|
139
140
|
condition_positive: positives,
|
140
141
|
condition_negative: negatives,
|
141
|
-
true_positive:
|
142
|
-
true_negative:
|
143
|
-
false_positive:
|
144
|
-
false_negative:
|
145
|
-
prevalence:
|
146
|
-
specificity:
|
147
|
-
recall:
|
148
|
-
precision:
|
149
|
-
accuracy:
|
150
|
-
f1_score:
|
142
|
+
true_positive: tab[:p][:t],
|
143
|
+
true_negative: tab[:n][:t],
|
144
|
+
false_positive: tab[:p][:f],
|
145
|
+
false_negative: tab[:n][:f],
|
146
|
+
prevalence: divide(positives, total),
|
147
|
+
specificity: divide(tab[:n][:t], negatives),
|
148
|
+
recall: divide(tab[:p][:t], positives),
|
149
|
+
precision: divide(tab[:p][:t], tab[:p][:t] + tab[:p][:f]),
|
150
|
+
accuracy: divide(tab[:p][:t] + tab[:n][:t], total),
|
151
|
+
f1_score: divide(2 * tab[:p][:t], 2 * tab[:p][:t] + tab[:p][:f] + tab[:n][:f])
|
151
152
|
}
|
152
153
|
end
|
153
154
|
|
154
155
|
def print_derivations(derivations)
|
155
156
|
max_len = derivations.keys.map(&:length).max
|
156
157
|
derivations.each do |k, v|
|
157
|
-
puts k.to_s.tr('_', ' ').capitalize.ljust(max_len) +
|
158
|
+
puts k.to_s.tr('_', ' ').capitalize.ljust(max_len) + ' : ' + v.to_s
|
158
159
|
end
|
159
160
|
end
|
160
161
|
|
161
162
|
def empty_conf_mat(categories)
|
162
|
-
categories.map{|actual| [actual, categories.map{|predicted| [predicted, 0]}.to_h]}.to_h
|
163
|
+
categories.map { |actual| [actual, categories.map { |predicted| [predicted, 0] }.to_h] }.to_h
|
163
164
|
end
|
164
165
|
|
165
166
|
def divide(dividend, divisor)
|
data/lib/classifier-reborn.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
#--
|
2
4
|
# Copyright (c) 2005 Lucas Carlson
|
3
5
|
#
|
@@ -36,4 +38,4 @@ end
|
|
36
38
|
require_relative 'classifier-reborn/category_namer'
|
37
39
|
require_relative 'classifier-reborn/bayes'
|
38
40
|
require_relative 'classifier-reborn/lsi'
|
39
|
-
require_relative 'classifier-reborn/validators/classifier_validator'
|
41
|
+
require_relative 'classifier-reborn/validators/classifier_validator'
|
metadata
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: classifier-reborn
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lucas Carlson
|
8
8
|
- Parker Moore
|
9
9
|
- Chase Gilliam
|
10
|
-
autorequire:
|
10
|
+
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date:
|
13
|
+
date: 2022-06-21 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: fast-stemmer
|
@@ -27,7 +27,21 @@ dependencies:
|
|
27
27
|
- !ruby/object:Gem::Version
|
28
28
|
version: '1.0'
|
29
29
|
- !ruby/object:Gem::Dependency
|
30
|
-
name:
|
30
|
+
name: matrix
|
31
|
+
requirement: !ruby/object:Gem::Requirement
|
32
|
+
requirements:
|
33
|
+
- - "~>"
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: '0.4'
|
36
|
+
type: :runtime
|
37
|
+
prerelease: false
|
38
|
+
version_requirements: !ruby/object:Gem::Requirement
|
39
|
+
requirements:
|
40
|
+
- - "~>"
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: '0.4'
|
43
|
+
- !ruby/object:Gem::Dependency
|
44
|
+
name: minitest
|
31
45
|
requirement: !ruby/object:Gem::Requirement
|
32
46
|
requirements:
|
33
47
|
- - ">="
|
@@ -41,7 +55,7 @@ dependencies:
|
|
41
55
|
- !ruby/object:Gem::Version
|
42
56
|
version: '0'
|
43
57
|
- !ruby/object:Gem::Dependency
|
44
|
-
name:
|
58
|
+
name: minitest-reporters
|
45
59
|
requirement: !ruby/object:Gem::Requirement
|
46
60
|
requirements:
|
47
61
|
- - ">="
|
@@ -55,7 +69,7 @@ dependencies:
|
|
55
69
|
- !ruby/object:Gem::Version
|
56
70
|
version: '0'
|
57
71
|
- !ruby/object:Gem::Dependency
|
58
|
-
name:
|
72
|
+
name: pry
|
59
73
|
requirement: !ruby/object:Gem::Requirement
|
60
74
|
requirements:
|
61
75
|
- - ">="
|
@@ -69,7 +83,7 @@ dependencies:
|
|
69
83
|
- !ruby/object:Gem::Version
|
70
84
|
version: '0'
|
71
85
|
- !ruby/object:Gem::Dependency
|
72
|
-
name:
|
86
|
+
name: rake
|
73
87
|
requirement: !ruby/object:Gem::Requirement
|
74
88
|
requirements:
|
75
89
|
- - ">="
|
@@ -83,7 +97,7 @@ dependencies:
|
|
83
97
|
- !ruby/object:Gem::Version
|
84
98
|
version: '0'
|
85
99
|
- !ruby/object:Gem::Dependency
|
86
|
-
name:
|
100
|
+
name: rdoc
|
87
101
|
requirement: !ruby/object:Gem::Requirement
|
88
102
|
requirements:
|
89
103
|
- - ">="
|
@@ -97,7 +111,7 @@ dependencies:
|
|
97
111
|
- !ruby/object:Gem::Version
|
98
112
|
version: '0'
|
99
113
|
- !ruby/object:Gem::Dependency
|
100
|
-
name:
|
114
|
+
name: redis
|
101
115
|
requirement: !ruby/object:Gem::Requirement
|
102
116
|
requirements:
|
103
117
|
- - ">="
|
@@ -111,7 +125,7 @@ dependencies:
|
|
111
125
|
- !ruby/object:Gem::Version
|
112
126
|
version: '0'
|
113
127
|
- !ruby/object:Gem::Dependency
|
114
|
-
name:
|
128
|
+
name: rubocop
|
115
129
|
requirement: !ruby/object:Gem::Requirement
|
116
130
|
requirements:
|
117
131
|
- - ">="
|
@@ -124,7 +138,7 @@ dependencies:
|
|
124
138
|
- - ">="
|
125
139
|
- !ruby/object:Gem::Version
|
126
140
|
version: '0'
|
127
|
-
description:
|
141
|
+
description:
|
128
142
|
email:
|
129
143
|
- lucas@rufy.com
|
130
144
|
- parkrmoore@gmail.com
|
@@ -167,8 +181,14 @@ files:
|
|
167
181
|
- lib/classifier-reborn/bayes.rb
|
168
182
|
- lib/classifier-reborn/category_namer.rb
|
169
183
|
- lib/classifier-reborn/extensions/hasher.rb
|
184
|
+
- lib/classifier-reborn/extensions/token_filter/stemmer.rb
|
185
|
+
- lib/classifier-reborn/extensions/token_filter/stopword.rb
|
186
|
+
- lib/classifier-reborn/extensions/token_filter/symbol.rb
|
187
|
+
- lib/classifier-reborn/extensions/tokenizer/token.rb
|
188
|
+
- lib/classifier-reborn/extensions/tokenizer/whitespace.rb
|
170
189
|
- lib/classifier-reborn/extensions/vector.rb
|
171
190
|
- lib/classifier-reborn/extensions/vector_serialize.rb
|
191
|
+
- lib/classifier-reborn/extensions/zero_vector.rb
|
172
192
|
- lib/classifier-reborn/lsi.rb
|
173
193
|
- lib/classifier-reborn/lsi/cached_content_node.rb
|
174
194
|
- lib/classifier-reborn/lsi/content_node.rb
|
@@ -176,11 +196,11 @@ files:
|
|
176
196
|
- lib/classifier-reborn/lsi/word_list.rb
|
177
197
|
- lib/classifier-reborn/validators/classifier_validator.rb
|
178
198
|
- lib/classifier-reborn/version.rb
|
179
|
-
homepage: https://github.
|
199
|
+
homepage: https://jekyll.github.io/classifier-reborn/
|
180
200
|
licenses:
|
181
201
|
- LGPL
|
182
202
|
metadata: {}
|
183
|
-
post_install_message:
|
203
|
+
post_install_message:
|
184
204
|
rdoc_options:
|
185
205
|
- "--charset=UTF-8"
|
186
206
|
require_paths:
|
@@ -189,17 +209,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
189
209
|
requirements:
|
190
210
|
- - ">="
|
191
211
|
- !ruby/object:Gem::Version
|
192
|
-
version:
|
212
|
+
version: 2.4.0
|
193
213
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
194
214
|
requirements:
|
195
215
|
- - ">="
|
196
216
|
- !ruby/object:Gem::Version
|
197
217
|
version: '0'
|
198
218
|
requirements: []
|
199
|
-
|
200
|
-
|
201
|
-
signing_key:
|
219
|
+
rubygems_version: 3.3.7
|
220
|
+
signing_key:
|
202
221
|
specification_version: 2
|
203
222
|
summary: A general classifier module to allow Bayesian and other types of classifications.
|
204
223
|
test_files: []
|
205
|
-
has_rdoc: true
|