classifier-reborn 2.2.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/README.markdown +6 -8
- data/lib/classifier-reborn/backends/bayes_memory_backend.rb +3 -1
- data/lib/classifier-reborn/backends/bayes_redis_backend.rb +2 -0
- data/lib/classifier-reborn/backends/no_redis_error.rb +5 -3
- data/lib/classifier-reborn/bayes.rb +27 -15
- data/lib/classifier-reborn/category_namer.rb +2 -0
- data/lib/classifier-reborn/extensions/hasher.rb +20 -45
- data/lib/classifier-reborn/extensions/token_filter/stemmer.rb +24 -0
- data/lib/classifier-reborn/extensions/token_filter/stopword.rb +48 -0
- data/lib/classifier-reborn/extensions/token_filter/symbol.rb +20 -0
- data/lib/classifier-reborn/extensions/tokenizer/token.rb +36 -0
- data/lib/classifier-reborn/extensions/tokenizer/whitespace.rb +28 -0
- data/lib/classifier-reborn/extensions/vector.rb +13 -7
- data/lib/classifier-reborn/extensions/vector_serialize.rb +3 -1
- data/lib/classifier-reborn/extensions/zero_vector.rb +7 -0
- data/lib/classifier-reborn/lsi/cached_content_node.rb +2 -0
- data/lib/classifier-reborn/lsi/content_node.rb +23 -10
- data/lib/classifier-reborn/lsi/summarizer.rb +3 -1
- data/lib/classifier-reborn/lsi/word_list.rb +2 -0
- data/lib/classifier-reborn/lsi.rb +79 -23
- data/lib/classifier-reborn/validators/classifier_validator.rb +39 -38
- data/lib/classifier-reborn/version.rb +3 -1
- data/lib/classifier-reborn.rb +3 -1
- metadata +36 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: '0100803f158326f660f53694ff5d0d400440792bb5174a10d80ae7eb780c5b6b'
|
4
|
+
data.tar.gz: 1f5a249471e67beb8796a0a61f47ea18fa2f0a252e832f03cb7e7b1937921fa5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e63b40492f9d35092353c198822f2ce444d05dec7613572048c3f420eecda4040c84026fe621ccb6c316e9862bc25258d47e32663168eb8f67c2b29b41733c57
|
7
|
+
data.tar.gz: abad42c42694cea59acf4bb59184a8f2aaa1d909826b126b4917b67b350c3ca9a14a3b688bd648e7ff8bba241a72e7846c749b91092e7ea91b5bc373c793b24f
|
data/README.markdown
CHANGED
@@ -2,11 +2,9 @@
|
|
2
2
|
|
3
3
|
[](https://rubygems.org/gems/classifier-reborn)
|
4
4
|
[](https://travis-ci.org/jekyll/classifier-reborn)
|
5
|
-
[](https://gemnasium.com/jekyll/classifier-reborn)
|
6
|
-
|
7
5
|
---
|
8
6
|
|
9
|
-
## [Read the Docs](
|
7
|
+
## [Read the Docs](https://jekyll.github.io/classifier-reborn/)
|
10
8
|
|
11
9
|
## Getting Started
|
12
10
|
|
@@ -45,11 +43,11 @@ irb(main):013:0> lsi.find_related("This text is around cats!", 2)
|
|
45
43
|
There is much more that can be done using Bayes and LSI beyond these quick examples.
|
46
44
|
For more information read the following documentation topics.
|
47
45
|
|
48
|
-
* [Installation and Dependencies](
|
49
|
-
* [Bayesian Classifier](
|
50
|
-
* [Latent Semantic Indexer (LSI)](
|
51
|
-
* [Classifier Validation](
|
52
|
-
* [Development and Contributions](
|
46
|
+
* [Installation and Dependencies](https://jekyll.github.io/classifier-reborn/)
|
47
|
+
* [Bayesian Classifier](https://jekyll.github.io/classifier-reborn/bayes)
|
48
|
+
* [Latent Semantic Indexer (LSI)](https://jekyll.github.io/classifier-reborn/lsi)
|
49
|
+
* [Classifier Validation](https://jekyll.github.io/classifier-reborn/validation)
|
50
|
+
* [Development and Contributions](https://jekyll.github.io/classifier-reborn/development) (*Optional Docker instructions included*)
|
53
51
|
|
54
52
|
### Notes on JRuby support
|
55
53
|
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module ClassifierReborn
|
2
4
|
class BayesMemoryBackend
|
3
5
|
attr_reader :total_words, :total_trainings
|
@@ -69,7 +71,7 @@ module ClassifierReborn
|
|
69
71
|
private
|
70
72
|
|
71
73
|
def category_counts(category)
|
72
|
-
@category_counts[category] ||= {training: 0, word: 0}
|
74
|
+
@category_counts[category] ||= { training: 0, word: 0 }
|
73
75
|
end
|
74
76
|
end
|
75
77
|
end
|
@@ -1,12 +1,14 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class NoRedisError < RuntimeError
|
2
4
|
def initialize
|
3
5
|
msg =
|
4
|
-
%q
|
6
|
+
%q(The Redis Backend can only be used if Redis is installed.
|
5
7
|
This error is raised from 'lib/classifier-reborn/backends/bayes_redis_backend.rb'.
|
6
8
|
If you have encountered this error and would like to use the Redis Backend,
|
7
9
|
please run 'gem install redis' or include 'gem "redis"' in
|
8
10
|
your gemfile. For more info see https://github.com/jekyll/classifier-reborn#usage.
|
9
|
-
|
11
|
+
)
|
10
12
|
super(msg)
|
11
13
|
end
|
12
14
|
end
|
@@ -1,9 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
4
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
5
|
# License:: LGPL
|
4
6
|
|
5
7
|
require 'set'
|
6
8
|
|
9
|
+
require_relative 'extensions/tokenizer/whitespace'
|
10
|
+
require_relative 'extensions/token_filter/stopword'
|
11
|
+
require_relative 'extensions/token_filter/stemmer'
|
7
12
|
require_relative 'category_namer'
|
8
13
|
require_relative 'backends/bayes_memory_backend'
|
9
14
|
require_relative 'backends/bayes_redis_backend'
|
@@ -26,12 +31,11 @@ module ClassifierReborn
|
|
26
31
|
# backend: BayesMemoryBackend.new Alternatively, BayesRedisBackend.new for persistent storage
|
27
32
|
def initialize(*args)
|
28
33
|
@initial_categories = []
|
29
|
-
options = { language:
|
34
|
+
options = { language: 'en',
|
30
35
|
enable_threshold: false,
|
31
|
-
threshold:
|
32
|
-
enable_stemmer:
|
33
|
-
backend:
|
34
|
-
}
|
36
|
+
threshold: 0.0,
|
37
|
+
enable_stemmer: true,
|
38
|
+
backend: BayesMemoryBackend.new }
|
35
39
|
args.flatten.each do |arg|
|
36
40
|
if arg.is_a?(Hash)
|
37
41
|
options.merge!(arg)
|
@@ -50,12 +54,14 @@ module ClassifierReborn
|
|
50
54
|
@threshold = options[:threshold]
|
51
55
|
@enable_stemmer = options[:enable_stemmer]
|
52
56
|
@backend = options[:backend]
|
57
|
+
@tokenizer = options[:tokenizer] || Tokenizer::Whitespace
|
58
|
+
@token_filters = options[:token_filters] || [TokenFilter::Stopword]
|
59
|
+
@token_filters << TokenFilter::Stemmer if @enable_stemmer && !@token_filters.include?(TokenFilter::Stemmer)
|
60
|
+
TokenFilter::Stopword.language = @language if @token_filters.include?(TokenFilter::Stopword)
|
53
61
|
|
54
62
|
populate_initial_categories
|
55
63
|
|
56
|
-
if options.key?(:stopwords)
|
57
|
-
custom_stopwords options[:stopwords]
|
58
|
-
end
|
64
|
+
custom_stopwords options[:stopwords] if options.key?(:stopwords)
|
59
65
|
end
|
60
66
|
|
61
67
|
# Provides a general training method for all categories specified in Bayes#new
|
@@ -65,8 +71,10 @@ module ClassifierReborn
|
|
65
71
|
# b.train "that", "That text"
|
66
72
|
# b.train "The other", "The other text"
|
67
73
|
def train(category, text)
|
68
|
-
word_hash = Hasher.word_hash(text, @
|
74
|
+
word_hash = Hasher.word_hash(text, @enable_stemmer,
|
75
|
+
tokenizer: @tokenizer, token_filters: @token_filters)
|
69
76
|
return if word_hash.empty?
|
77
|
+
|
70
78
|
category = CategoryNamer.prepare_name(category)
|
71
79
|
|
72
80
|
# Add the category dynamically or raise an error
|
@@ -95,11 +103,14 @@ module ClassifierReborn
|
|
95
103
|
# b.train :this, "This text"
|
96
104
|
# b.untrain :this, "This text"
|
97
105
|
def untrain(category, text)
|
98
|
-
word_hash = Hasher.word_hash(text, @
|
106
|
+
word_hash = Hasher.word_hash(text, @enable_stemmer,
|
107
|
+
tokenizer: @tokenizer, token_filters: @token_filters)
|
99
108
|
return if word_hash.empty?
|
109
|
+
|
100
110
|
category = CategoryNamer.prepare_name(category)
|
101
111
|
word_hash.each do |word, count|
|
102
112
|
next if @backend.total_words < 0
|
113
|
+
|
103
114
|
orig = @backend.category_word_frequency(category, word) || 0
|
104
115
|
@backend.update_category_word_frequency(category, word, -count)
|
105
116
|
if @backend.category_word_frequency(category, word) <= 0
|
@@ -120,7 +131,8 @@ module ClassifierReborn
|
|
120
131
|
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
121
132
|
def classifications(text)
|
122
133
|
score = {}
|
123
|
-
word_hash = Hasher.word_hash(text, @
|
134
|
+
word_hash = Hasher.word_hash(text, @enable_stemmer,
|
135
|
+
tokenizer: @tokenizer, token_filters: @token_filters)
|
124
136
|
if word_hash.empty?
|
125
137
|
category_keys.each do |category|
|
126
138
|
score[category.to_s] = Float::INFINITY
|
@@ -152,7 +164,7 @@ module ClassifierReborn
|
|
152
164
|
# Return the classification without the score
|
153
165
|
def classify(text)
|
154
166
|
result, score = classify_with_score(text)
|
155
|
-
result = nil if score < @threshold || score == Float::INFINITY
|
167
|
+
result = nil if threshold_enabled? && (score < @threshold || score == Float::INFINITY)
|
156
168
|
result
|
157
169
|
end
|
158
170
|
|
@@ -240,7 +252,7 @@ module ClassifierReborn
|
|
240
252
|
@backend.add_category(category)
|
241
253
|
end
|
242
254
|
|
243
|
-
|
255
|
+
alias append_category add_category
|
244
256
|
|
245
257
|
def reset
|
246
258
|
@backend.reset
|
@@ -261,12 +273,12 @@ module ClassifierReborn
|
|
261
273
|
if stopwords.strip.empty?
|
262
274
|
stopwords = []
|
263
275
|
elsif File.exist?(stopwords)
|
264
|
-
stopwords = File.read(stopwords).force_encoding(
|
276
|
+
stopwords = File.read(stopwords).force_encoding('utf-8').split
|
265
277
|
else
|
266
278
|
return # Do not overwrite the default
|
267
279
|
end
|
268
280
|
end
|
269
|
-
|
281
|
+
TokenFilter::Stopword::STOPWORDS[@language] = Set.new stopwords
|
270
282
|
end
|
271
283
|
end
|
272
284
|
end
|
@@ -1,67 +1,42 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
3
4
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
4
5
|
# License:: LGPL
|
5
6
|
|
6
7
|
require 'set'
|
7
8
|
|
9
|
+
require_relative 'tokenizer/whitespace'
|
10
|
+
require_relative 'token_filter/stopword'
|
11
|
+
require_relative 'token_filter/stemmer'
|
12
|
+
|
8
13
|
module ClassifierReborn
|
9
14
|
module Hasher
|
10
|
-
STOPWORDS_PATH = [File.expand_path(File.dirname(__FILE__) + '/../../../data/stopwords')]
|
11
|
-
|
12
15
|
module_function
|
13
16
|
|
14
17
|
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
15
18
|
# interned, and indexes to its frequency in the document.
|
16
|
-
def word_hash(str,
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
word_hash_for_words(str.gsub(/[^\p{WORD}\s]/, '').downcase.split, language, enable_stemmer)
|
25
|
-
end
|
26
|
-
|
27
|
-
def word_hash_for_words(words, language = 'en', enable_stemmer = true)
|
28
|
-
d = Hash.new(0)
|
29
|
-
words.each do |word|
|
30
|
-
next unless word.length > 2 && !STOPWORDS[language].include?(word)
|
31
|
-
if enable_stemmer
|
32
|
-
d[word.stem.intern] += 1
|
33
|
-
else
|
34
|
-
d[word.intern] += 1
|
19
|
+
def word_hash(str, enable_stemmer = true,
|
20
|
+
tokenizer: Tokenizer::Whitespace,
|
21
|
+
token_filters: [TokenFilter::Stopword])
|
22
|
+
if token_filters.include?(TokenFilter::Stemmer)
|
23
|
+
unless enable_stemmer
|
24
|
+
token_filters.reject! do |token_filter|
|
25
|
+
token_filter == TokenFilter::Stemmer
|
26
|
+
end
|
35
27
|
end
|
28
|
+
else
|
29
|
+
token_filters << TokenFilter::Stemmer if enable_stemmer
|
30
|
+
end
|
31
|
+
words = tokenizer.call(str)
|
32
|
+
token_filters.each do |token_filter|
|
33
|
+
words = token_filter.call(words)
|
36
34
|
end
|
37
|
-
d
|
38
|
-
end
|
39
|
-
|
40
|
-
# Add custom path to a new stopword file created by user
|
41
|
-
def add_custom_stopword_path(path)
|
42
|
-
STOPWORDS_PATH.unshift(path)
|
43
|
-
end
|
44
|
-
|
45
|
-
def word_hash_for_symbols(words)
|
46
35
|
d = Hash.new(0)
|
47
36
|
words.each do |word|
|
48
37
|
d[word.intern] += 1
|
49
38
|
end
|
50
39
|
d
|
51
40
|
end
|
52
|
-
|
53
|
-
# Create a lazily-loaded hash of stopword data
|
54
|
-
STOPWORDS = Hash.new do |hash, language|
|
55
|
-
hash[language] = []
|
56
|
-
|
57
|
-
STOPWORDS_PATH.each do |path|
|
58
|
-
if File.exist?(File.join(path, language))
|
59
|
-
hash[language] = Set.new File.read(File.join(path, language.to_s)).force_encoding("utf-8").split
|
60
|
-
break
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
|
-
hash[language]
|
65
|
-
end
|
66
41
|
end
|
67
42
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
4
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
5
|
+
# License:: LGPL
|
6
|
+
|
7
|
+
module ClassifierReborn
|
8
|
+
module TokenFilter
|
9
|
+
# This filter converts given tokens to their stemmed versions.
|
10
|
+
module Stemmer
|
11
|
+
module_function
|
12
|
+
|
13
|
+
def call(tokens)
|
14
|
+
tokens.collect do |token|
|
15
|
+
if token.stemmable?
|
16
|
+
token.stem
|
17
|
+
else
|
18
|
+
token
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
4
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
5
|
+
# License:: LGPL
|
6
|
+
|
7
|
+
module ClassifierReborn
|
8
|
+
module TokenFilter
|
9
|
+
# This filter removes stopwords in the language, from given tokens.
|
10
|
+
module Stopword
|
11
|
+
STOPWORDS_PATH = [File.expand_path(File.dirname(__FILE__) + '/../../../../data/stopwords')]
|
12
|
+
@language = 'en'
|
13
|
+
|
14
|
+
module_function
|
15
|
+
|
16
|
+
def call(tokens)
|
17
|
+
tokens.reject do |token|
|
18
|
+
token.maybe_stopword? &&
|
19
|
+
(token.length <= 2 || STOPWORDS[@language].include?(token))
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Add custom path to a new stopword file created by user
|
24
|
+
def add_custom_stopword_path(path)
|
25
|
+
STOPWORDS_PATH.unshift(path)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Create a lazily-loaded hash of stopword data
|
29
|
+
STOPWORDS = Hash.new do |hash, language|
|
30
|
+
hash[language] = []
|
31
|
+
|
32
|
+
STOPWORDS_PATH.each do |path|
|
33
|
+
if File.exist?(File.join(path, language))
|
34
|
+
hash[language] = Set.new File.read(File.join(path, language.to_s)).force_encoding('utf-8').split
|
35
|
+
break
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
hash[language]
|
40
|
+
end
|
41
|
+
|
42
|
+
# Changes the language of stopwords
|
43
|
+
def language=(language)
|
44
|
+
@language = language
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
4
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
5
|
+
# License:: LGPL
|
6
|
+
|
7
|
+
module ClassifierReborn
|
8
|
+
module TokenFilter
|
9
|
+
# This filter removes symbol-only terms, from given tokens.
|
10
|
+
module Symbol
|
11
|
+
module_function
|
12
|
+
|
13
|
+
def call(tokens)
|
14
|
+
tokens.reject do |token|
|
15
|
+
/[^\s\p{WORD}]/ === token
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
4
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
5
|
+
# License:: LGPL
|
6
|
+
|
7
|
+
module ClassifierReborn
|
8
|
+
module Tokenizer
|
9
|
+
class Token < String
|
10
|
+
# The class can be created with one token string and extra attributes. E.g.,
|
11
|
+
# t = ClassifierReborn::Tokenizer::Token.new 'Tokenize', stemmable: true, maybe_stopword: false
|
12
|
+
#
|
13
|
+
# Attributes available are:
|
14
|
+
# stemmable: true Possibility that the token can be stemmed. This must be false for un-stemmable terms, otherwise this should be true.
|
15
|
+
# maybe_stopword: true Possibility that the token is a stopword. This must be false for terms which never been stopword, otherwise this should be true.
|
16
|
+
def initialize(string, stemmable: true, maybe_stopword: true)
|
17
|
+
super(string)
|
18
|
+
@stemmable = stemmable
|
19
|
+
@maybe_stopword = maybe_stopword
|
20
|
+
end
|
21
|
+
|
22
|
+
def stemmable?
|
23
|
+
@stemmable
|
24
|
+
end
|
25
|
+
|
26
|
+
def maybe_stopword?
|
27
|
+
@maybe_stopword
|
28
|
+
end
|
29
|
+
|
30
|
+
def stem
|
31
|
+
stemmed = super
|
32
|
+
self.class.new(stemmed, stemmable: @stemmable, maybe_stopword: @maybe_stopword)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
4
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
5
|
+
# License:: LGPL
|
6
|
+
|
7
|
+
require_relative 'token'
|
8
|
+
|
9
|
+
module ClassifierReborn
|
10
|
+
module Tokenizer
|
11
|
+
# This tokenizes given input as white-space separated terms.
|
12
|
+
# It mainly aims to tokenize sentences written with a space between words, like English, French, and others.
|
13
|
+
module Whitespace
|
14
|
+
module_function
|
15
|
+
|
16
|
+
def call(str)
|
17
|
+
tokens = str.gsub(/[^\p{WORD}\s]/, '').downcase.split.collect do |word|
|
18
|
+
Token.new(word, stemmable: true, maybe_stopword: true)
|
19
|
+
end
|
20
|
+
symbol_tokens = str.scan(/[^\s\p{WORD}]/).collect do |word|
|
21
|
+
Token.new(word, stemmable: false, maybe_stopword: false)
|
22
|
+
end
|
23
|
+
tokens += symbol_tokens
|
24
|
+
tokens
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Ernest Ellingson
|
2
4
|
# Copyright:: Copyright (c) 2005
|
3
5
|
|
@@ -10,14 +12,14 @@ class Matrix
|
|
10
12
|
Matrix.diagonal(*s)
|
11
13
|
end
|
12
14
|
|
13
|
-
|
15
|
+
alias trans transpose
|
14
16
|
|
15
17
|
def SV_decomp(maxSweeps = 20)
|
16
|
-
if row_size >= column_size
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
18
|
+
q = if row_size >= column_size
|
19
|
+
trans * self
|
20
|
+
else
|
21
|
+
self * trans
|
22
|
+
end
|
21
23
|
|
22
24
|
qrot = q.dup
|
23
25
|
v = Matrix.identity(q.row_size)
|
@@ -31,7 +33,11 @@ class Matrix
|
|
31
33
|
(1..qrot.row_size - 1).each do |col|
|
32
34
|
next if row == col
|
33
35
|
|
34
|
-
h =
|
36
|
+
h = if (2.0 * qrot[row, col]) == (qrot[row, row] - qrot[col, col])
|
37
|
+
Math.atan(1) / 2.0
|
38
|
+
else
|
39
|
+
Math.atan((2.0 * qrot[row, col]) / (qrot[row, row] - qrot[col, col])) / 2.0
|
40
|
+
end
|
35
41
|
hcos = Math.cos(h)
|
36
42
|
hsin = Math.sin(h)
|
37
43
|
mzrot = Matrix.identity(qrot.row_size)
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: David Fayram (mailto:dfayram@lensmen.net)
|
2
4
|
# Copyright:: Copyright (c) 2005 David Fayram II
|
3
5
|
# License:: LGPL
|
@@ -27,7 +29,11 @@ module ClassifierReborn
|
|
27
29
|
|
28
30
|
# Method to access the transposed search vector
|
29
31
|
def transposed_search_vector
|
30
|
-
|
32
|
+
if $SVD == :numo
|
33
|
+
search_vector
|
34
|
+
else
|
35
|
+
search_vector.col
|
36
|
+
end
|
31
37
|
end
|
32
38
|
|
33
39
|
# Use this to fetch the appropriate search vector in normalized form.
|
@@ -38,18 +44,22 @@ module ClassifierReborn
|
|
38
44
|
# Creates the raw vector out of word_hash using word_list as the
|
39
45
|
# key for mapping the vector space.
|
40
46
|
def raw_vector_with(word_list)
|
41
|
-
if $
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
47
|
+
vec = if $SVD == :numo
|
48
|
+
Numo::DFloat.zeros(word_list.size)
|
49
|
+
elsif $SVD == :gsl
|
50
|
+
GSL::Vector.alloc(word_list.size)
|
51
|
+
else
|
52
|
+
Array.new(word_list.size, 0)
|
53
|
+
end
|
46
54
|
|
47
55
|
@word_hash.each_key do |word|
|
48
56
|
vec[word_list[word]] = @word_hash[word] if word_list[word]
|
49
57
|
end
|
50
58
|
|
51
59
|
# Perform the scaling transform and force floating point arithmetic
|
52
|
-
if $
|
60
|
+
if $SVD == :numo
|
61
|
+
total_words = vec.sum.to_f
|
62
|
+
elsif $SVD == :gsl
|
53
63
|
sum = 0.0
|
54
64
|
vec.each { |v| sum += v }
|
55
65
|
total_words = sum
|
@@ -59,7 +69,7 @@ module ClassifierReborn
|
|
59
69
|
|
60
70
|
total_unique_words = 0
|
61
71
|
|
62
|
-
if $
|
72
|
+
if [:numo, :gsl].include?($SVD)
|
63
73
|
vec.each { |word| total_unique_words += 1 if word != 0.0 }
|
64
74
|
else
|
65
75
|
total_unique_words = vec.count { |word| word != 0 }
|
@@ -83,12 +93,15 @@ module ClassifierReborn
|
|
83
93
|
hash[val] = Math.log(val + 1) / -weighted_total
|
84
94
|
end
|
85
95
|
|
86
|
-
vec.
|
96
|
+
vec = vec.map do |val|
|
87
97
|
cached_calcs[val]
|
88
98
|
end
|
89
99
|
end
|
90
100
|
|
91
|
-
if $
|
101
|
+
if $SVD == :numo
|
102
|
+
@raw_norm = vec / Numo::Linalg.norm(vec)
|
103
|
+
@raw_vector = vec
|
104
|
+
elsif $SVD == :gsl
|
92
105
|
@raw_norm = vec.normalize
|
93
106
|
@raw_vector = vec
|
94
107
|
else
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
4
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
5
|
# License:: LGPL
|
@@ -27,7 +29,7 @@ module ClassifierReborn
|
|
27
29
|
chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
|
28
30
|
lsi.build_index
|
29
31
|
summaries = lsi.highest_relative_content count
|
30
|
-
summaries.
|
32
|
+
summaries.select { |chunk| summaries.include? chunk }.map(&:strip).join(separator)
|
31
33
|
end
|
32
34
|
end
|
33
35
|
end
|
@@ -1,23 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: David Fayram (mailto:dfayram@lensmen.net)
|
2
4
|
# Copyright:: Copyright (c) 2005 David Fayram II
|
3
5
|
# License:: LGPL
|
4
6
|
|
7
|
+
# Try to load Numo first - it's the most current and the most well-supported.
|
8
|
+
# Fall back to GSL.
|
9
|
+
# Fall back to native vector.
|
5
10
|
begin
|
6
11
|
raise LoadError if ENV['NATIVE_VECTOR'] == 'true' # to test the native vector class, try `rake test NATIVE_VECTOR=true`
|
12
|
+
raise LoadError if ENV['GSL'] == 'true' # to test with gsl, try `rake test GSL=true`
|
7
13
|
|
8
|
-
require '
|
9
|
-
|
10
|
-
$
|
11
|
-
|
14
|
+
require 'numo/narray' # https://ruby-numo.github.io/narray/
|
15
|
+
require 'numo/linalg' # https://ruby-numo.github.io/linalg/
|
16
|
+
$SVD = :numo
|
12
17
|
rescue LoadError
|
13
|
-
|
14
|
-
|
18
|
+
begin
|
19
|
+
raise LoadError if ENV['NATIVE_VECTOR'] == 'true' # to test the native vector class, try `rake test NATIVE_VECTOR=true`
|
20
|
+
|
21
|
+
require 'gsl' # requires https://github.com/SciRuby/rb-gsl
|
22
|
+
require_relative 'extensions/vector_serialize'
|
23
|
+
$SVD = :gsl
|
24
|
+
rescue LoadError
|
25
|
+
$SVD = :ruby
|
26
|
+
require_relative 'extensions/vector'
|
27
|
+
require_relative 'extensions/zero_vector'
|
28
|
+
end
|
15
29
|
end
|
16
30
|
|
17
31
|
require_relative 'lsi/word_list'
|
18
32
|
require_relative 'lsi/content_node'
|
19
33
|
require_relative 'lsi/cached_content_node'
|
20
34
|
require_relative 'lsi/summarizer'
|
35
|
+
require_relative 'extensions/token_filter/stopword'
|
36
|
+
require_relative 'extensions/token_filter/symbol'
|
21
37
|
|
22
38
|
module ClassifierReborn
|
23
39
|
# This class implements a Latent Semantic Indexer, which can search, classify and cluster
|
@@ -40,6 +56,11 @@ module ClassifierReborn
|
|
40
56
|
@version = 0
|
41
57
|
@built_at_version = -1
|
42
58
|
@language = options[:language] || 'en'
|
59
|
+
@token_filters = [
|
60
|
+
TokenFilter::Stopword,
|
61
|
+
TokenFilter::Symbol
|
62
|
+
]
|
63
|
+
TokenFilter::Stopword.language = @language
|
43
64
|
extend CachedContentNode::InstanceMethods if @cache_node_vectors = options[:cache_node_vectors]
|
44
65
|
end
|
45
66
|
|
@@ -64,7 +85,8 @@ module ClassifierReborn
|
|
64
85
|
# lsi.add_item ar, *ar.categories { |x| ar.content }
|
65
86
|
#
|
66
87
|
def add_item(item, *categories, &block)
|
67
|
-
clean_word_hash = Hasher.
|
88
|
+
clean_word_hash = Hasher.word_hash((block ? yield(item) : item.to_s),
|
89
|
+
token_filters: @token_filters)
|
68
90
|
if clean_word_hash.empty?
|
69
91
|
puts "Input: '#{item}' is entirely stopwords or words with 2 or fewer characters. Classifier-Reborn cannot handle this document properly."
|
70
92
|
else
|
@@ -124,12 +146,21 @@ module ClassifierReborn
|
|
124
146
|
# turning the LSI class into a simple vector search engine.
|
125
147
|
def build_index(cutoff = 0.75)
|
126
148
|
return unless needs_rebuild?
|
149
|
+
|
127
150
|
make_word_list
|
128
151
|
|
129
152
|
doc_list = @items.values
|
130
153
|
tda = doc_list.collect { |node| node.raw_vector_with(@word_list) }
|
131
154
|
|
132
|
-
if $
|
155
|
+
if $SVD == :numo
|
156
|
+
tdm = Numo::NArray.asarray(tda.map(&:to_a)).transpose
|
157
|
+
ntdm = numo_build_reduced_matrix(tdm, cutoff)
|
158
|
+
|
159
|
+
ntdm.each_over_axis(1).with_index do |col_vec, i|
|
160
|
+
doc_list[i].lsi_vector = col_vec
|
161
|
+
doc_list[i].lsi_norm = col_vec / Numo::Linalg.norm(col_vec)
|
162
|
+
end
|
163
|
+
elsif $SVD == :gsl
|
133
164
|
tdm = GSL::Matrix.alloc(*tda).trans
|
134
165
|
ntdm = build_reduced_matrix(tdm, cutoff)
|
135
166
|
|
@@ -142,9 +173,13 @@ module ClassifierReborn
|
|
142
173
|
tdm = Matrix.rows(tda).trans
|
143
174
|
ntdm = build_reduced_matrix(tdm, cutoff)
|
144
175
|
|
145
|
-
ntdm.
|
176
|
+
ntdm.column_size.times do |col|
|
146
177
|
doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
|
147
|
-
|
178
|
+
if ntdm.column(col).zero?
|
179
|
+
doc_list[col].lsi_norm = ntdm.column(col) if doc_list[col]
|
180
|
+
else
|
181
|
+
doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
|
182
|
+
end
|
148
183
|
end
|
149
184
|
end
|
150
185
|
|
@@ -186,11 +221,13 @@ module ClassifierReborn
|
|
186
221
|
content_node = node_for_content(doc, &block)
|
187
222
|
result =
|
188
223
|
@items.keys.collect do |item|
|
189
|
-
if $
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
224
|
+
val = if $SVD == :numo
|
225
|
+
content_node.search_vector.dot(@items[item].transposed_search_vector)
|
226
|
+
elsif $SVD == :gsl
|
227
|
+
content_node.search_vector * @items[item].transposed_search_vector
|
228
|
+
else
|
229
|
+
(Matrix[content_node.search_vector] * @items[item].search_vector)[0]
|
230
|
+
end
|
194
231
|
[item, val]
|
195
232
|
end
|
196
233
|
result.sort_by { |x| x[1] }.reverse
|
@@ -205,7 +242,8 @@ module ClassifierReborn
|
|
205
242
|
return [] if needs_rebuild?
|
206
243
|
|
207
244
|
content_node = node_for_content(doc, &block)
|
208
|
-
if $
|
245
|
+
if ($SVD == :gsl && content_node.raw_norm.isnan?.all?) ||
|
246
|
+
($SVD == :numo && content_node.raw_norm.isnan.all?)
|
209
247
|
puts "There are no documents that are similar to #{doc}"
|
210
248
|
else
|
211
249
|
content_node_norms(content_node)
|
@@ -215,11 +253,13 @@ module ClassifierReborn
|
|
215
253
|
def content_node_norms(content_node)
|
216
254
|
result =
|
217
255
|
@items.keys.collect do |item|
|
218
|
-
if $
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
256
|
+
val = if $SVD == :numo
|
257
|
+
content_node.search_norm.dot(@items[item].search_norm)
|
258
|
+
elsif $SVD == :gsl
|
259
|
+
content_node.search_norm * @items[item].search_norm.col
|
260
|
+
else
|
261
|
+
(Matrix[content_node.search_norm] * @items[item].search_norm)[0]
|
262
|
+
end
|
223
263
|
[item, val]
|
224
264
|
end
|
225
265
|
result.sort_by { |x| x[1] }.reverse
|
@@ -234,6 +274,7 @@ module ClassifierReborn
|
|
234
274
|
# it is actually the same algorithm, just applied on a smaller document.
|
235
275
|
def search(string, max_nearest = 3)
|
236
276
|
return [] if needs_rebuild?
|
277
|
+
|
237
278
|
carry = proximity_norms_for_content(string)
|
238
279
|
unless carry.nil?
|
239
280
|
result = carry.collect { |x| x[0] }
|
@@ -295,6 +336,7 @@ module ClassifierReborn
|
|
295
336
|
# it's supposed to.
|
296
337
|
def highest_ranked_stems(doc, count = 3)
|
297
338
|
raise 'Requested stem ranking on non-indexed content!' unless @items[doc]
|
339
|
+
|
298
340
|
content_vector_array = node_for_content(doc).lsi_vector.to_a
|
299
341
|
top_n = content_vector_array.sort.reverse[0..count - 1]
|
300
342
|
top_n.collect { |x| @word_list.word_for_index(content_vector_array.index(x)) }
|
@@ -315,14 +357,28 @@ module ClassifierReborn
|
|
315
357
|
s[ord] = 0.0 if s[ord] < s_cutoff
|
316
358
|
end
|
317
359
|
# Reconstruct the term document matrix, only with reduced rank
|
318
|
-
u * ($
|
360
|
+
u * ($SVD == :gsl ? GSL::Matrix : ::Matrix).diag(s) * v.trans
|
361
|
+
end
|
362
|
+
|
363
|
+
def numo_build_reduced_matrix(matrix, cutoff = 0.75)
|
364
|
+
s, u, vt = Numo::Linalg.svd(matrix, driver: 'svd', job: 'S')
|
365
|
+
|
366
|
+
# TODO: Better than 75% term (as above)
|
367
|
+
s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
|
368
|
+
s.size.times do |ord|
|
369
|
+
s[ord] = 0.0 if s[ord] < s_cutoff
|
370
|
+
end
|
371
|
+
|
372
|
+
# Reconstruct the term document matrix, only with reduced rank
|
373
|
+
u.dot(::Numo::DFloat.eye(s.size) * s).dot(vt)
|
319
374
|
end
|
320
375
|
|
321
376
|
def node_for_content(item, &block)
|
322
377
|
if @items[item]
|
323
378
|
return @items[item]
|
324
379
|
else
|
325
|
-
clean_word_hash = Hasher.
|
380
|
+
clean_word_hash = Hasher.word_hash((block ? yield(item) : item.to_s),
|
381
|
+
token_filters: @token_filters)
|
326
382
|
|
327
383
|
content_node = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
|
328
384
|
|
@@ -1,10 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module ClassifierReborn
|
2
4
|
module ClassifierValidator
|
3
|
-
|
4
5
|
module_function
|
5
6
|
|
6
|
-
def cross_validate(classifier, sample_data, fold=10, *options)
|
7
|
-
classifier = ClassifierReborn
|
7
|
+
def cross_validate(classifier, sample_data, fold = 10, *options)
|
8
|
+
classifier = ClassifierReborn.const_get(classifier).new(options) if classifier.is_a?(String)
|
8
9
|
sample_data.shuffle!
|
9
10
|
partition_size = sample_data.length / fold
|
10
11
|
partitioned_data = sample_data.each_slice(partition_size)
|
@@ -14,13 +15,13 @@ module ClassifierReborn
|
|
14
15
|
test_data = training_data.slice!(i)
|
15
16
|
conf_mats << validate(classifier, training_data.flatten!(1), test_data)
|
16
17
|
end
|
17
|
-
classifier.reset
|
18
|
+
classifier.reset
|
18
19
|
generate_report(conf_mats)
|
19
20
|
end
|
20
21
|
|
21
22
|
def validate(classifier, training_data, test_data, *options)
|
22
|
-
classifier = ClassifierReborn
|
23
|
-
classifier.reset
|
23
|
+
classifier = ClassifierReborn.const_get(classifier).new(options) if classifier.is_a?(String)
|
24
|
+
classifier.reset
|
24
25
|
training_data.each do |rec|
|
25
26
|
classifier.train(rec.first, rec.last)
|
26
27
|
end
|
@@ -40,25 +41,25 @@ module ClassifierReborn
|
|
40
41
|
def generate_report(*conf_mats)
|
41
42
|
conf_mats.flatten!
|
42
43
|
accumulated_conf_mat = conf_mats.length == 1 ? conf_mats.first : empty_conf_mat(conf_mats.first.keys.sort)
|
43
|
-
header =
|
44
|
+
header = 'Run Total Correct Incorrect Accuracy'
|
44
45
|
puts
|
45
|
-
puts
|
46
|
+
puts ' Run Report '.center(header.length, '-')
|
46
47
|
puts header
|
47
|
-
puts
|
48
|
+
puts '-' * header.length
|
48
49
|
if conf_mats.length > 1
|
49
50
|
conf_mats.each_with_index do |conf_mat, i|
|
50
51
|
run_report = build_run_report(conf_mat)
|
51
|
-
print_run_report(run_report, i+1)
|
52
|
+
print_run_report(run_report, i + 1)
|
52
53
|
conf_mat.each do |actual, cols|
|
53
54
|
cols.each do |predicted, v|
|
54
55
|
accumulated_conf_mat[actual][predicted] += v
|
55
56
|
end
|
56
57
|
end
|
57
58
|
end
|
58
|
-
puts
|
59
|
+
puts '-' * header.length
|
59
60
|
end
|
60
61
|
run_report = build_run_report(accumulated_conf_mat)
|
61
|
-
print_run_report(run_report,
|
62
|
+
print_run_report(run_report, 'All')
|
62
63
|
puts
|
63
64
|
print_conf_mat(accumulated_conf_mat)
|
64
65
|
puts
|
@@ -78,11 +79,11 @@ module ClassifierReborn
|
|
78
79
|
end
|
79
80
|
end
|
80
81
|
total = correct + incorrect
|
81
|
-
{total: total, correct: correct, incorrect: incorrect, accuracy: divide(correct, total)}
|
82
|
+
{ total: total, correct: correct, incorrect: incorrect, accuracy: divide(correct, total) }
|
82
83
|
end
|
83
84
|
|
84
85
|
def conf_mat_to_tab(conf_mat)
|
85
|
-
conf_tab = Hash.new {|h, k| h[k] = {p: {t: 0, f: 0}, n: {t: 0, f: 0}}}
|
86
|
+
conf_tab = Hash.new { |h, k| h[k] = { p: { t: 0, f: 0 }, n: { t: 0, f: 0 } } }
|
86
87
|
conf_mat.each_key do |positive|
|
87
88
|
conf_mat.each do |actual, cols|
|
88
89
|
cols.each do |predicted, v|
|
@@ -93,32 +94,32 @@ module ClassifierReborn
|
|
93
94
|
conf_tab
|
94
95
|
end
|
95
96
|
|
96
|
-
def print_run_report(stats, prefix=
|
97
|
-
puts "#{
|
97
|
+
def print_run_report(stats, prefix = '', print_header = false)
|
98
|
+
puts "#{'Run'.rjust([3, prefix.length].max)} Total Correct Incorrect Accuracy" if print_header
|
98
99
|
puts "#{prefix.to_s.rjust(3)} #{stats[:total].to_s.rjust(9)} #{stats[:correct].to_s.rjust(9)} #{stats[:incorrect].to_s.rjust(9)} #{stats[:accuracy].round(5).to_s.ljust(7, '0').rjust(9)}"
|
99
100
|
end
|
100
101
|
|
101
102
|
def print_conf_mat(conf_mat)
|
102
|
-
header = [
|
103
|
+
header = ['Predicted ->'] + conf_mat.keys + %w[Total Recall]
|
103
104
|
cell_size = header.map(&:length).max
|
104
|
-
header = header.map{|h| h.rjust(cell_size)}.join(
|
105
|
-
puts
|
105
|
+
header = header.map { |h| h.rjust(cell_size) }.join(' ')
|
106
|
+
puts ' Confusion Matrix '.center(header.length, '-')
|
106
107
|
puts header
|
107
|
-
puts
|
108
|
-
predicted_totals = conf_mat.keys.map{|predicted| [predicted, 0]}.to_h
|
108
|
+
puts '-' * header.length
|
109
|
+
predicted_totals = conf_mat.keys.map { |predicted| [predicted, 0] }.to_h
|
109
110
|
correct = 0
|
110
111
|
conf_mat.each do |k, rec|
|
111
112
|
actual_total = rec.values.reduce(:+)
|
112
|
-
puts ([k.ljust(cell_size)] + rec.values.map{|v| v.to_s.rjust(cell_size)} + [actual_total.to_s.rjust(cell_size), divide(rec[k], actual_total).round(5).to_s.rjust(cell_size)]).join(
|
113
|
+
puts ([k.ljust(cell_size)] + rec.values.map { |v| v.to_s.rjust(cell_size) } + [actual_total.to_s.rjust(cell_size), divide(rec[k], actual_total).round(5).to_s.rjust(cell_size)]).join(' ')
|
113
114
|
rec.each do |cat, val|
|
114
115
|
predicted_totals[cat] += val
|
115
116
|
correct += val if cat == k
|
116
117
|
end
|
117
118
|
end
|
118
119
|
total = predicted_totals.values.reduce(:+)
|
119
|
-
puts
|
120
|
-
puts ([
|
121
|
-
puts ([
|
120
|
+
puts '-' * header.length
|
121
|
+
puts (['Total'.ljust(cell_size)] + predicted_totals.values.map { |v| v.to_s.rjust(cell_size) } + [total.to_s.rjust(cell_size), ''.rjust(cell_size)]).join(' ')
|
122
|
+
puts (['Precision'.ljust(cell_size)] + predicted_totals.keys.map { |k| divide(conf_mat[k][k], predicted_totals[k]).round(5).to_s.rjust(cell_size) } + ['Accuracy ->'.rjust(cell_size), divide(correct, total).round(5).to_s.rjust(cell_size)]).join(' ')
|
122
123
|
end
|
123
124
|
|
124
125
|
def print_conf_tab(conf_tab)
|
@@ -135,31 +136,31 @@ module ClassifierReborn
|
|
135
136
|
negatives = tab[:n][:t] + tab[:p][:f]
|
136
137
|
total = positives + negatives
|
137
138
|
{
|
138
|
-
total_population:
|
139
|
+
total_population: positives + negatives,
|
139
140
|
condition_positive: positives,
|
140
141
|
condition_negative: negatives,
|
141
|
-
true_positive:
|
142
|
-
true_negative:
|
143
|
-
false_positive:
|
144
|
-
false_negative:
|
145
|
-
prevalence:
|
146
|
-
specificity:
|
147
|
-
recall:
|
148
|
-
precision:
|
149
|
-
accuracy:
|
150
|
-
f1_score:
|
142
|
+
true_positive: tab[:p][:t],
|
143
|
+
true_negative: tab[:n][:t],
|
144
|
+
false_positive: tab[:p][:f],
|
145
|
+
false_negative: tab[:n][:f],
|
146
|
+
prevalence: divide(positives, total),
|
147
|
+
specificity: divide(tab[:n][:t], negatives),
|
148
|
+
recall: divide(tab[:p][:t], positives),
|
149
|
+
precision: divide(tab[:p][:t], tab[:p][:t] + tab[:p][:f]),
|
150
|
+
accuracy: divide(tab[:p][:t] + tab[:n][:t], total),
|
151
|
+
f1_score: divide(2 * tab[:p][:t], 2 * tab[:p][:t] + tab[:p][:f] + tab[:n][:f])
|
151
152
|
}
|
152
153
|
end
|
153
154
|
|
154
155
|
def print_derivations(derivations)
|
155
156
|
max_len = derivations.keys.map(&:length).max
|
156
157
|
derivations.each do |k, v|
|
157
|
-
puts k.to_s.tr('_', ' ').capitalize.ljust(max_len) +
|
158
|
+
puts k.to_s.tr('_', ' ').capitalize.ljust(max_len) + ' : ' + v.to_s
|
158
159
|
end
|
159
160
|
end
|
160
161
|
|
161
162
|
def empty_conf_mat(categories)
|
162
|
-
categories.map{|actual| [actual, categories.map{|predicted| [predicted, 0]}.to_h]}.to_h
|
163
|
+
categories.map { |actual| [actual, categories.map { |predicted| [predicted, 0] }.to_h] }.to_h
|
163
164
|
end
|
164
165
|
|
165
166
|
def divide(dividend, divisor)
|
data/lib/classifier-reborn.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
#--
|
2
4
|
# Copyright (c) 2005 Lucas Carlson
|
3
5
|
#
|
@@ -36,4 +38,4 @@ end
|
|
36
38
|
require_relative 'classifier-reborn/category_namer'
|
37
39
|
require_relative 'classifier-reborn/bayes'
|
38
40
|
require_relative 'classifier-reborn/lsi'
|
39
|
-
require_relative 'classifier-reborn/validators/classifier_validator'
|
41
|
+
require_relative 'classifier-reborn/validators/classifier_validator'
|
metadata
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: classifier-reborn
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lucas Carlson
|
8
8
|
- Parker Moore
|
9
9
|
- Chase Gilliam
|
10
|
-
autorequire:
|
10
|
+
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date:
|
13
|
+
date: 2022-06-21 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: fast-stemmer
|
@@ -27,7 +27,21 @@ dependencies:
|
|
27
27
|
- !ruby/object:Gem::Version
|
28
28
|
version: '1.0'
|
29
29
|
- !ruby/object:Gem::Dependency
|
30
|
-
name:
|
30
|
+
name: matrix
|
31
|
+
requirement: !ruby/object:Gem::Requirement
|
32
|
+
requirements:
|
33
|
+
- - "~>"
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: '0.4'
|
36
|
+
type: :runtime
|
37
|
+
prerelease: false
|
38
|
+
version_requirements: !ruby/object:Gem::Requirement
|
39
|
+
requirements:
|
40
|
+
- - "~>"
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: '0.4'
|
43
|
+
- !ruby/object:Gem::Dependency
|
44
|
+
name: minitest
|
31
45
|
requirement: !ruby/object:Gem::Requirement
|
32
46
|
requirements:
|
33
47
|
- - ">="
|
@@ -41,7 +55,7 @@ dependencies:
|
|
41
55
|
- !ruby/object:Gem::Version
|
42
56
|
version: '0'
|
43
57
|
- !ruby/object:Gem::Dependency
|
44
|
-
name:
|
58
|
+
name: minitest-reporters
|
45
59
|
requirement: !ruby/object:Gem::Requirement
|
46
60
|
requirements:
|
47
61
|
- - ">="
|
@@ -55,7 +69,7 @@ dependencies:
|
|
55
69
|
- !ruby/object:Gem::Version
|
56
70
|
version: '0'
|
57
71
|
- !ruby/object:Gem::Dependency
|
58
|
-
name:
|
72
|
+
name: pry
|
59
73
|
requirement: !ruby/object:Gem::Requirement
|
60
74
|
requirements:
|
61
75
|
- - ">="
|
@@ -69,7 +83,7 @@ dependencies:
|
|
69
83
|
- !ruby/object:Gem::Version
|
70
84
|
version: '0'
|
71
85
|
- !ruby/object:Gem::Dependency
|
72
|
-
name:
|
86
|
+
name: rake
|
73
87
|
requirement: !ruby/object:Gem::Requirement
|
74
88
|
requirements:
|
75
89
|
- - ">="
|
@@ -83,7 +97,7 @@ dependencies:
|
|
83
97
|
- !ruby/object:Gem::Version
|
84
98
|
version: '0'
|
85
99
|
- !ruby/object:Gem::Dependency
|
86
|
-
name:
|
100
|
+
name: rdoc
|
87
101
|
requirement: !ruby/object:Gem::Requirement
|
88
102
|
requirements:
|
89
103
|
- - ">="
|
@@ -97,7 +111,7 @@ dependencies:
|
|
97
111
|
- !ruby/object:Gem::Version
|
98
112
|
version: '0'
|
99
113
|
- !ruby/object:Gem::Dependency
|
100
|
-
name:
|
114
|
+
name: redis
|
101
115
|
requirement: !ruby/object:Gem::Requirement
|
102
116
|
requirements:
|
103
117
|
- - ">="
|
@@ -111,7 +125,7 @@ dependencies:
|
|
111
125
|
- !ruby/object:Gem::Version
|
112
126
|
version: '0'
|
113
127
|
- !ruby/object:Gem::Dependency
|
114
|
-
name:
|
128
|
+
name: rubocop
|
115
129
|
requirement: !ruby/object:Gem::Requirement
|
116
130
|
requirements:
|
117
131
|
- - ">="
|
@@ -124,7 +138,7 @@ dependencies:
|
|
124
138
|
- - ">="
|
125
139
|
- !ruby/object:Gem::Version
|
126
140
|
version: '0'
|
127
|
-
description:
|
141
|
+
description:
|
128
142
|
email:
|
129
143
|
- lucas@rufy.com
|
130
144
|
- parkrmoore@gmail.com
|
@@ -167,8 +181,14 @@ files:
|
|
167
181
|
- lib/classifier-reborn/bayes.rb
|
168
182
|
- lib/classifier-reborn/category_namer.rb
|
169
183
|
- lib/classifier-reborn/extensions/hasher.rb
|
184
|
+
- lib/classifier-reborn/extensions/token_filter/stemmer.rb
|
185
|
+
- lib/classifier-reborn/extensions/token_filter/stopword.rb
|
186
|
+
- lib/classifier-reborn/extensions/token_filter/symbol.rb
|
187
|
+
- lib/classifier-reborn/extensions/tokenizer/token.rb
|
188
|
+
- lib/classifier-reborn/extensions/tokenizer/whitespace.rb
|
170
189
|
- lib/classifier-reborn/extensions/vector.rb
|
171
190
|
- lib/classifier-reborn/extensions/vector_serialize.rb
|
191
|
+
- lib/classifier-reborn/extensions/zero_vector.rb
|
172
192
|
- lib/classifier-reborn/lsi.rb
|
173
193
|
- lib/classifier-reborn/lsi/cached_content_node.rb
|
174
194
|
- lib/classifier-reborn/lsi/content_node.rb
|
@@ -176,11 +196,11 @@ files:
|
|
176
196
|
- lib/classifier-reborn/lsi/word_list.rb
|
177
197
|
- lib/classifier-reborn/validators/classifier_validator.rb
|
178
198
|
- lib/classifier-reborn/version.rb
|
179
|
-
homepage: https://github.
|
199
|
+
homepage: https://jekyll.github.io/classifier-reborn/
|
180
200
|
licenses:
|
181
201
|
- LGPL
|
182
202
|
metadata: {}
|
183
|
-
post_install_message:
|
203
|
+
post_install_message:
|
184
204
|
rdoc_options:
|
185
205
|
- "--charset=UTF-8"
|
186
206
|
require_paths:
|
@@ -189,17 +209,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
189
209
|
requirements:
|
190
210
|
- - ">="
|
191
211
|
- !ruby/object:Gem::Version
|
192
|
-
version:
|
212
|
+
version: 2.4.0
|
193
213
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
194
214
|
requirements:
|
195
215
|
- - ">="
|
196
216
|
- !ruby/object:Gem::Version
|
197
217
|
version: '0'
|
198
218
|
requirements: []
|
199
|
-
|
200
|
-
|
201
|
-
signing_key:
|
219
|
+
rubygems_version: 3.3.7
|
220
|
+
signing_key:
|
202
221
|
specification_version: 2
|
203
222
|
summary: A general classifier module to allow Bayesian and other types of classifications.
|
204
223
|
test_files: []
|
205
|
-
has_rdoc: true
|