classifier-reborn 2.2.0 → 2.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: e626667dbd70c34cda4604852500af1bd9cc8f9f
4
- data.tar.gz: 36dad105dca4770c1a5b708257c66721522d235e
2
+ SHA256:
3
+ metadata.gz: '0100803f158326f660f53694ff5d0d400440792bb5174a10d80ae7eb780c5b6b'
4
+ data.tar.gz: 1f5a249471e67beb8796a0a61f47ea18fa2f0a252e832f03cb7e7b1937921fa5
5
5
  SHA512:
6
- metadata.gz: aa08b0c7ad09138ac9efb0f55daa3d24a43b1c1388cef4ad2a71a9375620ce1794db66d3eca470c7772a80e4eac8e3c7a6811b20a3b65e2e93cfced86922554b
7
- data.tar.gz: c137296ec3661043828e99abd11814b44a1d7e7ab6ca3d6f36ae54eaba1044d426847a9782bd106e082a6f7bacc22d72910eb51e6a37516017886a4c7df38a57
6
+ metadata.gz: e63b40492f9d35092353c198822f2ce444d05dec7613572048c3f420eecda4040c84026fe621ccb6c316e9862bc25258d47e32663168eb8f67c2b29b41733c57
7
+ data.tar.gz: abad42c42694cea59acf4bb59184a8f2aaa1d909826b126b4917b67b350c3ca9a14a3b688bd648e7ff8bba241a72e7846c749b91092e7ea91b5bc373c793b24f
data/README.markdown CHANGED
@@ -2,11 +2,9 @@
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/classifier-reborn.svg)](https://rubygems.org/gems/classifier-reborn)
4
4
  [![Build Status](https://img.shields.io/travis/jekyll/classifier-reborn/master.svg)](https://travis-ci.org/jekyll/classifier-reborn)
5
- [![Dependency Status](https://img.shields.io/gemnasium/jekyll/classifier-reborn.svg)](https://gemnasium.com/jekyll/classifier-reborn)
6
-
7
5
  ---
8
6
 
9
- ## [Read the Docs](http://www.classifier-reborn.com/)
7
+ ## [Read the Docs](https://jekyll.github.io/classifier-reborn/)
10
8
 
11
9
  ## Getting Started
12
10
 
@@ -45,11 +43,11 @@ irb(main):013:0> lsi.find_related("This text is around cats!", 2)
45
43
  There is much more that can be done using Bayes and LSI beyond these quick examples.
46
44
  For more information read the following documentation topics.
47
45
 
48
- * [Installation and Dependencies](http://www.classifier-reborn.com/)
49
- * [Bayesian Classifier](http://www.classifier-reborn.com/bayes)
50
- * [Latent Semantic Indexer (LSI)](http://www.classifier-reborn.com/lsi)
51
- * [Classifier Validation](http://www.classifier-reborn.com/validation)
52
- * [Development and Contributions](http://www.classifier-reborn.com/development) (*Optional Docker instructions included*)
46
+ * [Installation and Dependencies](https://jekyll.github.io/classifier-reborn/)
47
+ * [Bayesian Classifier](https://jekyll.github.io/classifier-reborn/bayes)
48
+ * [Latent Semantic Indexer (LSI)](https://jekyll.github.io/classifier-reborn/lsi)
49
+ * [Classifier Validation](https://jekyll.github.io/classifier-reborn/validation)
50
+ * [Development and Contributions](https://jekyll.github.io/classifier-reborn/development) (*Optional Docker instructions included*)
53
51
 
54
52
  ### Notes on JRuby support
55
53
 
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ClassifierReborn
2
4
  class BayesMemoryBackend
3
5
  attr_reader :total_words, :total_trainings
@@ -69,7 +71,7 @@ module ClassifierReborn
69
71
  private
70
72
 
71
73
  def category_counts(category)
72
- @category_counts[category] ||= {training: 0, word: 0}
74
+ @category_counts[category] ||= { training: 0, word: 0 }
73
75
  end
74
76
  end
75
77
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require_relative 'no_redis_error'
2
4
  # require redis when we run #intialize. This way only people using this backend
3
5
  # will need to install and load the backend without having to
@@ -1,12 +1,14 @@
1
- class NoRedisError < LoadError
1
+ # frozen_string_literal: true
2
+
3
+ class NoRedisError < RuntimeError
2
4
  def initialize
3
5
  msg =
4
- %q{The Redis Backend can only be used if Redis is installed.
6
+ %q(The Redis Backend can only be used if Redis is installed.
5
7
  This error is raised from 'lib/classifier-reborn/backends/bayes_redis_backend.rb'.
6
8
  If you have encountered this error and would like to use the Redis Backend,
7
9
  please run 'gem install redis' or include 'gem "redis"' in
8
10
  your gemfile. For more info see https://github.com/jekyll/classifier-reborn#usage.
9
- }
11
+ )
10
12
  super(msg)
11
13
  end
12
14
  end
@@ -1,9 +1,14 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
4
  # Copyright:: Copyright (c) 2005 Lucas Carlson
3
5
  # License:: LGPL
4
6
 
5
7
  require 'set'
6
8
 
9
+ require_relative 'extensions/tokenizer/whitespace'
10
+ require_relative 'extensions/token_filter/stopword'
11
+ require_relative 'extensions/token_filter/stemmer'
7
12
  require_relative 'category_namer'
8
13
  require_relative 'backends/bayes_memory_backend'
9
14
  require_relative 'backends/bayes_redis_backend'
@@ -26,12 +31,11 @@ module ClassifierReborn
26
31
  # backend: BayesMemoryBackend.new Alternatively, BayesRedisBackend.new for persistent storage
27
32
  def initialize(*args)
28
33
  @initial_categories = []
29
- options = { language: 'en',
34
+ options = { language: 'en',
30
35
  enable_threshold: false,
31
- threshold: 0.0,
32
- enable_stemmer: true,
33
- backend: BayesMemoryBackend.new
34
- }
36
+ threshold: 0.0,
37
+ enable_stemmer: true,
38
+ backend: BayesMemoryBackend.new }
35
39
  args.flatten.each do |arg|
36
40
  if arg.is_a?(Hash)
37
41
  options.merge!(arg)
@@ -50,12 +54,14 @@ module ClassifierReborn
50
54
  @threshold = options[:threshold]
51
55
  @enable_stemmer = options[:enable_stemmer]
52
56
  @backend = options[:backend]
57
+ @tokenizer = options[:tokenizer] || Tokenizer::Whitespace
58
+ @token_filters = options[:token_filters] || [TokenFilter::Stopword]
59
+ @token_filters << TokenFilter::Stemmer if @enable_stemmer && !@token_filters.include?(TokenFilter::Stemmer)
60
+ TokenFilter::Stopword.language = @language if @token_filters.include?(TokenFilter::Stopword)
53
61
 
54
62
  populate_initial_categories
55
63
 
56
- if options.key?(:stopwords)
57
- custom_stopwords options[:stopwords]
58
- end
64
+ custom_stopwords options[:stopwords] if options.key?(:stopwords)
59
65
  end
60
66
 
61
67
  # Provides a general training method for all categories specified in Bayes#new
@@ -65,8 +71,10 @@ module ClassifierReborn
65
71
  # b.train "that", "That text"
66
72
  # b.train "The other", "The other text"
67
73
  def train(category, text)
68
- word_hash = Hasher.word_hash(text, @language, @enable_stemmer)
74
+ word_hash = Hasher.word_hash(text, @enable_stemmer,
75
+ tokenizer: @tokenizer, token_filters: @token_filters)
69
76
  return if word_hash.empty?
77
+
70
78
  category = CategoryNamer.prepare_name(category)
71
79
 
72
80
  # Add the category dynamically or raise an error
@@ -95,11 +103,14 @@ module ClassifierReborn
95
103
  # b.train :this, "This text"
96
104
  # b.untrain :this, "This text"
97
105
  def untrain(category, text)
98
- word_hash = Hasher.word_hash(text, @language, @enable_stemmer)
106
+ word_hash = Hasher.word_hash(text, @enable_stemmer,
107
+ tokenizer: @tokenizer, token_filters: @token_filters)
99
108
  return if word_hash.empty?
109
+
100
110
  category = CategoryNamer.prepare_name(category)
101
111
  word_hash.each do |word, count|
102
112
  next if @backend.total_words < 0
113
+
103
114
  orig = @backend.category_word_frequency(category, word) || 0
104
115
  @backend.update_category_word_frequency(category, word, -count)
105
116
  if @backend.category_word_frequency(category, word) <= 0
@@ -120,7 +131,8 @@ module ClassifierReborn
120
131
  # The largest of these scores (the one closest to 0) is the one picked out by #classify
121
132
  def classifications(text)
122
133
  score = {}
123
- word_hash = Hasher.word_hash(text, @language, @enable_stemmer)
134
+ word_hash = Hasher.word_hash(text, @enable_stemmer,
135
+ tokenizer: @tokenizer, token_filters: @token_filters)
124
136
  if word_hash.empty?
125
137
  category_keys.each do |category|
126
138
  score[category.to_s] = Float::INFINITY
@@ -152,7 +164,7 @@ module ClassifierReborn
152
164
  # Return the classification without the score
153
165
  def classify(text)
154
166
  result, score = classify_with_score(text)
155
- result = nil if score < @threshold || score == Float::INFINITY if threshold_enabled?
167
+ result = nil if threshold_enabled? && (score < @threshold || score == Float::INFINITY)
156
168
  result
157
169
  end
158
170
 
@@ -240,7 +252,7 @@ module ClassifierReborn
240
252
  @backend.add_category(category)
241
253
  end
242
254
 
243
- alias_method :append_category, :add_category
255
+ alias append_category add_category
244
256
 
245
257
  def reset
246
258
  @backend.reset
@@ -261,12 +273,12 @@ module ClassifierReborn
261
273
  if stopwords.strip.empty?
262
274
  stopwords = []
263
275
  elsif File.exist?(stopwords)
264
- stopwords = File.read(stopwords).force_encoding("utf-8").split
276
+ stopwords = File.read(stopwords).force_encoding('utf-8').split
265
277
  else
266
278
  return # Do not overwrite the default
267
279
  end
268
280
  end
269
- Hasher::STOPWORDS[@language] = Set.new stopwords
281
+ TokenFilter::Stopword::STOPWORDS[@language] = Set.new stopwords
270
282
  end
271
283
  end
272
284
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
4
  # Copyright:: Copyright (c) 2005 Lucas Carlson
3
5
  # License:: LGPL
@@ -1,67 +1,42 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  # Author:: Lucas Carlson (mailto:lucas@rufy.com)
3
4
  # Copyright:: Copyright (c) 2005 Lucas Carlson
4
5
  # License:: LGPL
5
6
 
6
7
  require 'set'
7
8
 
9
+ require_relative 'tokenizer/whitespace'
10
+ require_relative 'token_filter/stopword'
11
+ require_relative 'token_filter/stemmer'
12
+
8
13
  module ClassifierReborn
9
14
  module Hasher
10
- STOPWORDS_PATH = [File.expand_path(File.dirname(__FILE__) + '/../../../data/stopwords')]
11
-
12
15
  module_function
13
16
 
14
17
  # Return a Hash of strings => ints. Each word in the string is stemmed,
15
18
  # interned, and indexes to its frequency in the document.
16
- def word_hash(str, language = 'en', enable_stemmer = true)
17
- cleaned_word_hash = clean_word_hash(str, language, enable_stemmer)
18
- symbol_hash = word_hash_for_symbols(str.scan(/[^\s\p{WORD}]/))
19
- cleaned_word_hash.merge(symbol_hash)
20
- end
21
-
22
- # Return a word hash without extra punctuation or short symbols, just stemmed words
23
- def clean_word_hash(str, language = 'en', enable_stemmer = true)
24
- word_hash_for_words(str.gsub(/[^\p{WORD}\s]/, '').downcase.split, language, enable_stemmer)
25
- end
26
-
27
- def word_hash_for_words(words, language = 'en', enable_stemmer = true)
28
- d = Hash.new(0)
29
- words.each do |word|
30
- next unless word.length > 2 && !STOPWORDS[language].include?(word)
31
- if enable_stemmer
32
- d[word.stem.intern] += 1
33
- else
34
- d[word.intern] += 1
19
+ def word_hash(str, enable_stemmer = true,
20
+ tokenizer: Tokenizer::Whitespace,
21
+ token_filters: [TokenFilter::Stopword])
22
+ if token_filters.include?(TokenFilter::Stemmer)
23
+ unless enable_stemmer
24
+ token_filters.reject! do |token_filter|
25
+ token_filter == TokenFilter::Stemmer
26
+ end
35
27
  end
28
+ else
29
+ token_filters << TokenFilter::Stemmer if enable_stemmer
30
+ end
31
+ words = tokenizer.call(str)
32
+ token_filters.each do |token_filter|
33
+ words = token_filter.call(words)
36
34
  end
37
- d
38
- end
39
-
40
- # Add custom path to a new stopword file created by user
41
- def add_custom_stopword_path(path)
42
- STOPWORDS_PATH.unshift(path)
43
- end
44
-
45
- def word_hash_for_symbols(words)
46
35
  d = Hash.new(0)
47
36
  words.each do |word|
48
37
  d[word.intern] += 1
49
38
  end
50
39
  d
51
40
  end
52
-
53
- # Create a lazily-loaded hash of stopword data
54
- STOPWORDS = Hash.new do |hash, language|
55
- hash[language] = []
56
-
57
- STOPWORDS_PATH.each do |path|
58
- if File.exist?(File.join(path, language))
59
- hash[language] = Set.new File.read(File.join(path, language.to_s)).force_encoding("utf-8").split
60
- break
61
- end
62
- end
63
-
64
- hash[language]
65
- end
66
41
  end
67
42
  end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
4
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
5
+ # License:: LGPL
6
+
7
+ module ClassifierReborn
8
+ module TokenFilter
9
+ # This filter converts given tokens to their stemmed versions.
10
+ module Stemmer
11
+ module_function
12
+
13
+ def call(tokens)
14
+ tokens.collect do |token|
15
+ if token.stemmable?
16
+ token.stem
17
+ else
18
+ token
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
4
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
5
+ # License:: LGPL
6
+
7
+ module ClassifierReborn
8
+ module TokenFilter
9
+ # This filter removes stopwords in the language, from given tokens.
10
+ module Stopword
11
+ STOPWORDS_PATH = [File.expand_path(File.dirname(__FILE__) + '/../../../../data/stopwords')]
12
+ @language = 'en'
13
+
14
+ module_function
15
+
16
+ def call(tokens)
17
+ tokens.reject do |token|
18
+ token.maybe_stopword? &&
19
+ (token.length <= 2 || STOPWORDS[@language].include?(token))
20
+ end
21
+ end
22
+
23
+ # Add custom path to a new stopword file created by user
24
+ def add_custom_stopword_path(path)
25
+ STOPWORDS_PATH.unshift(path)
26
+ end
27
+
28
+ # Create a lazily-loaded hash of stopword data
29
+ STOPWORDS = Hash.new do |hash, language|
30
+ hash[language] = []
31
+
32
+ STOPWORDS_PATH.each do |path|
33
+ if File.exist?(File.join(path, language))
34
+ hash[language] = Set.new File.read(File.join(path, language.to_s)).force_encoding('utf-8').split
35
+ break
36
+ end
37
+ end
38
+
39
+ hash[language]
40
+ end
41
+
42
+ # Changes the language of stopwords
43
+ def language=(language)
44
+ @language = language
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
4
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
5
+ # License:: LGPL
6
+
7
+ module ClassifierReborn
8
+ module TokenFilter
9
+ # This filter removes symbol-only terms, from given tokens.
10
+ module Symbol
11
+ module_function
12
+
13
+ def call(tokens)
14
+ tokens.reject do |token|
15
+ /[^\s\p{WORD}]/ === token
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
4
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
5
+ # License:: LGPL
6
+
7
+ module ClassifierReborn
8
+ module Tokenizer
9
+ class Token < String
10
+ # The class can be created with one token string and extra attributes. E.g.,
11
+ # t = ClassifierReborn::Tokenizer::Token.new 'Tokenize', stemmable: true, maybe_stopword: false
12
+ #
13
+ # Attributes available are:
14
+ # stemmable: true Possibility that the token can be stemmed. This must be false for un-stemmable terms, otherwise this should be true.
15
+ # maybe_stopword: true Possibility that the token is a stopword. This must be false for terms which never been stopword, otherwise this should be true.
16
+ def initialize(string, stemmable: true, maybe_stopword: true)
17
+ super(string)
18
+ @stemmable = stemmable
19
+ @maybe_stopword = maybe_stopword
20
+ end
21
+
22
+ def stemmable?
23
+ @stemmable
24
+ end
25
+
26
+ def maybe_stopword?
27
+ @maybe_stopword
28
+ end
29
+
30
+ def stem
31
+ stemmed = super
32
+ self.class.new(stemmed, stemmable: @stemmable, maybe_stopword: @maybe_stopword)
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
4
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
5
+ # License:: LGPL
6
+
7
+ require_relative 'token'
8
+
9
+ module ClassifierReborn
10
+ module Tokenizer
11
+ # This tokenizes given input as white-space separated terms.
12
+ # It mainly aims to tokenize sentences written with a space between words, like English, French, and others.
13
+ module Whitespace
14
+ module_function
15
+
16
+ def call(str)
17
+ tokens = str.gsub(/[^\p{WORD}\s]/, '').downcase.split.collect do |word|
18
+ Token.new(word, stemmable: true, maybe_stopword: true)
19
+ end
20
+ symbol_tokens = str.scan(/[^\s\p{WORD}]/).collect do |word|
21
+ Token.new(word, stemmable: false, maybe_stopword: false)
22
+ end
23
+ tokens += symbol_tokens
24
+ tokens
25
+ end
26
+ end
27
+ end
28
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Ernest Ellingson
2
4
  # Copyright:: Copyright (c) 2005
3
5
 
@@ -10,14 +12,14 @@ class Matrix
10
12
  Matrix.diagonal(*s)
11
13
  end
12
14
 
13
- alias_method :trans, :transpose
15
+ alias trans transpose
14
16
 
15
17
  def SV_decomp(maxSweeps = 20)
16
- if row_size >= column_size
17
- q = trans * self
18
- else
19
- q = self * trans
20
- end
18
+ q = if row_size >= column_size
19
+ trans * self
20
+ else
21
+ self * trans
22
+ end
21
23
 
22
24
  qrot = q.dup
23
25
  v = Matrix.identity(q.row_size)
@@ -31,7 +33,11 @@ class Matrix
31
33
  (1..qrot.row_size - 1).each do |col|
32
34
  next if row == col
33
35
 
34
- h = Math.atan((2 * qrot[row, col]) / (qrot[row, row] - qrot[col, col])) / 2.0
36
+ h = if (2.0 * qrot[row, col]) == (qrot[row, row] - qrot[col, col])
37
+ Math.atan(1) / 2.0
38
+ else
39
+ Math.atan((2.0 * qrot[row, col]) / (qrot[row, row] - qrot[col, col])) / 2.0
40
+ end
35
41
  hcos = Math.cos(h)
36
42
  hsin = Math.sin(h)
37
43
  mzrot = Matrix.identity(qrot.row_size)
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module GSL
2
4
  class Vector
3
5
  def _dump(_v)
@@ -12,7 +14,7 @@ module GSL
12
14
 
13
15
  class Matrix
14
16
  class <<self
15
- alias_method :diag, :diagonal
17
+ alias diag diagonal
16
18
  end
17
19
  end
18
20
  end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Vector
4
+ def zero?
5
+ all?(&:zero?)
6
+ end
7
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Kelley Reynolds (mailto:kelley@insidesystems.net)
2
4
  # Copyright:: Copyright (c) 2015 Kelley Reynolds
3
5
  # License:: LGPL
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
4
  # Copyright:: Copyright (c) 2005 David Fayram II
3
5
  # License:: LGPL
@@ -27,7 +29,11 @@ module ClassifierReborn
27
29
 
28
30
  # Method to access the transposed search vector
29
31
  def transposed_search_vector
30
- search_vector.col
32
+ if $SVD == :numo
33
+ search_vector
34
+ else
35
+ search_vector.col
36
+ end
31
37
  end
32
38
 
33
39
  # Use this to fetch the appropriate search vector in normalized form.
@@ -38,18 +44,22 @@ module ClassifierReborn
38
44
  # Creates the raw vector out of word_hash using word_list as the
39
45
  # key for mapping the vector space.
40
46
  def raw_vector_with(word_list)
41
- if $GSL
42
- vec = GSL::Vector.alloc(word_list.size)
43
- else
44
- vec = Array.new(word_list.size, 0)
45
- end
47
+ vec = if $SVD == :numo
48
+ Numo::DFloat.zeros(word_list.size)
49
+ elsif $SVD == :gsl
50
+ GSL::Vector.alloc(word_list.size)
51
+ else
52
+ Array.new(word_list.size, 0)
53
+ end
46
54
 
47
55
  @word_hash.each_key do |word|
48
56
  vec[word_list[word]] = @word_hash[word] if word_list[word]
49
57
  end
50
58
 
51
59
  # Perform the scaling transform and force floating point arithmetic
52
- if $GSL
60
+ if $SVD == :numo
61
+ total_words = vec.sum.to_f
62
+ elsif $SVD == :gsl
53
63
  sum = 0.0
54
64
  vec.each { |v| sum += v }
55
65
  total_words = sum
@@ -59,7 +69,7 @@ module ClassifierReborn
59
69
 
60
70
  total_unique_words = 0
61
71
 
62
- if $GSL
72
+ if [:numo, :gsl].include?($SVD)
63
73
  vec.each { |word| total_unique_words += 1 if word != 0.0 }
64
74
  else
65
75
  total_unique_words = vec.count { |word| word != 0 }
@@ -83,12 +93,15 @@ module ClassifierReborn
83
93
  hash[val] = Math.log(val + 1) / -weighted_total
84
94
  end
85
95
 
86
- vec.collect! do |val|
96
+ vec = vec.map do |val|
87
97
  cached_calcs[val]
88
98
  end
89
99
  end
90
100
 
91
- if $GSL
101
+ if $SVD == :numo
102
+ @raw_norm = vec / Numo::Linalg.norm(vec)
103
+ @raw_vector = vec
104
+ elsif $SVD == :gsl
92
105
  @raw_norm = vec.normalize
93
106
  @raw_vector = vec
94
107
  else
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
4
  # Copyright:: Copyright (c) 2005 Lucas Carlson
3
5
  # License:: LGPL
@@ -27,7 +29,7 @@ module ClassifierReborn
27
29
  chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
28
30
  lsi.build_index
29
31
  summaries = lsi.highest_relative_content count
30
- summaries.reject { |chunk| !summaries.include? chunk }.map(&:strip).join(separator)
32
+ summaries.select { |chunk| summaries.include? chunk }.map(&:strip).join(separator)
31
33
  end
32
34
  end
33
35
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
4
  # Copyright:: Copyright (c) 2005 David Fayram II
3
5
  # License:: LGPL
@@ -1,23 +1,39 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
4
  # Copyright:: Copyright (c) 2005 David Fayram II
3
5
  # License:: LGPL
4
6
 
7
+ # Try to load Numo first - it's the most current and the most well-supported.
8
+ # Fall back to GSL.
9
+ # Fall back to native vector.
5
10
  begin
6
11
  raise LoadError if ENV['NATIVE_VECTOR'] == 'true' # to test the native vector class, try `rake test NATIVE_VECTOR=true`
12
+ raise LoadError if ENV['GSL'] == 'true' # to test with gsl, try `rake test GSL=true`
7
13
 
8
- require 'gsl' # requires https://github.com/SciRuby/rb-gsl
9
- require_relative 'extensions/vector_serialize'
10
- $GSL = true
11
-
14
+ require 'numo/narray' # https://ruby-numo.github.io/narray/
15
+ require 'numo/linalg' # https://ruby-numo.github.io/linalg/
16
+ $SVD = :numo
12
17
  rescue LoadError
13
- $GSL = false
14
- require_relative 'extensions/vector'
18
+ begin
19
+ raise LoadError if ENV['NATIVE_VECTOR'] == 'true' # to test the native vector class, try `rake test NATIVE_VECTOR=true`
20
+
21
+ require 'gsl' # requires https://github.com/SciRuby/rb-gsl
22
+ require_relative 'extensions/vector_serialize'
23
+ $SVD = :gsl
24
+ rescue LoadError
25
+ $SVD = :ruby
26
+ require_relative 'extensions/vector'
27
+ require_relative 'extensions/zero_vector'
28
+ end
15
29
  end
16
30
 
17
31
  require_relative 'lsi/word_list'
18
32
  require_relative 'lsi/content_node'
19
33
  require_relative 'lsi/cached_content_node'
20
34
  require_relative 'lsi/summarizer'
35
+ require_relative 'extensions/token_filter/stopword'
36
+ require_relative 'extensions/token_filter/symbol'
21
37
 
22
38
  module ClassifierReborn
23
39
  # This class implements a Latent Semantic Indexer, which can search, classify and cluster
@@ -40,6 +56,11 @@ module ClassifierReborn
40
56
  @version = 0
41
57
  @built_at_version = -1
42
58
  @language = options[:language] || 'en'
59
+ @token_filters = [
60
+ TokenFilter::Stopword,
61
+ TokenFilter::Symbol
62
+ ]
63
+ TokenFilter::Stopword.language = @language
43
64
  extend CachedContentNode::InstanceMethods if @cache_node_vectors = options[:cache_node_vectors]
44
65
  end
45
66
 
@@ -64,7 +85,8 @@ module ClassifierReborn
64
85
  # lsi.add_item ar, *ar.categories { |x| ar.content }
65
86
  #
66
87
  def add_item(item, *categories, &block)
67
- clean_word_hash = Hasher.clean_word_hash((block ? block.call(item) : item.to_s), @language)
88
+ clean_word_hash = Hasher.word_hash((block ? yield(item) : item.to_s),
89
+ token_filters: @token_filters)
68
90
  if clean_word_hash.empty?
69
91
  puts "Input: '#{item}' is entirely stopwords or words with 2 or fewer characters. Classifier-Reborn cannot handle this document properly."
70
92
  else
@@ -124,12 +146,21 @@ module ClassifierReborn
124
146
  # turning the LSI class into a simple vector search engine.
125
147
  def build_index(cutoff = 0.75)
126
148
  return unless needs_rebuild?
149
+
127
150
  make_word_list
128
151
 
129
152
  doc_list = @items.values
130
153
  tda = doc_list.collect { |node| node.raw_vector_with(@word_list) }
131
154
 
132
- if $GSL
155
+ if $SVD == :numo
156
+ tdm = Numo::NArray.asarray(tda.map(&:to_a)).transpose
157
+ ntdm = numo_build_reduced_matrix(tdm, cutoff)
158
+
159
+ ntdm.each_over_axis(1).with_index do |col_vec, i|
160
+ doc_list[i].lsi_vector = col_vec
161
+ doc_list[i].lsi_norm = col_vec / Numo::Linalg.norm(col_vec)
162
+ end
163
+ elsif $SVD == :gsl
133
164
  tdm = GSL::Matrix.alloc(*tda).trans
134
165
  ntdm = build_reduced_matrix(tdm, cutoff)
135
166
 
@@ -142,9 +173,13 @@ module ClassifierReborn
142
173
  tdm = Matrix.rows(tda).trans
143
174
  ntdm = build_reduced_matrix(tdm, cutoff)
144
175
 
145
- ntdm.row_size.times do |col|
176
+ ntdm.column_size.times do |col|
146
177
  doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
147
- doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
178
+ if ntdm.column(col).zero?
179
+ doc_list[col].lsi_norm = ntdm.column(col) if doc_list[col]
180
+ else
181
+ doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
182
+ end
148
183
  end
149
184
  end
150
185
 
@@ -186,11 +221,13 @@ module ClassifierReborn
186
221
  content_node = node_for_content(doc, &block)
187
222
  result =
188
223
  @items.keys.collect do |item|
189
- if $GSL
190
- val = content_node.search_vector * @items[item].transposed_search_vector
191
- else
192
- val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
193
- end
224
+ val = if $SVD == :numo
225
+ content_node.search_vector.dot(@items[item].transposed_search_vector)
226
+ elsif $SVD == :gsl
227
+ content_node.search_vector * @items[item].transposed_search_vector
228
+ else
229
+ (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
230
+ end
194
231
  [item, val]
195
232
  end
196
233
  result.sort_by { |x| x[1] }.reverse
@@ -205,7 +242,8 @@ module ClassifierReborn
205
242
  return [] if needs_rebuild?
206
243
 
207
244
  content_node = node_for_content(doc, &block)
208
- if $GSL && content_node.raw_norm.isnan?.all?
245
+ if ($SVD == :gsl && content_node.raw_norm.isnan?.all?) ||
246
+ ($SVD == :numo && content_node.raw_norm.isnan.all?)
209
247
  puts "There are no documents that are similar to #{doc}"
210
248
  else
211
249
  content_node_norms(content_node)
@@ -215,11 +253,13 @@ module ClassifierReborn
215
253
  def content_node_norms(content_node)
216
254
  result =
217
255
  @items.keys.collect do |item|
218
- if $GSL
219
- val = content_node.search_norm * @items[item].search_norm.col
220
- else
221
- val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
222
- end
256
+ val = if $SVD == :numo
257
+ content_node.search_norm.dot(@items[item].search_norm)
258
+ elsif $SVD == :gsl
259
+ content_node.search_norm * @items[item].search_norm.col
260
+ else
261
+ (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
262
+ end
223
263
  [item, val]
224
264
  end
225
265
  result.sort_by { |x| x[1] }.reverse
@@ -234,6 +274,7 @@ module ClassifierReborn
234
274
  # it is actually the same algorithm, just applied on a smaller document.
235
275
  def search(string, max_nearest = 3)
236
276
  return [] if needs_rebuild?
277
+
237
278
  carry = proximity_norms_for_content(string)
238
279
  unless carry.nil?
239
280
  result = carry.collect { |x| x[0] }
@@ -295,6 +336,7 @@ module ClassifierReborn
295
336
  # it's supposed to.
296
337
  def highest_ranked_stems(doc, count = 3)
297
338
  raise 'Requested stem ranking on non-indexed content!' unless @items[doc]
339
+
298
340
  content_vector_array = node_for_content(doc).lsi_vector.to_a
299
341
  top_n = content_vector_array.sort.reverse[0..count - 1]
300
342
  top_n.collect { |x| @word_list.word_for_index(content_vector_array.index(x)) }
@@ -315,14 +357,28 @@ module ClassifierReborn
315
357
  s[ord] = 0.0 if s[ord] < s_cutoff
316
358
  end
317
359
  # Reconstruct the term document matrix, only with reduced rank
318
- u * ($GSL ? GSL::Matrix : ::Matrix).diag(s) * v.trans
360
+ u * ($SVD == :gsl ? GSL::Matrix : ::Matrix).diag(s) * v.trans
361
+ end
362
+
363
+ def numo_build_reduced_matrix(matrix, cutoff = 0.75)
364
+ s, u, vt = Numo::Linalg.svd(matrix, driver: 'svd', job: 'S')
365
+
366
+ # TODO: Better than 75% term (as above)
367
+ s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
368
+ s.size.times do |ord|
369
+ s[ord] = 0.0 if s[ord] < s_cutoff
370
+ end
371
+
372
+ # Reconstruct the term document matrix, only with reduced rank
373
+ u.dot(::Numo::DFloat.eye(s.size) * s).dot(vt)
319
374
  end
320
375
 
321
376
  def node_for_content(item, &block)
322
377
  if @items[item]
323
378
  return @items[item]
324
379
  else
325
- clean_word_hash = Hasher.clean_word_hash((block ? block.call(item) : item.to_s), @language)
380
+ clean_word_hash = Hasher.word_hash((block ? yield(item) : item.to_s),
381
+ token_filters: @token_filters)
326
382
 
327
383
  content_node = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
328
384
 
@@ -1,10 +1,11 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ClassifierReborn
2
4
  module ClassifierValidator
3
-
4
5
  module_function
5
6
 
6
- def cross_validate(classifier, sample_data, fold=10, *options)
7
- classifier = ClassifierReborn::const_get(classifier).new(options) if classifier.is_a?(String)
7
+ def cross_validate(classifier, sample_data, fold = 10, *options)
8
+ classifier = ClassifierReborn.const_get(classifier).new(options) if classifier.is_a?(String)
8
9
  sample_data.shuffle!
9
10
  partition_size = sample_data.length / fold
10
11
  partitioned_data = sample_data.each_slice(partition_size)
@@ -14,13 +15,13 @@ module ClassifierReborn
14
15
  test_data = training_data.slice!(i)
15
16
  conf_mats << validate(classifier, training_data.flatten!(1), test_data)
16
17
  end
17
- classifier.reset()
18
+ classifier.reset
18
19
  generate_report(conf_mats)
19
20
  end
20
21
 
21
22
  def validate(classifier, training_data, test_data, *options)
22
- classifier = ClassifierReborn::const_get(classifier).new(options) if classifier.is_a?(String)
23
- classifier.reset()
23
+ classifier = ClassifierReborn.const_get(classifier).new(options) if classifier.is_a?(String)
24
+ classifier.reset
24
25
  training_data.each do |rec|
25
26
  classifier.train(rec.first, rec.last)
26
27
  end
@@ -40,25 +41,25 @@ module ClassifierReborn
40
41
  def generate_report(*conf_mats)
41
42
  conf_mats.flatten!
42
43
  accumulated_conf_mat = conf_mats.length == 1 ? conf_mats.first : empty_conf_mat(conf_mats.first.keys.sort)
43
- header = "Run Total Correct Incorrect Accuracy"
44
+ header = 'Run Total Correct Incorrect Accuracy'
44
45
  puts
45
- puts " Run Report ".center(header.length, "-")
46
+ puts ' Run Report '.center(header.length, '-')
46
47
  puts header
47
- puts "-" * header.length
48
+ puts '-' * header.length
48
49
  if conf_mats.length > 1
49
50
  conf_mats.each_with_index do |conf_mat, i|
50
51
  run_report = build_run_report(conf_mat)
51
- print_run_report(run_report, i+1)
52
+ print_run_report(run_report, i + 1)
52
53
  conf_mat.each do |actual, cols|
53
54
  cols.each do |predicted, v|
54
55
  accumulated_conf_mat[actual][predicted] += v
55
56
  end
56
57
  end
57
58
  end
58
- puts "-" * header.length
59
+ puts '-' * header.length
59
60
  end
60
61
  run_report = build_run_report(accumulated_conf_mat)
61
- print_run_report(run_report, "All")
62
+ print_run_report(run_report, 'All')
62
63
  puts
63
64
  print_conf_mat(accumulated_conf_mat)
64
65
  puts
@@ -78,11 +79,11 @@ module ClassifierReborn
78
79
  end
79
80
  end
80
81
  total = correct + incorrect
81
- {total: total, correct: correct, incorrect: incorrect, accuracy: divide(correct, total)}
82
+ { total: total, correct: correct, incorrect: incorrect, accuracy: divide(correct, total) }
82
83
  end
83
84
 
84
85
  def conf_mat_to_tab(conf_mat)
85
- conf_tab = Hash.new {|h, k| h[k] = {p: {t: 0, f: 0}, n: {t: 0, f: 0}}}
86
+ conf_tab = Hash.new { |h, k| h[k] = { p: { t: 0, f: 0 }, n: { t: 0, f: 0 } } }
86
87
  conf_mat.each_key do |positive|
87
88
  conf_mat.each do |actual, cols|
88
89
  cols.each do |predicted, v|
@@ -93,32 +94,32 @@ module ClassifierReborn
93
94
  conf_tab
94
95
  end
95
96
 
96
- def print_run_report(stats, prefix="", print_header=false)
97
- puts "#{"Run".rjust([3, prefix.length].max)} Total Correct Incorrect Accuracy" if print_header
97
+ def print_run_report(stats, prefix = '', print_header = false)
98
+ puts "#{'Run'.rjust([3, prefix.length].max)} Total Correct Incorrect Accuracy" if print_header
98
99
  puts "#{prefix.to_s.rjust(3)} #{stats[:total].to_s.rjust(9)} #{stats[:correct].to_s.rjust(9)} #{stats[:incorrect].to_s.rjust(9)} #{stats[:accuracy].round(5).to_s.ljust(7, '0').rjust(9)}"
99
100
  end
100
101
 
101
102
  def print_conf_mat(conf_mat)
102
- header = ["Predicted ->"] + conf_mat.keys + ["Total", "Recall"]
103
+ header = ['Predicted ->'] + conf_mat.keys + %w[Total Recall]
103
104
  cell_size = header.map(&:length).max
104
- header = header.map{|h| h.rjust(cell_size)}.join(" ")
105
- puts " Confusion Matrix ".center(header.length, "-")
105
+ header = header.map { |h| h.rjust(cell_size) }.join(' ')
106
+ puts ' Confusion Matrix '.center(header.length, '-')
106
107
  puts header
107
- puts "-" * header.length
108
- predicted_totals = conf_mat.keys.map{|predicted| [predicted, 0]}.to_h
108
+ puts '-' * header.length
109
+ predicted_totals = conf_mat.keys.map { |predicted| [predicted, 0] }.to_h
109
110
  correct = 0
110
111
  conf_mat.each do |k, rec|
111
112
  actual_total = rec.values.reduce(:+)
112
- puts ([k.ljust(cell_size)] + rec.values.map{|v| v.to_s.rjust(cell_size)} + [actual_total.to_s.rjust(cell_size), divide(rec[k], actual_total).round(5).to_s.rjust(cell_size)]).join(" ")
113
+ puts ([k.ljust(cell_size)] + rec.values.map { |v| v.to_s.rjust(cell_size) } + [actual_total.to_s.rjust(cell_size), divide(rec[k], actual_total).round(5).to_s.rjust(cell_size)]).join(' ')
113
114
  rec.each do |cat, val|
114
115
  predicted_totals[cat] += val
115
116
  correct += val if cat == k
116
117
  end
117
118
  end
118
119
  total = predicted_totals.values.reduce(:+)
119
- puts "-" * header.length
120
- puts (["Total".ljust(cell_size)] + predicted_totals.values.map{|v| v.to_s.rjust(cell_size)} + [total.to_s.rjust(cell_size), "".rjust(cell_size)]).join(" ")
121
- puts (["Precision".ljust(cell_size)] + predicted_totals.keys.map{|k| divide(conf_mat[k][k], predicted_totals[k]).round(5).to_s.rjust(cell_size)} + ["Accuracy ->".rjust(cell_size), divide(correct, total).round(5).to_s.rjust(cell_size)]).join(" ")
120
+ puts '-' * header.length
121
+ puts (['Total'.ljust(cell_size)] + predicted_totals.values.map { |v| v.to_s.rjust(cell_size) } + [total.to_s.rjust(cell_size), ''.rjust(cell_size)]).join(' ')
122
+ puts (['Precision'.ljust(cell_size)] + predicted_totals.keys.map { |k| divide(conf_mat[k][k], predicted_totals[k]).round(5).to_s.rjust(cell_size) } + ['Accuracy ->'.rjust(cell_size), divide(correct, total).round(5).to_s.rjust(cell_size)]).join(' ')
122
123
  end
123
124
 
124
125
  def print_conf_tab(conf_tab)
@@ -135,31 +136,31 @@ module ClassifierReborn
135
136
  negatives = tab[:n][:t] + tab[:p][:f]
136
137
  total = positives + negatives
137
138
  {
138
- total_population: positives + negatives,
139
+ total_population: positives + negatives,
139
140
  condition_positive: positives,
140
141
  condition_negative: negatives,
141
- true_positive: tab[:p][:t],
142
- true_negative: tab[:n][:t],
143
- false_positive: tab[:p][:f],
144
- false_negative: tab[:n][:f],
145
- prevalence: divide(positives, total),
146
- specificity: divide(tab[:n][:t], negatives),
147
- recall: divide(tab[:p][:t], positives),
148
- precision: divide(tab[:p][:t], tab[:p][:t] + tab[:p][:f]),
149
- accuracy: divide(tab[:p][:t] + tab[:n][:t], total),
150
- f1_score: divide(2 * tab[:p][:t], 2 * tab[:p][:t] + tab[:p][:f] + tab[:n][:f])
142
+ true_positive: tab[:p][:t],
143
+ true_negative: tab[:n][:t],
144
+ false_positive: tab[:p][:f],
145
+ false_negative: tab[:n][:f],
146
+ prevalence: divide(positives, total),
147
+ specificity: divide(tab[:n][:t], negatives),
148
+ recall: divide(tab[:p][:t], positives),
149
+ precision: divide(tab[:p][:t], tab[:p][:t] + tab[:p][:f]),
150
+ accuracy: divide(tab[:p][:t] + tab[:n][:t], total),
151
+ f1_score: divide(2 * tab[:p][:t], 2 * tab[:p][:t] + tab[:p][:f] + tab[:n][:f])
151
152
  }
152
153
  end
153
154
 
154
155
  def print_derivations(derivations)
155
156
  max_len = derivations.keys.map(&:length).max
156
157
  derivations.each do |k, v|
157
- puts k.to_s.tr('_', ' ').capitalize.ljust(max_len) + " : " + v.to_s
158
+ puts k.to_s.tr('_', ' ').capitalize.ljust(max_len) + ' : ' + v.to_s
158
159
  end
159
160
  end
160
161
 
161
162
  def empty_conf_mat(categories)
162
- categories.map{|actual| [actual, categories.map{|predicted| [predicted, 0]}.to_h]}.to_h
163
+ categories.map { |actual| [actual, categories.map { |predicted| [predicted, 0] }.to_h] }.to_h
163
164
  end
164
165
 
165
166
  def divide(dividend, divisor)
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ClassifierReborn
2
- VERSION = '2.2.0'
4
+ VERSION = '2.3.0'
3
5
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  #--
2
4
  # Copyright (c) 2005 Lucas Carlson
3
5
  #
@@ -36,4 +38,4 @@ end
36
38
  require_relative 'classifier-reborn/category_namer'
37
39
  require_relative 'classifier-reborn/bayes'
38
40
  require_relative 'classifier-reborn/lsi'
39
- require_relative 'classifier-reborn/validators/classifier_validator'
41
+ require_relative 'classifier-reborn/validators/classifier_validator'
metadata CHANGED
@@ -1,16 +1,16 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: classifier-reborn
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.0
4
+ version: 2.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lucas Carlson
8
8
  - Parker Moore
9
9
  - Chase Gilliam
10
- autorequire:
10
+ autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2017-12-15 00:00:00.000000000 Z
13
+ date: 2022-06-21 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: fast-stemmer
@@ -27,7 +27,21 @@ dependencies:
27
27
  - !ruby/object:Gem::Version
28
28
  version: '1.0'
29
29
  - !ruby/object:Gem::Dependency
30
- name: rake
30
+ name: matrix
31
+ requirement: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - "~>"
34
+ - !ruby/object:Gem::Version
35
+ version: '0.4'
36
+ type: :runtime
37
+ prerelease: false
38
+ version_requirements: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - "~>"
41
+ - !ruby/object:Gem::Version
42
+ version: '0.4'
43
+ - !ruby/object:Gem::Dependency
44
+ name: minitest
31
45
  requirement: !ruby/object:Gem::Requirement
32
46
  requirements:
33
47
  - - ">="
@@ -41,7 +55,7 @@ dependencies:
41
55
  - !ruby/object:Gem::Version
42
56
  version: '0'
43
57
  - !ruby/object:Gem::Dependency
44
- name: rdoc
58
+ name: minitest-reporters
45
59
  requirement: !ruby/object:Gem::Requirement
46
60
  requirements:
47
61
  - - ">="
@@ -55,7 +69,7 @@ dependencies:
55
69
  - !ruby/object:Gem::Version
56
70
  version: '0'
57
71
  - !ruby/object:Gem::Dependency
58
- name: minitest
72
+ name: pry
59
73
  requirement: !ruby/object:Gem::Requirement
60
74
  requirements:
61
75
  - - ">="
@@ -69,7 +83,7 @@ dependencies:
69
83
  - !ruby/object:Gem::Version
70
84
  version: '0'
71
85
  - !ruby/object:Gem::Dependency
72
- name: minitest-reporters
86
+ name: rake
73
87
  requirement: !ruby/object:Gem::Requirement
74
88
  requirements:
75
89
  - - ">="
@@ -83,7 +97,7 @@ dependencies:
83
97
  - !ruby/object:Gem::Version
84
98
  version: '0'
85
99
  - !ruby/object:Gem::Dependency
86
- name: rubocop
100
+ name: rdoc
87
101
  requirement: !ruby/object:Gem::Requirement
88
102
  requirements:
89
103
  - - ">="
@@ -97,7 +111,7 @@ dependencies:
97
111
  - !ruby/object:Gem::Version
98
112
  version: '0'
99
113
  - !ruby/object:Gem::Dependency
100
- name: pry
114
+ name: redis
101
115
  requirement: !ruby/object:Gem::Requirement
102
116
  requirements:
103
117
  - - ">="
@@ -111,7 +125,7 @@ dependencies:
111
125
  - !ruby/object:Gem::Version
112
126
  version: '0'
113
127
  - !ruby/object:Gem::Dependency
114
- name: redis
128
+ name: rubocop
115
129
  requirement: !ruby/object:Gem::Requirement
116
130
  requirements:
117
131
  - - ">="
@@ -124,7 +138,7 @@ dependencies:
124
138
  - - ">="
125
139
  - !ruby/object:Gem::Version
126
140
  version: '0'
127
- description:
141
+ description:
128
142
  email:
129
143
  - lucas@rufy.com
130
144
  - parkrmoore@gmail.com
@@ -167,8 +181,14 @@ files:
167
181
  - lib/classifier-reborn/bayes.rb
168
182
  - lib/classifier-reborn/category_namer.rb
169
183
  - lib/classifier-reborn/extensions/hasher.rb
184
+ - lib/classifier-reborn/extensions/token_filter/stemmer.rb
185
+ - lib/classifier-reborn/extensions/token_filter/stopword.rb
186
+ - lib/classifier-reborn/extensions/token_filter/symbol.rb
187
+ - lib/classifier-reborn/extensions/tokenizer/token.rb
188
+ - lib/classifier-reborn/extensions/tokenizer/whitespace.rb
170
189
  - lib/classifier-reborn/extensions/vector.rb
171
190
  - lib/classifier-reborn/extensions/vector_serialize.rb
191
+ - lib/classifier-reborn/extensions/zero_vector.rb
172
192
  - lib/classifier-reborn/lsi.rb
173
193
  - lib/classifier-reborn/lsi/cached_content_node.rb
174
194
  - lib/classifier-reborn/lsi/content_node.rb
@@ -176,11 +196,11 @@ files:
176
196
  - lib/classifier-reborn/lsi/word_list.rb
177
197
  - lib/classifier-reborn/validators/classifier_validator.rb
178
198
  - lib/classifier-reborn/version.rb
179
- homepage: https://github.com/jekyll/classifier-reborn
199
+ homepage: https://jekyll.github.io/classifier-reborn/
180
200
  licenses:
181
201
  - LGPL
182
202
  metadata: {}
183
- post_install_message:
203
+ post_install_message:
184
204
  rdoc_options:
185
205
  - "--charset=UTF-8"
186
206
  require_paths:
@@ -189,17 +209,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
189
209
  requirements:
190
210
  - - ">="
191
211
  - !ruby/object:Gem::Version
192
- version: 1.9.3
212
+ version: 2.4.0
193
213
  required_rubygems_version: !ruby/object:Gem::Requirement
194
214
  requirements:
195
215
  - - ">="
196
216
  - !ruby/object:Gem::Version
197
217
  version: '0'
198
218
  requirements: []
199
- rubyforge_project:
200
- rubygems_version: 2.6.14
201
- signing_key:
219
+ rubygems_version: 3.3.7
220
+ signing_key:
202
221
  specification_version: 2
203
222
  summary: A general classifier module to allow Bayesian and other types of classifications.
204
223
  test_files: []
205
- has_rdoc: true