classifier-reborn 2.0.4 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +5 -5
  2. data/LICENSE +74 -1
  3. data/README.markdown +57 -207
  4. data/data/stopwords/ar +104 -0
  5. data/data/stopwords/bn +362 -0
  6. data/data/stopwords/hi +97 -0
  7. data/data/stopwords/ja +43 -0
  8. data/data/stopwords/ru +420 -0
  9. data/data/stopwords/tr +199 -30
  10. data/data/stopwords/vi +647 -0
  11. data/data/stopwords/zh +125 -0
  12. data/lib/classifier-reborn/backends/bayes_memory_backend.rb +77 -0
  13. data/lib/classifier-reborn/backends/bayes_redis_backend.rb +109 -0
  14. data/lib/classifier-reborn/backends/no_redis_error.rb +14 -0
  15. data/lib/classifier-reborn/bayes.rb +141 -65
  16. data/lib/classifier-reborn/category_namer.rb +6 -4
  17. data/lib/classifier-reborn/extensions/hasher.rb +22 -39
  18. data/lib/classifier-reborn/extensions/token_filter/stemmer.rb +24 -0
  19. data/lib/classifier-reborn/extensions/token_filter/stopword.rb +48 -0
  20. data/lib/classifier-reborn/extensions/token_filter/symbol.rb +20 -0
  21. data/lib/classifier-reborn/extensions/tokenizer/token.rb +36 -0
  22. data/lib/classifier-reborn/extensions/tokenizer/whitespace.rb +28 -0
  23. data/lib/classifier-reborn/extensions/vector.rb +35 -28
  24. data/lib/classifier-reborn/extensions/vector_serialize.rb +10 -10
  25. data/lib/classifier-reborn/extensions/zero_vector.rb +7 -0
  26. data/lib/classifier-reborn/lsi/cached_content_node.rb +6 -5
  27. data/lib/classifier-reborn/lsi/content_node.rb +35 -25
  28. data/lib/classifier-reborn/lsi/summarizer.rb +7 -5
  29. data/lib/classifier-reborn/lsi/word_list.rb +5 -6
  30. data/lib/classifier-reborn/lsi.rb +166 -94
  31. data/lib/classifier-reborn/validators/classifier_validator.rb +170 -0
  32. data/lib/classifier-reborn/version.rb +3 -1
  33. data/lib/classifier-reborn.rb +12 -1
  34. metadata +98 -17
  35. data/bin/bayes.rb +0 -36
  36. data/bin/summarize.rb +0 -16
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
4
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
5
+ # License:: LGPL
6
+
7
+ module ClassifierReborn
8
+ module TokenFilter
9
+ # This filter removes stopwords in the language, from given tokens.
10
+ module Stopword
11
+ STOPWORDS_PATH = [File.expand_path(File.dirname(__FILE__) + '/../../../../data/stopwords')]
12
+ @language = 'en'
13
+
14
+ module_function
15
+
16
+ def call(tokens)
17
+ tokens.reject do |token|
18
+ token.maybe_stopword? &&
19
+ (token.length <= 2 || STOPWORDS[@language].include?(token))
20
+ end
21
+ end
22
+
23
+ # Add custom path to a new stopword file created by user
24
+ def add_custom_stopword_path(path)
25
+ STOPWORDS_PATH.unshift(path)
26
+ end
27
+
28
+ # Create a lazily-loaded hash of stopword data
29
+ STOPWORDS = Hash.new do |hash, language|
30
+ hash[language] = []
31
+
32
+ STOPWORDS_PATH.each do |path|
33
+ if File.exist?(File.join(path, language))
34
+ hash[language] = Set.new File.read(File.join(path, language.to_s)).force_encoding('utf-8').split
35
+ break
36
+ end
37
+ end
38
+
39
+ hash[language]
40
+ end
41
+
42
+ # Changes the language of stopwords
43
+ def language=(language)
44
+ @language = language
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
4
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
5
+ # License:: LGPL
6
+
7
+ module ClassifierReborn
8
+ module TokenFilter
9
+ # This filter removes symbol-only terms, from given tokens.
10
+ module Symbol
11
+ module_function
12
+
13
+ def call(tokens)
14
+ tokens.reject do |token|
15
+ /[^\s\p{WORD}]/ === token
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
4
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
5
+ # License:: LGPL
6
+
7
+ module ClassifierReborn
8
+ module Tokenizer
9
+ class Token < String
10
+ # The class can be created with one token string and extra attributes. E.g.,
11
+ # t = ClassifierReborn::Tokenizer::Token.new 'Tokenize', stemmable: true, maybe_stopword: false
12
+ #
13
+ # Attributes available are:
14
+ # stemmable: true Possibility that the token can be stemmed. This must be false for un-stemmable terms, otherwise this should be true.
15
+ # maybe_stopword: true Possibility that the token is a stopword. This must be false for terms which never been stopword, otherwise this should be true.
16
+ def initialize(string, stemmable: true, maybe_stopword: true)
17
+ super(string)
18
+ @stemmable = stemmable
19
+ @maybe_stopword = maybe_stopword
20
+ end
21
+
22
+ def stemmable?
23
+ @stemmable
24
+ end
25
+
26
+ def maybe_stopword?
27
+ @maybe_stopword
28
+ end
29
+
30
+ def stem
31
+ stemmed = super
32
+ self.class.new(stemmed, stemmable: @stemmable, maybe_stopword: @maybe_stopword)
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
4
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
5
+ # License:: LGPL
6
+
7
+ require_relative 'token'
8
+
9
+ module ClassifierReborn
10
+ module Tokenizer
11
+ # This tokenizes given input as white-space separated terms.
12
+ # It mainly aims to tokenize sentences written with a space between words, like English, French, and others.
13
+ module Whitespace
14
+ module_function
15
+
16
+ def call(str)
17
+ tokens = str.gsub(/[^\p{WORD}\s]/, '').downcase.split.collect do |word|
18
+ Token.new(word, stemmable: true, maybe_stopword: true)
19
+ end
20
+ symbol_tokens = str.scan(/[^\s\p{WORD}]/).collect do |word|
21
+ Token.new(word, stemmable: false, maybe_stopword: false)
22
+ end
23
+ tokens += symbol_tokens
24
+ tokens
25
+ end
26
+ end
27
+ end
28
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Ernest Ellingson
2
4
  # Copyright:: Copyright (c) 2005
3
5
 
@@ -6,73 +8,78 @@
6
8
  require 'matrix'
7
9
 
8
10
  class Matrix
9
- def Matrix.diag(s)
10
- Matrix.diagonal(*s)
11
+ def self.diag(s)
12
+ Matrix.diagonal(*s)
11
13
  end
12
14
 
13
- alias :trans :transpose
15
+ alias trans transpose
14
16
 
15
17
  def SV_decomp(maxSweeps = 20)
16
- if self.row_size >= self.column_size
17
- q = self.trans * self
18
- else
19
- q = self * self.trans
20
- end
18
+ q = if row_size >= column_size
19
+ trans * self
20
+ else
21
+ self * trans
22
+ end
21
23
 
22
24
  qrot = q.dup
23
25
  v = Matrix.identity(q.row_size)
24
26
  mzrot = nil
25
27
  cnt = 0
26
28
  s_old = nil
27
- mu = nil
28
29
 
29
- while true do
30
+ loop do
30
31
  cnt += 1
31
- for row in (0...qrot.row_size-1) do
32
- for col in (1..qrot.row_size-1) do
32
+ (0...qrot.row_size - 1).each do |row|
33
+ (1..qrot.row_size - 1).each do |col|
33
34
  next if row == col
34
- h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
35
+
36
+ h = if (2.0 * qrot[row, col]) == (qrot[row, row] - qrot[col, col])
37
+ Math.atan(1) / 2.0
38
+ else
39
+ Math.atan((2.0 * qrot[row, col]) / (qrot[row, row] - qrot[col, col])) / 2.0
40
+ end
35
41
  hcos = Math.cos(h)
36
42
  hsin = Math.sin(h)
37
43
  mzrot = Matrix.identity(qrot.row_size)
38
- mzrot[row,row] = hcos
39
- mzrot[row,col] = -hsin
40
- mzrot[col,row] = hsin
41
- mzrot[col,col] = hcos
44
+ mzrot[row, row] = hcos
45
+ mzrot[row, col] = -hsin
46
+ mzrot[col, row] = hsin
47
+ mzrot[col, col] = hcos
42
48
  qrot = mzrot.trans * qrot * mzrot
43
- v = v * mzrot
49
+ v *= mzrot
44
50
  end
45
51
  end
46
52
  s_old = qrot.dup if cnt == 1
47
53
  sum_qrot = 0.0
48
54
  if cnt > 1
49
55
  qrot.row_size.times do |r|
50
- sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
56
+ sum_qrot += (qrot[r, r] - s_old[r, r]).abs if (qrot[r, r] - s_old[r, r]).abs > 0.001
51
57
  end
52
58
  s_old = qrot.dup
53
59
  end
54
- break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
60
+ break if (sum_qrot <= 0.001 && cnt > 1) || cnt >= maxSweeps
55
61
  end # of do while true
56
62
  s = []
57
63
  qrot.row_size.times do |r|
58
- s << Math.sqrt(qrot[r,r])
64
+ s << Math.sqrt(qrot[r, r])
59
65
  end
60
- #puts "cnt = #{cnt}"
61
- if self.row_size >= self.column_size
62
- mu = self * v * Matrix.diagonal(*s).inverse
66
+ # puts "cnt = #{cnt}"
67
+ if row_size >= column_size
68
+ mu = self * v * Matrix.diagonal(*s).inverse
63
69
  return [mu, v, s]
64
70
  else
65
71
  puts v.row_size
66
72
  puts v.column_size
67
- puts self.row_size
68
- puts self.column_size
73
+ puts row_size
74
+ puts column_size
69
75
  puts s.size
70
76
 
71
- mu = (self.trans * v * Matrix.diagonal(*s).inverse)
77
+ mu = (trans * v * Matrix.diagonal(*s).inverse)
72
78
  return [mu, v, s]
73
79
  end
74
80
  end
75
- def []=(i,j,val)
81
+
82
+ def []=(i, j, val)
76
83
  @rows[i][j] = val
77
84
  end
78
85
  end
@@ -1,20 +1,20 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module GSL
2
-
3
4
  class Vector
4
- def _dump(v)
5
- Marshal.dump( self.to_a )
5
+ def _dump(_v)
6
+ Marshal.dump(to_a)
6
7
  end
7
-
8
+
8
9
  def self._load(arr)
9
10
  arry = Marshal.load(arr)
10
- return GSL::Vector.alloc(arry)
11
+ GSL::Vector.alloc(arry)
11
12
  end
12
-
13
13
  end
14
-
14
+
15
15
  class Matrix
16
- class <<self
17
- alias :diag :diagonal
18
- end
16
+ class <<self
17
+ alias diag diagonal
18
+ end
19
19
  end
20
20
  end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Vector
4
+ def zero?
5
+ all?(&:zero?)
6
+ end
7
+ end
@@ -1,9 +1,10 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Kelley Reynolds (mailto:kelley@insidesystems.net)
2
4
  # Copyright:: Copyright (c) 2015 Kelley Reynolds
3
5
  # License:: LGPL
4
6
 
5
7
  module ClassifierReborn
6
-
7
8
  # Subclass of ContentNode which caches the search_vector transpositions.
8
9
  # Its great because its much faster for large indexes, but at the cost of more ram. Additionally,
9
10
  # if you Marshal your classifier and want to keep the size down, you'll need to manually
@@ -16,7 +17,7 @@ module ClassifierReborn
16
17
  end
17
18
  end
18
19
 
19
- def initialize( word_hash, *categories )
20
+ def initialize(word_hash, *categories)
20
21
  clear_cache!
21
22
  super
22
23
  end
@@ -29,13 +30,13 @@ module ClassifierReborn
29
30
  def transposed_search_vector
30
31
  @transposed_search_vector ||= super
31
32
  end
32
-
33
+
33
34
  # Clear the cache before we continue on
34
- def raw_vector_with( word_list )
35
+ def raw_vector_with(word_list)
35
36
  clear_cache!
36
37
  super
37
38
  end
38
-
39
+
39
40
  # We don't want the cached_data here
40
41
  def marshal_dump
41
42
  [@lsi_vector, @lsi_norm, @raw_vector, @raw_norm, @categories, @word_hash]
@@ -1,12 +1,13 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
4
  # Copyright:: Copyright (c) 2005 David Fayram II
3
5
  # License:: LGPL
4
6
 
5
7
  module ClassifierReborn
6
-
7
- # This is an internal data structure class for the LSI node. Save for
8
- # raw_vector_with, it should be fairly straightforward to understand.
9
- # You should never have to use it directly.
8
+ # This is an internal data structure class for the LSI node. Save for
9
+ # raw_vector_with, it should be fairly straightforward to understand.
10
+ # You should never have to use it directly.
10
11
  class ContentNode
11
12
  attr_accessor :raw_vector, :raw_norm,
12
13
  :lsi_vector, :lsi_norm,
@@ -15,7 +16,7 @@ module ClassifierReborn
15
16
  attr_reader :word_hash
16
17
  # If text_proc is not specified, the source will be duck-typed
17
18
  # via source.to_s
18
- def initialize( word_hash, *categories )
19
+ def initialize(word_hash, *categories)
19
20
  @categories = categories || []
20
21
  @word_hash = word_hash
21
22
  @lsi_norm, @lsi_vector = nil
@@ -28,7 +29,11 @@ module ClassifierReborn
28
29
 
29
30
  # Method to access the transposed search vector
30
31
  def transposed_search_vector
31
- search_vector.col
32
+ if $SVD == :numo
33
+ search_vector
34
+ else
35
+ search_vector.col
36
+ end
32
37
  end
33
38
 
34
39
  # Use this to fetch the appropriate search vector in normalized form.
@@ -38,21 +43,25 @@ module ClassifierReborn
38
43
 
39
44
  # Creates the raw vector out of word_hash using word_list as the
40
45
  # key for mapping the vector space.
41
- def raw_vector_with( word_list )
42
- if $GSL
43
- vec = GSL::Vector.alloc(word_list.size)
44
- else
45
- vec = Array.new(word_list.size, 0)
46
- end
46
+ def raw_vector_with(word_list)
47
+ vec = if $SVD == :numo
48
+ Numo::DFloat.zeros(word_list.size)
49
+ elsif $SVD == :gsl
50
+ GSL::Vector.alloc(word_list.size)
51
+ else
52
+ Array.new(word_list.size, 0)
53
+ end
47
54
 
48
55
  @word_hash.each_key do |word|
49
56
  vec[word_list[word]] = @word_hash[word] if word_list[word]
50
57
  end
51
58
 
52
59
  # Perform the scaling transform and force floating point arithmetic
53
- if $GSL
60
+ if $SVD == :numo
61
+ total_words = vec.sum.to_f
62
+ elsif $SVD == :gsl
54
63
  sum = 0.0
55
- vec.each {|v| sum += v }
64
+ vec.each { |v| sum += v }
56
65
  total_words = sum
57
66
  else
58
67
  total_words = vec.reduce(0, :+).to_f
@@ -60,10 +69,10 @@ module ClassifierReborn
60
69
 
61
70
  total_unique_words = 0
62
71
 
63
- if $GSL
72
+ if [:numo, :gsl].include?($SVD)
64
73
  vec.each { |word| total_unique_words += 1 if word != 0.0 }
65
74
  else
66
- total_unique_words = vec.count{ |word| word != 0 }
75
+ total_unique_words = vec.count { |word| word != 0 }
67
76
  end
68
77
 
69
78
  # Perform first-order association transform if this vector has more
@@ -71,9 +80,9 @@ module ClassifierReborn
71
80
  if total_words > 1.0 && total_unique_words > 1
72
81
  weighted_total = 0.0
73
82
  # Cache calculations, this takes too long on large indexes
74
- cached_calcs = Hash.new { |hash, term|
75
- hash[term] = (( term / total_words ) * Math.log( term / total_words ))
76
- }
83
+ cached_calcs = Hash.new do |hash, term|
84
+ hash[term] = ((term / total_words) * Math.log(term / total_words))
85
+ end
77
86
 
78
87
  vec.each do |term|
79
88
  weighted_total += cached_calcs[term] if term > 0.0
@@ -81,15 +90,18 @@ module ClassifierReborn
81
90
 
82
91
  # Cache calculations, this takes too long on large indexes
83
92
  cached_calcs = Hash.new do |hash, val|
84
- hash[val] = Math.log( val + 1 ) / -weighted_total
93
+ hash[val] = Math.log(val + 1) / -weighted_total
85
94
  end
86
95
 
87
- vec.collect! { |val|
96
+ vec = vec.map do |val|
88
97
  cached_calcs[val]
89
- }
98
+ end
90
99
  end
91
100
 
92
- if $GSL
101
+ if $SVD == :numo
102
+ @raw_norm = vec / Numo::Linalg.norm(vec)
103
+ @raw_vector = vec
104
+ elsif $SVD == :gsl
93
105
  @raw_norm = vec.normalize
94
106
  @raw_vector = vec
95
107
  else
@@ -97,7 +109,5 @@ module ClassifierReborn
97
109
  @raw_vector = Vector[*vec]
98
110
  end
99
111
  end
100
-
101
112
  end
102
-
103
113
  end
@@ -1,16 +1,18 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
4
  # Copyright:: Copyright (c) 2005 Lucas Carlson
3
5
  # License:: LGPL
4
6
 
5
7
  module ClassifierReborn
6
8
  module Summarizer
7
- extend self
9
+ module_function
8
10
 
9
- def summary( str, count=10, separator=" [...] " )
11
+ def summary(str, count = 10, separator = ' [...] ')
10
12
  perform_lsi split_sentences(str), count, separator
11
13
  end
12
14
 
13
- def paragraph_summary( str, count=1, separator=" [...] " )
15
+ def paragraph_summary(str, count = 1, separator = ' [...] ')
14
16
  perform_lsi split_paragraphs(str), count, separator
15
17
  end
16
18
 
@@ -23,11 +25,11 @@ module ClassifierReborn
23
25
  end
24
26
 
25
27
  def perform_lsi(chunks, count, separator)
26
- lsi = ClassifierReborn::LSI.new :auto_rebuild => false
28
+ lsi = ClassifierReborn::LSI.new auto_rebuild: false
27
29
  chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
28
30
  lsi.build_index
29
31
  summaries = lsi.highest_relative_content count
30
- return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
32
+ summaries.select { |chunk| summaries.include? chunk }.map(&:strip).join(separator)
31
33
  end
32
34
  end
33
35
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
4
  # Copyright:: Copyright (c) 2005 David Fayram II
3
5
  # License:: LGPL
@@ -8,19 +10,17 @@ module ClassifierReborn
8
10
 
9
11
  class WordList
10
12
  def initialize
11
- @location_table = Hash.new
13
+ @location_table = {}
12
14
  end
13
15
 
14
16
  # Adds a word (if it is new) and assigns it a unique dimension.
15
17
  def add_word(word)
16
- term = word
17
- @location_table[term] = @location_table.size unless @location_table[term]
18
+ @location_table[word] = @location_table.size unless @location_table[word]
18
19
  end
19
20
 
20
21
  # Returns the dimension of the word or nil if the word is not in the space.
21
22
  def [](lookup)
22
- term = lookup
23
- @location_table[term]
23
+ @location_table[lookup]
24
24
  end
25
25
 
26
26
  def word_for_index(ind)
@@ -31,6 +31,5 @@ module ClassifierReborn
31
31
  def size
32
32
  @location_table.size
33
33
  end
34
-
35
34
  end
36
35
  end