classifier 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,166 @@
1
+ # rbs_inline: enabled
2
+
3
+ # rubocop:disable Naming/MethodParameterName, Metrics/ParameterLists
4
+
5
+ require 'matrix'
6
+
7
+ module Classifier
8
+ class LSI
9
+ # Brand's Incremental SVD Algorithm for LSI
10
+ #
11
+ # Implements the algorithm from Brand (2006) "Fast low-rank modifications
12
+ # of the thin singular value decomposition" for adding documents to LSI
13
+ # without full SVD recomputation.
14
+ #
15
+ # Given existing thin SVD: A ≈ U * S * V^T (with k components)
16
+ # When adding a new column c:
17
+ #
18
+ # 1. Project: m = U^T * c (project onto existing column space)
19
+ # 2. Residual: p = c - U * m (component orthogonal to U)
20
+ # 3. Orthonormalize: If ||p|| > ε: p̂ = p / ||p||
21
+ # 4. Form K matrix:
22
+ # - If ||p|| > ε: K = [diag(s), m; 0, ||p||] (rank grows by 1)
23
+ # - If ||p|| ≈ 0: K = diag(s) + m * e_last^T (no new direction)
24
+ # 5. Small SVD: Compute SVD of K (only (k+1) × (k+1) matrix!)
25
+ # 6. Update:
26
+ # - U_new = [U, p̂] * U'
27
+ # - S_new = S'
28
+ #
29
+ module IncrementalSVD
30
+ EPSILON = 1e-10
31
+
32
+ class << self
33
+ # Updates SVD with a new document vector using Brand's algorithm.
34
+ #
35
+ # @param u [Matrix] current left singular vectors (m × k)
36
+ # @param s [Array<Float>] current singular values (k values)
37
+ # @param c [Vector] new document vector (m × 1)
38
+ # @param max_rank [Integer] maximum rank to maintain
39
+ # @param epsilon [Float] threshold for zero detection
40
+ # @return [Array<Matrix, Array<Float>>] updated [u, s]
41
+ #
42
+ # @rbs (Matrix, Array[Float], Vector, max_rank: Integer, ?epsilon: Float) -> [Matrix, Array[Float]]
43
+ def update(u, s, c, max_rank:, epsilon: EPSILON)
44
+ m_vec = project(u, c)
45
+ u_times_m = u * m_vec
46
+ p_vec = c - (u_times_m.is_a?(Vector) ? u_times_m : Vector[*u_times_m.to_a.flatten])
47
+ p_norm = magnitude(p_vec)
48
+
49
+ if p_norm > epsilon
50
+ update_with_new_direction(u, s, m_vec, p_vec, p_norm, max_rank, epsilon)
51
+ else
52
+ update_in_span(u, s, m_vec, max_rank, epsilon)
53
+ end
54
+ end
55
+
56
+ # Projects a document vector onto the semantic space defined by U.
57
+ # Returns the LSI representation: lsi_vec = U^T * raw_vec
58
+ #
59
+ # @param u [Matrix] left singular vectors (m × k)
60
+ # @param raw_vec [Vector] document vector in term space (m × 1)
61
+ # @return [Vector] document in semantic space (k × 1)
62
+ #
63
+ # @rbs (Matrix, Vector) -> Vector
64
+ def project(u, raw_vec)
65
+ u.transpose * raw_vec
66
+ end
67
+
68
+ private
69
+
70
+ # Update when new document has a component orthogonal to existing U.
71
+ # @rbs (Matrix, Array[Float], Vector, Vector, Float, Integer, Float) -> [Matrix, Array[Float]]
72
+ def update_with_new_direction(u, s, m_vec, p_vec, p_norm, max_rank, epsilon)
73
+ p_hat = p_vec * (1.0 / p_norm)
74
+ k_matrix = build_k_matrix_with_growth(s, m_vec, p_norm)
75
+ u_prime, s_prime = small_svd(k_matrix, epsilon)
76
+ u_extended = extend_matrix_with_column(u, p_hat)
77
+ u_new = u_extended * u_prime
78
+
79
+ u_new, s_prime = truncate(u_new, s_prime, max_rank) if s_prime.size > max_rank
80
+
81
+ [u_new, s_prime]
82
+ end
83
+
84
+ # Update when new document is entirely within span of existing U.
85
+ # @rbs (Matrix, Array[Float], Vector, Integer, Float) -> [Matrix, Array[Float]]
86
+ def update_in_span(u, s, m_vec, max_rank, epsilon)
87
+ k_matrix = build_k_matrix_in_span(s, m_vec)
88
+ u_prime, s_prime = small_svd(k_matrix, epsilon)
89
+ u_new = u * u_prime
90
+
91
+ u_new, s_prime = truncate(u_new, s_prime, max_rank) if s_prime.size > max_rank
92
+
93
+ [u_new, s_prime]
94
+ end
95
+
96
+ # Builds the K matrix when rank grows by 1.
97
+ # @rbs (Array[Float], untyped, Float) -> untyped
98
+ def build_k_matrix_with_growth(s, m_vec, p_norm)
99
+ k = s.size
100
+ rows = k.times.map do |i|
101
+ row = Array.new(k + 1, 0.0) #: Array[Float]
102
+ row[i] = s[i].to_f
103
+ row[k] = m_vec[i].to_f
104
+ row
105
+ end
106
+ rows << Array.new(k + 1, 0.0).tap { |r| r[k] = p_norm }
107
+ Matrix.rows(rows)
108
+ end
109
+
110
+ # Builds the K matrix when vector is in span (no rank growth).
111
+ # @rbs (Array[Float], Vector) -> Matrix
112
+ def build_k_matrix_in_span(s, _m_vec)
113
+ k = s.size
114
+ rows = k.times.map do |i|
115
+ row = Array.new(k, 0.0)
116
+ row[i] = s[i]
117
+ row
118
+ end
119
+ Matrix.rows(rows)
120
+ end
121
+
122
+ # Computes SVD of small matrix and extracts singular values.
123
+ # @rbs (Matrix, Float) -> [Matrix, Array[Float]]
124
+ def small_svd(matrix, epsilon)
125
+ u, _v, s_array = matrix.SV_decomp
126
+
127
+ s_sorted = s_array.select { |sv| sv.abs > epsilon }.sort.reverse
128
+ indices = s_array.each_with_index
129
+ .select { |sv, _| sv.abs > epsilon }
130
+ .sort_by { |sv, _| -sv }
131
+ .map { |_, i| i }
132
+
133
+ u_cols = indices.map { |i| u.column(i).to_a }
134
+ u_reordered = u_cols.empty? ? Matrix.empty(matrix.row_size, 0) : Matrix.columns(u_cols)
135
+
136
+ [u_reordered, s_sorted]
137
+ end
138
+
139
+ # Extends matrix with a new column
140
+ # @rbs (Matrix, Vector) -> Matrix
141
+ def extend_matrix_with_column(matrix, col_vec)
142
+ rows = matrix.row_size.times.map do |i|
143
+ matrix.row(i).to_a + [col_vec[i]]
144
+ end
145
+ Matrix.rows(rows)
146
+ end
147
+
148
+ # Truncates to max_rank
149
+ # @rbs (untyped, Array[Float], Integer) -> [untyped, Array[Float]]
150
+ def truncate(u, s, max_rank)
151
+ s_truncated = s[0...max_rank] || [] #: Array[Float]
152
+ cols = (0...max_rank).map { |i| u.column(i).to_a }
153
+ u_truncated = Matrix.columns(cols)
154
+ [u_truncated, s_truncated]
155
+ end
156
+
157
+ # Computes magnitude of a vector
158
+ # @rbs (untyped) -> Float
159
+ def magnitude(vec)
160
+ Math.sqrt(vec.to_a.sum { |x| x.to_f * x.to_f })
161
+ end
162
+ end
163
+ end
164
+ end
165
+ end
166
+ # rubocop:enable Naming/MethodParameterName, Metrics/ParameterLists
@@ -3,6 +3,8 @@
3
3
  # License:: LGPL
4
4
 
5
5
  class String
6
+ ABBREVIATIONS = %w[Mr Mrs Ms Dr Prof Jr Sr Inc Ltd Corp Co vs etc al eg ie].freeze
7
+
6
8
  def summary(count = 10, separator = ' [...] ')
7
9
  perform_lsi split_sentences, count, separator
8
10
  end
@@ -12,20 +14,38 @@ class String
12
14
  end
13
15
 
14
16
  def split_sentences
15
- split(/(\.|!|\?)/) # TODO: make this less primitive
17
+ return pragmatic_segment if defined?(PragmaticSegmenter)
18
+
19
+ split_sentences_regex
16
20
  end
17
21
 
18
22
  def split_paragraphs
19
- split(/(\n\n|\r\r|\r\n\r\n)/) # TODO: make this less primitive
23
+ split(/\r?\n\r?\n+/)
20
24
  end
21
25
 
22
26
  private
23
27
 
28
+ def pragmatic_segment
29
+ PragmaticSegmenter::Segmenter.new(text: self).segment
30
+ end
31
+
32
+ def split_sentences_regex
33
+ abbrev_pattern = ABBREVIATIONS.map { |a| "#{a}\\." }.join('|')
34
+ text = gsub(/\b(#{abbrev_pattern})/i) { |m| m.gsub('.', '<<<DOT>>>') }
35
+ text = text.gsub(/(\d)\.(\d)/, '\1<<<DOT>>>\2')
36
+ sentences = text.split(/(?<=[.!?])(?:\s+|(?=[A-Z]))/)
37
+ sentences.map { |s| s.gsub('<<<DOT>>>', '.') }
38
+ end
39
+
24
40
  def perform_lsi(chunks, count, separator)
25
41
  lsi = Classifier::LSI.new auto_rebuild: false
26
- chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
42
+ chunks.each do |chunk|
43
+ stripped = chunk.strip
44
+ next if stripped.empty? || stripped.split.size == 1
45
+
46
+ lsi << chunk
47
+ end
27
48
  lsi.build_index
28
- summaries = lsi.highest_relative_content count
29
- summaries.select { |chunk| summaries.include?(chunk) }.map(&:strip).join(separator)
49
+ lsi.highest_relative_content(count).map(&:strip).join(separator)
30
50
  end
31
51
  end