classifier 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +66 -199
- data/ext/classifier/classifier_ext.c +1 -0
- data/ext/classifier/incremental_svd.c +393 -0
- data/ext/classifier/linalg.h +8 -0
- data/lib/classifier/bayes.rb +177 -53
- data/lib/classifier/errors.rb +3 -0
- data/lib/classifier/knn.rb +351 -0
- data/lib/classifier/logistic_regression.rb +571 -0
- data/lib/classifier/lsi/incremental_svd.rb +166 -0
- data/lib/classifier/lsi/summary.rb +25 -5
- data/lib/classifier/lsi.rb +365 -17
- data/lib/classifier/streaming/line_reader.rb +99 -0
- data/lib/classifier/streaming/progress.rb +96 -0
- data/lib/classifier/streaming.rb +122 -0
- data/lib/classifier/tfidf.rb +408 -0
- data/lib/classifier.rb +4 -0
- data/sig/vendor/matrix.rbs +25 -14
- data/sig/vendor/streaming.rbs +14 -0
- metadata +17 -4
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# rbs_inline: enabled
|
|
2
|
+
|
|
3
|
+
# rubocop:disable Naming/MethodParameterName, Metrics/ParameterLists
|
|
4
|
+
|
|
5
|
+
require 'matrix'
|
|
6
|
+
|
|
7
|
+
module Classifier
|
|
8
|
+
class LSI
|
|
9
|
+
# Brand's Incremental SVD Algorithm for LSI
|
|
10
|
+
#
|
|
11
|
+
# Implements the algorithm from Brand (2006) "Fast low-rank modifications
|
|
12
|
+
# of the thin singular value decomposition" for adding documents to LSI
|
|
13
|
+
# without full SVD recomputation.
|
|
14
|
+
#
|
|
15
|
+
# Given existing thin SVD: A ≈ U * S * V^T (with k components)
|
|
16
|
+
# When adding a new column c:
|
|
17
|
+
#
|
|
18
|
+
# 1. Project: m = U^T * c (project onto existing column space)
|
|
19
|
+
# 2. Residual: p = c - U * m (component orthogonal to U)
|
|
20
|
+
# 3. Orthonormalize: If ||p|| > ε: p̂ = p / ||p||
|
|
21
|
+
# 4. Form K matrix:
|
|
22
|
+
# - If ||p|| > ε: K = [diag(s), m; 0, ||p||] (rank grows by 1)
|
|
23
|
+
# - If ||p|| ≈ 0: K = diag(s) + m * e_last^T (no new direction)
|
|
24
|
+
# 5. Small SVD: Compute SVD of K (only (k+1) × (k+1) matrix!)
|
|
25
|
+
# 6. Update:
|
|
26
|
+
# - U_new = [U, p̂] * U'
|
|
27
|
+
# - S_new = S'
|
|
28
|
+
#
|
|
29
|
+
module IncrementalSVD
|
|
30
|
+
EPSILON = 1e-10
|
|
31
|
+
|
|
32
|
+
class << self
|
|
33
|
+
# Updates SVD with a new document vector using Brand's algorithm.
|
|
34
|
+
#
|
|
35
|
+
# @param u [Matrix] current left singular vectors (m × k)
|
|
36
|
+
# @param s [Array<Float>] current singular values (k values)
|
|
37
|
+
# @param c [Vector] new document vector (m × 1)
|
|
38
|
+
# @param max_rank [Integer] maximum rank to maintain
|
|
39
|
+
# @param epsilon [Float] threshold for zero detection
|
|
40
|
+
# @return [Array<Matrix, Array<Float>>] updated [u, s]
|
|
41
|
+
#
|
|
42
|
+
# @rbs (Matrix, Array[Float], Vector, max_rank: Integer, ?epsilon: Float) -> [Matrix, Array[Float]]
|
|
43
|
+
def update(u, s, c, max_rank:, epsilon: EPSILON)
|
|
44
|
+
m_vec = project(u, c)
|
|
45
|
+
u_times_m = u * m_vec
|
|
46
|
+
p_vec = c - (u_times_m.is_a?(Vector) ? u_times_m : Vector[*u_times_m.to_a.flatten])
|
|
47
|
+
p_norm = magnitude(p_vec)
|
|
48
|
+
|
|
49
|
+
if p_norm > epsilon
|
|
50
|
+
update_with_new_direction(u, s, m_vec, p_vec, p_norm, max_rank, epsilon)
|
|
51
|
+
else
|
|
52
|
+
update_in_span(u, s, m_vec, max_rank, epsilon)
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Projects a document vector onto the semantic space defined by U.
|
|
57
|
+
# Returns the LSI representation: lsi_vec = U^T * raw_vec
|
|
58
|
+
#
|
|
59
|
+
# @param u [Matrix] left singular vectors (m × k)
|
|
60
|
+
# @param raw_vec [Vector] document vector in term space (m × 1)
|
|
61
|
+
# @return [Vector] document in semantic space (k × 1)
|
|
62
|
+
#
|
|
63
|
+
# @rbs (Matrix, Vector) -> Vector
|
|
64
|
+
def project(u, raw_vec)
|
|
65
|
+
u.transpose * raw_vec
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
private
|
|
69
|
+
|
|
70
|
+
# Update when new document has a component orthogonal to existing U.
|
|
71
|
+
# @rbs (Matrix, Array[Float], Vector, Vector, Float, Integer, Float) -> [Matrix, Array[Float]]
|
|
72
|
+
def update_with_new_direction(u, s, m_vec, p_vec, p_norm, max_rank, epsilon)
|
|
73
|
+
p_hat = p_vec * (1.0 / p_norm)
|
|
74
|
+
k_matrix = build_k_matrix_with_growth(s, m_vec, p_norm)
|
|
75
|
+
u_prime, s_prime = small_svd(k_matrix, epsilon)
|
|
76
|
+
u_extended = extend_matrix_with_column(u, p_hat)
|
|
77
|
+
u_new = u_extended * u_prime
|
|
78
|
+
|
|
79
|
+
u_new, s_prime = truncate(u_new, s_prime, max_rank) if s_prime.size > max_rank
|
|
80
|
+
|
|
81
|
+
[u_new, s_prime]
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Update when new document is entirely within span of existing U.
|
|
85
|
+
# @rbs (Matrix, Array[Float], Vector, Integer, Float) -> [Matrix, Array[Float]]
|
|
86
|
+
def update_in_span(u, s, m_vec, max_rank, epsilon)
|
|
87
|
+
k_matrix = build_k_matrix_in_span(s, m_vec)
|
|
88
|
+
u_prime, s_prime = small_svd(k_matrix, epsilon)
|
|
89
|
+
u_new = u * u_prime
|
|
90
|
+
|
|
91
|
+
u_new, s_prime = truncate(u_new, s_prime, max_rank) if s_prime.size > max_rank
|
|
92
|
+
|
|
93
|
+
[u_new, s_prime]
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Builds the K matrix when rank grows by 1.
|
|
97
|
+
# @rbs (Array[Float], untyped, Float) -> untyped
|
|
98
|
+
def build_k_matrix_with_growth(s, m_vec, p_norm)
|
|
99
|
+
k = s.size
|
|
100
|
+
rows = k.times.map do |i|
|
|
101
|
+
row = Array.new(k + 1, 0.0) #: Array[Float]
|
|
102
|
+
row[i] = s[i].to_f
|
|
103
|
+
row[k] = m_vec[i].to_f
|
|
104
|
+
row
|
|
105
|
+
end
|
|
106
|
+
rows << Array.new(k + 1, 0.0).tap { |r| r[k] = p_norm }
|
|
107
|
+
Matrix.rows(rows)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Builds the K matrix when vector is in span (no rank growth).
|
|
111
|
+
# @rbs (Array[Float], Vector) -> Matrix
|
|
112
|
+
def build_k_matrix_in_span(s, _m_vec)
|
|
113
|
+
k = s.size
|
|
114
|
+
rows = k.times.map do |i|
|
|
115
|
+
row = Array.new(k, 0.0)
|
|
116
|
+
row[i] = s[i]
|
|
117
|
+
row
|
|
118
|
+
end
|
|
119
|
+
Matrix.rows(rows)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Computes SVD of small matrix and extracts singular values.
|
|
123
|
+
# @rbs (Matrix, Float) -> [Matrix, Array[Float]]
|
|
124
|
+
def small_svd(matrix, epsilon)
|
|
125
|
+
u, _v, s_array = matrix.SV_decomp
|
|
126
|
+
|
|
127
|
+
s_sorted = s_array.select { |sv| sv.abs > epsilon }.sort.reverse
|
|
128
|
+
indices = s_array.each_with_index
|
|
129
|
+
.select { |sv, _| sv.abs > epsilon }
|
|
130
|
+
.sort_by { |sv, _| -sv }
|
|
131
|
+
.map { |_, i| i }
|
|
132
|
+
|
|
133
|
+
u_cols = indices.map { |i| u.column(i).to_a }
|
|
134
|
+
u_reordered = u_cols.empty? ? Matrix.empty(matrix.row_size, 0) : Matrix.columns(u_cols)
|
|
135
|
+
|
|
136
|
+
[u_reordered, s_sorted]
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Extends matrix with a new column
|
|
140
|
+
# @rbs (Matrix, Vector) -> Matrix
|
|
141
|
+
def extend_matrix_with_column(matrix, col_vec)
|
|
142
|
+
rows = matrix.row_size.times.map do |i|
|
|
143
|
+
matrix.row(i).to_a + [col_vec[i]]
|
|
144
|
+
end
|
|
145
|
+
Matrix.rows(rows)
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Truncates to max_rank
|
|
149
|
+
# @rbs (untyped, Array[Float], Integer) -> [untyped, Array[Float]]
|
|
150
|
+
def truncate(u, s, max_rank)
|
|
151
|
+
s_truncated = s[0...max_rank] || [] #: Array[Float]
|
|
152
|
+
cols = (0...max_rank).map { |i| u.column(i).to_a }
|
|
153
|
+
u_truncated = Matrix.columns(cols)
|
|
154
|
+
[u_truncated, s_truncated]
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Computes magnitude of a vector
|
|
158
|
+
# @rbs (untyped) -> Float
|
|
159
|
+
def magnitude(vec)
|
|
160
|
+
Math.sqrt(vec.to_a.sum { |x| x.to_f * x.to_f })
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
# rubocop:enable Naming/MethodParameterName, Metrics/ParameterLists
|
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
# License:: LGPL
|
|
4
4
|
|
|
5
5
|
class String
|
|
6
|
+
ABBREVIATIONS = %w[Mr Mrs Ms Dr Prof Jr Sr Inc Ltd Corp Co vs etc al eg ie].freeze
|
|
7
|
+
|
|
6
8
|
def summary(count = 10, separator = ' [...] ')
|
|
7
9
|
perform_lsi split_sentences, count, separator
|
|
8
10
|
end
|
|
@@ -12,20 +14,38 @@ class String
|
|
|
12
14
|
end
|
|
13
15
|
|
|
14
16
|
def split_sentences
|
|
15
|
-
|
|
17
|
+
return pragmatic_segment if defined?(PragmaticSegmenter)
|
|
18
|
+
|
|
19
|
+
split_sentences_regex
|
|
16
20
|
end
|
|
17
21
|
|
|
18
22
|
def split_paragraphs
|
|
19
|
-
split(
|
|
23
|
+
split(/\r?\n\r?\n+/)
|
|
20
24
|
end
|
|
21
25
|
|
|
22
26
|
private
|
|
23
27
|
|
|
28
|
+
def pragmatic_segment
|
|
29
|
+
PragmaticSegmenter::Segmenter.new(text: self).segment
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def split_sentences_regex
|
|
33
|
+
abbrev_pattern = ABBREVIATIONS.map { |a| "#{a}\\." }.join('|')
|
|
34
|
+
text = gsub(/\b(#{abbrev_pattern})/i) { |m| m.gsub('.', '<<<DOT>>>') }
|
|
35
|
+
text = text.gsub(/(\d)\.(\d)/, '\1<<<DOT>>>\2')
|
|
36
|
+
sentences = text.split(/(?<=[.!?])(?:\s+|(?=[A-Z]))/)
|
|
37
|
+
sentences.map { |s| s.gsub('<<<DOT>>>', '.') }
|
|
38
|
+
end
|
|
39
|
+
|
|
24
40
|
def perform_lsi(chunks, count, separator)
|
|
25
41
|
lsi = Classifier::LSI.new auto_rebuild: false
|
|
26
|
-
chunks.each
|
|
42
|
+
chunks.each do |chunk|
|
|
43
|
+
stripped = chunk.strip
|
|
44
|
+
next if stripped.empty? || stripped.split.size == 1
|
|
45
|
+
|
|
46
|
+
lsi << chunk
|
|
47
|
+
end
|
|
27
48
|
lsi.build_index
|
|
28
|
-
|
|
29
|
-
summaries.select { |chunk| summaries.include?(chunk) }.map(&:strip).join(separator)
|
|
49
|
+
lsi.highest_relative_content(count).map(&:strip).join(separator)
|
|
30
50
|
end
|
|
31
51
|
end
|