classifier 2.0.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,393 @@
1
+ /*
2
+ * incremental_svd.c
3
+ * Native C implementation of Brand's incremental SVD operations
4
+ *
5
+ * Provides fast matrix operations for:
6
+ * - Matrix column extension
7
+ * - Vertical stacking (vstack)
8
+ * - Vector subtraction
9
+ * - Batch document projection
10
+ */
11
+
12
+ #include "linalg.h"
13
+
14
+ /*
15
+ * Extend a matrix with a new column
16
+ * Returns a new matrix [M | col] with one additional column
17
+ */
18
+ CMatrix *cmatrix_extend_column(CMatrix *m, CVector *col)
19
+ {
20
+ if (m->rows != col->size) {
21
+ rb_raise(rb_eArgError,
22
+ "Matrix rows (%ld) must match vector size (%ld)",
23
+ (long)m->rows, (long)col->size);
24
+ }
25
+
26
+ CMatrix *result = cmatrix_alloc(m->rows, m->cols + 1);
27
+
28
+ for (size_t i = 0; i < m->rows; i++) {
29
+ memcpy(&MAT_AT(result, i, 0), &MAT_AT(m, i, 0), m->cols * sizeof(double));
30
+ MAT_AT(result, i, m->cols) = col->data[i];
31
+ }
32
+
33
+ return result;
34
+ }
35
+
36
+ /*
37
+ * Vertically stack two matrices
38
+ * Returns a new matrix [top; bottom]
39
+ */
40
+ CMatrix *cmatrix_vstack(CMatrix *top, CMatrix *bottom)
41
+ {
42
+ if (top->cols != bottom->cols) {
43
+ rb_raise(rb_eArgError,
44
+ "Matrices must have same column count: %ld vs %ld",
45
+ (long)top->cols, (long)bottom->cols);
46
+ }
47
+
48
+ size_t new_rows = top->rows + bottom->rows;
49
+ CMatrix *result = cmatrix_alloc(new_rows, top->cols);
50
+
51
+ memcpy(result->data, top->data, top->rows * top->cols * sizeof(double));
52
+ memcpy(result->data + top->rows * top->cols,
53
+ bottom->data,
54
+ bottom->rows * bottom->cols * sizeof(double));
55
+
56
+ return result;
57
+ }
58
+
59
+ /*
60
+ * Vector subtraction: a - b
61
+ */
62
+ CVector *cvector_subtract(CVector *a, CVector *b)
63
+ {
64
+ if (a->size != b->size) {
65
+ rb_raise(rb_eArgError,
66
+ "Vector sizes must match: %ld vs %ld",
67
+ (long)a->size, (long)b->size);
68
+ }
69
+
70
+ CVector *result = cvector_alloc(a->size);
71
+ for (size_t i = 0; i < a->size; i++) {
72
+ result->data[i] = a->data[i] - b->data[i];
73
+ }
74
+ return result;
75
+ }
76
+
77
+ /*
78
+ * Batch project multiple vectors onto U matrix
79
+ * Computes lsi_vector = U^T * raw_vector for each vector
80
+ * This is the most performance-critical operation for incremental updates
81
+ */
82
+ void cbatch_project(CMatrix *u, CVector **raw_vectors, size_t num_vectors,
83
+ CVector **lsi_vectors_out)
84
+ {
85
+ size_t m = u->rows; /* vocabulary size */
86
+ size_t k = u->cols; /* rank */
87
+
88
+ for (size_t v = 0; v < num_vectors; v++) {
89
+ CVector *raw = raw_vectors[v];
90
+ if (raw->size != m) {
91
+ rb_raise(rb_eArgError,
92
+ "Vector %ld size (%ld) must match matrix rows (%ld)",
93
+ (long)v, (long)raw->size, (long)m);
94
+ }
95
+
96
+ CVector *lsi = cvector_alloc(k);
97
+
98
+ /* Compute U^T * raw (project onto k-dimensional space) */
99
+ for (size_t j = 0; j < k; j++) {
100
+ double sum = 0.0;
101
+ for (size_t i = 0; i < m; i++) {
102
+ sum += MAT_AT(u, i, j) * raw->data[i];
103
+ }
104
+ lsi->data[j] = sum;
105
+ }
106
+
107
+ lsi_vectors_out[v] = lsi;
108
+ }
109
+ }
110
+
111
+ /*
112
+ * Build the K matrix for Brand's algorithm when rank grows
113
+ * K = | diag(s) m_vec |
114
+ * | 0 p_norm |
115
+ */
116
+ static CMatrix *build_k_matrix_with_growth(CVector *s, CVector *m_vec, double p_norm)
117
+ {
118
+ size_t k = s->size;
119
+ CMatrix *result = cmatrix_alloc(k + 1, k + 1);
120
+
121
+ /* First k rows: diagonal s values and m_vec in last column */
122
+ for (size_t i = 0; i < k; i++) {
123
+ MAT_AT(result, i, i) = s->data[i];
124
+ MAT_AT(result, i, k) = m_vec->data[i];
125
+ }
126
+
127
+ /* Last row: zeros except p_norm in last position */
128
+ MAT_AT(result, k, k) = p_norm;
129
+
130
+ return result;
131
+ }
132
+
133
+ /*
134
+ * Perform one incremental SVD update using Brand's algorithm
135
+ *
136
+ * @param u Current U matrix (m x k)
137
+ * @param s Current singular values (k values)
138
+ * @param c New document vector (m x 1)
139
+ * @param max_rank Maximum rank to maintain
140
+ * @param epsilon Threshold for detecting new directions
141
+ * @param u_out Output: updated U matrix
142
+ * @param s_out Output: updated singular values
143
+ */
144
+ static void incremental_update(CMatrix *u, CVector *s, CVector *c, int max_rank,
145
+ double epsilon, CMatrix **u_out, CVector **s_out)
146
+ {
147
+ size_t m = u->rows;
148
+ size_t k = u->cols;
149
+
150
+ /* Step 1: Project c onto column space of U */
151
+ /* m_vec = U^T * c */
152
+ CVector *m_vec = cvector_alloc(k);
153
+ for (size_t j = 0; j < k; j++) {
154
+ double sum = 0.0;
155
+ for (size_t i = 0; i < m; i++) {
156
+ sum += MAT_AT(u, i, j) * c->data[i];
157
+ }
158
+ m_vec->data[j] = sum;
159
+ }
160
+
161
+ /* Step 2: Compute residual p = c - U * m_vec */
162
+ CVector *u_times_m = cmatrix_multiply_vector(u, m_vec);
163
+ CVector *p = cvector_subtract(c, u_times_m);
164
+ double p_norm = cvector_magnitude(p);
165
+
166
+ cvector_free(u_times_m);
167
+
168
+ if (p_norm > epsilon) {
169
+ /* New direction found - rank may increase */
170
+
171
+ /* Step 3: Normalize residual */
172
+ CVector *p_hat = cvector_alloc(m);
173
+ double inv_p_norm = 1.0 / p_norm;
174
+ for (size_t i = 0; i < m; i++) {
175
+ p_hat->data[i] = p->data[i] * inv_p_norm;
176
+ }
177
+
178
+ /* Step 4: Build K matrix */
179
+ CMatrix *k_mat = build_k_matrix_with_growth(s, m_vec, p_norm);
180
+
181
+ /* Step 5: SVD of K matrix */
182
+ CMatrix *u_prime, *v_prime;
183
+ CVector *s_prime;
184
+ jacobi_svd(k_mat, &u_prime, &v_prime, &s_prime);
185
+ cmatrix_free(k_mat);
186
+ cmatrix_free(v_prime);
187
+
188
+ /* Step 6: Update U = [U | p_hat] * U' */
189
+ CMatrix *u_extended = cmatrix_extend_column(u, p_hat);
190
+ CMatrix *u_new = cmatrix_multiply(u_extended, u_prime);
191
+ cmatrix_free(u_extended);
192
+ cmatrix_free(u_prime);
193
+ cvector_free(p_hat);
194
+
195
+ /* Truncate if needed */
196
+ if (s_prime->size > (size_t)max_rank) {
197
+ /* Create truncated U (keep first max_rank columns) */
198
+ CMatrix *u_trunc = cmatrix_alloc(u_new->rows, (size_t)max_rank);
199
+ for (size_t i = 0; i < u_new->rows; i++) {
200
+ memcpy(&MAT_AT(u_trunc, i, 0), &MAT_AT(u_new, i, 0),
201
+ (size_t)max_rank * sizeof(double));
202
+ }
203
+ cmatrix_free(u_new);
204
+ u_new = u_trunc;
205
+
206
+ /* Truncate singular values */
207
+ CVector *s_trunc = cvector_alloc((size_t)max_rank);
208
+ memcpy(s_trunc->data, s_prime->data, (size_t)max_rank * sizeof(double));
209
+ cvector_free(s_prime);
210
+ s_prime = s_trunc;
211
+ }
212
+
213
+ *u_out = u_new;
214
+ *s_out = s_prime;
215
+ } else {
216
+ /* Vector in span - use simpler update */
217
+ /* For now, just return unchanged (projection handles this) */
218
+ *u_out = cmatrix_alloc(u->rows, u->cols);
219
+ memcpy((*u_out)->data, u->data, u->rows * u->cols * sizeof(double));
220
+ *s_out = cvector_alloc(s->size);
221
+ memcpy((*s_out)->data, s->data, s->size * sizeof(double));
222
+ }
223
+
224
+ cvector_free(p);
225
+ cvector_free(m_vec);
226
+ }
227
+
228
+ /* ========== Ruby Wrappers ========== */
229
+
230
+ /*
231
+ * Matrix.extend_column(matrix, vector)
232
+ * Returns [matrix | vector]
233
+ */
234
+ static VALUE rb_cmatrix_extend_column(VALUE klass, VALUE rb_matrix, VALUE rb_vector)
235
+ {
236
+ CMatrix *m;
237
+ CVector *v;
238
+
239
+ GET_CMATRIX(rb_matrix, m);
240
+ GET_CVECTOR(rb_vector, v);
241
+
242
+ CMatrix *result = cmatrix_extend_column(m, v);
243
+ return TypedData_Wrap_Struct(klass, &cmatrix_type, result);
244
+
245
+ (void)klass;
246
+ }
247
+
248
+ /*
249
+ * Matrix.vstack(top, bottom)
250
+ * Vertically stack two matrices
251
+ */
252
+ static VALUE rb_cmatrix_vstack(VALUE klass, VALUE rb_top, VALUE rb_bottom)
253
+ {
254
+ CMatrix *top, *bottom;
255
+
256
+ GET_CMATRIX(rb_top, top);
257
+ GET_CMATRIX(rb_bottom, bottom);
258
+
259
+ CMatrix *result = cmatrix_vstack(top, bottom);
260
+ return TypedData_Wrap_Struct(klass, &cmatrix_type, result);
261
+
262
+ (void)klass;
263
+ }
264
+
265
+ /*
266
+ * Matrix.zeros(rows, cols)
267
+ * Create a zero matrix
268
+ */
269
+ static VALUE rb_cmatrix_zeros(VALUE klass, VALUE rb_rows, VALUE rb_cols)
270
+ {
271
+ size_t rows = NUM2SIZET(rb_rows);
272
+ size_t cols = NUM2SIZET(rb_cols);
273
+
274
+ CMatrix *result = cmatrix_alloc(rows, cols);
275
+ return TypedData_Wrap_Struct(klass, &cmatrix_type, result);
276
+
277
+ (void)klass;
278
+ }
279
+
280
+ /*
281
+ * Vector#-(other)
282
+ * Vector subtraction
283
+ */
284
+ static VALUE rb_cvector_subtract(VALUE self, VALUE other)
285
+ {
286
+ CVector *a, *b;
287
+
288
+ GET_CVECTOR(self, a);
289
+
290
+ if (rb_obj_is_kind_of(other, cClassifierVector)) {
291
+ GET_CVECTOR(other, b);
292
+ CVector *result = cvector_subtract(a, b);
293
+ return TypedData_Wrap_Struct(cClassifierVector, &cvector_type, result);
294
+ }
295
+
296
+ rb_raise(rb_eTypeError, "Cannot subtract %s from Vector",
297
+ rb_obj_classname(other));
298
+ return Qnil;
299
+ }
300
+
301
+ /*
302
+ * Matrix#batch_project(vectors_array)
303
+ * Project multiple vectors onto this matrix (as U)
304
+ * Returns array of projected vectors
305
+ *
306
+ * This is the high-performance batch operation for re-projecting documents
307
+ */
308
+ static VALUE rb_cmatrix_batch_project(VALUE self, VALUE rb_vectors)
309
+ {
310
+ CMatrix *u;
311
+ GET_CMATRIX(self, u);
312
+
313
+ Check_Type(rb_vectors, T_ARRAY);
314
+ long num_vectors = RARRAY_LEN(rb_vectors);
315
+
316
+ if (num_vectors == 0) {
317
+ return rb_ary_new();
318
+ }
319
+
320
+ CVector **raw_vectors = ALLOC_N(CVector *, num_vectors);
321
+ for (long i = 0; i < num_vectors; i++) {
322
+ VALUE rb_vec = rb_ary_entry(rb_vectors, i);
323
+ if (!rb_obj_is_kind_of(rb_vec, cClassifierVector)) {
324
+ xfree(raw_vectors);
325
+ rb_raise(rb_eTypeError, "Expected array of Vectors");
326
+ }
327
+ GET_CVECTOR(rb_vec, raw_vectors[i]);
328
+ }
329
+
330
+ CVector **lsi_vectors = ALLOC_N(CVector *, num_vectors);
331
+ cbatch_project(u, raw_vectors, (size_t)num_vectors, lsi_vectors);
332
+
333
+ VALUE result = rb_ary_new_capa(num_vectors);
334
+ for (long i = 0; i < num_vectors; i++) {
335
+ VALUE rb_lsi = TypedData_Wrap_Struct(cClassifierVector, &cvector_type,
336
+ lsi_vectors[i]);
337
+ rb_ary_push(result, rb_lsi);
338
+ }
339
+
340
+ xfree(raw_vectors);
341
+ xfree(lsi_vectors);
342
+
343
+ return result;
344
+ }
345
+
346
+ /*
347
+ * Matrix#incremental_svd_update(singular_values, new_vector, max_rank, epsilon)
348
+ * Perform one Brand's incremental SVD update
349
+ * Returns [new_u, new_singular_values]
350
+ */
351
+ static VALUE rb_cmatrix_incremental_update(VALUE self, VALUE rb_s, VALUE rb_c,
352
+ VALUE rb_max_rank, VALUE rb_epsilon)
353
+ {
354
+ CMatrix *u;
355
+ CVector *s, *c;
356
+
357
+ GET_CMATRIX(self, u);
358
+ GET_CVECTOR(rb_s, s);
359
+ GET_CVECTOR(rb_c, c);
360
+
361
+ int max_rank = NUM2INT(rb_max_rank);
362
+ double epsilon = NUM2DBL(rb_epsilon);
363
+
364
+ CMatrix *u_new;
365
+ CVector *s_new;
366
+
367
+ incremental_update(u, s, c, max_rank, epsilon, &u_new, &s_new);
368
+
369
+ VALUE rb_u_new = TypedData_Wrap_Struct(cClassifierMatrix, &cmatrix_type, u_new);
370
+ VALUE rb_s_new = TypedData_Wrap_Struct(cClassifierVector, &cvector_type, s_new);
371
+
372
+ return rb_ary_new_from_args(2, rb_u_new, rb_s_new);
373
+ }
374
+
375
+ void Init_incremental_svd(void)
376
+ {
377
+ /* Matrix class methods for incremental SVD */
378
+ rb_define_singleton_method(cClassifierMatrix, "extend_column",
379
+ rb_cmatrix_extend_column, 2);
380
+ rb_define_singleton_method(cClassifierMatrix, "vstack",
381
+ rb_cmatrix_vstack, 2);
382
+ rb_define_singleton_method(cClassifierMatrix, "zeros",
383
+ rb_cmatrix_zeros, 2);
384
+
385
+ /* Instance methods */
386
+ rb_define_method(cClassifierMatrix, "batch_project",
387
+ rb_cmatrix_batch_project, 1);
388
+ rb_define_method(cClassifierMatrix, "incremental_svd_update",
389
+ rb_cmatrix_incremental_update, 4);
390
+
391
+ /* Vector subtraction */
392
+ rb_define_method(cClassifierVector, "-", rb_cvector_subtract, 1);
393
+ }
@@ -0,0 +1,72 @@
1
+ #ifndef CLASSIFIER_LINALG_H
2
+ #define CLASSIFIER_LINALG_H
3
+
4
+ #include <ruby.h>
5
+ #include <math.h>
6
+ #include <stdlib.h>
7
+ #include <string.h>
8
+
9
+ /* Epsilon for numerical comparisons */
10
+ #define CLASSIFIER_EPSILON 1e-10
11
+
12
+ /* Vector structure */
13
+ typedef struct {
14
+ size_t size;
15
+ double *data;
16
+ int is_col; /* 0 = row vector, 1 = column vector */
17
+ } CVector;
18
+
19
+ /* Matrix structure */
20
+ typedef struct {
21
+ size_t rows;
22
+ size_t cols;
23
+ double *data; /* Row-major storage */
24
+ } CMatrix;
25
+
26
+ /* Ruby class references */
27
+ extern VALUE cClassifierVector;
28
+ extern VALUE cClassifierMatrix;
29
+ extern VALUE mClassifierLinalg;
30
+
31
+ /* Vector functions */
32
+ void Init_vector(void);
33
+ CVector *cvector_alloc(size_t size);
34
+ void cvector_free(void *ptr);
35
+ double cvector_magnitude(CVector *v);
36
+ CVector *cvector_normalize(CVector *v);
37
+ double cvector_sum(CVector *v);
38
+ double cvector_dot(CVector *a, CVector *b);
39
+
40
+ /* Matrix functions */
41
+ void Init_matrix(void);
42
+ CMatrix *cmatrix_alloc(size_t rows, size_t cols);
43
+ void cmatrix_free(void *ptr);
44
+ CMatrix *cmatrix_transpose(CMatrix *m);
45
+ CMatrix *cmatrix_multiply(CMatrix *a, CMatrix *b);
46
+ CVector *cmatrix_multiply_vector(CMatrix *m, CVector *v);
47
+ CMatrix *cmatrix_diagonal(CVector *v);
48
+
49
+ /* SVD functions */
50
+ void Init_svd(void);
51
+ void jacobi_svd(CMatrix *a, CMatrix **u, CMatrix **v, CVector **s);
52
+
53
+ /* Incremental SVD functions */
54
+ void Init_incremental_svd(void);
55
+ CMatrix *cmatrix_extend_column(CMatrix *m, CVector *col);
56
+ CMatrix *cmatrix_vstack(CMatrix *top, CMatrix *bottom);
57
+ CVector *cvector_subtract(CVector *a, CVector *b);
58
+ void cbatch_project(CMatrix *u, CVector **raw_vectors, size_t num_vectors,
59
+ CVector **lsi_vectors_out);
60
+
61
+ /* TypedData definitions */
62
+ extern const rb_data_type_t cvector_type;
63
+ extern const rb_data_type_t cmatrix_type;
64
+
65
+ /* Helper macros */
66
+ #define GET_CVECTOR(obj, ptr) TypedData_Get_Struct(obj, CVector, &cvector_type, ptr)
67
+ #define GET_CMATRIX(obj, ptr) TypedData_Get_Struct(obj, CMatrix, &cmatrix_type, ptr)
68
+
69
+ /* Matrix element access (row-major) */
70
+ #define MAT_AT(m, i, j) ((m)->data[(i) * (m)->cols + (j)])
71
+
72
+ #endif /* CLASSIFIER_LINALG_H */