classifier 1.4.4 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CLAUDE.md +77 -0
- data/README.md +274 -0
- data/ext/classifier/classifier_ext.c +25 -0
- data/ext/classifier/extconf.rb +15 -0
- data/ext/classifier/linalg.h +64 -0
- data/ext/classifier/matrix.c +387 -0
- data/ext/classifier/svd.c +208 -0
- data/ext/classifier/vector.c +319 -0
- data/lib/classifier/bayes.rb +294 -60
- data/lib/classifier/errors.rb +16 -0
- data/lib/classifier/extensions/vector.rb +42 -26
- data/lib/classifier/extensions/word_hash.rb +8 -1
- data/lib/classifier/lsi/content_node.rb +30 -9
- data/lib/classifier/lsi/word_list.rb +12 -1
- data/lib/classifier/lsi.rb +479 -125
- data/lib/classifier/storage/base.rb +50 -0
- data/lib/classifier/storage/file.rb +51 -0
- data/lib/classifier/storage/memory.rb +49 -0
- data/lib/classifier/storage.rb +9 -0
- data/lib/classifier.rb +2 -0
- data/sig/vendor/fast_stemmer.rbs +9 -0
- data/sig/vendor/gsl.rbs +27 -0
- data/sig/vendor/json.rbs +4 -0
- data/sig/vendor/matrix.rbs +26 -0
- data/sig/vendor/mutex_m.rbs +16 -0
- data/test/test_helper.rb +13 -1
- metadata +71 -10
- data/lib/classifier/extensions/vector_serialize.rb +0 -18
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* vector.c
|
|
3
|
+
* Vector implementation for Classifier native linear algebra
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
#include "linalg.h"
|
|
7
|
+
|
|
8
|
+
const rb_data_type_t cvector_type = {
|
|
9
|
+
.wrap_struct_name = "Classifier::Linalg::Vector",
|
|
10
|
+
.function = {
|
|
11
|
+
.dmark = NULL,
|
|
12
|
+
.dfree = cvector_free,
|
|
13
|
+
.dsize = NULL,
|
|
14
|
+
},
|
|
15
|
+
.flags = RUBY_TYPED_FREE_IMMEDIATELY
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
/* Allocate a new CVector */
|
|
19
|
+
CVector *cvector_alloc(size_t size)
|
|
20
|
+
{
|
|
21
|
+
CVector *v = ALLOC(CVector);
|
|
22
|
+
v->size = size;
|
|
23
|
+
v->data = ALLOC_N(double, size);
|
|
24
|
+
v->is_col = 0; /* Default to row vector */
|
|
25
|
+
memset(v->data, 0, size * sizeof(double));
|
|
26
|
+
return v;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/* Free a CVector */
|
|
30
|
+
void cvector_free(void *ptr)
|
|
31
|
+
{
|
|
32
|
+
CVector *v = (CVector *)ptr;
|
|
33
|
+
if (v) {
|
|
34
|
+
if (v->data) xfree(v->data);
|
|
35
|
+
xfree(v);
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/* Calculate magnitude (Euclidean norm) */
|
|
40
|
+
double cvector_magnitude(CVector *v)
|
|
41
|
+
{
|
|
42
|
+
double sum = 0.0;
|
|
43
|
+
for (size_t i = 0; i < v->size; i++) {
|
|
44
|
+
sum += v->data[i] * v->data[i];
|
|
45
|
+
}
|
|
46
|
+
return sqrt(sum);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/* Return normalized copy */
|
|
50
|
+
CVector *cvector_normalize(CVector *v)
|
|
51
|
+
{
|
|
52
|
+
CVector *result = cvector_alloc(v->size);
|
|
53
|
+
result->is_col = v->is_col;
|
|
54
|
+
double mag = cvector_magnitude(v);
|
|
55
|
+
|
|
56
|
+
if (mag <= CLASSIFIER_EPSILON) {
|
|
57
|
+
/* Return zero vector if magnitude is too small */
|
|
58
|
+
return result;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
for (size_t i = 0; i < v->size; i++) {
|
|
62
|
+
result->data[i] = v->data[i] / mag;
|
|
63
|
+
}
|
|
64
|
+
return result;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/* Sum all elements */
|
|
68
|
+
double cvector_sum(CVector *v)
|
|
69
|
+
{
|
|
70
|
+
double sum = 0.0;
|
|
71
|
+
for (size_t i = 0; i < v->size; i++) {
|
|
72
|
+
sum += v->data[i];
|
|
73
|
+
}
|
|
74
|
+
return sum;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/* Dot product */
|
|
78
|
+
double cvector_dot(CVector *a, CVector *b)
|
|
79
|
+
{
|
|
80
|
+
if (a->size != b->size) {
|
|
81
|
+
rb_raise(rb_eArgError, "Vector sizes must match for dot product");
|
|
82
|
+
}
|
|
83
|
+
double sum = 0.0;
|
|
84
|
+
for (size_t i = 0; i < a->size; i++) {
|
|
85
|
+
sum += a->data[i] * b->data[i];
|
|
86
|
+
}
|
|
87
|
+
return sum;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/* Ruby allocation function */
|
|
91
|
+
static VALUE rb_cvector_alloc(VALUE klass)
|
|
92
|
+
{
|
|
93
|
+
CVector *v = cvector_alloc(0);
|
|
94
|
+
return TypedData_Wrap_Struct(klass, &cvector_type, v);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/*
|
|
98
|
+
* Vector.alloc(size_or_array)
|
|
99
|
+
* Create a new vector from size (zero-filled) or array of values
|
|
100
|
+
*/
|
|
101
|
+
static VALUE rb_cvector_s_alloc(VALUE klass, VALUE arg)
|
|
102
|
+
{
|
|
103
|
+
CVector *v;
|
|
104
|
+
VALUE result;
|
|
105
|
+
|
|
106
|
+
if (RB_TYPE_P(arg, T_ARRAY)) {
|
|
107
|
+
long len = RARRAY_LEN(arg);
|
|
108
|
+
v = cvector_alloc((size_t)len);
|
|
109
|
+
for (long i = 0; i < len; i++) {
|
|
110
|
+
v->data[i] = NUM2DBL(rb_ary_entry(arg, i));
|
|
111
|
+
}
|
|
112
|
+
} else {
|
|
113
|
+
size_t size = NUM2SIZET(arg);
|
|
114
|
+
v = cvector_alloc(size);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
result = TypedData_Wrap_Struct(klass, &cvector_type, v);
|
|
118
|
+
return result;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/* Vector#size */
|
|
122
|
+
static VALUE rb_cvector_size(VALUE self)
|
|
123
|
+
{
|
|
124
|
+
CVector *v;
|
|
125
|
+
GET_CVECTOR(self, v);
|
|
126
|
+
return SIZET2NUM(v->size);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/* Vector#[] */
|
|
130
|
+
static VALUE rb_cvector_aref(VALUE self, VALUE idx)
|
|
131
|
+
{
|
|
132
|
+
CVector *v;
|
|
133
|
+
GET_CVECTOR(self, v);
|
|
134
|
+
long i = NUM2LONG(idx);
|
|
135
|
+
|
|
136
|
+
if (i < 0) i += v->size;
|
|
137
|
+
if (i < 0 || (size_t)i >= v->size) {
|
|
138
|
+
rb_raise(rb_eIndexError, "index %ld out of bounds", i);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
return DBL2NUM(v->data[i]);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/* Vector#[]= */
|
|
145
|
+
static VALUE rb_cvector_aset(VALUE self, VALUE idx, VALUE val)
|
|
146
|
+
{
|
|
147
|
+
CVector *v;
|
|
148
|
+
GET_CVECTOR(self, v);
|
|
149
|
+
long i = NUM2LONG(idx);
|
|
150
|
+
|
|
151
|
+
if (i < 0) i += v->size;
|
|
152
|
+
if (i < 0 || (size_t)i >= v->size) {
|
|
153
|
+
rb_raise(rb_eIndexError, "index %ld out of bounds", i);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
v->data[i] = NUM2DBL(val);
|
|
157
|
+
return val;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/* Vector#to_a */
|
|
161
|
+
static VALUE rb_cvector_to_a(VALUE self)
|
|
162
|
+
{
|
|
163
|
+
CVector *v;
|
|
164
|
+
GET_CVECTOR(self, v);
|
|
165
|
+
VALUE ary = rb_ary_new_capa((long)v->size);
|
|
166
|
+
|
|
167
|
+
for (size_t i = 0; i < v->size; i++) {
|
|
168
|
+
rb_ary_push(ary, DBL2NUM(v->data[i]));
|
|
169
|
+
}
|
|
170
|
+
return ary;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/* Vector#sum */
|
|
174
|
+
static VALUE rb_cvector_sum(VALUE self)
|
|
175
|
+
{
|
|
176
|
+
CVector *v;
|
|
177
|
+
GET_CVECTOR(self, v);
|
|
178
|
+
return DBL2NUM(cvector_sum(v));
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
/* Vector#each */
|
|
182
|
+
static VALUE rb_cvector_each(VALUE self)
|
|
183
|
+
{
|
|
184
|
+
CVector *v;
|
|
185
|
+
GET_CVECTOR(self, v);
|
|
186
|
+
|
|
187
|
+
RETURN_ENUMERATOR(self, 0, 0);
|
|
188
|
+
|
|
189
|
+
for (size_t i = 0; i < v->size; i++) {
|
|
190
|
+
rb_yield(DBL2NUM(v->data[i]));
|
|
191
|
+
}
|
|
192
|
+
return self;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/* Vector#collect (map) */
|
|
196
|
+
static VALUE rb_cvector_collect(VALUE self)
|
|
197
|
+
{
|
|
198
|
+
CVector *v;
|
|
199
|
+
GET_CVECTOR(self, v);
|
|
200
|
+
|
|
201
|
+
RETURN_ENUMERATOR(self, 0, 0);
|
|
202
|
+
|
|
203
|
+
CVector *result = cvector_alloc(v->size);
|
|
204
|
+
result->is_col = v->is_col;
|
|
205
|
+
|
|
206
|
+
for (size_t i = 0; i < v->size; i++) {
|
|
207
|
+
VALUE val = rb_yield(DBL2NUM(v->data[i]));
|
|
208
|
+
result->data[i] = NUM2DBL(val);
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
return TypedData_Wrap_Struct(cClassifierVector, &cvector_type, result);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
/* Vector#normalize */
|
|
215
|
+
static VALUE rb_cvector_normalize(VALUE self)
|
|
216
|
+
{
|
|
217
|
+
CVector *v;
|
|
218
|
+
GET_CVECTOR(self, v);
|
|
219
|
+
CVector *result = cvector_normalize(v);
|
|
220
|
+
return TypedData_Wrap_Struct(cClassifierVector, &cvector_type, result);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
/* Vector#row - return self as row vector */
|
|
224
|
+
static VALUE rb_cvector_row(VALUE self)
|
|
225
|
+
{
|
|
226
|
+
CVector *v;
|
|
227
|
+
GET_CVECTOR(self, v);
|
|
228
|
+
|
|
229
|
+
CVector *result = cvector_alloc(v->size);
|
|
230
|
+
memcpy(result->data, v->data, v->size * sizeof(double));
|
|
231
|
+
result->is_col = 0;
|
|
232
|
+
|
|
233
|
+
return TypedData_Wrap_Struct(cClassifierVector, &cvector_type, result);
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
/* Vector#col - return self as column vector */
|
|
237
|
+
static VALUE rb_cvector_col(VALUE self)
|
|
238
|
+
{
|
|
239
|
+
CVector *v;
|
|
240
|
+
GET_CVECTOR(self, v);
|
|
241
|
+
|
|
242
|
+
CVector *result = cvector_alloc(v->size);
|
|
243
|
+
memcpy(result->data, v->data, v->size * sizeof(double));
|
|
244
|
+
result->is_col = 1;
|
|
245
|
+
|
|
246
|
+
return TypedData_Wrap_Struct(cClassifierVector, &cvector_type, result);
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/* Vector#* - dot product with vector, or matrix multiplication */
|
|
250
|
+
static VALUE rb_cvector_mul(VALUE self, VALUE other)
|
|
251
|
+
{
|
|
252
|
+
CVector *v;
|
|
253
|
+
GET_CVECTOR(self, v);
|
|
254
|
+
|
|
255
|
+
if (rb_obj_is_kind_of(other, cClassifierVector)) {
|
|
256
|
+
CVector *w;
|
|
257
|
+
GET_CVECTOR(other, w);
|
|
258
|
+
return DBL2NUM(cvector_dot(v, w));
|
|
259
|
+
} else if (RB_TYPE_P(other, T_FLOAT) || RB_TYPE_P(other, T_FIXNUM)) {
|
|
260
|
+
/* Scalar multiplication */
|
|
261
|
+
double scalar = NUM2DBL(other);
|
|
262
|
+
CVector *result = cvector_alloc(v->size);
|
|
263
|
+
result->is_col = v->is_col;
|
|
264
|
+
for (size_t i = 0; i < v->size; i++) {
|
|
265
|
+
result->data[i] = v->data[i] * scalar;
|
|
266
|
+
}
|
|
267
|
+
return TypedData_Wrap_Struct(cClassifierVector, &cvector_type, result);
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
rb_raise(rb_eTypeError, "Cannot multiply Vector with %s", rb_obj_classname(other));
|
|
271
|
+
return Qnil;
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
/* Vector#_dump for Marshal */
|
|
275
|
+
static VALUE rb_cvector_dump(VALUE self, VALUE depth)
|
|
276
|
+
{
|
|
277
|
+
CVector *v;
|
|
278
|
+
GET_CVECTOR(self, v);
|
|
279
|
+
VALUE ary = rb_cvector_to_a(self);
|
|
280
|
+
rb_ary_push(ary, v->is_col ? Qtrue : Qfalse);
|
|
281
|
+
return rb_marshal_dump(ary, Qnil);
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
/* Vector._load for Marshal */
|
|
285
|
+
static VALUE rb_cvector_s_load(VALUE klass, VALUE str)
|
|
286
|
+
{
|
|
287
|
+
VALUE ary = rb_marshal_load(str);
|
|
288
|
+
VALUE is_col = rb_ary_pop(ary);
|
|
289
|
+
VALUE result = rb_cvector_s_alloc(klass, ary);
|
|
290
|
+
CVector *v;
|
|
291
|
+
GET_CVECTOR(result, v);
|
|
292
|
+
v->is_col = RTEST(is_col) ? 1 : 0;
|
|
293
|
+
return result;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
void Init_vector(void)
|
|
297
|
+
{
|
|
298
|
+
cClassifierVector = rb_define_class_under(mClassifierLinalg, "Vector", rb_cObject);
|
|
299
|
+
|
|
300
|
+
rb_define_alloc_func(cClassifierVector, rb_cvector_alloc);
|
|
301
|
+
rb_define_singleton_method(cClassifierVector, "alloc", rb_cvector_s_alloc, 1);
|
|
302
|
+
rb_define_singleton_method(cClassifierVector, "_load", rb_cvector_s_load, 1);
|
|
303
|
+
|
|
304
|
+
rb_define_method(cClassifierVector, "size", rb_cvector_size, 0);
|
|
305
|
+
rb_define_method(cClassifierVector, "[]", rb_cvector_aref, 1);
|
|
306
|
+
rb_define_method(cClassifierVector, "[]=", rb_cvector_aset, 2);
|
|
307
|
+
rb_define_method(cClassifierVector, "to_a", rb_cvector_to_a, 0);
|
|
308
|
+
rb_define_method(cClassifierVector, "sum", rb_cvector_sum, 0);
|
|
309
|
+
rb_define_method(cClassifierVector, "each", rb_cvector_each, 0);
|
|
310
|
+
rb_define_method(cClassifierVector, "collect", rb_cvector_collect, 0);
|
|
311
|
+
rb_define_alias(cClassifierVector, "map", "collect");
|
|
312
|
+
rb_define_method(cClassifierVector, "normalize", rb_cvector_normalize, 0);
|
|
313
|
+
rb_define_method(cClassifierVector, "row", rb_cvector_row, 0);
|
|
314
|
+
rb_define_method(cClassifierVector, "col", rb_cvector_col, 0);
|
|
315
|
+
rb_define_method(cClassifierVector, "*", rb_cvector_mul, 1);
|
|
316
|
+
rb_define_method(cClassifierVector, "_dump", rb_cvector_dump, 1);
|
|
317
|
+
|
|
318
|
+
rb_include_module(cClassifierVector, rb_mEnumerable);
|
|
319
|
+
}
|