classifier 2.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CLAUDE.md +23 -13
- data/README.md +82 -67
- data/ext/classifier/classifier_ext.c +25 -0
- data/ext/classifier/extconf.rb +15 -0
- data/ext/classifier/linalg.h +64 -0
- data/ext/classifier/matrix.c +387 -0
- data/ext/classifier/svd.c +208 -0
- data/ext/classifier/vector.c +319 -0
- data/lib/classifier/bayes.rb +253 -33
- data/lib/classifier/errors.rb +16 -0
- data/lib/classifier/extensions/vector.rb +12 -4
- data/lib/classifier/lsi/content_node.rb +5 -5
- data/lib/classifier/lsi.rb +439 -141
- data/lib/classifier/storage/base.rb +50 -0
- data/lib/classifier/storage/file.rb +51 -0
- data/lib/classifier/storage/memory.rb +49 -0
- data/lib/classifier/storage.rb +9 -0
- data/lib/classifier.rb +2 -0
- data/sig/vendor/json.rbs +4 -0
- data/sig/vendor/mutex_m.rbs +16 -0
- data/test/test_helper.rb +2 -0
- metadata +36 -5
- data/lib/classifier/extensions/vector_serialize.rb +0 -18
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e82726ac5c6e619e701be4591c8ed29e8b9f9a88236ad9d8d602fe3f748dcf43
|
|
4
|
+
data.tar.gz: 3771f2d2fce4992ed0cd5b1ad3849cc3e1c4fa310a144c1056e5f9d0c49579ef
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 913111c43ffd83a1a461023c6d1f4fef0f1efb7f4e591089db7dbf3d37a1cc98cb3b5de16888019ab1caf8db83ad828dbaf82d2cab237b991216bdb27f529d15
|
|
7
|
+
data.tar.gz: b7142000fa687a58481051921663665edad1ac3b183947e79cf63bb15a79a0ca7bc66cd76104fccea713162cf05e876bd609d5f4866a13ea2c184f8caa089547
|
data/CLAUDE.md
CHANGED
|
@@ -11,21 +11,28 @@ Ruby gem providing text classification via two algorithms:
|
|
|
11
11
|
## Common Commands
|
|
12
12
|
|
|
13
13
|
```bash
|
|
14
|
-
#
|
|
15
|
-
rake
|
|
14
|
+
# Compile native C extension
|
|
15
|
+
bundle exec rake compile
|
|
16
|
+
|
|
17
|
+
# Run all tests (compiles first)
|
|
18
|
+
bundle exec rake test
|
|
16
19
|
|
|
17
20
|
# Run a single test file
|
|
18
21
|
ruby -Ilib test/bayes/bayesian_test.rb
|
|
19
22
|
ruby -Ilib test/lsi/lsi_test.rb
|
|
20
23
|
|
|
21
|
-
# Run tests with
|
|
22
|
-
NATIVE_VECTOR=true rake test
|
|
24
|
+
# Run tests with pure Ruby (no native extension)
|
|
25
|
+
NATIVE_VECTOR=true bundle exec rake test
|
|
26
|
+
|
|
27
|
+
# Run benchmarks
|
|
28
|
+
bundle exec rake benchmark
|
|
29
|
+
bundle exec rake benchmark:compare
|
|
23
30
|
|
|
24
31
|
# Interactive console
|
|
25
|
-
rake console
|
|
32
|
+
bundle exec rake console
|
|
26
33
|
|
|
27
34
|
# Generate documentation
|
|
28
|
-
rake doc
|
|
35
|
+
bundle exec rake doc
|
|
29
36
|
```
|
|
30
37
|
|
|
31
38
|
## Architecture
|
|
@@ -39,7 +46,7 @@ rake doc
|
|
|
39
46
|
|
|
40
47
|
**LSI Classifier** (`lib/classifier/lsi.rb`)
|
|
41
48
|
- Uses Singular Value Decomposition (SVD) for semantic analysis
|
|
42
|
-
-
|
|
49
|
+
- Native C extension for 5-50x faster matrix operations; falls back to pure Ruby
|
|
43
50
|
- Key operations: `add_item`, `classify`, `find_related`, `search`
|
|
44
51
|
- `auto_rebuild` option controls automatic index rebuilding after changes
|
|
45
52
|
|
|
@@ -49,15 +56,18 @@ rake doc
|
|
|
49
56
|
- Uses `fast-stemmer` gem for Porter stemming
|
|
50
57
|
|
|
51
58
|
**Vector Extensions** (`lib/classifier/extensions/vector.rb`)
|
|
52
|
-
- Pure Ruby SVD implementation (`Matrix#SV_decomp`)
|
|
59
|
+
- Pure Ruby SVD implementation (`Matrix#SV_decomp`) - used as fallback
|
|
53
60
|
- Vector normalization and magnitude calculations
|
|
54
61
|
|
|
55
|
-
###
|
|
62
|
+
### Native C Extension (`ext/classifier/`)
|
|
63
|
+
|
|
64
|
+
LSI uses a native C extension for fast linear algebra operations:
|
|
65
|
+
- `Classifier::Linalg::Vector` - Vector operations (alloc, normalize, dot product)
|
|
66
|
+
- `Classifier::Linalg::Matrix` - Matrix operations (alloc, transpose, multiply)
|
|
67
|
+
- Jacobi SVD implementation for singular value decomposition
|
|
56
68
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
- Serialization handled via `vector_serialize.rb`
|
|
60
|
-
- Test without GSL: `NATIVE_VECTOR=true rake test`
|
|
69
|
+
Check current backend: `Classifier::LSI.backend` returns `:native` or `:ruby`
|
|
70
|
+
Force pure Ruby: `NATIVE_VECTOR=true bundle exec rake test`
|
|
61
71
|
|
|
62
72
|
### Content Nodes (`lib/classifier/lsi/content_node.rb`)
|
|
63
73
|
|
data/README.md
CHANGED
|
@@ -6,11 +6,14 @@
|
|
|
6
6
|
|
|
7
7
|
A Ruby library for text classification using Bayesian and Latent Semantic Indexing (LSI) algorithms.
|
|
8
8
|
|
|
9
|
+
**[Documentation](https://rubyclassifier.com/docs)** · **[Tutorials](https://rubyclassifier.com/docs/tutorials)** · **[Guides](https://rubyclassifier.com/docs/guides)**
|
|
10
|
+
|
|
9
11
|
## Table of Contents
|
|
10
12
|
|
|
11
13
|
- [Installation](#installation)
|
|
12
14
|
- [Bayesian Classifier](#bayesian-classifier)
|
|
13
15
|
- [LSI (Latent Semantic Indexing)](#lsi-latent-semantic-indexing)
|
|
16
|
+
- [Persistence](#persistence)
|
|
14
17
|
- [Performance](#performance)
|
|
15
18
|
- [Development](#development)
|
|
16
19
|
- [Contributing](#contributing)
|
|
@@ -36,47 +39,27 @@ Or install directly:
|
|
|
36
39
|
gem install classifier
|
|
37
40
|
```
|
|
38
41
|
|
|
39
|
-
###
|
|
40
|
-
|
|
41
|
-
For significantly faster LSI operations, install the [GNU Scientific Library](https://www.gnu.org/software/gsl/).
|
|
42
|
+
### Native C Extension
|
|
42
43
|
|
|
43
|
-
|
|
44
|
-
<summary><strong>Ruby 3+</strong></summary>
|
|
44
|
+
The gem includes a native C extension for fast LSI operations. It compiles automatically during gem installation. No external dependencies are required.
|
|
45
45
|
|
|
46
|
-
|
|
46
|
+
To verify the native extension is active:
|
|
47
47
|
|
|
48
|
-
```
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
apt-get install libgsl-dev # Ubuntu/Debian
|
|
52
|
-
|
|
53
|
-
# Build and install the gem
|
|
54
|
-
git clone https://github.com/cardmagic/rb-gsl.git
|
|
55
|
-
cd rb-gsl
|
|
56
|
-
git checkout fix/ruby-3.4-compatibility
|
|
57
|
-
gem build gsl.gemspec
|
|
58
|
-
gem install gsl-*.gem
|
|
48
|
+
```ruby
|
|
49
|
+
require 'classifier'
|
|
50
|
+
puts Classifier::LSI.backend # => :native
|
|
59
51
|
```
|
|
60
|
-
</details>
|
|
61
52
|
|
|
62
|
-
|
|
63
|
-
<summary><strong>Ruby 2.x</strong></summary>
|
|
53
|
+
To force pure Ruby mode (for debugging):
|
|
64
54
|
|
|
65
55
|
```bash
|
|
66
|
-
|
|
67
|
-
brew install gsl
|
|
68
|
-
gem install gsl
|
|
69
|
-
|
|
70
|
-
# Ubuntu/Debian
|
|
71
|
-
apt-get install libgsl-dev
|
|
72
|
-
gem install gsl
|
|
56
|
+
NATIVE_VECTOR=true ruby your_script.rb
|
|
73
57
|
```
|
|
74
|
-
</details>
|
|
75
58
|
|
|
76
|
-
|
|
59
|
+
To suppress the warning when native extension isn't available:
|
|
77
60
|
|
|
78
61
|
```bash
|
|
79
|
-
|
|
62
|
+
SUPPRESS_LSI_WARNING=true ruby your_script.rb
|
|
80
63
|
```
|
|
81
64
|
|
|
82
65
|
### Compatibility
|
|
@@ -111,29 +94,10 @@ classifier.classify "Congratulations! You've won a prize!"
|
|
|
111
94
|
# => "Spam"
|
|
112
95
|
```
|
|
113
96
|
|
|
114
|
-
### Persistence with Madeleine
|
|
115
|
-
|
|
116
|
-
```ruby
|
|
117
|
-
require 'classifier'
|
|
118
|
-
require 'madeleine'
|
|
119
|
-
|
|
120
|
-
m = SnapshotMadeleine.new("classifier_data") {
|
|
121
|
-
Classifier::Bayes.new('Interesting', 'Uninteresting')
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
m.system.train_interesting "fascinating article about science"
|
|
125
|
-
m.system.train_uninteresting "boring repetitive content"
|
|
126
|
-
m.take_snapshot
|
|
127
|
-
|
|
128
|
-
# Later, restore and use:
|
|
129
|
-
m.system.classify "new scientific discovery"
|
|
130
|
-
# => "Interesting"
|
|
131
|
-
```
|
|
132
|
-
|
|
133
97
|
### Learn More
|
|
134
98
|
|
|
135
|
-
- [
|
|
136
|
-
- [
|
|
99
|
+
- [Bayes Basics Guide](https://rubyclassifier.com/docs/guides/bayes/basics) - In-depth documentation
|
|
100
|
+
- [Build a Spam Filter Tutorial](https://rubyclassifier.com/docs/tutorials/spam-filter) - Step-by-step guide
|
|
137
101
|
- [Paul Graham: A Plan for Spam](http://www.paulgraham.com/spam.html)
|
|
138
102
|
|
|
139
103
|
## LSI (Latent Semantic Indexing)
|
|
@@ -176,33 +140,83 @@ lsi.search "programming", 3
|
|
|
176
140
|
|
|
177
141
|
### Learn More
|
|
178
142
|
|
|
143
|
+
- [LSI Basics Guide](https://rubyclassifier.com/docs/guides/lsi/basics) - In-depth documentation
|
|
179
144
|
- [Wikipedia: Latent Semantic Analysis](http://en.wikipedia.org/wiki/Latent_semantic_analysis)
|
|
180
|
-
|
|
145
|
+
|
|
146
|
+
## Persistence
|
|
147
|
+
|
|
148
|
+
Save and load trained classifiers with pluggable storage backends. Works with both Bayes and LSI classifiers.
|
|
149
|
+
|
|
150
|
+
### File Storage
|
|
151
|
+
|
|
152
|
+
```ruby
|
|
153
|
+
require 'classifier'
|
|
154
|
+
|
|
155
|
+
classifier = Classifier::Bayes.new('Spam', 'Ham')
|
|
156
|
+
classifier.train_spam "Buy now! Limited offer!"
|
|
157
|
+
classifier.train_ham "Meeting tomorrow at 3pm"
|
|
158
|
+
|
|
159
|
+
# Configure storage and save
|
|
160
|
+
classifier.storage = Classifier::Storage::File.new(path: "spam_filter.json")
|
|
161
|
+
classifier.save
|
|
162
|
+
|
|
163
|
+
# Load later
|
|
164
|
+
loaded = Classifier::Bayes.load(storage: classifier.storage)
|
|
165
|
+
loaded.classify "Claim your prize now!"
|
|
166
|
+
# => "Spam"
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### Custom Storage Backends
|
|
170
|
+
|
|
171
|
+
Create backends for Redis, PostgreSQL, S3, or any storage system:
|
|
172
|
+
|
|
173
|
+
```ruby
|
|
174
|
+
class RedisStorage < Classifier::Storage::Base
|
|
175
|
+
def initialize(redis:, key:)
|
|
176
|
+
super()
|
|
177
|
+
@redis, @key = redis, key
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def write(data) = @redis.set(@key, data)
|
|
181
|
+
def read = @redis.get(@key)
|
|
182
|
+
def delete = @redis.del(@key)
|
|
183
|
+
def exists? = @redis.exists?(@key)
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Use it
|
|
187
|
+
classifier.storage = RedisStorage.new(redis: Redis.new, key: "classifier:spam")
|
|
188
|
+
classifier.save
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### Learn More
|
|
192
|
+
|
|
193
|
+
- [Persistence Guide](https://rubyclassifier.com/docs/guides/persistence/basics) - Full documentation with examples
|
|
181
194
|
|
|
182
195
|
## Performance
|
|
183
196
|
|
|
184
|
-
###
|
|
197
|
+
### Native C Extension vs Pure Ruby
|
|
185
198
|
|
|
186
|
-
|
|
199
|
+
The native C extension provides dramatic speedups for LSI operations, especially `build_index` (SVD computation):
|
|
187
200
|
|
|
188
201
|
| Documents | build_index | Overall |
|
|
189
202
|
|-----------|-------------|---------|
|
|
190
|
-
| 5 |
|
|
191
|
-
| 10 |
|
|
192
|
-
| 15 |
|
|
203
|
+
| 5 | 7x faster | 2.6x |
|
|
204
|
+
| 10 | 25x faster | 4.6x |
|
|
205
|
+
| 15 | 112x faster | 14.5x |
|
|
206
|
+
| 20 | 385x faster | 48.7x |
|
|
193
207
|
|
|
194
208
|
<details>
|
|
195
|
-
<summary>Detailed benchmark (
|
|
209
|
+
<summary>Detailed benchmark (20 documents)</summary>
|
|
196
210
|
|
|
197
211
|
```
|
|
198
|
-
Operation
|
|
212
|
+
Operation Pure Ruby Native C Speedup
|
|
199
213
|
----------------------------------------------------------
|
|
200
|
-
build_index 0.
|
|
201
|
-
classify 0.
|
|
202
|
-
search 0.
|
|
203
|
-
find_related 0.
|
|
214
|
+
build_index 0.5540 0.0014 384.5x
|
|
215
|
+
classify 0.0190 0.0060 3.2x
|
|
216
|
+
search 0.0145 0.0037 3.9x
|
|
217
|
+
find_related 0.0098 0.0011 8.6x
|
|
204
218
|
----------------------------------------------------------
|
|
205
|
-
TOTAL 0.
|
|
219
|
+
TOTAL 0.5973 0.0123 48.7x
|
|
206
220
|
```
|
|
207
221
|
</details>
|
|
208
222
|
|
|
@@ -210,7 +224,7 @@ TOTAL 0.1725 0.0104 16.6x
|
|
|
210
224
|
|
|
211
225
|
```bash
|
|
212
226
|
rake benchmark # Run with current configuration
|
|
213
|
-
rake benchmark:compare # Compare
|
|
227
|
+
rake benchmark:compare # Compare native C vs pure Ruby
|
|
214
228
|
```
|
|
215
229
|
|
|
216
230
|
## Development
|
|
@@ -221,15 +235,16 @@ rake benchmark:compare # Compare GSL vs native Ruby
|
|
|
221
235
|
git clone https://github.com/cardmagic/classifier.git
|
|
222
236
|
cd classifier
|
|
223
237
|
bundle install
|
|
238
|
+
rake compile # Compile native C extension
|
|
224
239
|
```
|
|
225
240
|
|
|
226
241
|
### Running Tests
|
|
227
242
|
|
|
228
243
|
```bash
|
|
229
|
-
rake test # Run all tests
|
|
244
|
+
rake test # Run all tests (compiles first)
|
|
230
245
|
ruby -Ilib test/bayes/bayesian_test.rb # Run specific test file
|
|
231
246
|
|
|
232
|
-
# Test
|
|
247
|
+
# Test with pure Ruby (no native extension)
|
|
233
248
|
NATIVE_VECTOR=true rake test
|
|
234
249
|
```
|
|
235
250
|
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* classifier_ext.c
|
|
3
|
+
* Main entry point for the Classifier native linear algebra extension
|
|
4
|
+
*
|
|
5
|
+
* This extension provides zero-dependency Vector, Matrix, and SVD
|
|
6
|
+
* implementations for the Classifier gem's LSI functionality.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
#include "linalg.h"
|
|
10
|
+
|
|
11
|
+
VALUE mClassifierLinalg;
|
|
12
|
+
VALUE cClassifierVector;
|
|
13
|
+
VALUE cClassifierMatrix;
|
|
14
|
+
|
|
15
|
+
void Init_classifier_ext(void)
|
|
16
|
+
{
|
|
17
|
+
/* Define Classifier::Linalg module */
|
|
18
|
+
VALUE mClassifier = rb_define_module("Classifier");
|
|
19
|
+
mClassifierLinalg = rb_define_module_under(mClassifier, "Linalg");
|
|
20
|
+
|
|
21
|
+
/* Initialize Vector and Matrix classes */
|
|
22
|
+
Init_vector();
|
|
23
|
+
Init_matrix();
|
|
24
|
+
Init_svd();
|
|
25
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
require 'mkmf'
|
|
2
|
+
|
|
3
|
+
# rubocop:disable Style/GlobalVars
|
|
4
|
+
if ENV['COVERAGE']
|
|
5
|
+
# Coverage flags: disable optimization for accurate line coverage
|
|
6
|
+
$CFLAGS << ' -O0 -g --coverage -Wall'
|
|
7
|
+
$LDFLAGS << ' --coverage'
|
|
8
|
+
else
|
|
9
|
+
# Optimization flags for performance
|
|
10
|
+
$CFLAGS << ' -O3 -ffast-math -Wall'
|
|
11
|
+
end
|
|
12
|
+
# rubocop:enable Style/GlobalVars
|
|
13
|
+
|
|
14
|
+
# Create the Makefile
|
|
15
|
+
create_makefile('classifier/classifier_ext')
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
#ifndef CLASSIFIER_LINALG_H
|
|
2
|
+
#define CLASSIFIER_LINALG_H
|
|
3
|
+
|
|
4
|
+
#include <ruby.h>
|
|
5
|
+
#include <math.h>
|
|
6
|
+
#include <stdlib.h>
|
|
7
|
+
#include <string.h>
|
|
8
|
+
|
|
9
|
+
/* Epsilon for numerical comparisons */
|
|
10
|
+
#define CLASSIFIER_EPSILON 1e-10
|
|
11
|
+
|
|
12
|
+
/* Vector structure */
|
|
13
|
+
typedef struct {
|
|
14
|
+
size_t size;
|
|
15
|
+
double *data;
|
|
16
|
+
int is_col; /* 0 = row vector, 1 = column vector */
|
|
17
|
+
} CVector;
|
|
18
|
+
|
|
19
|
+
/* Matrix structure */
|
|
20
|
+
typedef struct {
|
|
21
|
+
size_t rows;
|
|
22
|
+
size_t cols;
|
|
23
|
+
double *data; /* Row-major storage */
|
|
24
|
+
} CMatrix;
|
|
25
|
+
|
|
26
|
+
/* Ruby class references */
|
|
27
|
+
extern VALUE cClassifierVector;
|
|
28
|
+
extern VALUE cClassifierMatrix;
|
|
29
|
+
extern VALUE mClassifierLinalg;
|
|
30
|
+
|
|
31
|
+
/* Vector functions */
|
|
32
|
+
void Init_vector(void);
|
|
33
|
+
CVector *cvector_alloc(size_t size);
|
|
34
|
+
void cvector_free(void *ptr);
|
|
35
|
+
double cvector_magnitude(CVector *v);
|
|
36
|
+
CVector *cvector_normalize(CVector *v);
|
|
37
|
+
double cvector_sum(CVector *v);
|
|
38
|
+
double cvector_dot(CVector *a, CVector *b);
|
|
39
|
+
|
|
40
|
+
/* Matrix functions */
|
|
41
|
+
void Init_matrix(void);
|
|
42
|
+
CMatrix *cmatrix_alloc(size_t rows, size_t cols);
|
|
43
|
+
void cmatrix_free(void *ptr);
|
|
44
|
+
CMatrix *cmatrix_transpose(CMatrix *m);
|
|
45
|
+
CMatrix *cmatrix_multiply(CMatrix *a, CMatrix *b);
|
|
46
|
+
CVector *cmatrix_multiply_vector(CMatrix *m, CVector *v);
|
|
47
|
+
CMatrix *cmatrix_diagonal(CVector *v);
|
|
48
|
+
|
|
49
|
+
/* SVD functions */
|
|
50
|
+
void Init_svd(void);
|
|
51
|
+
void jacobi_svd(CMatrix *a, CMatrix **u, CMatrix **v, CVector **s);
|
|
52
|
+
|
|
53
|
+
/* TypedData definitions */
|
|
54
|
+
extern const rb_data_type_t cvector_type;
|
|
55
|
+
extern const rb_data_type_t cmatrix_type;
|
|
56
|
+
|
|
57
|
+
/* Helper macros */
|
|
58
|
+
#define GET_CVECTOR(obj, ptr) TypedData_Get_Struct(obj, CVector, &cvector_type, ptr)
|
|
59
|
+
#define GET_CMATRIX(obj, ptr) TypedData_Get_Struct(obj, CMatrix, &cmatrix_type, ptr)
|
|
60
|
+
|
|
61
|
+
/* Matrix element access (row-major) */
|
|
62
|
+
#define MAT_AT(m, i, j) ((m)->data[(i) * (m)->cols + (j)])
|
|
63
|
+
|
|
64
|
+
#endif /* CLASSIFIER_LINALG_H */
|