classifier 2.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fea14969bc8a61283823b0b0f5bae013af968caf4676c383155e3b8682b948de
4
- data.tar.gz: 4d626c85d084ff75eba2ff305673734a6f25b668e773b1b5a3a0630a6b68df96
3
+ metadata.gz: e82726ac5c6e619e701be4591c8ed29e8b9f9a88236ad9d8d602fe3f748dcf43
4
+ data.tar.gz: 3771f2d2fce4992ed0cd5b1ad3849cc3e1c4fa310a144c1056e5f9d0c49579ef
5
5
  SHA512:
6
- metadata.gz: ef53c06db3326b1b6ebc14255b4ba198286c06e291cba3afc67bba360ca766a173f89269405d216751806ca72f885a87ac80ec24a031053f8e6f2987e8e2267e
7
- data.tar.gz: 8f120a9b78e802e6fd3e7172fd311b476745e27d5b3d301dc8d140296a451875e5aa33a901514bfdd1bc96c656ad1a43cbb3935a05223cd38548a71ba6a3a1c1
6
+ metadata.gz: 913111c43ffd83a1a461023c6d1f4fef0f1efb7f4e591089db7dbf3d37a1cc98cb3b5de16888019ab1caf8db83ad828dbaf82d2cab237b991216bdb27f529d15
7
+ data.tar.gz: b7142000fa687a58481051921663665edad1ac3b183947e79cf63bb15a79a0ca7bc66cd76104fccea713162cf05e876bd609d5f4866a13ea2c184f8caa089547
data/CLAUDE.md CHANGED
@@ -11,21 +11,28 @@ Ruby gem providing text classification via two algorithms:
11
11
  ## Common Commands
12
12
 
13
13
  ```bash
14
- # Run all tests
15
- rake test
14
+ # Compile native C extension
15
+ bundle exec rake compile
16
+
17
+ # Run all tests (compiles first)
18
+ bundle exec rake test
16
19
 
17
20
  # Run a single test file
18
21
  ruby -Ilib test/bayes/bayesian_test.rb
19
22
  ruby -Ilib test/lsi/lsi_test.rb
20
23
 
21
- # Run tests with native Ruby vector (without GSL)
22
- NATIVE_VECTOR=true rake test
24
+ # Run tests with pure Ruby (no native extension)
25
+ NATIVE_VECTOR=true bundle exec rake test
26
+
27
+ # Run benchmarks
28
+ bundle exec rake benchmark
29
+ bundle exec rake benchmark:compare
23
30
 
24
31
  # Interactive console
25
- rake console
32
+ bundle exec rake console
26
33
 
27
34
  # Generate documentation
28
- rake doc
35
+ bundle exec rake doc
29
36
  ```
30
37
 
31
38
  ## Architecture
@@ -39,7 +46,7 @@ rake doc
39
46
 
40
47
  **LSI Classifier** (`lib/classifier/lsi.rb`)
41
48
  - Uses Singular Value Decomposition (SVD) for semantic analysis
42
- - Optional GSL gem for 10x faster matrix operations; falls back to pure Ruby SVD
49
+ - Native C extension for 5-50x faster matrix operations; falls back to pure Ruby
43
50
  - Key operations: `add_item`, `classify`, `find_related`, `search`
44
51
  - `auto_rebuild` option controls automatic index rebuilding after changes
45
52
 
@@ -49,15 +56,18 @@ rake doc
49
56
  - Uses `fast-stemmer` gem for Porter stemming
50
57
 
51
58
  **Vector Extensions** (`lib/classifier/extensions/vector.rb`)
52
- - Pure Ruby SVD implementation (`Matrix#SV_decomp`)
59
+ - Pure Ruby SVD implementation (`Matrix#SV_decomp`) - used as fallback
53
60
  - Vector normalization and magnitude calculations
54
61
 
55
- ### GSL Integration
62
+ ### Native C Extension (`ext/classifier/`)
63
+
64
+ LSI uses a native C extension for fast linear algebra operations:
65
+ - `Classifier::Linalg::Vector` - Vector operations (alloc, normalize, dot product)
66
+ - `Classifier::Linalg::Matrix` - Matrix operations (alloc, transpose, multiply)
67
+ - Jacobi SVD implementation for singular value decomposition
56
68
 
57
- LSI checks for the `gsl` gem at load time. When available:
58
- - Uses `GSL::Matrix` and `GSL::Vector` for faster operations
59
- - Serialization handled via `vector_serialize.rb`
60
- - Test without GSL: `NATIVE_VECTOR=true rake test`
69
+ Check current backend: `Classifier::LSI.backend` returns `:native` or `:ruby`
70
+ Force pure Ruby: `NATIVE_VECTOR=true bundle exec rake test`
61
71
 
62
72
  ### Content Nodes (`lib/classifier/lsi/content_node.rb`)
63
73
 
data/README.md CHANGED
@@ -6,11 +6,14 @@
6
6
 
7
7
  A Ruby library for text classification using Bayesian and Latent Semantic Indexing (LSI) algorithms.
8
8
 
9
+ **[Documentation](https://rubyclassifier.com/docs)** · **[Tutorials](https://rubyclassifier.com/docs/tutorials)** · **[Guides](https://rubyclassifier.com/docs/guides)**
10
+
9
11
  ## Table of Contents
10
12
 
11
13
  - [Installation](#installation)
12
14
  - [Bayesian Classifier](#bayesian-classifier)
13
15
  - [LSI (Latent Semantic Indexing)](#lsi-latent-semantic-indexing)
16
+ - [Persistence](#persistence)
14
17
  - [Performance](#performance)
15
18
  - [Development](#development)
16
19
  - [Contributing](#contributing)
@@ -36,47 +39,27 @@ Or install directly:
36
39
  gem install classifier
37
40
  ```
38
41
 
39
- ### Optional: GSL for Faster LSI
40
-
41
- For significantly faster LSI operations, install the [GNU Scientific Library](https://www.gnu.org/software/gsl/).
42
+ ### Native C Extension
42
43
 
43
- <details>
44
- <summary><strong>Ruby 3+</strong></summary>
44
+ The gem includes a native C extension for fast LSI operations. It compiles automatically during gem installation. No external dependencies are required.
45
45
 
46
- The released `gsl` gem doesn't support Ruby 3+. Install from source:
46
+ To verify the native extension is active:
47
47
 
48
- ```bash
49
- # Install GSL library
50
- brew install gsl # macOS
51
- apt-get install libgsl-dev # Ubuntu/Debian
52
-
53
- # Build and install the gem
54
- git clone https://github.com/cardmagic/rb-gsl.git
55
- cd rb-gsl
56
- git checkout fix/ruby-3.4-compatibility
57
- gem build gsl.gemspec
58
- gem install gsl-*.gem
48
+ ```ruby
49
+ require 'classifier'
50
+ puts Classifier::LSI.backend # => :native
59
51
  ```
60
- </details>
61
52
 
62
- <details>
63
- <summary><strong>Ruby 2.x</strong></summary>
53
+ To force pure Ruby mode (for debugging):
64
54
 
65
55
  ```bash
66
- # macOS
67
- brew install gsl
68
- gem install gsl
69
-
70
- # Ubuntu/Debian
71
- apt-get install libgsl-dev
72
- gem install gsl
56
+ NATIVE_VECTOR=true ruby your_script.rb
73
57
  ```
74
- </details>
75
58
 
76
- When GSL is installed, Classifier automatically uses it. To suppress the GSL notice:
59
+ To suppress the warning when native extension isn't available:
77
60
 
78
61
  ```bash
79
- SUPPRESS_GSL_WARNING=true ruby your_script.rb
62
+ SUPPRESS_LSI_WARNING=true ruby your_script.rb
80
63
  ```
81
64
 
82
65
  ### Compatibility
@@ -111,29 +94,10 @@ classifier.classify "Congratulations! You've won a prize!"
111
94
  # => "Spam"
112
95
  ```
113
96
 
114
- ### Persistence with Madeleine
115
-
116
- ```ruby
117
- require 'classifier'
118
- require 'madeleine'
119
-
120
- m = SnapshotMadeleine.new("classifier_data") {
121
- Classifier::Bayes.new('Interesting', 'Uninteresting')
122
- }
123
-
124
- m.system.train_interesting "fascinating article about science"
125
- m.system.train_uninteresting "boring repetitive content"
126
- m.take_snapshot
127
-
128
- # Later, restore and use:
129
- m.system.classify "new scientific discovery"
130
- # => "Interesting"
131
- ```
132
-
133
97
  ### Learn More
134
98
 
135
- - [Bayesian Filtering Explained](http://www.process.com/precisemail/bayesian_filtering.htm)
136
- - [Wikipedia: Bayesian Filtering](http://en.wikipedia.org/wiki/Bayesian_filtering)
99
+ - [Bayes Basics Guide](https://rubyclassifier.com/docs/guides/bayes/basics) - In-depth documentation
100
+ - [Build a Spam Filter Tutorial](https://rubyclassifier.com/docs/tutorials/spam-filter) - Step-by-step guide
137
101
  - [Paul Graham: A Plan for Spam](http://www.paulgraham.com/spam.html)
138
102
 
139
103
  ## LSI (Latent Semantic Indexing)
@@ -176,33 +140,83 @@ lsi.search "programming", 3
176
140
 
177
141
  ### Learn More
178
142
 
143
+ - [LSI Basics Guide](https://rubyclassifier.com/docs/guides/lsi/basics) - In-depth documentation
179
144
  - [Wikipedia: Latent Semantic Analysis](http://en.wikipedia.org/wiki/Latent_semantic_analysis)
180
- - [C2 Wiki: Latent Semantic Indexing](http://www.c2.com/cgi/wiki?LatentSemanticIndexing)
145
+
146
+ ## Persistence
147
+
148
+ Save and load trained classifiers with pluggable storage backends. Works with both Bayes and LSI classifiers.
149
+
150
+ ### File Storage
151
+
152
+ ```ruby
153
+ require 'classifier'
154
+
155
+ classifier = Classifier::Bayes.new('Spam', 'Ham')
156
+ classifier.train_spam "Buy now! Limited offer!"
157
+ classifier.train_ham "Meeting tomorrow at 3pm"
158
+
159
+ # Configure storage and save
160
+ classifier.storage = Classifier::Storage::File.new(path: "spam_filter.json")
161
+ classifier.save
162
+
163
+ # Load later
164
+ loaded = Classifier::Bayes.load(storage: classifier.storage)
165
+ loaded.classify "Claim your prize now!"
166
+ # => "Spam"
167
+ ```
168
+
169
+ ### Custom Storage Backends
170
+
171
+ Create backends for Redis, PostgreSQL, S3, or any storage system:
172
+
173
+ ```ruby
174
+ class RedisStorage < Classifier::Storage::Base
175
+ def initialize(redis:, key:)
176
+ super()
177
+ @redis, @key = redis, key
178
+ end
179
+
180
+ def write(data) = @redis.set(@key, data)
181
+ def read = @redis.get(@key)
182
+ def delete = @redis.del(@key)
183
+ def exists? = @redis.exists?(@key)
184
+ end
185
+
186
+ # Use it
187
+ classifier.storage = RedisStorage.new(redis: Redis.new, key: "classifier:spam")
188
+ classifier.save
189
+ ```
190
+
191
+ ### Learn More
192
+
193
+ - [Persistence Guide](https://rubyclassifier.com/docs/guides/persistence/basics) - Full documentation with examples
181
194
 
182
195
  ## Performance
183
196
 
184
- ### GSL vs Native Ruby
197
+ ### Native C Extension vs Pure Ruby
185
198
 
186
- GSL provides dramatic speedups for LSI operations, especially `build_index` (SVD computation):
199
+ The native C extension provides dramatic speedups for LSI operations, especially `build_index` (SVD computation):
187
200
 
188
201
  | Documents | build_index | Overall |
189
202
  |-----------|-------------|---------|
190
- | 5 | 4x faster | 2.5x |
191
- | 10 | 24x faster | 5.5x |
192
- | 15 | 116x faster | 17x |
203
+ | 5 | 7x faster | 2.6x |
204
+ | 10 | 25x faster | 4.6x |
205
+ | 15 | 112x faster | 14.5x |
206
+ | 20 | 385x faster | 48.7x |
193
207
 
194
208
  <details>
195
- <summary>Detailed benchmark (15 documents)</summary>
209
+ <summary>Detailed benchmark (20 documents)</summary>
196
210
 
197
211
  ```
198
- Operation Native GSL Speedup
212
+ Operation Pure Ruby Native C Speedup
199
213
  ----------------------------------------------------------
200
- build_index 0.1412 0.0012 116.2x
201
- classify 0.0142 0.0049 2.9x
202
- search 0.0102 0.0026 3.9x
203
- find_related 0.0069 0.0016 4.2x
214
+ build_index 0.5540 0.0014 384.5x
215
+ classify 0.0190 0.0060 3.2x
216
+ search 0.0145 0.0037 3.9x
217
+ find_related 0.0098 0.0011 8.6x
204
218
  ----------------------------------------------------------
205
- TOTAL 0.1725 0.0104 16.6x
219
+ TOTAL 0.5973 0.0123 48.7x
206
220
  ```
207
221
  </details>
208
222
 
@@ -210,7 +224,7 @@ TOTAL 0.1725 0.0104 16.6x
210
224
 
211
225
  ```bash
212
226
  rake benchmark # Run with current configuration
213
- rake benchmark:compare # Compare GSL vs native Ruby
227
+ rake benchmark:compare # Compare native C vs pure Ruby
214
228
  ```
215
229
 
216
230
  ## Development
@@ -221,15 +235,16 @@ rake benchmark:compare # Compare GSL vs native Ruby
221
235
  git clone https://github.com/cardmagic/classifier.git
222
236
  cd classifier
223
237
  bundle install
238
+ rake compile # Compile native C extension
224
239
  ```
225
240
 
226
241
  ### Running Tests
227
242
 
228
243
  ```bash
229
- rake test # Run all tests
244
+ rake test # Run all tests (compiles first)
230
245
  ruby -Ilib test/bayes/bayesian_test.rb # Run specific test file
231
246
 
232
- # Test without GSL (pure Ruby)
247
+ # Test with pure Ruby (no native extension)
233
248
  NATIVE_VECTOR=true rake test
234
249
  ```
235
250
 
@@ -0,0 +1,25 @@
1
+ /*
2
+ * classifier_ext.c
3
+ * Main entry point for the Classifier native linear algebra extension
4
+ *
5
+ * This extension provides zero-dependency Vector, Matrix, and SVD
6
+ * implementations for the Classifier gem's LSI functionality.
7
+ */
8
+
9
+ #include "linalg.h"
10
+
11
+ VALUE mClassifierLinalg;
12
+ VALUE cClassifierVector;
13
+ VALUE cClassifierMatrix;
14
+
15
+ void Init_classifier_ext(void)
16
+ {
17
+ /* Define Classifier::Linalg module */
18
+ VALUE mClassifier = rb_define_module("Classifier");
19
+ mClassifierLinalg = rb_define_module_under(mClassifier, "Linalg");
20
+
21
+ /* Initialize Vector and Matrix classes */
22
+ Init_vector();
23
+ Init_matrix();
24
+ Init_svd();
25
+ }
@@ -0,0 +1,15 @@
1
+ require 'mkmf'
2
+
3
+ # rubocop:disable Style/GlobalVars
4
+ if ENV['COVERAGE']
5
+ # Coverage flags: disable optimization for accurate line coverage
6
+ $CFLAGS << ' -O0 -g --coverage -Wall'
7
+ $LDFLAGS << ' --coverage'
8
+ else
9
+ # Optimization flags for performance
10
+ $CFLAGS << ' -O3 -ffast-math -Wall'
11
+ end
12
+ # rubocop:enable Style/GlobalVars
13
+
14
+ # Create the Makefile
15
+ create_makefile('classifier/classifier_ext')
@@ -0,0 +1,64 @@
1
+ #ifndef CLASSIFIER_LINALG_H
2
+ #define CLASSIFIER_LINALG_H
3
+
4
+ #include <ruby.h>
5
+ #include <math.h>
6
+ #include <stdlib.h>
7
+ #include <string.h>
8
+
9
+ /* Epsilon for numerical comparisons */
10
+ #define CLASSIFIER_EPSILON 1e-10
11
+
12
+ /* Vector structure */
13
+ typedef struct {
14
+ size_t size;
15
+ double *data;
16
+ int is_col; /* 0 = row vector, 1 = column vector */
17
+ } CVector;
18
+
19
+ /* Matrix structure */
20
+ typedef struct {
21
+ size_t rows;
22
+ size_t cols;
23
+ double *data; /* Row-major storage */
24
+ } CMatrix;
25
+
26
+ /* Ruby class references */
27
+ extern VALUE cClassifierVector;
28
+ extern VALUE cClassifierMatrix;
29
+ extern VALUE mClassifierLinalg;
30
+
31
+ /* Vector functions */
32
+ void Init_vector(void);
33
+ CVector *cvector_alloc(size_t size);
34
+ void cvector_free(void *ptr);
35
+ double cvector_magnitude(CVector *v);
36
+ CVector *cvector_normalize(CVector *v);
37
+ double cvector_sum(CVector *v);
38
+ double cvector_dot(CVector *a, CVector *b);
39
+
40
+ /* Matrix functions */
41
+ void Init_matrix(void);
42
+ CMatrix *cmatrix_alloc(size_t rows, size_t cols);
43
+ void cmatrix_free(void *ptr);
44
+ CMatrix *cmatrix_transpose(CMatrix *m);
45
+ CMatrix *cmatrix_multiply(CMatrix *a, CMatrix *b);
46
+ CVector *cmatrix_multiply_vector(CMatrix *m, CVector *v);
47
+ CMatrix *cmatrix_diagonal(CVector *v);
48
+
49
+ /* SVD functions */
50
+ void Init_svd(void);
51
+ void jacobi_svd(CMatrix *a, CMatrix **u, CMatrix **v, CVector **s);
52
+
53
+ /* TypedData definitions */
54
+ extern const rb_data_type_t cvector_type;
55
+ extern const rb_data_type_t cmatrix_type;
56
+
57
+ /* Helper macros */
58
+ #define GET_CVECTOR(obj, ptr) TypedData_Get_Struct(obj, CVector, &cvector_type, ptr)
59
+ #define GET_CMATRIX(obj, ptr) TypedData_Get_Struct(obj, CMatrix, &cmatrix_type, ptr)
60
+
61
+ /* Matrix element access (row-major) */
62
+ #define MAT_AT(m, i, j) ((m)->data[(i) * (m)->cols + (j)])
63
+
64
+ #endif /* CLASSIFIER_LINALG_H */