classifier 1.4.4 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f5bbc7714c3f1b5b6bf2b81484e071eb34c5510455d9520f8bd743ffe25a3bb6
4
- data.tar.gz: f9236a1e0c086e1bda93645d94ea21f6634b76acbb4c99e62709e1b011311509
3
+ metadata.gz: e82726ac5c6e619e701be4591c8ed29e8b9f9a88236ad9d8d602fe3f748dcf43
4
+ data.tar.gz: 3771f2d2fce4992ed0cd5b1ad3849cc3e1c4fa310a144c1056e5f9d0c49579ef
5
5
  SHA512:
6
- metadata.gz: 79596fac37a6591587859335d4dbb280f038c1504a6fba9a48f7f3d5c83c50bee959887a827588d7418503ff141bff81401125604ddc104af35863dd84842e28
7
- data.tar.gz: 5cd28357e92c65e10630700a65097350030e401f16d4aacaaf9407e8bd8dd2d200739965bab52d547054a90bfe25ab53935e6544f06ebe3f61d356cc972d8ead
6
+ metadata.gz: 913111c43ffd83a1a461023c6d1f4fef0f1efb7f4e591089db7dbf3d37a1cc98cb3b5de16888019ab1caf8db83ad828dbaf82d2cab237b991216bdb27f529d15
7
+ data.tar.gz: b7142000fa687a58481051921663665edad1ac3b183947e79cf63bb15a79a0ca7bc66cd76104fccea713162cf05e876bd609d5f4866a13ea2c184f8caa089547
data/CLAUDE.md ADDED
@@ -0,0 +1,77 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Project Overview
6
+
7
+ Ruby gem providing text classification via two algorithms:
8
+ - **Bayes** (`Classifier::Bayes`) - Naive Bayesian classification
9
+ - **LSI** (`Classifier::LSI`) - Latent Semantic Indexing for semantic classification, clustering, and search
10
+
11
+ ## Common Commands
12
+
13
+ ```bash
14
+ # Compile native C extension
15
+ bundle exec rake compile
16
+
17
+ # Run all tests (compiles first)
18
+ bundle exec rake test
19
+
20
+ # Run a single test file
21
+ ruby -Ilib test/bayes/bayesian_test.rb
22
+ ruby -Ilib test/lsi/lsi_test.rb
23
+
24
+ # Run tests with pure Ruby (no native extension)
25
+ NATIVE_VECTOR=true bundle exec rake test
26
+
27
+ # Run benchmarks
28
+ bundle exec rake benchmark
29
+ bundle exec rake benchmark:compare
30
+
31
+ # Interactive console
32
+ bundle exec rake console
33
+
34
+ # Generate documentation
35
+ bundle exec rake doc
36
+ ```
37
+
38
+ ## Architecture
39
+
40
+ ### Core Components
41
+
42
+ **Bayesian Classifier** (`lib/classifier/bayes.rb`)
43
+ - Train with `train(category, text)` or dynamic methods like `train_spam(text)`
44
+ - Classify with `classify(text)` returning the best category
45
+ - Uses log probabilities for numerical stability
46
+
47
+ **LSI Classifier** (`lib/classifier/lsi.rb`)
48
+ - Uses Singular Value Decomposition (SVD) for semantic analysis
49
+ - Native C extension for 5-50x faster matrix operations; falls back to pure Ruby
50
+ - Key operations: `add_item`, `classify`, `find_related`, `search`
51
+ - `auto_rebuild` option controls automatic index rebuilding after changes
52
+
53
+ **String Extensions** (`lib/classifier/extensions/word_hash.rb`)
54
+ - `word_hash` / `clean_word_hash` - tokenize text to stemmed word frequencies
55
+ - `CORPUS_SKIP_WORDS` - stopwords filtered during tokenization
56
+ - Uses `fast-stemmer` gem for Porter stemming
57
+
58
+ **Vector Extensions** (`lib/classifier/extensions/vector.rb`)
59
+ - Pure Ruby SVD implementation (`Matrix#SV_decomp`) - used as fallback
60
+ - Vector normalization and magnitude calculations
61
+
62
+ ### Native C Extension (`ext/classifier/`)
63
+
64
+ LSI uses a native C extension for fast linear algebra operations:
65
+ - `Classifier::Linalg::Vector` - Vector operations (alloc, normalize, dot product)
66
+ - `Classifier::Linalg::Matrix` - Matrix operations (alloc, transpose, multiply)
67
+ - Jacobi SVD implementation for singular value decomposition
68
+
69
+ Check current backend: `Classifier::LSI.backend` returns `:native` or `:ruby`
70
+ Force pure Ruby: `NATIVE_VECTOR=true bundle exec rake test`
71
+
72
+ ### Content Nodes (`lib/classifier/lsi/content_node.rb`)
73
+
74
+ Internal data structure storing:
75
+ - `word_hash` - term frequencies
76
+ - `raw_vector` / `raw_norm` - initial vector representation
77
+ - `lsi_vector` / `lsi_norm` - reduced dimensionality representation after SVD
data/README.md ADDED
@@ -0,0 +1,274 @@
1
+ # Classifier
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/classifier.svg)](https://badge.fury.io/rb/classifier)
4
+ [![CI](https://github.com/cardmagic/classifier/actions/workflows/ruby.yml/badge.svg)](https://github.com/cardmagic/classifier/actions/workflows/ruby.yml)
5
+ [![License: LGPL](https://img.shields.io/badge/License-LGPL_2.1-blue.svg)](https://opensource.org/licenses/LGPL-2.1)
6
+
7
+ A Ruby library for text classification using Bayesian and Latent Semantic Indexing (LSI) algorithms.
8
+
9
+ **[Documentation](https://rubyclassifier.com/docs)** · **[Tutorials](https://rubyclassifier.com/docs/tutorials)** · **[Guides](https://rubyclassifier.com/docs/guides)**
10
+
11
+ ## Table of Contents
12
+
13
+ - [Installation](#installation)
14
+ - [Bayesian Classifier](#bayesian-classifier)
15
+ - [LSI (Latent Semantic Indexing)](#lsi-latent-semantic-indexing)
16
+ - [Persistence](#persistence)
17
+ - [Performance](#performance)
18
+ - [Development](#development)
19
+ - [Contributing](#contributing)
20
+ - [License](#license)
21
+
22
+ ## Installation
23
+
24
+ Add to your Gemfile:
25
+
26
+ ```ruby
27
+ gem 'classifier'
28
+ ```
29
+
30
+ Then run:
31
+
32
+ ```bash
33
+ bundle install
34
+ ```
35
+
36
+ Or install directly:
37
+
38
+ ```bash
39
+ gem install classifier
40
+ ```
41
+
42
+ ### Native C Extension
43
+
44
+ The gem includes a native C extension for fast LSI operations. It compiles automatically during gem installation. No external dependencies are required.
45
+
46
+ To verify the native extension is active:
47
+
48
+ ```ruby
49
+ require 'classifier'
50
+ puts Classifier::LSI.backend # => :native
51
+ ```
52
+
53
+ To force pure Ruby mode (for debugging):
54
+
55
+ ```bash
56
+ NATIVE_VECTOR=true ruby your_script.rb
57
+ ```
58
+
59
+ To suppress the warning when native extension isn't available:
60
+
61
+ ```bash
62
+ SUPPRESS_LSI_WARNING=true ruby your_script.rb
63
+ ```
64
+
65
+ ### Compatibility
66
+
67
+ | Ruby Version | Status |
68
+ |--------------|--------|
69
+ | 4.0 | Supported |
70
+ | 3.4 | Supported |
71
+ | 3.3 | Supported |
72
+ | 3.2 | Supported |
73
+ | 3.1 | EOL (unsupported) |
74
+
75
+ ## Bayesian Classifier
76
+
77
+ Fast, accurate classification with modest memory requirements. Ideal for spam filtering, sentiment analysis, and content categorization.
78
+
79
+ ### Quick Start
80
+
81
+ ```ruby
82
+ require 'classifier'
83
+
84
+ classifier = Classifier::Bayes.new('Spam', 'Ham')
85
+
86
+ # Train the classifier
87
+ classifier.train_spam "Buy cheap viagra now! Limited offer!"
88
+ classifier.train_spam "You've won a million dollars! Claim now!"
89
+ classifier.train_ham "Meeting scheduled for tomorrow at 10am"
90
+ classifier.train_ham "Please review the attached document"
91
+
92
+ # Classify new text
93
+ classifier.classify "Congratulations! You've won a prize!"
94
+ # => "Spam"
95
+ ```
96
+
97
+ ### Learn More
98
+
99
+ - [Bayes Basics Guide](https://rubyclassifier.com/docs/guides/bayes/basics) - In-depth documentation
100
+ - [Build a Spam Filter Tutorial](https://rubyclassifier.com/docs/tutorials/spam-filter) - Step-by-step guide
101
+ - [Paul Graham: A Plan for Spam](http://www.paulgraham.com/spam.html)
102
+
103
+ ## LSI (Latent Semantic Indexing)
104
+
105
+ Semantic analysis using Singular Value Decomposition (SVD). More flexible than Bayesian classifiers, providing search, clustering, and classification based on meaning rather than just keywords.
106
+
107
+ ### Quick Start
108
+
109
+ ```ruby
110
+ require 'classifier'
111
+
112
+ lsi = Classifier::LSI.new
113
+
114
+ # Add documents with categories
115
+ lsi.add_item "Dogs are loyal pets that love to play fetch", :pets
116
+ lsi.add_item "Cats are independent and love to nap", :pets
117
+ lsi.add_item "Ruby is a dynamic programming language", :programming
118
+ lsi.add_item "Python is great for data science", :programming
119
+
120
+ # Classify new text
121
+ lsi.classify "My puppy loves to run around"
122
+ # => :pets
123
+
124
+ # Get classification with confidence score
125
+ lsi.classify_with_confidence "Learning to code in Ruby"
126
+ # => [:programming, 0.89]
127
+ ```
128
+
129
+ ### Search and Discovery
130
+
131
+ ```ruby
132
+ # Find similar documents
133
+ lsi.find_related "Dogs are great companions", 2
134
+ # => ["Dogs are loyal pets that love to play fetch", "Cats are independent..."]
135
+
136
+ # Search by keyword
137
+ lsi.search "programming", 3
138
+ # => ["Ruby is a dynamic programming language", "Python is great for..."]
139
+ ```
140
+
141
+ ### Learn More
142
+
143
+ - [LSI Basics Guide](https://rubyclassifier.com/docs/guides/lsi/basics) - In-depth documentation
144
+ - [Wikipedia: Latent Semantic Analysis](http://en.wikipedia.org/wiki/Latent_semantic_analysis)
145
+
146
+ ## Persistence
147
+
148
+ Save and load trained classifiers with pluggable storage backends. Works with both Bayes and LSI classifiers.
149
+
150
+ ### File Storage
151
+
152
+ ```ruby
153
+ require 'classifier'
154
+
155
+ classifier = Classifier::Bayes.new('Spam', 'Ham')
156
+ classifier.train_spam "Buy now! Limited offer!"
157
+ classifier.train_ham "Meeting tomorrow at 3pm"
158
+
159
+ # Configure storage and save
160
+ classifier.storage = Classifier::Storage::File.new(path: "spam_filter.json")
161
+ classifier.save
162
+
163
+ # Load later
164
+ loaded = Classifier::Bayes.load(storage: classifier.storage)
165
+ loaded.classify "Claim your prize now!"
166
+ # => "Spam"
167
+ ```
168
+
169
+ ### Custom Storage Backends
170
+
171
+ Create backends for Redis, PostgreSQL, S3, or any storage system:
172
+
173
+ ```ruby
174
+ class RedisStorage < Classifier::Storage::Base
175
+ def initialize(redis:, key:)
176
+ super()
177
+ @redis, @key = redis, key
178
+ end
179
+
180
+ def write(data) = @redis.set(@key, data)
181
+ def read = @redis.get(@key)
182
+ def delete = @redis.del(@key)
183
+ def exists? = @redis.exists?(@key)
184
+ end
185
+
186
+ # Use it
187
+ classifier.storage = RedisStorage.new(redis: Redis.new, key: "classifier:spam")
188
+ classifier.save
189
+ ```
190
+
191
+ ### Learn More
192
+
193
+ - [Persistence Guide](https://rubyclassifier.com/docs/guides/persistence/basics) - Full documentation with examples
194
+
195
+ ## Performance
196
+
197
+ ### Native C Extension vs Pure Ruby
198
+
199
+ The native C extension provides dramatic speedups for LSI operations, especially `build_index` (SVD computation):
200
+
201
+ | Documents | build_index | Overall |
202
+ |-----------|-------------|---------|
203
+ | 5 | 7x faster | 2.6x |
204
+ | 10 | 25x faster | 4.6x |
205
+ | 15 | 112x faster | 14.5x |
206
+ | 20 | 385x faster | 48.7x |
207
+
208
+ <details>
209
+ <summary>Detailed benchmark (20 documents)</summary>
210
+
211
+ ```
212
+ Operation Pure Ruby Native C Speedup
213
+ ----------------------------------------------------------
214
+ build_index 0.5540 0.0014 384.5x
215
+ classify 0.0190 0.0060 3.2x
216
+ search 0.0145 0.0037 3.9x
217
+ find_related 0.0098 0.0011 8.6x
218
+ ----------------------------------------------------------
219
+ TOTAL 0.5973 0.0123 48.7x
220
+ ```
221
+ </details>
222
+
223
+ ### Running Benchmarks
224
+
225
+ ```bash
226
+ rake benchmark # Run with current configuration
227
+ rake benchmark:compare # Compare native C vs pure Ruby
228
+ ```
229
+
230
+ ## Development
231
+
232
+ ### Setup
233
+
234
+ ```bash
235
+ git clone https://github.com/cardmagic/classifier.git
236
+ cd classifier
237
+ bundle install
238
+ rake compile # Compile native C extension
239
+ ```
240
+
241
+ ### Running Tests
242
+
243
+ ```bash
244
+ rake test # Run all tests (compiles first)
245
+ ruby -Ilib test/bayes/bayesian_test.rb # Run specific test file
246
+
247
+ # Test with pure Ruby (no native extension)
248
+ NATIVE_VECTOR=true rake test
249
+ ```
250
+
251
+ ### Console
252
+
253
+ ```bash
254
+ rake console
255
+ ```
256
+
257
+ ## Contributing
258
+
259
+ 1. Fork the repository
260
+ 2. Create your feature branch (`git checkout -b feature/amazing-feature`)
261
+ 3. Commit your changes (`git commit -am 'Add amazing feature'`)
262
+ 4. Push to the branch (`git push origin feature/amazing-feature`)
263
+ 5. Open a Pull Request
264
+
265
+ ## Authors
266
+
267
+ - **Lucas Carlson** - *Original author* - lucas@rufy.com
268
+ - **David Fayram II** - *LSI implementation* - dfayram@gmail.com
269
+ - **Cameron McBride** - cameron.mcbride@gmail.com
270
+ - **Ivan Acosta-Rubio** - ivan@softwarecriollo.com
271
+
272
+ ## License
273
+
274
+ This library is released under the [GNU Lesser General Public License (LGPL) 2.1](LICENSE).
@@ -0,0 +1,25 @@
1
+ /*
2
+ * classifier_ext.c
3
+ * Main entry point for the Classifier native linear algebra extension
4
+ *
5
+ * This extension provides zero-dependency Vector, Matrix, and SVD
6
+ * implementations for the Classifier gem's LSI functionality.
7
+ */
8
+
9
+ #include "linalg.h"
10
+
11
+ VALUE mClassifierLinalg;
12
+ VALUE cClassifierVector;
13
+ VALUE cClassifierMatrix;
14
+
15
+ void Init_classifier_ext(void)
16
+ {
17
+ /* Define Classifier::Linalg module */
18
+ VALUE mClassifier = rb_define_module("Classifier");
19
+ mClassifierLinalg = rb_define_module_under(mClassifier, "Linalg");
20
+
21
+ /* Initialize Vector and Matrix classes */
22
+ Init_vector();
23
+ Init_matrix();
24
+ Init_svd();
25
+ }
@@ -0,0 +1,15 @@
1
+ require 'mkmf'
2
+
3
+ # rubocop:disable Style/GlobalVars
4
+ if ENV['COVERAGE']
5
+ # Coverage flags: disable optimization for accurate line coverage
6
+ $CFLAGS << ' -O0 -g --coverage -Wall'
7
+ $LDFLAGS << ' --coverage'
8
+ else
9
+ # Optimization flags for performance
10
+ $CFLAGS << ' -O3 -ffast-math -Wall'
11
+ end
12
+ # rubocop:enable Style/GlobalVars
13
+
14
+ # Create the Makefile
15
+ create_makefile('classifier/classifier_ext')
@@ -0,0 +1,64 @@
1
+ #ifndef CLASSIFIER_LINALG_H
2
+ #define CLASSIFIER_LINALG_H
3
+
4
+ #include <ruby.h>
5
+ #include <math.h>
6
+ #include <stdlib.h>
7
+ #include <string.h>
8
+
9
+ /* Epsilon for numerical comparisons */
10
+ #define CLASSIFIER_EPSILON 1e-10
11
+
12
+ /* Vector structure */
13
+ typedef struct {
14
+ size_t size;
15
+ double *data;
16
+ int is_col; /* 0 = row vector, 1 = column vector */
17
+ } CVector;
18
+
19
+ /* Matrix structure */
20
+ typedef struct {
21
+ size_t rows;
22
+ size_t cols;
23
+ double *data; /* Row-major storage */
24
+ } CMatrix;
25
+
26
+ /* Ruby class references */
27
+ extern VALUE cClassifierVector;
28
+ extern VALUE cClassifierMatrix;
29
+ extern VALUE mClassifierLinalg;
30
+
31
+ /* Vector functions */
32
+ void Init_vector(void);
33
+ CVector *cvector_alloc(size_t size);
34
+ void cvector_free(void *ptr);
35
+ double cvector_magnitude(CVector *v);
36
+ CVector *cvector_normalize(CVector *v);
37
+ double cvector_sum(CVector *v);
38
+ double cvector_dot(CVector *a, CVector *b);
39
+
40
+ /* Matrix functions */
41
+ void Init_matrix(void);
42
+ CMatrix *cmatrix_alloc(size_t rows, size_t cols);
43
+ void cmatrix_free(void *ptr);
44
+ CMatrix *cmatrix_transpose(CMatrix *m);
45
+ CMatrix *cmatrix_multiply(CMatrix *a, CMatrix *b);
46
+ CVector *cmatrix_multiply_vector(CMatrix *m, CVector *v);
47
+ CMatrix *cmatrix_diagonal(CVector *v);
48
+
49
+ /* SVD functions */
50
+ void Init_svd(void);
51
+ void jacobi_svd(CMatrix *a, CMatrix **u, CMatrix **v, CVector **s);
52
+
53
+ /* TypedData definitions */
54
+ extern const rb_data_type_t cvector_type;
55
+ extern const rb_data_type_t cmatrix_type;
56
+
57
+ /* Helper macros */
58
+ #define GET_CVECTOR(obj, ptr) TypedData_Get_Struct(obj, CVector, &cvector_type, ptr)
59
+ #define GET_CMATRIX(obj, ptr) TypedData_Get_Struct(obj, CMatrix, &cmatrix_type, ptr)
60
+
61
+ /* Matrix element access (row-major) */
62
+ #define MAT_AT(m, i, j) ((m)->data[(i) * (m)->cols + (j)])
63
+
64
+ #endif /* CLASSIFIER_LINALG_H */