adaptive-simple-text-classifier 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- adaptive_simple_text_classifier-0.1.0/LICENSE +21 -0
- adaptive_simple_text_classifier-0.1.0/PKG-INFO +445 -0
- adaptive_simple_text_classifier-0.1.0/README.md +403 -0
- adaptive_simple_text_classifier-0.1.0/adaptive_classifier/__init__.py +56 -0
- adaptive_simple_text_classifier-0.1.0/adaptive_classifier/classifier.py +320 -0
- adaptive_simple_text_classifier-0.1.0/adaptive_classifier/embeddings.py +85 -0
- adaptive_simple_text_classifier-0.1.0/adaptive_classifier/index.py +279 -0
- adaptive_simple_text_classifier-0.1.0/adaptive_classifier/normalizer.py +142 -0
- adaptive_simple_text_classifier-0.1.0/adaptive_classifier/providers.py +328 -0
- adaptive_simple_text_classifier-0.1.0/adaptive_classifier/taxonomy.py +160 -0
- adaptive_simple_text_classifier-0.1.0/adaptive_classifier/types.py +95 -0
- adaptive_simple_text_classifier-0.1.0/adaptive_classifier/vector_stores.py +95 -0
- adaptive_simple_text_classifier-0.1.0/adaptive_simple_text_classifier.egg-info/PKG-INFO +445 -0
- adaptive_simple_text_classifier-0.1.0/adaptive_simple_text_classifier.egg-info/SOURCES.txt +19 -0
- adaptive_simple_text_classifier-0.1.0/adaptive_simple_text_classifier.egg-info/dependency_links.txt +1 -0
- adaptive_simple_text_classifier-0.1.0/adaptive_simple_text_classifier.egg-info/requires.txt +22 -0
- adaptive_simple_text_classifier-0.1.0/adaptive_simple_text_classifier.egg-info/top_level.txt +3 -0
- adaptive_simple_text_classifier-0.1.0/example/benchmark.py +531 -0
- adaptive_simple_text_classifier-0.1.0/pyproject.toml +74 -0
- adaptive_simple_text_classifier-0.1.0/setup.cfg +4 -0
- adaptive_simple_text_classifier-0.1.0/tests/test_core.py +103 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 2Lines Software Inc
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,445 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: adaptive-simple-text-classifier
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Self-building hybrid classifier: FAISS embeddings + LLM fallback with feedback loop
|
|
5
|
+
Author: 2Lines Software Inc
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/johncarpenter/adaptive-simple-text-classifier
|
|
8
|
+
Project-URL: Repository, https://github.com/johncarpenter/adaptive-simple-text-classifier
|
|
9
|
+
Project-URL: Issues, https://github.com/johncarpenter/adaptive-simple-text-classifier/issues
|
|
10
|
+
Keywords: classification,nlp,faiss,llm,embeddings,machine-learning
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Classifier: Topic :: Text Processing
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: numpy>=1.24
|
|
25
|
+
Requires-Dist: faiss-cpu>=1.7
|
|
26
|
+
Requires-Dist: sentence-transformers>=2.2
|
|
27
|
+
Provides-Extra: anthropic
|
|
28
|
+
Requires-Dist: anthropic>=0.39; extra == "anthropic"
|
|
29
|
+
Provides-Extra: vertex
|
|
30
|
+
Requires-Dist: anthropic[vertex]>=0.39; extra == "vertex"
|
|
31
|
+
Provides-Extra: bedrock
|
|
32
|
+
Requires-Dist: anthropic[bedrock]>=0.39; extra == "bedrock"
|
|
33
|
+
Provides-Extra: all
|
|
34
|
+
Requires-Dist: anthropic>=0.39; extra == "all"
|
|
35
|
+
Requires-Dist: anthropic[vertex]>=0.39; extra == "all"
|
|
36
|
+
Requires-Dist: anthropic[bedrock]>=0.39; extra == "all"
|
|
37
|
+
Provides-Extra: dev
|
|
38
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
39
|
+
Requires-Dist: pytest-asyncio>=0.21; extra == "dev"
|
|
40
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
41
|
+
Dynamic: license-file
|
|
42
|
+
|
|
43
|
+
# adaptive-simple-text-classifier
|
|
44
|
+
|
|
45
|
+
[](https://github.com/johncarpenter/adaptive-simple-text-classifier/actions/workflows/ci.yml)
|
|
46
|
+
[](https://pypi.org/project/adaptive-simple-text-classifier/)
|
|
47
|
+
[](https://pypi.org/project/adaptive-simple-text-classifier/)
|
|
48
|
+
[](LICENSE)
|
|
49
|
+
|
|
50
|
+
Self-building hybrid text classifier. FAISS embedding search with LLM fallback and automatic feedback loop.
|
|
51
|
+
|
|
52
|
+
Classifies messy, abbreviated text into structured taxonomies. The index grows as LLM results feed back, so accuracy improves and LLM costs decrease over time.
|
|
53
|
+
|
|
54
|
+
## Install
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install adaptive-simple-text-classifier
|
|
58
|
+
|
|
59
|
+
# With LLM provider:
|
|
60
|
+
pip install adaptive-simple-text-classifier[anthropic] # Direct Anthropic API
|
|
61
|
+
pip install adaptive-simple-text-classifier[vertex] # Google Cloud Vertex AI
|
|
62
|
+
pip install adaptive-simple-text-classifier[bedrock] # AWS Bedrock
|
|
63
|
+
pip install adaptive-simple-text-classifier[all] # All providers
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## How It Works
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
Input Text ──> Normalize ──> FAISS Search ──┬──> Confident? ──> Return result
|
|
70
|
+
│
|
|
71
|
+
└──> Uncertain? ──> LLM Classify ──> Return result
|
|
72
|
+
│
|
|
73
|
+
└──> Feed back into FAISS index
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
1. **First run**: Most items go to the LLM (cold start, only taxonomy labels in the index)
|
|
77
|
+
2. **LLM results** get embedded and stored back in the FAISS index
|
|
78
|
+
3. **Subsequent runs**: FAISS handles most items, LLM handles only novel patterns
|
|
79
|
+
4. **Over time**: Hit rate climbs toward 100%, LLM costs drop to near zero
|
|
80
|
+
|
|
81
|
+
## Quick Start
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from adaptive_classifier import AdaptiveClassifier, create_normalizer
|
|
85
|
+
|
|
86
|
+
# Define your taxonomy (nested dict, flat list, or YAML/JSON file)
|
|
87
|
+
taxonomy = {
|
|
88
|
+
"Food": {
|
|
89
|
+
"Burgers": ["Hamburger", "Cheeseburger", "Veggie Burger"],
|
|
90
|
+
"Pizza": ["Pepperoni", "Margherita", "Hawaiian"],
|
|
91
|
+
"Drinks": ["Coffee", "Juice", "Soda"],
|
|
92
|
+
},
|
|
93
|
+
"Retail": {
|
|
94
|
+
"Electronics": ["Phone", "Laptop", "Tablet"],
|
|
95
|
+
"Furniture": ["Chair", "Table", "Bookshelf"],
|
|
96
|
+
},
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
classifier = AdaptiveClassifier(
|
|
100
|
+
taxonomy=taxonomy,
|
|
101
|
+
provider="anthropic", # or "vertex", "bedrock", callable
|
|
102
|
+
index_path="./my_classifier", # persists to disk
|
|
103
|
+
confidence_threshold=0.65, # below this -> LLM fallback
|
|
104
|
+
normalizer=create_normalizer(), # expands abbreviations, strips noise
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
results = classifier.classify([
|
|
108
|
+
"chz brgr",
|
|
109
|
+
"lg pep pizza",
|
|
110
|
+
"bkshf oak - $249.99",
|
|
111
|
+
"iced coffee lg",
|
|
112
|
+
])
|
|
113
|
+
|
|
114
|
+
for r in results:
|
|
115
|
+
print(f"{r.input_text:30s} -> {r.category_path:40s} ({r.confidence:.2f}, {r.source.value})")
|
|
116
|
+
|
|
117
|
+
# Check stats
|
|
118
|
+
print(results.stats.to_dict())
|
|
119
|
+
# {'total': 4, 'embedding_hits': 1, 'llm_calls': 1, 'llm_items': 3, 'fed_back': 6, ...}
|
|
120
|
+
|
|
121
|
+
# Run again - more hits from the index, fewer LLM calls
|
|
122
|
+
results2 = classifier.classify(["double chz burger", "pepperoni pza sm"])
|
|
123
|
+
print(f"Hit rate: {results2.stats.embedding_hits}/{results2.stats.total}")
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## Benchmark Results
|
|
127
|
+
|
|
128
|
+
Tested against the [Financial Transaction Categorization Dataset](https://huggingface.co/datasets/mitulshah/transaction-categorization) (4.5M records, 10 categories) with 50 training examples and 100 test records. See [`example/`](example/) for the full benchmark.
|
|
129
|
+
|
|
130
|
+
### Accuracy across 3 runs
|
|
131
|
+
|
|
132
|
+
```
|
|
133
|
+
Run Embed-only Hybrid+LLM Post-feedback
|
|
134
|
+
---------------------------------------------------------
|
|
135
|
+
#1 42.0% 90.0% 72.0%
|
|
136
|
+
#2 90.0% 82.0%
|
|
137
|
+
#3 90.0% 84.0%
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### LLM usage decreasing as the index learns
|
|
141
|
+
|
|
142
|
+
```
|
|
143
|
+
Run LLM items LLM calls Index size
|
|
144
|
+
---------------------------------------------------------
|
|
145
|
+
#1 98 2 168
|
|
146
|
+
#2 48 1 216
|
|
147
|
+
#3 15 1 231
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### Run 3 performance
|
|
151
|
+
|
|
152
|
+
| Metric | Hybrid+LLM | Post-feedback (no LLM) |
|
|
153
|
+
|--------|-----------|----------------------|
|
|
154
|
+
| Accuracy | 90.0% | 84.0% |
|
|
155
|
+
| Macro F1 | 0.8972 | 0.8366 |
|
|
156
|
+
| Throughput | 49.9 items/s | 471.7 items/s |
|
|
157
|
+
| Embedding hits | 85/100 | 100/100 |
|
|
158
|
+
| LLM fallback items | 15 | 0 |
|
|
159
|
+
|
|
160
|
+
By run 3, LLM usage dropped 85% (98 -> 15 items) and embedding-only throughput is 9x faster than hybrid. The index grows from 70 vectors to 231 as LLM results feed back.
|
|
161
|
+
|
|
162
|
+
## Use Case Examples
|
|
163
|
+
|
|
164
|
+
### Banking Transaction Classification
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
from adaptive_classifier import AdaptiveClassifier, create_normalizer
|
|
168
|
+
|
|
169
|
+
envelopes = {
|
|
170
|
+
"Housing": ["Rent", "Mortgage", "Property Tax", "Home Insurance", "Maintenance"],
|
|
171
|
+
"Transportation": ["Gas", "Car Payment", "Insurance", "Parking", "Transit"],
|
|
172
|
+
"Food": ["Groceries", "Restaurants", "Coffee Shops", "Fast Food"],
|
|
173
|
+
"Utilities": ["Electric", "Gas Utility", "Water", "Internet", "Phone"],
|
|
174
|
+
"Health": ["Doctor", "Dentist", "Pharmacy", "Gym"],
|
|
175
|
+
"Entertainment": ["Streaming", "Movies", "Games", "Books"],
|
|
176
|
+
"Savings": ["Emergency Fund", "Retirement", "Investment"],
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
classifier = AdaptiveClassifier(
|
|
180
|
+
taxonomy=envelopes,
|
|
181
|
+
provider="anthropic",
|
|
182
|
+
index_path="./budget_classifier",
|
|
183
|
+
normalizer=create_normalizer(
|
|
184
|
+
abbreviations={"wal-mart": "walmart grocery", "amzn": "amazon"},
|
|
185
|
+
strip_codes=True,
|
|
186
|
+
),
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
transactions = [
|
|
190
|
+
"WALMART SUPERCENTER #4532",
|
|
191
|
+
"SHELL OIL 57442",
|
|
192
|
+
"NETFLIX.COM",
|
|
193
|
+
"CITY OF CALGARY UTILITIES",
|
|
194
|
+
"TIM HORTONS #0891",
|
|
195
|
+
"PHARMACHOICE #112",
|
|
196
|
+
]
|
|
197
|
+
|
|
198
|
+
results = classifier.classify(transactions)
|
|
199
|
+
for r in results:
|
|
200
|
+
print(f"{r.input_text:35s} -> {r.leaf_label}")
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
### Property Valuation CRN Lookup
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
from adaptive_classifier import AdaptiveClassifier, Taxonomy
|
|
207
|
+
|
|
208
|
+
crn_taxonomy = Taxonomy.from_flat([
|
|
209
|
+
"Furniture > Seating > Office Chair",
|
|
210
|
+
"Furniture > Seating > Dining Chair",
|
|
211
|
+
"Furniture > Storage > Bookshelf",
|
|
212
|
+
"Furniture > Storage > Filing Cabinet",
|
|
213
|
+
"Furniture > Tables > Desk",
|
|
214
|
+
"Furniture > Tables > Dining Table",
|
|
215
|
+
"Electronics > Computing > Desktop Computer",
|
|
216
|
+
"Electronics > Computing > Laptop",
|
|
217
|
+
"Electronics > Audio Visual > Television",
|
|
218
|
+
"Electronics > Audio Visual > Projector",
|
|
219
|
+
"Appliances > Kitchen > Refrigerator",
|
|
220
|
+
"Appliances > Kitchen > Dishwasher",
|
|
221
|
+
"Appliances > Laundry > Washing Machine",
|
|
222
|
+
])
|
|
223
|
+
|
|
224
|
+
classifier = AdaptiveClassifier(
|
|
225
|
+
taxonomy=crn_taxonomy,
|
|
226
|
+
provider="vertex",
|
|
227
|
+
index_path="./crn_classifier",
|
|
228
|
+
provider_kwargs={"project_id": "my-gcp-project", "region": "us-east5"},
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
items = [
|
|
232
|
+
"oak bkshf 5-shelf",
|
|
233
|
+
"Herman Miller Aeron",
|
|
234
|
+
"Samsung 65in QLED",
|
|
235
|
+
"ikea kallax",
|
|
236
|
+
"dell latitude 5540",
|
|
237
|
+
]
|
|
238
|
+
|
|
239
|
+
results = classifier.classify(items)
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
### POS Product Hierarchy
|
|
243
|
+
|
|
244
|
+
```python
|
|
245
|
+
from adaptive_classifier import AdaptiveClassifier, create_normalizer
|
|
246
|
+
|
|
247
|
+
menu = {
|
|
248
|
+
"Burgers": {
|
|
249
|
+
"Beef": ["Hamburger", "Cheeseburger", "Bacon Burger", "Double Burger"],
|
|
250
|
+
"Chicken": ["Chicken Burger", "Spicy Chicken", "Grilled Chicken"],
|
|
251
|
+
"Plant": ["Veggie Burger", "Beyond Burger"],
|
|
252
|
+
},
|
|
253
|
+
"Sides": {
|
|
254
|
+
"Fries": ["Regular Fries", "Sweet Potato Fries", "Poutine"],
|
|
255
|
+
"Salads": ["Garden Salad", "Caesar Salad", "Coleslaw"],
|
|
256
|
+
},
|
|
257
|
+
"Drinks": {
|
|
258
|
+
"Hot": ["Coffee", "Tea", "Hot Chocolate"],
|
|
259
|
+
"Cold": ["Soda", "Iced Tea", "Milkshake", "Water"],
|
|
260
|
+
},
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
classifier = AdaptiveClassifier(
|
|
264
|
+
taxonomy=menu,
|
|
265
|
+
provider="bedrock",
|
|
266
|
+
index_path="./pos_classifier",
|
|
267
|
+
normalizer=create_normalizer(
|
|
268
|
+
abbreviations={
|
|
269
|
+
"chz": "cheese", "brgr": "burger", "dbl": "double",
|
|
270
|
+
"reg": "regular", "sw pot": "sweet potato",
|
|
271
|
+
}
|
|
272
|
+
),
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
pos_entries = [
|
|
276
|
+
"dbl chz brgr",
|
|
277
|
+
"reg fry",
|
|
278
|
+
"lg coff blk",
|
|
279
|
+
"spcy chkn sndwch",
|
|
280
|
+
"grdn salad",
|
|
281
|
+
"sw pot fry",
|
|
282
|
+
]
|
|
283
|
+
|
|
284
|
+
results = classifier.classify(pos_entries)
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
## Pluggable LLM Backend
|
|
288
|
+
|
|
289
|
+
```python
|
|
290
|
+
# Any callable works
|
|
291
|
+
def my_custom_llm(items, system_prompt, user_prompt):
|
|
292
|
+
# Call OpenAI, local model, whatever
|
|
293
|
+
response = my_api.complete(system=system_prompt, user=user_prompt)
|
|
294
|
+
return response.text # Must return JSON string
|
|
295
|
+
|
|
296
|
+
classifier = AdaptiveClassifier(
|
|
297
|
+
taxonomy=my_taxonomy,
|
|
298
|
+
provider=my_custom_llm,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
# Or implement the LLMProvider protocol directly
|
|
302
|
+
from adaptive_classifier import LLMProvider
|
|
303
|
+
|
|
304
|
+
class MyProvider:
|
|
305
|
+
def classify_batch(self, items, taxonomy_prompt, batch_size=50):
|
|
306
|
+
# Your implementation
|
|
307
|
+
return [{"input": item, "category": "..."} for item in items]
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
## Pluggable Vector Store
|
|
311
|
+
|
|
312
|
+
FAISS is the default, but you can swap in any vector backend by implementing the `VectorStore` protocol:
|
|
313
|
+
|
|
314
|
+
```python
|
|
315
|
+
from adaptive_classifier import AdaptiveClassifier, VectorStore
|
|
316
|
+
import numpy as np
|
|
317
|
+
from pathlib import Path
|
|
318
|
+
|
|
319
|
+
class MyVectorStore:
|
|
320
|
+
"""Drop-in replacement - e.g. Pinecone, Qdrant, Annoy, etc."""
|
|
321
|
+
|
|
322
|
+
@property
|
|
323
|
+
def size(self) -> int: ...
|
|
324
|
+
def add(self, vectors: np.ndarray) -> None: ...
|
|
325
|
+
def search(self, queries: np.ndarray, k: int) -> tuple[np.ndarray, np.ndarray]: ...
|
|
326
|
+
def reset(self) -> None: ...
|
|
327
|
+
def save(self, path: Path) -> None: ...
|
|
328
|
+
def load(self, path: Path) -> None: ...
|
|
329
|
+
|
|
330
|
+
classifier = AdaptiveClassifier(
|
|
331
|
+
taxonomy=my_taxonomy,
|
|
332
|
+
vector_store=MyVectorStore(),
|
|
333
|
+
)
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
## Pre-seeding with Known Mappings
|
|
337
|
+
|
|
338
|
+
```python
|
|
339
|
+
# If you already have labeled data, seed the index directly
|
|
340
|
+
classifier.add_examples({
|
|
341
|
+
"WALMART SUPERCENTER": "Food > Groceries",
|
|
342
|
+
"COSTCO WHOLESALE": "Food > Groceries",
|
|
343
|
+
"NETFLIX.COM": "Entertainment > Streaming",
|
|
344
|
+
"SPOTIFY": "Entertainment > Streaming",
|
|
345
|
+
})
|
|
346
|
+
```
|
|
347
|
+
|
|
348
|
+
## Configuration
|
|
349
|
+
|
|
350
|
+
| Parameter | Default | Description |
|
|
351
|
+
|-----------|---------|-------------|
|
|
352
|
+
| `confidence_threshold` | `0.65` | Below this -> LLM fallback |
|
|
353
|
+
| `k_neighbors` | `5` | Neighbors for majority voting |
|
|
354
|
+
| `llm_batch_size` | `50` | Items per LLM API call |
|
|
355
|
+
| `auto_feedback` | `True` | Feed LLM results back to index |
|
|
356
|
+
| `auto_save` | `True` | Save index after each classify() |
|
|
357
|
+
| `embedding_model` | `all-MiniLM-L6-v2` | Sentence transformer model |
|
|
358
|
+
| `vector_store` | `None` (FAISS) | Custom `VectorStore` backend |
|
|
359
|
+
|
|
360
|
+
## Taxonomy Formats
|
|
361
|
+
|
|
362
|
+
```python
|
|
363
|
+
# Nested dict
|
|
364
|
+
taxonomy = {"Category": {"Subcategory": ["Leaf1", "Leaf2"]}}
|
|
365
|
+
|
|
366
|
+
# Flat path list
|
|
367
|
+
taxonomy = ["Category > Subcategory > Leaf1", "Category > Subcategory > Leaf2"]
|
|
368
|
+
|
|
369
|
+
# From file
|
|
370
|
+
classifier = AdaptiveClassifier(taxonomy="./taxonomy.json")
|
|
371
|
+
classifier = AdaptiveClassifier(taxonomy="./taxonomy.yaml")
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
## Architecture
|
|
375
|
+
|
|
376
|
+
```
|
|
377
|
+
adaptive_classifier/
|
|
378
|
+
├── classifier.py # AdaptiveClassifier orchestrator
|
|
379
|
+
├── taxonomy.py # Taxonomy tree management
|
|
380
|
+
├── index.py # Vector index + persistence + feedback
|
|
381
|
+
├── vector_stores.py # Pluggable vector store backends (FAISS default)
|
|
382
|
+
├── embeddings.py # Embedding provider abstraction
|
|
383
|
+
├── providers.py # Pluggable LLM backends
|
|
384
|
+
├── normalizer.py # Text normalization / abbreviation expansion
|
|
385
|
+
└── types.py # Classification, BatchStats, etc.
|
|
386
|
+
```
|
|
387
|
+
|
|
388
|
+
## Development
|
|
389
|
+
|
|
390
|
+
### Setup
|
|
391
|
+
|
|
392
|
+
```bash
|
|
393
|
+
# Clone the repo
|
|
394
|
+
git clone https://github.com/johncarpenter/adaptive-simple-text-classifier.git
|
|
395
|
+
cd adaptive-simple-text-classifier
|
|
396
|
+
|
|
397
|
+
# Install with uv (recommended)
|
|
398
|
+
uv sync
|
|
399
|
+
|
|
400
|
+
# Or with pip
|
|
401
|
+
pip install -e ".[dev]"
|
|
402
|
+
```
|
|
403
|
+
|
|
404
|
+
### Testing
|
|
405
|
+
|
|
406
|
+
```bash
|
|
407
|
+
# Run unit tests
|
|
408
|
+
uv run pytest -v
|
|
409
|
+
|
|
410
|
+
# Run the benchmark (requires ANTHROPIC_API_KEY for hybrid mode)
|
|
411
|
+
uv run python example/benchmark.py --embedding-only # no API key needed
|
|
412
|
+
uv run python example/benchmark.py # full hybrid benchmark
|
|
413
|
+
uv run python example/benchmark.py --runs 3 # watch the learning curve
|
|
414
|
+
```
|
|
415
|
+
|
|
416
|
+
See [`example/README.md`](example/README.md) for benchmark details and dataset setup.
|
|
417
|
+
|
|
418
|
+
### Linting
|
|
419
|
+
|
|
420
|
+
```bash
|
|
421
|
+
uv run ruff check .
|
|
422
|
+
uv run ruff format .
|
|
423
|
+
```
|
|
424
|
+
|
|
425
|
+
## Contributing
|
|
426
|
+
|
|
427
|
+
Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
428
|
+
|
|
429
|
+
## Releasing
|
|
430
|
+
|
|
431
|
+
Releases are published to [PyPI](https://pypi.org/project/adaptive-simple-text-classifier/) automatically when a GitHub release is created.
|
|
432
|
+
|
|
433
|
+
To create a new release:
|
|
434
|
+
|
|
435
|
+
1. Update the version in `pyproject.toml` and `adaptive_classifier/__init__.py`
|
|
436
|
+
2. Commit: `git commit -am "Bump version to X.Y.Z"`
|
|
437
|
+
3. Tag: `git tag vX.Y.Z`
|
|
438
|
+
4. Push: `git push origin main --tags`
|
|
439
|
+
5. Create a [GitHub release](https://github.com/johncarpenter/adaptive-simple-text-classifier/releases/new) from the tag
|
|
440
|
+
|
|
441
|
+
The [publish workflow](.github/workflows/publish.yml) will build and upload to PyPI using trusted publishing.
|
|
442
|
+
|
|
443
|
+
## License
|
|
444
|
+
|
|
445
|
+
MIT - see [LICENSE](LICENSE) for details.
|