semantic-reducer 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- semantic_reducer-0.1.0/LICENSE +21 -0
- semantic_reducer-0.1.0/PKG-INFO +172 -0
- semantic_reducer-0.1.0/README.md +140 -0
- semantic_reducer-0.1.0/pyproject.toml +50 -0
- semantic_reducer-0.1.0/setup.cfg +4 -0
- semantic_reducer-0.1.0/src/semantic_reducer/__init__.py +3 -0
- semantic_reducer-0.1.0/src/semantic_reducer/reducer.py +150 -0
- semantic_reducer-0.1.0/src/semantic_reducer.egg-info/PKG-INFO +172 -0
- semantic_reducer-0.1.0/src/semantic_reducer.egg-info/SOURCES.txt +10 -0
- semantic_reducer-0.1.0/src/semantic_reducer.egg-info/dependency_links.txt +1 -0
- semantic_reducer-0.1.0/src/semantic_reducer.egg-info/requires.txt +5 -0
- semantic_reducer-0.1.0/src/semantic_reducer.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ab_Kafi
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: semantic-reducer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: O(1) semantic vocabulary reduction using BERT and FAISS.
|
|
5
|
+
Author-email: Your Name <your.email@example.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
Project-URL: Homepage, https://github.com/abkafi1234/Semantic-Reducer
|
|
8
|
+
Project-URL: Repository, https://github.com/abkafi1234/Semantic-Reducer
|
|
9
|
+
Project-URL: Bug Tracker, https://github.com/abkafi1234/Semantic-Reducer
|
|
10
|
+
Keywords: nlp,bert,faiss,embeddings,text-reduction,machine-learning
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
23
|
+
Requires-Python: >=3.8
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: torch>=1.10.0
|
|
27
|
+
Requires-Dist: numpy>=1.21.0
|
|
28
|
+
Requires-Dist: faiss-cpu>=1.7.0
|
|
29
|
+
Requires-Dist: tqdm>=4.60.0
|
|
30
|
+
Requires-Dist: transformers>=4.20.0
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# Semantic Reducer
|
|
34
|
+
|
|
35
|
+
A high-performance Python library that bridges the gap between **Deep Learning** and **traditional Machine Learning**.
|
|
36
|
+
|
|
37
|
+
`semantic-reducer` uses **BERT contextual embeddings** and **FAISS vector search** to semantically normalize and reduce the vocabulary of a text corpus.
|
|
38
|
+
|
|
39
|
+
By mapping semantically equivalent words (for example: *fast*, *quick*, *rapid*) to a single, highly frequent canonical word, it creates cleaner and denser input for traditional ML models such as **TF-IDF + SVM**, **Random Forest**, or **Logistic Regression**.
|
|
40
|
+
|
|
41
|
+
After the initial processing stage, inference on new text requires only a dictionary lookup, enabling **O(1) runtime complexity**.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
# Features
|
|
46
|
+
|
|
47
|
+
- **Context-Aware Processing**
|
|
48
|
+
Uses Transformer models (default: `bert-base-multilingual-cased`) to capture contextual meaning instead of relying on static embeddings.
|
|
49
|
+
|
|
50
|
+
- **Smart Vocabulary Reduction**
|
|
51
|
+
Automatically replaces rare or obscure words with their most frequent semantic equivalents.
|
|
52
|
+
|
|
53
|
+
- **O(1) Inference Speed**
|
|
54
|
+
After training, the BERT and FAISS pipeline is no longer required. Text reduction happens via a lightweight Python dictionary.
|
|
55
|
+
|
|
56
|
+
- **Production Ready**
|
|
57
|
+
Supports saving and loading compiled reduction maps for deployment.
|
|
58
|
+
|
|
59
|
+
- **Hardware Agnostic**
|
|
60
|
+
Automatically detects and uses GPU (CUDA) if available, while remaining fully functional on CPU.
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
# Installation
|
|
65
|
+
|
|
66
|
+
Install via pip:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
pip install semantic-reducer
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
# Quick Start
|
|
75
|
+
|
|
76
|
+
## 1. Training and Building the Reduction Map
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from semantic_reducer import SemanticReducer
|
|
80
|
+
|
|
81
|
+
# Initialize the reducer
|
|
82
|
+
# (downloads the BERT model on first run)
|
|
83
|
+
reducer = SemanticReducer()
|
|
84
|
+
|
|
85
|
+
# Training corpus
|
|
86
|
+
corpus = [
|
|
87
|
+
"The quick brown fox jumps over the lazy dog.",
|
|
88
|
+
"A fast dark-colored canine leaps above a tired hound.",
|
|
89
|
+
"Speedy foxes jump over sleepy dogs."
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
# Extract contextual representations
|
|
93
|
+
reducer.process_corpus_contextually(corpus, batch_size=2)
|
|
94
|
+
|
|
95
|
+
# Finalize embeddings and create FAISS index
|
|
96
|
+
reducer.finalize_embeddings()
|
|
97
|
+
reducer.build_index()
|
|
98
|
+
|
|
99
|
+
# Build the semantic reduction dictionary
|
|
100
|
+
reducer.build_reduction_map(
|
|
101
|
+
threshold=0.85, # similarity threshold
|
|
102
|
+
top_k=5 # number of neighbors to consider
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Save the trained system
|
|
106
|
+
reducer.save_system(prefix="my_corpus")
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## 2. Lightning-Fast Inference
|
|
112
|
+
|
|
113
|
+
Once the reduction map is built, **BERT is no longer required**. New text is processed through a dictionary lookup.
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from semantic_reducer import SemanticReducer
|
|
117
|
+
|
|
118
|
+
# Initialize reducer
|
|
119
|
+
reducer = SemanticReducer()
|
|
120
|
+
|
|
121
|
+
# Load saved system
|
|
122
|
+
reducer.load_system(prefix="my_corpus")
|
|
123
|
+
|
|
124
|
+
# Reduce new text instantly
|
|
125
|
+
new_text = "The rapid fox leaped."
|
|
126
|
+
reduced_text = reducer.reduce_text(new_text)
|
|
127
|
+
|
|
128
|
+
print(f"Original: {new_text}")
|
|
129
|
+
print(f"Reduced: {reduced_text}")
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
# Workflow Overview
|
|
135
|
+
|
|
136
|
+
1. **Contextual Encoding**
|
|
137
|
+
Extract contextual word embeddings using BERT.
|
|
138
|
+
|
|
139
|
+
2. **Vector Indexing**
|
|
140
|
+
Store embeddings in a FAISS similarity search index.
|
|
141
|
+
|
|
142
|
+
3. **Semantic Clustering**
|
|
143
|
+
Identify semantically similar words.
|
|
144
|
+
|
|
145
|
+
4. **Canonical Replacement**
|
|
146
|
+
Replace each cluster with the most frequent word in the corpus.
|
|
147
|
+
|
|
148
|
+
5. **Dictionary Compilation**
|
|
149
|
+
Store the final mappings for fast inference.
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
# Requirements
|
|
154
|
+
|
|
155
|
+
- Python ≥ 3.8
|
|
156
|
+
- torch
|
|
157
|
+
- numpy
|
|
158
|
+
- faiss-cpu
|
|
159
|
+
- transformers
|
|
160
|
+
- tqdm
|
|
161
|
+
|
|
162
|
+
Install dependencies manually if needed:
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
pip install torch numpy faiss-cpu transformers tqdm
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
# License
|
|
171
|
+
|
|
172
|
+
This project is licensed under the **MIT License**.
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# Semantic Reducer
|
|
2
|
+
|
|
3
|
+
A high-performance Python library that bridges the gap between **Deep Learning** and **traditional Machine Learning**.
|
|
4
|
+
|
|
5
|
+
`semantic-reducer` uses **BERT contextual embeddings** and **FAISS vector search** to semantically normalize and reduce the vocabulary of a text corpus.
|
|
6
|
+
|
|
7
|
+
By mapping semantically equivalent words (for example: *fast*, *quick*, *rapid*) to a single, highly frequent canonical word, it creates cleaner and denser input for traditional ML models such as **TF-IDF + SVM**, **Random Forest**, or **Logistic Regression**.
|
|
8
|
+
|
|
9
|
+
After the initial processing stage, inference on new text requires only a dictionary lookup, enabling **O(1) runtime complexity**.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
# Features
|
|
14
|
+
|
|
15
|
+
- **Context-Aware Processing**
|
|
16
|
+
Uses Transformer models (default: `bert-base-multilingual-cased`) to capture contextual meaning instead of relying on static embeddings.
|
|
17
|
+
|
|
18
|
+
- **Smart Vocabulary Reduction**
|
|
19
|
+
Automatically replaces rare or obscure words with their most frequent semantic equivalents.
|
|
20
|
+
|
|
21
|
+
- **O(1) Inference Speed**
|
|
22
|
+
After training, the BERT and FAISS pipeline is no longer required. Text reduction happens via a lightweight Python dictionary.
|
|
23
|
+
|
|
24
|
+
- **Production Ready**
|
|
25
|
+
Supports saving and loading compiled reduction maps for deployment.
|
|
26
|
+
|
|
27
|
+
- **Hardware Agnostic**
|
|
28
|
+
Automatically detects and uses GPU (CUDA) if available, while remaining fully functional on CPU.
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
# Installation
|
|
33
|
+
|
|
34
|
+
Install via pip:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install semantic-reducer
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
# Quick Start
|
|
43
|
+
|
|
44
|
+
## 1. Training and Building the Reduction Map
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from semantic_reducer import SemanticReducer
|
|
48
|
+
|
|
49
|
+
# Initialize the reducer
|
|
50
|
+
# (downloads the BERT model on first run)
|
|
51
|
+
reducer = SemanticReducer()
|
|
52
|
+
|
|
53
|
+
# Training corpus
|
|
54
|
+
corpus = [
|
|
55
|
+
"The quick brown fox jumps over the lazy dog.",
|
|
56
|
+
"A fast dark-colored canine leaps above a tired hound.",
|
|
57
|
+
"Speedy foxes jump over sleepy dogs."
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
# Extract contextual representations
|
|
61
|
+
reducer.process_corpus_contextually(corpus, batch_size=2)
|
|
62
|
+
|
|
63
|
+
# Finalize embeddings and create FAISS index
|
|
64
|
+
reducer.finalize_embeddings()
|
|
65
|
+
reducer.build_index()
|
|
66
|
+
|
|
67
|
+
# Build the semantic reduction dictionary
|
|
68
|
+
reducer.build_reduction_map(
|
|
69
|
+
threshold=0.85, # similarity threshold
|
|
70
|
+
top_k=5 # number of neighbors to consider
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Save the trained system
|
|
74
|
+
reducer.save_system(prefix="my_corpus")
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## 2. Lightning-Fast Inference
|
|
80
|
+
|
|
81
|
+
Once the reduction map is built, **BERT is no longer required**. New text is processed through a dictionary lookup.
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from semantic_reducer import SemanticReducer
|
|
85
|
+
|
|
86
|
+
# Initialize reducer
|
|
87
|
+
reducer = SemanticReducer()
|
|
88
|
+
|
|
89
|
+
# Load saved system
|
|
90
|
+
reducer.load_system(prefix="my_corpus")
|
|
91
|
+
|
|
92
|
+
# Reduce new text instantly
|
|
93
|
+
new_text = "The rapid fox leaped."
|
|
94
|
+
reduced_text = reducer.reduce_text(new_text)
|
|
95
|
+
|
|
96
|
+
print(f"Original: {new_text}")
|
|
97
|
+
print(f"Reduced: {reduced_text}")
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
# Workflow Overview
|
|
103
|
+
|
|
104
|
+
1. **Contextual Encoding**
|
|
105
|
+
Extract contextual word embeddings using BERT.
|
|
106
|
+
|
|
107
|
+
2. **Vector Indexing**
|
|
108
|
+
Store embeddings in a FAISS similarity search index.
|
|
109
|
+
|
|
110
|
+
3. **Semantic Clustering**
|
|
111
|
+
Identify semantically similar words.
|
|
112
|
+
|
|
113
|
+
4. **Canonical Replacement**
|
|
114
|
+
Replace each cluster with the most frequent word in the corpus.
|
|
115
|
+
|
|
116
|
+
5. **Dictionary Compilation**
|
|
117
|
+
Store the final mappings for fast inference.
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
# Requirements
|
|
122
|
+
|
|
123
|
+
- Python ≥ 3.8
|
|
124
|
+
- torch
|
|
125
|
+
- numpy
|
|
126
|
+
- faiss-cpu
|
|
127
|
+
- transformers
|
|
128
|
+
- tqdm
|
|
129
|
+
|
|
130
|
+
Install dependencies manually if needed:
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
pip install torch numpy faiss-cpu transformers tqdm
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
# License
|
|
139
|
+
|
|
140
|
+
This project is licensed under the **MIT License**.
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "semantic-reducer"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name="Your Name", email="your.email@example.com" },
|
|
10
|
+
]
|
|
11
|
+
description = "O(1) semantic vocabulary reduction using BERT and FAISS."
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.8"
|
|
14
|
+
license = { text = "MIT License" }
|
|
15
|
+
keywords = ["nlp", "bert", "faiss", "embeddings", "text-reduction", "machine-learning"]
|
|
16
|
+
|
|
17
|
+
# Classifiers help users filter and find your project on PyPI
|
|
18
|
+
classifiers = [
|
|
19
|
+
"Development Status :: 3 - Alpha",
|
|
20
|
+
"Intended Audience :: Developers",
|
|
21
|
+
"Intended Audience :: Science/Research",
|
|
22
|
+
"License :: OSI Approved :: MIT License",
|
|
23
|
+
"Operating System :: OS Independent",
|
|
24
|
+
"Programming Language :: Python :: 3",
|
|
25
|
+
"Programming Language :: Python :: 3.8",
|
|
26
|
+
"Programming Language :: Python :: 3.9",
|
|
27
|
+
"Programming Language :: Python :: 3.10",
|
|
28
|
+
"Programming Language :: Python :: 3.11",
|
|
29
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
30
|
+
"Topic :: Text Processing :: Linguistic",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
# The libraries installed automatically when someone pip installs your package
|
|
34
|
+
dependencies = [
|
|
35
|
+
"torch>=1.10.0",
|
|
36
|
+
"numpy>=1.21.0",
|
|
37
|
+
"faiss-cpu>=1.7.0",
|
|
38
|
+
"tqdm>=4.60.0",
|
|
39
|
+
"transformers>=4.20.0"
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
# These links will appear on the left sidebar of your PyPI page
|
|
43
|
+
[project.urls]
|
|
44
|
+
Homepage = "https://github.com/abkafi1234/Semantic-Reducer"
|
|
45
|
+
Repository = "https://github.com/abkafi1234/Semantic-Reducer"
|
|
46
|
+
"Bug Tracker" = "https://github.com/abkafi1234/Semantic-Reducer"
|
|
47
|
+
|
|
48
|
+
# Explicitly tells the build tool to look inside the src/ folder for your code
|
|
49
|
+
[tool.setuptools.packages.find]
|
|
50
|
+
where = ["src"]
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
import numpy as np
|
|
3
|
+
import faiss
|
|
4
|
+
import pickle
|
|
5
|
+
import os
|
|
6
|
+
from tqdm import tqdm
|
|
7
|
+
from collections import defaultdict, Counter
|
|
8
|
+
from transformers import AutoTokenizer, AutoModel
|
|
9
|
+
|
|
10
|
+
class SemanticReducer:
|
|
11
|
+
def __init__(self, model_name='bert-base-multilingual-cased', cache_dir='./cache'):
|
|
12
|
+
self.model_name = model_name
|
|
13
|
+
self.cache_dir = cache_dir
|
|
14
|
+
os.makedirs(cache_dir, exist_ok=True)
|
|
15
|
+
|
|
16
|
+
# Use Fast tokenizer for word_ids mapping
|
|
17
|
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
|
|
18
|
+
self.model = AutoModel.from_pretrained(model_name)
|
|
19
|
+
self.model.eval() # Ensure evaluation mode
|
|
20
|
+
|
|
21
|
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
22
|
+
self.model.to(self.device)
|
|
23
|
+
|
|
24
|
+
self.word_counts = Counter()
|
|
25
|
+
self.word_embeddings_sum = defaultdict(lambda: np.zeros(self.model.config.hidden_size))
|
|
26
|
+
self.vocab_embeddings = None
|
|
27
|
+
self.vocab_list = []
|
|
28
|
+
self.index = None
|
|
29
|
+
self.reduction_map = {} # O(1) inference dictionary
|
|
30
|
+
|
|
31
|
+
def process_corpus_contextually(self, sentences, batch_size=16):
|
|
32
|
+
"""
|
|
33
|
+
Processes entire sentences to extract sense-averaged contextual embeddings.
|
|
34
|
+
Addresses Reviewer 3's concern about isolated token embeddings.
|
|
35
|
+
"""
|
|
36
|
+
print("Extracting contextual embeddings from corpus...")
|
|
37
|
+
|
|
38
|
+
# Pre-tokenize by whitespace to maintain word boundaries
|
|
39
|
+
split_sentences = [str(sent).strip().split() for sent in sentences]
|
|
40
|
+
|
|
41
|
+
for i in tqdm(range(0, len(split_sentences), batch_size), desc="Processing Batches"):
|
|
42
|
+
batch = split_sentences[i:i+batch_size]
|
|
43
|
+
|
|
44
|
+
# Tokenize batch with word alignment (returns a BatchEncoding object)
|
|
45
|
+
inputs = self.tokenizer(batch, is_split_into_words=True, return_tensors='pt',
|
|
46
|
+
padding=True, truncation=True, max_length=512)
|
|
47
|
+
|
|
48
|
+
# Move tensors to device FOR THE MODEL, but do NOT overwrite 'inputs'
|
|
49
|
+
model_inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
|
50
|
+
|
|
51
|
+
with torch.no_grad():
|
|
52
|
+
outputs = self.model(**model_inputs)
|
|
53
|
+
|
|
54
|
+
# Use last_hidden_state (explicitly addressing Rev 4, Q2)
|
|
55
|
+
hidden_states = outputs.last_hidden_state.cpu().numpy()
|
|
56
|
+
|
|
57
|
+
# Aggregate subwords into whole words
|
|
58
|
+
for batch_idx in range(len(batch)):
|
|
59
|
+
# Now this works perfectly because inputs is still a BatchEncoding
|
|
60
|
+
word_ids = inputs.word_ids(batch_index=batch_idx)
|
|
61
|
+
original_words = batch[batch_idx]
|
|
62
|
+
|
|
63
|
+
# Group subword embeddings by word_id
|
|
64
|
+
word_embeddings_temp = defaultdict(list)
|
|
65
|
+
for seq_idx, word_idx in enumerate(word_ids):
|
|
66
|
+
if word_idx is not None: # Ignore [CLS], [SEP], [PAD]
|
|
67
|
+
word_embeddings_temp[word_idx].append(hidden_states[batch_idx, seq_idx, :])
|
|
68
|
+
|
|
69
|
+
# Average subwords to get the word embedding, add to global corpus sum
|
|
70
|
+
for word_idx, embeddings in word_embeddings_temp.items():
|
|
71
|
+
# Safeguard just in case word_idx is out of bounds
|
|
72
|
+
if word_idx < len(original_words):
|
|
73
|
+
word = original_words[word_idx]
|
|
74
|
+
avg_word_emb = np.mean(embeddings, axis=0)
|
|
75
|
+
|
|
76
|
+
self.word_embeddings_sum[word] += avg_word_emb
|
|
77
|
+
self.word_counts[word] += 1
|
|
78
|
+
|
|
79
|
+
def finalize_embeddings(self):
|
|
80
|
+
"""Averages the accumulated contextual embeddings to create static representations."""
|
|
81
|
+
print("Finalizing sense-averaged embeddings...")
|
|
82
|
+
self.vocab_list = list(self.word_counts.keys())
|
|
83
|
+
self.vocab_embeddings = np.zeros((len(self.vocab_list), self.model.config.hidden_size), dtype=np.float32)
|
|
84
|
+
|
|
85
|
+
for idx, word in enumerate(self.vocab_list):
|
|
86
|
+
# Divide sum by count to get the mean contextual embedding
|
|
87
|
+
self.vocab_embeddings[idx] = self.word_embeddings_sum[word] / self.word_counts[word]
|
|
88
|
+
|
|
89
|
+
# L2 Normalize for FAISS Inner Product (Cosine Similarity)
|
|
90
|
+
faiss.normalize_L2(self.vocab_embeddings)
|
|
91
|
+
|
|
92
|
+
def build_index(self):
|
|
93
|
+
"""Builds the FAISS index. Explicitly uses Inner Product (Cosine)."""
|
|
94
|
+
dim = self.vocab_embeddings.shape[1]
|
|
95
|
+
self.index = faiss.IndexFlatIP(dim) # Cosine similarity since vectors are L2 normalized
|
|
96
|
+
self.index.add(self.vocab_embeddings)
|
|
97
|
+
print(f"Built FAISS index with {self.index.ntotal} vectors of dimension {dim}.")
|
|
98
|
+
|
|
99
|
+
def build_reduction_map(self, threshold=0.9, top_k=10):
|
|
100
|
+
"""
|
|
101
|
+
Creates the global mapping dictionary.
|
|
102
|
+
Uses FREQUENCY instead of length to pick the canonical word (Fixes Rev 3, Concern 3).
|
|
103
|
+
"""
|
|
104
|
+
print(f"Building semantic reduction map (threshold={threshold})...")
|
|
105
|
+
self.reduction_map = {}
|
|
106
|
+
|
|
107
|
+
# Search all words against the index at once for efficiency
|
|
108
|
+
distances, indices = self.index.search(self.vocab_embeddings, top_k)
|
|
109
|
+
|
|
110
|
+
for i, word in enumerate(tqdm(self.vocab_list, desc="Mapping Vocabulary")):
|
|
111
|
+
candidates = []
|
|
112
|
+
for j in range(top_k):
|
|
113
|
+
dist = distances[i][j]
|
|
114
|
+
neighbor_idx = indices[i][j]
|
|
115
|
+
|
|
116
|
+
if dist >= threshold:
|
|
117
|
+
neighbor_word = self.vocab_list[neighbor_idx]
|
|
118
|
+
candidates.append(neighbor_word)
|
|
119
|
+
|
|
120
|
+
if candidates:
|
|
121
|
+
# The core fix: Sort candidates by corpus frequency (descending)
|
|
122
|
+
# If tied, fall back to shorter length as secondary heuristic
|
|
123
|
+
candidates.sort(key=lambda w: (self.word_counts[w], -len(w)), reverse=True)
|
|
124
|
+
|
|
125
|
+
# The most frequent semantic neighbor becomes the representative token
|
|
126
|
+
self.reduction_map[word] = candidates[0]
|
|
127
|
+
else:
|
|
128
|
+
self.reduction_map[word] = word
|
|
129
|
+
|
|
130
|
+
def reduce_text(self, text):
|
|
131
|
+
"""O(1) inference for new text."""
|
|
132
|
+
words = text.split()
|
|
133
|
+
return ' '.join([self.reduction_map.get(w, w) for w in words])
|
|
134
|
+
|
|
135
|
+
def save_system(self, prefix="semred"):
|
|
136
|
+
"""Saves the minimal artifacts needed for inference."""
|
|
137
|
+
file_path = os.path.join(self.cache_dir, f"{prefix}_map.pkl")
|
|
138
|
+
with open(file_path, 'wb') as f:
|
|
139
|
+
pickle.dump(self.reduction_map, f)
|
|
140
|
+
print(f"System saved successfully to {file_path}")
|
|
141
|
+
|
|
142
|
+
def load_system(self, prefix="semred"):
|
|
143
|
+
"""Loads a previously saved mapping dictionary for fast inference."""
|
|
144
|
+
file_path = os.path.join(self.cache_dir, f"{prefix}_map.pkl")
|
|
145
|
+
if not os.path.exists(file_path):
|
|
146
|
+
raise FileNotFoundError(f"Could not find a saved system at {file_path}")
|
|
147
|
+
|
|
148
|
+
with open(file_path, 'rb') as f:
|
|
149
|
+
self.reduction_map = pickle.load(f)
|
|
150
|
+
print(f"System loaded successfully from {file_path}")
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: semantic-reducer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: O(1) semantic vocabulary reduction using BERT and FAISS.
|
|
5
|
+
Author-email: Your Name <your.email@example.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
Project-URL: Homepage, https://github.com/abkafi1234/Semantic-Reducer
|
|
8
|
+
Project-URL: Repository, https://github.com/abkafi1234/Semantic-Reducer
|
|
9
|
+
Project-URL: Bug Tracker, https://github.com/abkafi1234/Semantic-Reducer
|
|
10
|
+
Keywords: nlp,bert,faiss,embeddings,text-reduction,machine-learning
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
23
|
+
Requires-Python: >=3.8
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: torch>=1.10.0
|
|
27
|
+
Requires-Dist: numpy>=1.21.0
|
|
28
|
+
Requires-Dist: faiss-cpu>=1.7.0
|
|
29
|
+
Requires-Dist: tqdm>=4.60.0
|
|
30
|
+
Requires-Dist: transformers>=4.20.0
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# Semantic Reducer
|
|
34
|
+
|
|
35
|
+
A high-performance Python library that bridges the gap between **Deep Learning** and **traditional Machine Learning**.
|
|
36
|
+
|
|
37
|
+
`semantic-reducer` uses **BERT contextual embeddings** and **FAISS vector search** to semantically normalize and reduce the vocabulary of a text corpus.
|
|
38
|
+
|
|
39
|
+
By mapping semantically equivalent words (for example: *fast*, *quick*, *rapid*) to a single, highly frequent canonical word, it creates cleaner and denser input for traditional ML models such as **TF-IDF + SVM**, **Random Forest**, or **Logistic Regression**.
|
|
40
|
+
|
|
41
|
+
After the initial processing stage, inference on new text requires only a dictionary lookup, enabling **O(1) runtime complexity**.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
# Features
|
|
46
|
+
|
|
47
|
+
- **Context-Aware Processing**
|
|
48
|
+
Uses Transformer models (default: `bert-base-multilingual-cased`) to capture contextual meaning instead of relying on static embeddings.
|
|
49
|
+
|
|
50
|
+
- **Smart Vocabulary Reduction**
|
|
51
|
+
Automatically replaces rare or obscure words with their most frequent semantic equivalents.
|
|
52
|
+
|
|
53
|
+
- **O(1) Inference Speed**
|
|
54
|
+
After training, the BERT and FAISS pipeline is no longer required. Text reduction happens via a lightweight Python dictionary.
|
|
55
|
+
|
|
56
|
+
- **Production Ready**
|
|
57
|
+
Supports saving and loading compiled reduction maps for deployment.
|
|
58
|
+
|
|
59
|
+
- **Hardware Agnostic**
|
|
60
|
+
Automatically detects and uses GPU (CUDA) if available, while remaining fully functional on CPU.
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
# Installation
|
|
65
|
+
|
|
66
|
+
Install via pip:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
pip install semantic-reducer
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
# Quick Start
|
|
75
|
+
|
|
76
|
+
## 1. Training and Building the Reduction Map
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from semantic_reducer import SemanticReducer
|
|
80
|
+
|
|
81
|
+
# Initialize the reducer
|
|
82
|
+
# (downloads the BERT model on first run)
|
|
83
|
+
reducer = SemanticReducer()
|
|
84
|
+
|
|
85
|
+
# Training corpus
|
|
86
|
+
corpus = [
|
|
87
|
+
"The quick brown fox jumps over the lazy dog.",
|
|
88
|
+
"A fast dark-colored canine leaps above a tired hound.",
|
|
89
|
+
"Speedy foxes jump over sleepy dogs."
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
# Extract contextual representations
|
|
93
|
+
reducer.process_corpus_contextually(corpus, batch_size=2)
|
|
94
|
+
|
|
95
|
+
# Finalize embeddings and create FAISS index
|
|
96
|
+
reducer.finalize_embeddings()
|
|
97
|
+
reducer.build_index()
|
|
98
|
+
|
|
99
|
+
# Build the semantic reduction dictionary
|
|
100
|
+
reducer.build_reduction_map(
|
|
101
|
+
threshold=0.85, # similarity threshold
|
|
102
|
+
top_k=5 # number of neighbors to consider
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Save the trained system
|
|
106
|
+
reducer.save_system(prefix="my_corpus")
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## 2. Lightning-Fast Inference
|
|
112
|
+
|
|
113
|
+
Once the reduction map is built, **BERT is no longer required**. New text is processed through a dictionary lookup.
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from semantic_reducer import SemanticReducer
|
|
117
|
+
|
|
118
|
+
# Initialize reducer
|
|
119
|
+
reducer = SemanticReducer()
|
|
120
|
+
|
|
121
|
+
# Load saved system
|
|
122
|
+
reducer.load_system(prefix="my_corpus")
|
|
123
|
+
|
|
124
|
+
# Reduce new text instantly
|
|
125
|
+
new_text = "The rapid fox leaped."
|
|
126
|
+
reduced_text = reducer.reduce_text(new_text)
|
|
127
|
+
|
|
128
|
+
print(f"Original: {new_text}")
|
|
129
|
+
print(f"Reduced: {reduced_text}")
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
# Workflow Overview
|
|
135
|
+
|
|
136
|
+
1. **Contextual Encoding**
|
|
137
|
+
Extract contextual word embeddings using BERT.
|
|
138
|
+
|
|
139
|
+
2. **Vector Indexing**
|
|
140
|
+
Store embeddings in a FAISS similarity search index.
|
|
141
|
+
|
|
142
|
+
3. **Semantic Clustering**
|
|
143
|
+
Identify semantically similar words.
|
|
144
|
+
|
|
145
|
+
4. **Canonical Replacement**
|
|
146
|
+
Replace each cluster with the most frequent word in the corpus.
|
|
147
|
+
|
|
148
|
+
5. **Dictionary Compilation**
|
|
149
|
+
Store the final mappings for fast inference.
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
# Requirements
|
|
154
|
+
|
|
155
|
+
- Python ≥ 3.8
|
|
156
|
+
- torch
|
|
157
|
+
- numpy
|
|
158
|
+
- faiss-cpu
|
|
159
|
+
- transformers
|
|
160
|
+
- tqdm
|
|
161
|
+
|
|
162
|
+
Install dependencies manually if needed:
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
pip install torch numpy faiss-cpu transformers tqdm
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
# License
|
|
171
|
+
|
|
172
|
+
This project is licensed under the **MIT License**.
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/semantic_reducer/__init__.py
|
|
5
|
+
src/semantic_reducer/reducer.py
|
|
6
|
+
src/semantic_reducer.egg-info/PKG-INFO
|
|
7
|
+
src/semantic_reducer.egg-info/SOURCES.txt
|
|
8
|
+
src/semantic_reducer.egg-info/dependency_links.txt
|
|
9
|
+
src/semantic_reducer.egg-info/requires.txt
|
|
10
|
+
src/semantic_reducer.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
semantic_reducer
|