sentence-embedder 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sentence_embedder-1.0.0/PKG-INFO +206 -0
- sentence_embedder-1.0.0/README.md +167 -0
- sentence_embedder-1.0.0/pyproject.toml +63 -0
- sentence_embedder-1.0.0/sentence_embedder/__init__.py +23 -0
- sentence_embedder-1.0.0/sentence_embedder/cache.py +127 -0
- sentence_embedder-1.0.0/sentence_embedder/embedder.py +229 -0
- sentence_embedder-1.0.0/sentence_embedder.egg-info/PKG-INFO +206 -0
- sentence_embedder-1.0.0/sentence_embedder.egg-info/SOURCES.txt +11 -0
- sentence_embedder-1.0.0/sentence_embedder.egg-info/dependency_links.txt +1 -0
- sentence_embedder-1.0.0/sentence_embedder.egg-info/requires.txt +19 -0
- sentence_embedder-1.0.0/sentence_embedder.egg-info/top_level.txt +1 -0
- sentence_embedder-1.0.0/setup.cfg +4 -0
- sentence_embedder-1.0.0/tests/test_embedder.py +137 -0
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sentence-embedder
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Lightweight Python library for sentence/semantic embeddings — SentenceTransformers & OpenAI in one unified API.
|
|
5
|
+
Author-email: Your Name <you@example.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/your-org/sentence-embedder
|
|
8
|
+
Project-URL: Repository, https://github.com/your-org/sentence-embedder
|
|
9
|
+
Project-URL: Bug Tracker, https://github.com/your-org/sentence-embedder/issues
|
|
10
|
+
Project-URL: Changelog, https://github.com/your-org/sentence-embedder/blob/main/CHANGELOG.md
|
|
11
|
+
Keywords: embeddings,nlp,sentence-transformers,semantic-search,openai,machine-learning
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
Requires-Dist: numpy>=1.24
|
|
25
|
+
Provides-Extra: st
|
|
26
|
+
Requires-Dist: sentence-transformers>=2.7; extra == "st"
|
|
27
|
+
Provides-Extra: openai
|
|
28
|
+
Requires-Dist: openai>=1.0; extra == "openai"
|
|
29
|
+
Provides-Extra: all
|
|
30
|
+
Requires-Dist: sentence-transformers>=2.7; extra == "all"
|
|
31
|
+
Requires-Dist: openai>=1.0; extra == "all"
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
34
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
35
|
+
Requires-Dist: ruff; extra == "dev"
|
|
36
|
+
Requires-Dist: mypy; extra == "dev"
|
|
37
|
+
Requires-Dist: build; extra == "dev"
|
|
38
|
+
Requires-Dist: twine; extra == "dev"
|
|
39
|
+
|
|
40
|
+
# sentence-embedder
|
|
41
|
+
|
|
42
|
+
A lightweight **internal Python library** for generating sentence/semantic embeddings with a clean, unified API.
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## Features
|
|
47
|
+
|
|
48
|
+
| Feature | Details |
|
|
49
|
+
|---|---|
|
|
50
|
+
| **Backends** | SentenceTransformers (local, offline) · OpenAI API |
|
|
51
|
+
| **Single & batch** | `embed()` and `embed_batch()` |
|
|
52
|
+
| **Similarity** | Cosine similarity between two sentences |
|
|
53
|
+
| **Semantic search** | `most_similar()` over an in-memory corpus |
|
|
54
|
+
| **Disk cache** | `EmbeddingCache` — skip re-embedding seen texts |
|
|
55
|
+
| **Typed** | Full type hints, compatible with `mypy` |
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## Installation
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
# Clone / copy this package into your project, then:
|
|
63
|
+
|
|
64
|
+
# SentenceTransformers backend (local, recommended)
|
|
65
|
+
pip install -e ".[st]"
|
|
66
|
+
|
|
67
|
+
# OpenAI backend
|
|
68
|
+
pip install -e ".[openai]"
|
|
69
|
+
|
|
70
|
+
# Both
|
|
71
|
+
pip install -e ".[all]"
|
|
72
|
+
|
|
73
|
+
# With dev tools (pytest, ruff, mypy)
|
|
74
|
+
pip install -e ".[all,dev]"
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## Quick Start
|
|
80
|
+
|
|
81
|
+
### 1 — Basic embedding
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from sentence_embedder import SentenceEmbedder
|
|
85
|
+
|
|
86
|
+
embedder = SentenceEmbedder() # defaults: SentenceTransformers, all-MiniLM-L6-v2
|
|
87
|
+
|
|
88
|
+
vec = embedder.embed("The cat sat on the mat.")
|
|
89
|
+
print(vec.shape) # (384,)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### 2 — Batch embedding
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
texts = ["Hello world", "Machine learning is fun", "I love Python"]
|
|
96
|
+
vecs = embedder.embed_batch(texts)
|
|
97
|
+
print(vecs.shape) # (3, 384)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### 3 — Cosine similarity
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
score = embedder.similarity("fast car", "quick automobile")
|
|
104
|
+
print(score) # ~0.85
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### 4 — Semantic search
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
corpus = [
|
|
111
|
+
"The stock market crashed today.",
|
|
112
|
+
"Scientists discover a new planet.",
|
|
113
|
+
"Football team wins the championship.",
|
|
114
|
+
"A new AI model beats human performance.",
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
results = embedder.most_similar("breakthrough in artificial intelligence", corpus, top_k=2)
|
|
118
|
+
for sentence, score in results:
|
|
119
|
+
print(f"{score:.3f} {sentence}")
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### 5 — Disk cache (avoid re-embedding)
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from sentence_embedder import SentenceEmbedder, EmbeddingCache
|
|
126
|
+
|
|
127
|
+
base = SentenceEmbedder()
|
|
128
|
+
embedder = EmbeddingCache(base, cache_dir=".cache/embeddings")
|
|
129
|
+
|
|
130
|
+
vec = embedder.embed("Hello world") # computed → stored on disk
|
|
131
|
+
vec = embedder.embed("Hello world") # loaded from cache instantly
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## OpenAI Backend
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
from sentence_embedder import SentenceEmbedder
|
|
140
|
+
|
|
141
|
+
embedder = SentenceEmbedder(
|
|
142
|
+
backend="openai",
|
|
143
|
+
model_name="text-embedding-3-small",
|
|
144
|
+
openai_api_key="sk-...", # or set OPENAI_API_KEY env var
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
vec = embedder.embed("Hello from OpenAI!")
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## API Reference
|
|
153
|
+
|
|
154
|
+
### `SentenceEmbedder`
|
|
155
|
+
|
|
156
|
+
| Method | Signature | Description |
|
|
157
|
+
|---|---|---|
|
|
158
|
+
| `embed` | `(text: str) → ndarray` | Embed one sentence → 1-D vector |
|
|
159
|
+
| `embed_batch` | `(texts: List[str]) → ndarray` | Embed many → 2-D array `(N, dim)` |
|
|
160
|
+
| `similarity` | `(a: str, b: str) → float` | Cosine similarity in `[-1, 1]` |
|
|
161
|
+
| `most_similar` | `(query, corpus, top_k=5) → List[tuple]` | Ranked `(sentence, score)` pairs |
|
|
162
|
+
|
|
163
|
+
### `EmbeddingCache`
|
|
164
|
+
|
|
165
|
+
| Method | Signature | Description |
|
|
166
|
+
|---|---|---|
|
|
167
|
+
| `embed` | `(text: str) → ndarray` | Cache-aware single embed |
|
|
168
|
+
| `embed_batch` | `(texts: List[str]) → ndarray` | Cache-aware batch embed |
|
|
169
|
+
| `clear` | `() → None` | Wipe the cache database |
|
|
170
|
+
|
|
171
|
+
---
|
|
172
|
+
|
|
173
|
+
## Running Tests
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
pytest
|
|
177
|
+
# With coverage:
|
|
178
|
+
pytest --cov=sentence_embedder --cov-report=term-missing
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
---
|
|
182
|
+
|
|
183
|
+
## Package Structure
|
|
184
|
+
|
|
185
|
+
```
|
|
186
|
+
sentence_embedder/
|
|
187
|
+
├── sentence_embedder/
|
|
188
|
+
│ ├── __init__.py # Public exports
|
|
189
|
+
│ ├── embedder.py # SentenceEmbedder (core)
|
|
190
|
+
│ └── cache.py # EmbeddingCache (disk cache)
|
|
191
|
+
├── tests/
|
|
192
|
+
│ └── test_embedder.py # Unit tests (no model/network needed)
|
|
193
|
+
├── pyproject.toml # Build config & dependencies
|
|
194
|
+
└── README.md
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
199
|
+
## Choosing a Model
|
|
200
|
+
|
|
201
|
+
| Model | Dim | Speed | Quality | Use case |
|
|
202
|
+
|---|---|---|---|---|
|
|
203
|
+
| `all-MiniLM-L6-v2` | 384 | ⚡⚡⚡ | ★★★ | Default, general purpose |
|
|
204
|
+
| `all-mpnet-base-v2` | 768 | ⚡⚡ | ★★★★ | Higher quality |
|
|
205
|
+
| `multi-qa-MiniLM-L6-cos-v1` | 384 | ⚡⚡⚡ | ★★★★ | Q&A / search |
|
|
206
|
+
| `text-embedding-3-small` *(OpenAI)* | 1536 | API | ★★★★★ | Best quality, needs key |
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
# sentence-embedder
|
|
2
|
+
|
|
3
|
+
A lightweight **internal Python library** for generating sentence/semantic embeddings with a clean, unified API.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
| Feature | Details |
|
|
10
|
+
|---|---|
|
|
11
|
+
| **Backends** | SentenceTransformers (local, offline) · OpenAI API |
|
|
12
|
+
| **Single & batch** | `embed()` and `embed_batch()` |
|
|
13
|
+
| **Similarity** | Cosine similarity between two sentences |
|
|
14
|
+
| **Semantic search** | `most_similar()` over an in-memory corpus |
|
|
15
|
+
| **Disk cache** | `EmbeddingCache` — skip re-embedding seen texts |
|
|
16
|
+
| **Typed** | Full type hints, compatible with `mypy` |
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
# Clone / copy this package into your project, then:
|
|
24
|
+
|
|
25
|
+
# SentenceTransformers backend (local, recommended)
|
|
26
|
+
pip install -e ".[st]"
|
|
27
|
+
|
|
28
|
+
# OpenAI backend
|
|
29
|
+
pip install -e ".[openai]"
|
|
30
|
+
|
|
31
|
+
# Both
|
|
32
|
+
pip install -e ".[all]"
|
|
33
|
+
|
|
34
|
+
# With dev tools (pytest, ruff, mypy)
|
|
35
|
+
pip install -e ".[all,dev]"
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## Quick Start
|
|
41
|
+
|
|
42
|
+
### 1 — Basic embedding
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from sentence_embedder import SentenceEmbedder
|
|
46
|
+
|
|
47
|
+
embedder = SentenceEmbedder() # defaults: SentenceTransformers, all-MiniLM-L6-v2
|
|
48
|
+
|
|
49
|
+
vec = embedder.embed("The cat sat on the mat.")
|
|
50
|
+
print(vec.shape) # (384,)
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### 2 — Batch embedding
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
texts = ["Hello world", "Machine learning is fun", "I love Python"]
|
|
57
|
+
vecs = embedder.embed_batch(texts)
|
|
58
|
+
print(vecs.shape) # (3, 384)
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### 3 — Cosine similarity
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
score = embedder.similarity("fast car", "quick automobile")
|
|
65
|
+
print(score) # ~0.85
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### 4 — Semantic search
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
corpus = [
|
|
72
|
+
"The stock market crashed today.",
|
|
73
|
+
"Scientists discover a new planet.",
|
|
74
|
+
"Football team wins the championship.",
|
|
75
|
+
"A new AI model beats human performance.",
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
results = embedder.most_similar("breakthrough in artificial intelligence", corpus, top_k=2)
|
|
79
|
+
for sentence, score in results:
|
|
80
|
+
print(f"{score:.3f} {sentence}")
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### 5 — Disk cache (avoid re-embedding)
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from sentence_embedder import SentenceEmbedder, EmbeddingCache
|
|
87
|
+
|
|
88
|
+
base = SentenceEmbedder()
|
|
89
|
+
embedder = EmbeddingCache(base, cache_dir=".cache/embeddings")
|
|
90
|
+
|
|
91
|
+
vec = embedder.embed("Hello world") # computed → stored on disk
|
|
92
|
+
vec = embedder.embed("Hello world") # loaded from cache instantly
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## OpenAI Backend
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
from sentence_embedder import SentenceEmbedder
|
|
101
|
+
|
|
102
|
+
embedder = SentenceEmbedder(
|
|
103
|
+
backend="openai",
|
|
104
|
+
model_name="text-embedding-3-small",
|
|
105
|
+
openai_api_key="sk-...", # or set OPENAI_API_KEY env var
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
vec = embedder.embed("Hello from OpenAI!")
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## API Reference
|
|
114
|
+
|
|
115
|
+
### `SentenceEmbedder`
|
|
116
|
+
|
|
117
|
+
| Method | Signature | Description |
|
|
118
|
+
|---|---|---|
|
|
119
|
+
| `embed` | `(text: str) → ndarray` | Embed one sentence → 1-D vector |
|
|
120
|
+
| `embed_batch` | `(texts: List[str]) → ndarray` | Embed many → 2-D array `(N, dim)` |
|
|
121
|
+
| `similarity` | `(a: str, b: str) → float` | Cosine similarity in `[-1, 1]` |
|
|
122
|
+
| `most_similar` | `(query, corpus, top_k=5) → List[tuple]` | Ranked `(sentence, score)` pairs |
|
|
123
|
+
|
|
124
|
+
### `EmbeddingCache`
|
|
125
|
+
|
|
126
|
+
| Method | Signature | Description |
|
|
127
|
+
|---|---|---|
|
|
128
|
+
| `embed` | `(text: str) → ndarray` | Cache-aware single embed |
|
|
129
|
+
| `embed_batch` | `(texts: List[str]) → ndarray` | Cache-aware batch embed |
|
|
130
|
+
| `clear` | `() → None` | Wipe the cache database |
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## Running Tests
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
pytest
|
|
138
|
+
# With coverage:
|
|
139
|
+
pytest --cov=sentence_embedder --cov-report=term-missing
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
---
|
|
143
|
+
|
|
144
|
+
## Package Structure
|
|
145
|
+
|
|
146
|
+
```
|
|
147
|
+
sentence_embedder/
|
|
148
|
+
├── sentence_embedder/
|
|
149
|
+
│ ├── __init__.py # Public exports
|
|
150
|
+
│ ├── embedder.py # SentenceEmbedder (core)
|
|
151
|
+
│ └── cache.py # EmbeddingCache (disk cache)
|
|
152
|
+
├── tests/
|
|
153
|
+
│ └── test_embedder.py # Unit tests (no model/network needed)
|
|
154
|
+
├── pyproject.toml # Build config & dependencies
|
|
155
|
+
└── README.md
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## Choosing a Model
|
|
161
|
+
|
|
162
|
+
| Model | Dim | Speed | Quality | Use case |
|
|
163
|
+
|---|---|---|---|---|
|
|
164
|
+
| `all-MiniLM-L6-v2` | 384 | ⚡⚡⚡ | ★★★ | Default, general purpose |
|
|
165
|
+
| `all-mpnet-base-v2` | 768 | ⚡⚡ | ★★★★ | Higher quality |
|
|
166
|
+
| `multi-qa-MiniLM-L6-cos-v1` | 384 | ⚡⚡⚡ | ★★★★ | Q&A / search |
|
|
167
|
+
| `text-embedding-3-small` *(OpenAI)* | 1536 | API | ★★★★★ | Best quality, needs key |
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "sentence-embedder"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Lightweight Python library for sentence/semantic embeddings — SentenceTransformers & OpenAI in one unified API."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Your Name", email = "you@example.com" }
|
|
14
|
+
]
|
|
15
|
+
keywords = [
|
|
16
|
+
"embeddings", "nlp", "sentence-transformers",
|
|
17
|
+
"semantic-search", "openai", "machine-learning"
|
|
18
|
+
]
|
|
19
|
+
classifiers = [
|
|
20
|
+
"Development Status :: 5 - Production/Stable",
|
|
21
|
+
"Intended Audience :: Developers",
|
|
22
|
+
"Intended Audience :: Science/Research",
|
|
23
|
+
"License :: OSI Approved :: MIT License",
|
|
24
|
+
"Programming Language :: Python :: 3",
|
|
25
|
+
"Programming Language :: Python :: 3.10",
|
|
26
|
+
"Programming Language :: Python :: 3.11",
|
|
27
|
+
"Programming Language :: Python :: 3.12",
|
|
28
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
29
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
dependencies = [
|
|
33
|
+
"numpy>=1.24",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[project.urls]
|
|
37
|
+
Homepage = "https://github.com/your-org/sentence-embedder"
|
|
38
|
+
Repository = "https://github.com/your-org/sentence-embedder"
|
|
39
|
+
"Bug Tracker" = "https://github.com/your-org/sentence-embedder/issues"
|
|
40
|
+
Changelog = "https://github.com/your-org/sentence-embedder/blob/main/CHANGELOG.md"
|
|
41
|
+
|
|
42
|
+
[project.optional-dependencies]
|
|
43
|
+
st = ["sentence-transformers>=2.7"]
|
|
44
|
+
openai = ["openai>=1.0"]
|
|
45
|
+
all = ["sentence-transformers>=2.7", "openai>=1.0"]
|
|
46
|
+
dev = ["pytest>=7", "pytest-cov", "ruff", "mypy", "build", "twine"]
|
|
47
|
+
|
|
48
|
+
[tool.setuptools.packages.find]
|
|
49
|
+
where = ["."]
|
|
50
|
+
include = ["sentence_embedder*"]
|
|
51
|
+
|
|
52
|
+
[tool.pytest.ini_options]
|
|
53
|
+
testpaths = ["tests"]
|
|
54
|
+
addopts = "-v --tb=short"
|
|
55
|
+
|
|
56
|
+
[tool.ruff]
|
|
57
|
+
line-length = 88
|
|
58
|
+
target-version = "py310"
|
|
59
|
+
|
|
60
|
+
[tool.mypy]
|
|
61
|
+
python_version = "3.10"
|
|
62
|
+
strict = false
|
|
63
|
+
ignore_missing_imports = true
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""
|
|
2
|
+
sentence_embedder
|
|
3
|
+
=================
|
|
4
|
+
A lightweight internal library for generating sentence/semantic embeddings.
|
|
5
|
+
|
|
6
|
+
Quick start
|
|
7
|
+
-----------
|
|
8
|
+
>>> from sentence_embedder import SentenceEmbedder
|
|
9
|
+
>>> embedder = SentenceEmbedder() # uses all-MiniLM-L6-v2
|
|
10
|
+
>>> vec = embedder.embed("The cat sat on the mat.")
|
|
11
|
+
>>> vec.shape
|
|
12
|
+
(384,)
|
|
13
|
+
|
|
14
|
+
>>> results = embedder.most_similar("fast car", ["racing vehicle", "slow snail", "quick automobile"])
|
|
15
|
+
>>> results[0]
|
|
16
|
+
('quick automobile', 0.87...)
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from sentence_embedder.embedder import SentenceEmbedder
|
|
20
|
+
from sentence_embedder.cache import EmbeddingCache
|
|
21
|
+
|
|
22
|
+
__all__ = ["SentenceEmbedder", "EmbeddingCache"]
|
|
23
|
+
__version__ = "1.0.0"
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cache.py
|
|
3
|
+
--------
|
|
4
|
+
Optional disk-based embedding cache to avoid re-computing embeddings
|
|
5
|
+
for texts that have already been processed.
|
|
6
|
+
|
|
7
|
+
Uses a simple ``shelve`` database keyed by ``(model_name, text)``.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import hashlib
|
|
13
|
+
import logging
|
|
14
|
+
import shelve
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import List, Optional
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class EmbeddingCache:
|
|
24
|
+
"""
|
|
25
|
+
Transparent disk cache for embedding vectors.
|
|
26
|
+
|
|
27
|
+
Wrap any :class:`~sentence_embedder.SentenceEmbedder` with this class
|
|
28
|
+
to skip re-embedding texts that were already processed.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
embedder: A :class:`~sentence_embedder.SentenceEmbedder` instance.
|
|
32
|
+
cache_dir (str | Path): Directory where the cache database is stored.
|
|
33
|
+
Created automatically if it does not exist.
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
>>> from sentence_embedder import SentenceEmbedder
|
|
37
|
+
>>> from sentence_embedder.cache import EmbeddingCache
|
|
38
|
+
>>> base = SentenceEmbedder()
|
|
39
|
+
>>> cached = EmbeddingCache(base, cache_dir=".cache/embeddings")
|
|
40
|
+
>>> vec = cached.embed("Hello world") # computed and stored
|
|
41
|
+
>>> vec = cached.embed("Hello world") # retrieved from cache
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(self, embedder, cache_dir: str | Path = ".cache/embeddings") -> None:
|
|
45
|
+
self._embedder = embedder
|
|
46
|
+
cache_path = Path(cache_dir)
|
|
47
|
+
cache_path.mkdir(parents=True, exist_ok=True)
|
|
48
|
+
self._db_path = str(cache_path / "embeddings")
|
|
49
|
+
logger.info("EmbeddingCache initialised at '%s'.", self._db_path)
|
|
50
|
+
|
|
51
|
+
# ------------------------------------------------------------------
|
|
52
|
+
# Public API (mirrors SentenceEmbedder)
|
|
53
|
+
# ------------------------------------------------------------------
|
|
54
|
+
|
|
55
|
+
def embed(self, text: str) -> np.ndarray:
|
|
56
|
+
"""
|
|
57
|
+
Embed a single text, using the cache when available.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
text (str): Input text.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
np.ndarray: Embedding vector.
|
|
64
|
+
"""
|
|
65
|
+
key = self._make_key(text)
|
|
66
|
+
with shelve.open(self._db_path) as db:
|
|
67
|
+
if key in db:
|
|
68
|
+
logger.debug("Cache hit for text hash '%s'.", key)
|
|
69
|
+
return db[key]
|
|
70
|
+
|
|
71
|
+
vec = self._embedder.embed(text)
|
|
72
|
+
with shelve.open(self._db_path) as db:
|
|
73
|
+
db[key] = vec
|
|
74
|
+
return vec
|
|
75
|
+
|
|
76
|
+
def embed_batch(self, texts: List[str]) -> np.ndarray:
|
|
77
|
+
"""
|
|
78
|
+
Embed a list of texts, only calling the model for cache misses.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
texts (List[str]): Input texts.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
np.ndarray: 2-D array of shape ``(len(texts), dim)``.
|
|
85
|
+
"""
|
|
86
|
+
keys = [self._make_key(t) for t in texts]
|
|
87
|
+
results: List[Optional[np.ndarray]] = [None] * len(texts)
|
|
88
|
+
missing_indices: List[int] = []
|
|
89
|
+
|
|
90
|
+
with shelve.open(self._db_path) as db:
|
|
91
|
+
for i, key in enumerate(keys):
|
|
92
|
+
if key in db:
|
|
93
|
+
results[i] = db[key]
|
|
94
|
+
else:
|
|
95
|
+
missing_indices.append(i)
|
|
96
|
+
|
|
97
|
+
if missing_indices:
|
|
98
|
+
missing_texts = [texts[i] for i in missing_indices]
|
|
99
|
+
new_vecs = self._embedder.embed_batch(missing_texts)
|
|
100
|
+
with shelve.open(self._db_path) as db:
|
|
101
|
+
for idx, vec in zip(missing_indices, new_vecs):
|
|
102
|
+
db[keys[idx]] = vec
|
|
103
|
+
results[idx] = vec
|
|
104
|
+
|
|
105
|
+
logger.debug(
|
|
106
|
+
"embed_batch: %d cached, %d computed.",
|
|
107
|
+
len(texts) - len(missing_indices),
|
|
108
|
+
len(missing_indices),
|
|
109
|
+
)
|
|
110
|
+
return np.stack(results)
|
|
111
|
+
|
|
112
|
+
def clear(self) -> None:
|
|
113
|
+
"""Delete all cached embeddings."""
|
|
114
|
+
import os
|
|
115
|
+
for suffix in ["", ".db", ".dir", ".bak", ".dat"]:
|
|
116
|
+
path = Path(self._db_path + suffix)
|
|
117
|
+
if path.exists():
|
|
118
|
+
os.remove(path)
|
|
119
|
+
logger.info("Embedding cache cleared.")
|
|
120
|
+
|
|
121
|
+
# ------------------------------------------------------------------
|
|
122
|
+
# Helpers
|
|
123
|
+
# ------------------------------------------------------------------
|
|
124
|
+
|
|
125
|
+
def _make_key(self, text: str) -> str:
|
|
126
|
+
payload = f"{self._embedder.model_name}::{text}"
|
|
127
|
+
return hashlib.sha256(payload.encode()).hexdigest()
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
"""
|
|
2
|
+
embedder.py
|
|
3
|
+
-----------
|
|
4
|
+
Core module for generating sentence/semantic embeddings.
|
|
5
|
+
Supports multiple backends: SentenceTransformers and OpenAI.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from typing import List
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SentenceEmbedder:
|
|
19
|
+
"""
|
|
20
|
+
A unified interface for generating sentence/semantic embeddings.
|
|
21
|
+
|
|
22
|
+
Supports the following backends:
|
|
23
|
+
- ``"sentencetransformers"`` — Local HuggingFace models (default, offline-friendly)
|
|
24
|
+
- ``"openai"`` — OpenAI text-embedding-* API models
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
backend (str): Which embedding backend to use. One of
|
|
28
|
+
``"sentencetransformers"`` or ``"openai"``. Defaults to
|
|
29
|
+
``"sentencetransformers"``.
|
|
30
|
+
model_name (str): Model identifier.
|
|
31
|
+
- For SentenceTransformers: any model on HuggingFace Hub,
|
|
32
|
+
e.g. ``"all-MiniLM-L6-v2"`` (default).
|
|
33
|
+
- For OpenAI: e.g. ``"text-embedding-3-small"``.
|
|
34
|
+
openai_api_key (str | None): API key for OpenAI backend. If ``None``,
|
|
35
|
+
falls back to the ``OPENAI_API_KEY`` environment variable.
|
|
36
|
+
device (str): Torch device for SentenceTransformers, e.g. ``"cpu"``
|
|
37
|
+
or ``"cuda"``. Ignored for OpenAI backend.
|
|
38
|
+
|
|
39
|
+
Example:
|
|
40
|
+
>>> embedder = SentenceEmbedder()
|
|
41
|
+
>>> vec = embedder.embed("Hello world")
|
|
42
|
+
>>> vec.shape
|
|
43
|
+
(384,)
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
SUPPORTED_BACKENDS = {"sentencetransformers", "openai"}
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
backend: str = "sentencetransformers",
|
|
51
|
+
model_name: str = "all-MiniLM-L6-v2",
|
|
52
|
+
openai_api_key: str | None = None,
|
|
53
|
+
device: str = "cpu",
|
|
54
|
+
) -> None:
|
|
55
|
+
backend = backend.lower()
|
|
56
|
+
if backend not in self.SUPPORTED_BACKENDS:
|
|
57
|
+
raise ValueError(
|
|
58
|
+
f"Unsupported backend '{backend}'. "
|
|
59
|
+
f"Choose from: {self.SUPPORTED_BACKENDS}"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
self.backend = backend
|
|
63
|
+
self.model_name = model_name
|
|
64
|
+
self.device = device
|
|
65
|
+
self._model = None
|
|
66
|
+
self._openai_client = None
|
|
67
|
+
|
|
68
|
+
if backend == "openai":
|
|
69
|
+
self._init_openai(openai_api_key)
|
|
70
|
+
else:
|
|
71
|
+
self._init_sentence_transformers()
|
|
72
|
+
|
|
73
|
+
# ------------------------------------------------------------------
|
|
74
|
+
# Initialisation helpers
|
|
75
|
+
# ------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
def _init_sentence_transformers(self) -> None:
|
|
78
|
+
try:
|
|
79
|
+
from sentence_transformers import SentenceTransformer
|
|
80
|
+
except ImportError as exc:
|
|
81
|
+
raise ImportError(
|
|
82
|
+
"sentence-transformers is not installed. "
|
|
83
|
+
"Run: pip install sentence-transformers"
|
|
84
|
+
) from exc
|
|
85
|
+
|
|
86
|
+
logger.info("Loading SentenceTransformer model '%s'…", self.model_name)
|
|
87
|
+
self._model = SentenceTransformer(self.model_name, device=self.device)
|
|
88
|
+
logger.info("Model loaded.")
|
|
89
|
+
|
|
90
|
+
def _init_openai(self, api_key: str | None) -> None:
|
|
91
|
+
try:
|
|
92
|
+
from openai import OpenAI
|
|
93
|
+
except ImportError as exc:
|
|
94
|
+
raise ImportError(
|
|
95
|
+
"openai is not installed. Run: pip install openai"
|
|
96
|
+
) from exc
|
|
97
|
+
|
|
98
|
+
import os
|
|
99
|
+
|
|
100
|
+
key = api_key or os.getenv("OPENAI_API_KEY")
|
|
101
|
+
if not key:
|
|
102
|
+
raise EnvironmentError(
|
|
103
|
+
"OpenAI API key not provided. Pass openai_api_key= or set "
|
|
104
|
+
"the OPENAI_API_KEY environment variable."
|
|
105
|
+
)
|
|
106
|
+
self._openai_client = OpenAI(api_key=key)
|
|
107
|
+
logger.info("OpenAI client initialised with model '%s'.", self.model_name)
|
|
108
|
+
|
|
109
|
+
# ------------------------------------------------------------------
|
|
110
|
+
# Public API
|
|
111
|
+
# ------------------------------------------------------------------
|
|
112
|
+
|
|
113
|
+
def embed(self, text: str) -> np.ndarray:
|
|
114
|
+
"""
|
|
115
|
+
Embed a single sentence/document.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
text (str): Input text to embed.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
np.ndarray: 1-D float32 embedding vector.
|
|
122
|
+
|
|
123
|
+
Raises:
|
|
124
|
+
ValueError: If ``text`` is empty.
|
|
125
|
+
"""
|
|
126
|
+
if not text or not text.strip():
|
|
127
|
+
raise ValueError("Input text must not be empty.")
|
|
128
|
+
return self.embed_batch([text])[0]
|
|
129
|
+
|
|
130
|
+
def embed_batch(self, texts: List[str]) -> np.ndarray:
|
|
131
|
+
"""
|
|
132
|
+
Embed a list of sentences/documents.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
texts (List[str]): List of input strings. Empty strings are
|
|
136
|
+
rejected.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
np.ndarray: 2-D float32 array of shape ``(len(texts), dim)``.
|
|
140
|
+
|
|
141
|
+
Raises:
|
|
142
|
+
ValueError: If ``texts`` is empty or contains blank strings.
|
|
143
|
+
"""
|
|
144
|
+
if not texts:
|
|
145
|
+
raise ValueError("texts list must not be empty.")
|
|
146
|
+
if any(not t or not t.strip() for t in texts):
|
|
147
|
+
raise ValueError("texts list must not contain empty strings.")
|
|
148
|
+
|
|
149
|
+
if self.backend == "sentencetransformers":
|
|
150
|
+
return self._embed_st(texts)
|
|
151
|
+
return self._embed_openai(texts)
|
|
152
|
+
|
|
153
|
+
def similarity(self, a: str, b: str) -> float:
|
|
154
|
+
"""
|
|
155
|
+
Compute cosine similarity between two sentences.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
a (str): First sentence.
|
|
159
|
+
b (str): Second sentence.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
float: Cosine similarity in [-1, 1].
|
|
163
|
+
"""
|
|
164
|
+
vecs = self.embed_batch([a, b])
|
|
165
|
+
return float(_cosine_similarity(vecs[0], vecs[1]))
|
|
166
|
+
|
|
167
|
+
def most_similar(
|
|
168
|
+
self,
|
|
169
|
+
query: str,
|
|
170
|
+
corpus: List[str],
|
|
171
|
+
top_k: int = 5,
|
|
172
|
+
) -> List[tuple[str, float]]:
|
|
173
|
+
"""
|
|
174
|
+
Return the ``top_k`` most semantically similar sentences from a corpus.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
query (str): The query sentence.
|
|
178
|
+
corpus (List[str]): Pool of candidate sentences.
|
|
179
|
+
top_k (int): Number of results to return.
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
List[tuple[str, float]]: Ranked list of ``(sentence, score)``
|
|
183
|
+
pairs, highest similarity first.
|
|
184
|
+
"""
|
|
185
|
+
if not corpus:
|
|
186
|
+
raise ValueError("corpus must not be empty.")
|
|
187
|
+
top_k = min(top_k, len(corpus))
|
|
188
|
+
|
|
189
|
+
query_vec = self.embed(query)
|
|
190
|
+
corpus_vecs = self.embed_batch(corpus)
|
|
191
|
+
|
|
192
|
+
scores = [
|
|
193
|
+
float(_cosine_similarity(query_vec, cv)) for cv in corpus_vecs
|
|
194
|
+
]
|
|
195
|
+
ranked = sorted(zip(corpus, scores), key=lambda x: x[1], reverse=True)
|
|
196
|
+
return ranked[:top_k]
|
|
197
|
+
|
|
198
|
+
# ------------------------------------------------------------------
|
|
199
|
+
# Backend implementations
|
|
200
|
+
# ------------------------------------------------------------------
|
|
201
|
+
|
|
202
|
+
def _embed_st(self, texts: List[str]) -> np.ndarray:
|
|
203
|
+
vecs = self._model.encode(
|
|
204
|
+
texts,
|
|
205
|
+
convert_to_numpy=True,
|
|
206
|
+
show_progress_bar=len(texts) > 50,
|
|
207
|
+
)
|
|
208
|
+
return vecs.astype(np.float32)
|
|
209
|
+
|
|
210
|
+
def _embed_openai(self, texts: List[str]) -> np.ndarray:
|
|
211
|
+
response = self._openai_client.embeddings.create(
|
|
212
|
+
input=texts,
|
|
213
|
+
model=self.model_name,
|
|
214
|
+
)
|
|
215
|
+
vecs = [item.embedding for item in response.data]
|
|
216
|
+
return np.array(vecs, dtype=np.float32)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
# ------------------------------------------------------------------
|
|
220
|
+
# Utilities
|
|
221
|
+
# ------------------------------------------------------------------
|
|
222
|
+
|
|
223
|
+
def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
|
224
|
+
"""Compute cosine similarity between two vectors."""
|
|
225
|
+
norm_a = np.linalg.norm(a)
|
|
226
|
+
norm_b = np.linalg.norm(b)
|
|
227
|
+
if norm_a == 0 or norm_b == 0:
|
|
228
|
+
return 0.0
|
|
229
|
+
return float(np.dot(a, b) / (norm_a * norm_b))
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sentence-embedder
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Lightweight Python library for sentence/semantic embeddings — SentenceTransformers & OpenAI in one unified API.
|
|
5
|
+
Author-email: Your Name <you@example.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/your-org/sentence-embedder
|
|
8
|
+
Project-URL: Repository, https://github.com/your-org/sentence-embedder
|
|
9
|
+
Project-URL: Bug Tracker, https://github.com/your-org/sentence-embedder/issues
|
|
10
|
+
Project-URL: Changelog, https://github.com/your-org/sentence-embedder/blob/main/CHANGELOG.md
|
|
11
|
+
Keywords: embeddings,nlp,sentence-transformers,semantic-search,openai,machine-learning
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
Requires-Dist: numpy>=1.24
|
|
25
|
+
Provides-Extra: st
|
|
26
|
+
Requires-Dist: sentence-transformers>=2.7; extra == "st"
|
|
27
|
+
Provides-Extra: openai
|
|
28
|
+
Requires-Dist: openai>=1.0; extra == "openai"
|
|
29
|
+
Provides-Extra: all
|
|
30
|
+
Requires-Dist: sentence-transformers>=2.7; extra == "all"
|
|
31
|
+
Requires-Dist: openai>=1.0; extra == "all"
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
34
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
35
|
+
Requires-Dist: ruff; extra == "dev"
|
|
36
|
+
Requires-Dist: mypy; extra == "dev"
|
|
37
|
+
Requires-Dist: build; extra == "dev"
|
|
38
|
+
Requires-Dist: twine; extra == "dev"
|
|
39
|
+
|
|
40
|
+
# sentence-embedder
|
|
41
|
+
|
|
42
|
+
A lightweight **internal Python library** for generating sentence/semantic embeddings with a clean, unified API.
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## Features
|
|
47
|
+
|
|
48
|
+
| Feature | Details |
|
|
49
|
+
|---|---|
|
|
50
|
+
| **Backends** | SentenceTransformers (local, offline) · OpenAI API |
|
|
51
|
+
| **Single & batch** | `embed()` and `embed_batch()` |
|
|
52
|
+
| **Similarity** | Cosine similarity between two sentences |
|
|
53
|
+
| **Semantic search** | `most_similar()` over an in-memory corpus |
|
|
54
|
+
| **Disk cache** | `EmbeddingCache` — skip re-embedding seen texts |
|
|
55
|
+
| **Typed** | Full type hints, compatible with `mypy` |
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## Installation
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
# Clone / copy this package into your project, then:
|
|
63
|
+
|
|
64
|
+
# SentenceTransformers backend (local, recommended)
|
|
65
|
+
pip install -e ".[st]"
|
|
66
|
+
|
|
67
|
+
# OpenAI backend
|
|
68
|
+
pip install -e ".[openai]"
|
|
69
|
+
|
|
70
|
+
# Both
|
|
71
|
+
pip install -e ".[all]"
|
|
72
|
+
|
|
73
|
+
# With dev tools (pytest, ruff, mypy)
|
|
74
|
+
pip install -e ".[all,dev]"
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## Quick Start
|
|
80
|
+
|
|
81
|
+
### 1 — Basic embedding
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from sentence_embedder import SentenceEmbedder
|
|
85
|
+
|
|
86
|
+
embedder = SentenceEmbedder() # defaults: SentenceTransformers, all-MiniLM-L6-v2
|
|
87
|
+
|
|
88
|
+
vec = embedder.embed("The cat sat on the mat.")
|
|
89
|
+
print(vec.shape) # (384,)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### 2 — Batch embedding
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
texts = ["Hello world", "Machine learning is fun", "I love Python"]
|
|
96
|
+
vecs = embedder.embed_batch(texts)
|
|
97
|
+
print(vecs.shape) # (3, 384)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### 3 — Cosine similarity
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
score = embedder.similarity("fast car", "quick automobile")
|
|
104
|
+
print(score) # ~0.85
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### 4 — Semantic search
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
corpus = [
|
|
111
|
+
"The stock market crashed today.",
|
|
112
|
+
"Scientists discover a new planet.",
|
|
113
|
+
"Football team wins the championship.",
|
|
114
|
+
"A new AI model beats human performance.",
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
results = embedder.most_similar("breakthrough in artificial intelligence", corpus, top_k=2)
|
|
118
|
+
for sentence, score in results:
|
|
119
|
+
print(f"{score:.3f} {sentence}")
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### 5 — Disk cache (avoid re-embedding)
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from sentence_embedder import SentenceEmbedder, EmbeddingCache
|
|
126
|
+
|
|
127
|
+
base = SentenceEmbedder()
|
|
128
|
+
embedder = EmbeddingCache(base, cache_dir=".cache/embeddings")
|
|
129
|
+
|
|
130
|
+
vec = embedder.embed("Hello world") # computed → stored on disk
|
|
131
|
+
vec = embedder.embed("Hello world") # loaded from cache instantly
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## OpenAI Backend
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
from sentence_embedder import SentenceEmbedder
|
|
140
|
+
|
|
141
|
+
embedder = SentenceEmbedder(
|
|
142
|
+
backend="openai",
|
|
143
|
+
model_name="text-embedding-3-small",
|
|
144
|
+
openai_api_key="sk-...", # or set OPENAI_API_KEY env var
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
vec = embedder.embed("Hello from OpenAI!")
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## API Reference
|
|
153
|
+
|
|
154
|
+
### `SentenceEmbedder`
|
|
155
|
+
|
|
156
|
+
| Method | Signature | Description |
|
|
157
|
+
|---|---|---|
|
|
158
|
+
| `embed` | `(text: str) → ndarray` | Embed one sentence → 1-D vector |
|
|
159
|
+
| `embed_batch` | `(texts: List[str]) → ndarray` | Embed many → 2-D array `(N, dim)` |
|
|
160
|
+
| `similarity` | `(a: str, b: str) → float` | Cosine similarity in `[-1, 1]` |
|
|
161
|
+
| `most_similar` | `(query, corpus, top_k=5) → List[tuple]` | Ranked `(sentence, score)` pairs |
|
|
162
|
+
|
|
163
|
+
### `EmbeddingCache`
|
|
164
|
+
|
|
165
|
+
| Method | Signature | Description |
|
|
166
|
+
|---|---|---|
|
|
167
|
+
| `embed` | `(text: str) → ndarray` | Cache-aware single embed |
|
|
168
|
+
| `embed_batch` | `(texts: List[str]) → ndarray` | Cache-aware batch embed |
|
|
169
|
+
| `clear` | `() → None` | Wipe the cache database |
|
|
170
|
+
|
|
171
|
+
---
|
|
172
|
+
|
|
173
|
+
## Running Tests
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
pytest
|
|
177
|
+
# With coverage:
|
|
178
|
+
pytest --cov=sentence_embedder --cov-report=term-missing
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
---
|
|
182
|
+
|
|
183
|
+
## Package Structure
|
|
184
|
+
|
|
185
|
+
```
|
|
186
|
+
sentence_embedder/
|
|
187
|
+
├── sentence_embedder/
|
|
188
|
+
│ ├── __init__.py # Public exports
|
|
189
|
+
│ ├── embedder.py # SentenceEmbedder (core)
|
|
190
|
+
│ └── cache.py # EmbeddingCache (disk cache)
|
|
191
|
+
├── tests/
|
|
192
|
+
│ └── test_embedder.py # Unit tests (no model/network needed)
|
|
193
|
+
├── pyproject.toml # Build config & dependencies
|
|
194
|
+
└── README.md
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
199
|
+
## Choosing a Model
|
|
200
|
+
|
|
201
|
+
| Model | Dim | Speed | Quality | Use case |
|
|
202
|
+
|---|---|---|---|---|
|
|
203
|
+
| `all-MiniLM-L6-v2` | 384 | ⚡⚡⚡ | ★★★ | Default, general purpose |
|
|
204
|
+
| `all-mpnet-base-v2` | 768 | ⚡⚡ | ★★★★ | Higher quality |
|
|
205
|
+
| `multi-qa-MiniLM-L6-cos-v1` | 384 | ⚡⚡⚡ | ★★★★ | Q&A / search |
|
|
206
|
+
| `text-embedding-3-small` *(OpenAI)* | 1536 | API | ★★★★★ | Best quality, needs key |
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
sentence_embedder/__init__.py
|
|
4
|
+
sentence_embedder/cache.py
|
|
5
|
+
sentence_embedder/embedder.py
|
|
6
|
+
sentence_embedder.egg-info/PKG-INFO
|
|
7
|
+
sentence_embedder.egg-info/SOURCES.txt
|
|
8
|
+
sentence_embedder.egg-info/dependency_links.txt
|
|
9
|
+
sentence_embedder.egg-info/requires.txt
|
|
10
|
+
sentence_embedder.egg-info/top_level.txt
|
|
11
|
+
tests/test_embedder.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
sentence_embedder
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""
|
|
2
|
+
tests/test_embedder.py
|
|
3
|
+
----------------------
|
|
4
|
+
Unit tests for SentenceEmbedder and EmbeddingCache.
|
|
5
|
+
Uses mocks so no model or network is required.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pytest
|
|
10
|
+
from unittest.mock import MagicMock, patch
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# ---------------------------------------------------------------------------
|
|
14
|
+
# Helpers
|
|
15
|
+
# ---------------------------------------------------------------------------
|
|
16
|
+
|
|
17
|
+
def make_mock_st_model(dim: int = 8):
|
|
18
|
+
"""Return a mock SentenceTransformer-like model."""
|
|
19
|
+
model = MagicMock()
|
|
20
|
+
model.encode = lambda texts, **kw: np.random.rand(len(texts), dim).astype(np.float32)
|
|
21
|
+
return model
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
# SentenceEmbedder — SentenceTransformers backend
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
|
|
28
|
+
class TestSentenceEmbedderST:
|
|
29
|
+
"""Tests using the (mocked) SentenceTransformers backend."""
|
|
30
|
+
|
|
31
|
+
@pytest.fixture()
|
|
32
|
+
def embedder(self):
|
|
33
|
+
with patch(
|
|
34
|
+
"sentence_embedder.embedder.SentenceEmbedder._init_sentence_transformers"
|
|
35
|
+
):
|
|
36
|
+
from sentence_embedder import SentenceEmbedder
|
|
37
|
+
|
|
38
|
+
emb = SentenceEmbedder(backend="sentencetransformers")
|
|
39
|
+
emb._model = make_mock_st_model(dim=8)
|
|
40
|
+
return emb
|
|
41
|
+
|
|
42
|
+
def test_embed_returns_1d_array(self, embedder):
|
|
43
|
+
vec = embedder.embed("Hello world")
|
|
44
|
+
assert vec.ndim == 1
|
|
45
|
+
assert vec.shape == (8,)
|
|
46
|
+
|
|
47
|
+
def test_embed_batch_returns_2d_array(self, embedder):
|
|
48
|
+
vecs = embedder.embed_batch(["Hello", "World", "Test"])
|
|
49
|
+
assert vecs.ndim == 2
|
|
50
|
+
assert vecs.shape == (3, 8)
|
|
51
|
+
|
|
52
|
+
def test_embed_empty_string_raises(self, embedder):
|
|
53
|
+
with pytest.raises(ValueError, match="empty"):
|
|
54
|
+
embedder.embed("")
|
|
55
|
+
|
|
56
|
+
def test_embed_batch_empty_list_raises(self, embedder):
|
|
57
|
+
with pytest.raises(ValueError, match="empty"):
|
|
58
|
+
embedder.embed_batch([])
|
|
59
|
+
|
|
60
|
+
def test_embed_batch_blank_string_raises(self, embedder):
|
|
61
|
+
with pytest.raises(ValueError, match="empty"):
|
|
62
|
+
embedder.embed_batch(["hello", " "])
|
|
63
|
+
|
|
64
|
+
def test_similarity_returns_float_in_range(self, embedder):
|
|
65
|
+
score = embedder.similarity("cat", "dog")
|
|
66
|
+
assert isinstance(score, float)
|
|
67
|
+
assert -1.0 <= score <= 1.0
|
|
68
|
+
|
|
69
|
+
def test_most_similar_returns_ranked_list(self, embedder):
|
|
70
|
+
corpus = ["apple", "banana", "cherry", "date"]
|
|
71
|
+
results = embedder.most_similar("fruit", corpus, top_k=2)
|
|
72
|
+
assert len(results) == 2
|
|
73
|
+
assert all(isinstance(s, str) and isinstance(sc, float) for s, sc in results)
|
|
74
|
+
|
|
75
|
+
def test_most_similar_top_k_capped(self, embedder):
|
|
76
|
+
corpus = ["a", "b"]
|
|
77
|
+
results = embedder.most_similar("query", corpus, top_k=10)
|
|
78
|
+
assert len(results) == 2
|
|
79
|
+
|
|
80
|
+
def test_unsupported_backend_raises(self):
|
|
81
|
+
with pytest.raises(ValueError, match="Unsupported backend"):
|
|
82
|
+
from sentence_embedder import SentenceEmbedder
|
|
83
|
+
SentenceEmbedder(backend="fakegpt")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# ---------------------------------------------------------------------------
|
|
87
|
+
# EmbeddingCache
|
|
88
|
+
# ---------------------------------------------------------------------------
|
|
89
|
+
|
|
90
|
+
class TestEmbeddingCache:
|
|
91
|
+
"""Tests for the disk cache wrapper."""
|
|
92
|
+
|
|
93
|
+
@pytest.fixture()
|
|
94
|
+
def embedder(self, tmp_path):
|
|
95
|
+
"""A mock embedder that counts calls."""
|
|
96
|
+
with patch(
|
|
97
|
+
"sentence_embedder.embedder.SentenceEmbedder._init_sentence_transformers"
|
|
98
|
+
):
|
|
99
|
+
from sentence_embedder import SentenceEmbedder, EmbeddingCache
|
|
100
|
+
|
|
101
|
+
emb = SentenceEmbedder(backend="sentencetransformers")
|
|
102
|
+
emb._model = make_mock_st_model(dim=8)
|
|
103
|
+
|
|
104
|
+
cached = EmbeddingCache(emb, cache_dir=str(tmp_path / "cache"))
|
|
105
|
+
# Track how many times the underlying model is called
|
|
106
|
+
original_embed_batch = emb.embed_batch
|
|
107
|
+
emb._call_count = 0
|
|
108
|
+
|
|
109
|
+
def counting_embed_batch(texts):
|
|
110
|
+
emb._call_count += len(texts)
|
|
111
|
+
return original_embed_batch(texts)
|
|
112
|
+
|
|
113
|
+
emb.embed_batch = counting_embed_batch
|
|
114
|
+
return emb, cached
|
|
115
|
+
|
|
116
|
+
def test_cache_miss_then_hit(self, embedder):
|
|
117
|
+
base, cached = embedder
|
|
118
|
+
_ = cached.embed("The sky is blue")
|
|
119
|
+
assert base._call_count == 1
|
|
120
|
+
_ = cached.embed("The sky is blue")
|
|
121
|
+
assert base._call_count == 1 # no second call
|
|
122
|
+
|
|
123
|
+
def test_batch_partial_cache(self, embedder):
|
|
124
|
+
base, cached = embedder
|
|
125
|
+
texts = ["sentence one", "sentence two", "sentence three"]
|
|
126
|
+
_ = cached.embed_batch(texts[:2]) # cache first two
|
|
127
|
+
base._call_count = 0 # reset counter
|
|
128
|
+
_ = cached.embed_batch(texts) # only "sentence three" should be new
|
|
129
|
+
assert base._call_count == 1
|
|
130
|
+
|
|
131
|
+
def test_clear_cache(self, embedder, tmp_path):
|
|
132
|
+
base, cached = embedder
|
|
133
|
+
_ = cached.embed("some text")
|
|
134
|
+
cached.clear()
|
|
135
|
+
base._call_count = 0
|
|
136
|
+
_ = cached.embed("some text") # must recompute after clear
|
|
137
|
+
assert base._call_count == 1
|