sentence-embedder 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,206 @@
1
+ Metadata-Version: 2.4
2
+ Name: sentence-embedder
3
+ Version: 1.0.0
4
+ Summary: Lightweight Python library for sentence/semantic embeddings — SentenceTransformers & OpenAI in one unified API.
5
+ Author-email: Your Name <you@example.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/your-org/sentence-embedder
8
+ Project-URL: Repository, https://github.com/your-org/sentence-embedder
9
+ Project-URL: Bug Tracker, https://github.com/your-org/sentence-embedder/issues
10
+ Project-URL: Changelog, https://github.com/your-org/sentence-embedder/blob/main/CHANGELOG.md
11
+ Keywords: embeddings,nlp,sentence-transformers,semantic-search,openai,machine-learning
12
+ Classifier: Development Status :: 5 - Production/Stable
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
+ Requires-Python: >=3.10
23
+ Description-Content-Type: text/markdown
24
+ Requires-Dist: numpy>=1.24
25
+ Provides-Extra: st
26
+ Requires-Dist: sentence-transformers>=2.7; extra == "st"
27
+ Provides-Extra: openai
28
+ Requires-Dist: openai>=1.0; extra == "openai"
29
+ Provides-Extra: all
30
+ Requires-Dist: sentence-transformers>=2.7; extra == "all"
31
+ Requires-Dist: openai>=1.0; extra == "all"
32
+ Provides-Extra: dev
33
+ Requires-Dist: pytest>=7; extra == "dev"
34
+ Requires-Dist: pytest-cov; extra == "dev"
35
+ Requires-Dist: ruff; extra == "dev"
36
+ Requires-Dist: mypy; extra == "dev"
37
+ Requires-Dist: build; extra == "dev"
38
+ Requires-Dist: twine; extra == "dev"
39
+
40
+ # sentence-embedder
41
+
42
+ A lightweight **internal Python library** for generating sentence/semantic embeddings with a clean, unified API.
43
+
44
+ ---
45
+
46
+ ## Features
47
+
48
+ | Feature | Details |
49
+ |---|---|
50
+ | **Backends** | SentenceTransformers (local, offline) · OpenAI API |
51
+ | **Single & batch** | `embed()` and `embed_batch()` |
52
+ | **Similarity** | Cosine similarity between two sentences |
53
+ | **Semantic search** | `most_similar()` over an in-memory corpus |
54
+ | **Disk cache** | `EmbeddingCache` — skip re-embedding seen texts |
55
+ | **Typed** | Full type hints, compatible with `mypy` |
56
+
57
+ ---
58
+
59
+ ## Installation
60
+
61
+ ```bash
62
+ # Clone / copy this package into your project, then:
63
+
64
+ # SentenceTransformers backend (local, recommended)
65
+ pip install -e ".[st]"
66
+
67
+ # OpenAI backend
68
+ pip install -e ".[openai]"
69
+
70
+ # Both
71
+ pip install -e ".[all]"
72
+
73
+ # With dev tools (pytest, ruff, mypy)
74
+ pip install -e ".[all,dev]"
75
+ ```
76
+
77
+ ---
78
+
79
+ ## Quick Start
80
+
81
+ ### 1 — Basic embedding
82
+
83
+ ```python
84
+ from sentence_embedder import SentenceEmbedder
85
+
86
+ embedder = SentenceEmbedder() # defaults: SentenceTransformers, all-MiniLM-L6-v2
87
+
88
+ vec = embedder.embed("The cat sat on the mat.")
89
+ print(vec.shape) # (384,)
90
+ ```
91
+
92
+ ### 2 — Batch embedding
93
+
94
+ ```python
95
+ texts = ["Hello world", "Machine learning is fun", "I love Python"]
96
+ vecs = embedder.embed_batch(texts)
97
+ print(vecs.shape) # (3, 384)
98
+ ```
99
+
100
+ ### 3 — Cosine similarity
101
+
102
+ ```python
103
+ score = embedder.similarity("fast car", "quick automobile")
104
+ print(score) # ~0.85
105
+ ```
106
+
107
+ ### 4 — Semantic search
108
+
109
+ ```python
110
+ corpus = [
111
+ "The stock market crashed today.",
112
+ "Scientists discover a new planet.",
113
+ "Football team wins the championship.",
114
+ "A new AI model beats human performance.",
115
+ ]
116
+
117
+ results = embedder.most_similar("breakthrough in artificial intelligence", corpus, top_k=2)
118
+ for sentence, score in results:
119
+ print(f"{score:.3f} {sentence}")
120
+ ```
121
+
122
+ ### 5 — Disk cache (avoid re-embedding)
123
+
124
+ ```python
125
+ from sentence_embedder import SentenceEmbedder, EmbeddingCache
126
+
127
+ base = SentenceEmbedder()
128
+ embedder = EmbeddingCache(base, cache_dir=".cache/embeddings")
129
+
130
+ vec = embedder.embed("Hello world") # computed → stored on disk
131
+ vec = embedder.embed("Hello world") # loaded from cache instantly
132
+ ```
133
+
134
+ ---
135
+
136
+ ## OpenAI Backend
137
+
138
+ ```python
139
+ from sentence_embedder import SentenceEmbedder
140
+
141
+ embedder = SentenceEmbedder(
142
+ backend="openai",
143
+ model_name="text-embedding-3-small",
144
+ openai_api_key="sk-...", # or set OPENAI_API_KEY env var
145
+ )
146
+
147
+ vec = embedder.embed("Hello from OpenAI!")
148
+ ```
149
+
150
+ ---
151
+
152
+ ## API Reference
153
+
154
+ ### `SentenceEmbedder`
155
+
156
+ | Method | Signature | Description |
157
+ |---|---|---|
158
+ | `embed` | `(text: str) → ndarray` | Embed one sentence → 1-D vector |
159
+ | `embed_batch` | `(texts: List[str]) → ndarray` | Embed many → 2-D array `(N, dim)` |
160
+ | `similarity` | `(a: str, b: str) → float` | Cosine similarity in `[-1, 1]` |
161
+ | `most_similar` | `(query, corpus, top_k=5) → List[tuple]` | Ranked `(sentence, score)` pairs |
162
+
163
+ ### `EmbeddingCache`
164
+
165
+ | Method | Signature | Description |
166
+ |---|---|---|
167
+ | `embed` | `(text: str) → ndarray` | Cache-aware single embed |
168
+ | `embed_batch` | `(texts: List[str]) → ndarray` | Cache-aware batch embed |
169
+ | `clear` | `() → None` | Wipe the cache database |
170
+
171
+ ---
172
+
173
+ ## Running Tests
174
+
175
+ ```bash
176
+ pytest
177
+ # With coverage:
178
+ pytest --cov=sentence_embedder --cov-report=term-missing
179
+ ```
180
+
181
+ ---
182
+
183
+ ## Package Structure
184
+
185
+ ```
186
+ sentence_embedder/
187
+ ├── sentence_embedder/
188
+ │ ├── __init__.py # Public exports
189
+ │ ├── embedder.py # SentenceEmbedder (core)
190
+ │ └── cache.py # EmbeddingCache (disk cache)
191
+ ├── tests/
192
+ │ └── test_embedder.py # Unit tests (no model/network needed)
193
+ ├── pyproject.toml # Build config & dependencies
194
+ └── README.md
195
+ ```
196
+
197
+ ---
198
+
199
+ ## Choosing a Model
200
+
201
+ | Model | Dim | Speed | Quality | Use case |
202
+ |---|---|---|---|---|
203
+ | `all-MiniLM-L6-v2` | 384 | ⚡⚡⚡ | ★★★ | Default, general purpose |
204
+ | `all-mpnet-base-v2` | 768 | ⚡⚡ | ★★★★ | Higher quality |
205
+ | `multi-qa-MiniLM-L6-cos-v1` | 384 | ⚡⚡⚡ | ★★★★ | Q&A / search |
206
+ | `text-embedding-3-small` *(OpenAI)* | 1536 | API | ★★★★★ | Best quality, needs key |
@@ -0,0 +1,167 @@
1
+ # sentence-embedder
2
+
3
+ A lightweight **internal Python library** for generating sentence/semantic embeddings with a clean, unified API.
4
+
5
+ ---
6
+
7
+ ## Features
8
+
9
+ | Feature | Details |
10
+ |---|---|
11
+ | **Backends** | SentenceTransformers (local, offline) · OpenAI API |
12
+ | **Single & batch** | `embed()` and `embed_batch()` |
13
+ | **Similarity** | Cosine similarity between two sentences |
14
+ | **Semantic search** | `most_similar()` over an in-memory corpus |
15
+ | **Disk cache** | `EmbeddingCache` — skip re-embedding seen texts |
16
+ | **Typed** | Full type hints, compatible with `mypy` |
17
+
18
+ ---
19
+
20
+ ## Installation
21
+
22
+ ```bash
23
+ # Clone / copy this package into your project, then:
24
+
25
+ # SentenceTransformers backend (local, recommended)
26
+ pip install -e ".[st]"
27
+
28
+ # OpenAI backend
29
+ pip install -e ".[openai]"
30
+
31
+ # Both
32
+ pip install -e ".[all]"
33
+
34
+ # With dev tools (pytest, ruff, mypy)
35
+ pip install -e ".[all,dev]"
36
+ ```
37
+
38
+ ---
39
+
40
+ ## Quick Start
41
+
42
+ ### 1 — Basic embedding
43
+
44
+ ```python
45
+ from sentence_embedder import SentenceEmbedder
46
+
47
+ embedder = SentenceEmbedder() # defaults: SentenceTransformers, all-MiniLM-L6-v2
48
+
49
+ vec = embedder.embed("The cat sat on the mat.")
50
+ print(vec.shape) # (384,)
51
+ ```
52
+
53
+ ### 2 — Batch embedding
54
+
55
+ ```python
56
+ texts = ["Hello world", "Machine learning is fun", "I love Python"]
57
+ vecs = embedder.embed_batch(texts)
58
+ print(vecs.shape) # (3, 384)
59
+ ```
60
+
61
+ ### 3 — Cosine similarity
62
+
63
+ ```python
64
+ score = embedder.similarity("fast car", "quick automobile")
65
+ print(score) # ~0.85
66
+ ```
67
+
68
+ ### 4 — Semantic search
69
+
70
+ ```python
71
+ corpus = [
72
+ "The stock market crashed today.",
73
+ "Scientists discover a new planet.",
74
+ "Football team wins the championship.",
75
+ "A new AI model beats human performance.",
76
+ ]
77
+
78
+ results = embedder.most_similar("breakthrough in artificial intelligence", corpus, top_k=2)
79
+ for sentence, score in results:
80
+ print(f"{score:.3f} {sentence}")
81
+ ```
82
+
83
+ ### 5 — Disk cache (avoid re-embedding)
84
+
85
+ ```python
86
+ from sentence_embedder import SentenceEmbedder, EmbeddingCache
87
+
88
+ base = SentenceEmbedder()
89
+ embedder = EmbeddingCache(base, cache_dir=".cache/embeddings")
90
+
91
+ vec = embedder.embed("Hello world") # computed → stored on disk
92
+ vec = embedder.embed("Hello world") # loaded from cache instantly
93
+ ```
94
+
95
+ ---
96
+
97
+ ## OpenAI Backend
98
+
99
+ ```python
100
+ from sentence_embedder import SentenceEmbedder
101
+
102
+ embedder = SentenceEmbedder(
103
+ backend="openai",
104
+ model_name="text-embedding-3-small",
105
+ openai_api_key="sk-...", # or set OPENAI_API_KEY env var
106
+ )
107
+
108
+ vec = embedder.embed("Hello from OpenAI!")
109
+ ```
110
+
111
+ ---
112
+
113
+ ## API Reference
114
+
115
+ ### `SentenceEmbedder`
116
+
117
+ | Method | Signature | Description |
118
+ |---|---|---|
119
+ | `embed` | `(text: str) → ndarray` | Embed one sentence → 1-D vector |
120
+ | `embed_batch` | `(texts: List[str]) → ndarray` | Embed many → 2-D array `(N, dim)` |
121
+ | `similarity` | `(a: str, b: str) → float` | Cosine similarity in `[-1, 1]` |
122
+ | `most_similar` | `(query, corpus, top_k=5) → List[tuple]` | Ranked `(sentence, score)` pairs |
123
+
124
+ ### `EmbeddingCache`
125
+
126
+ | Method | Signature | Description |
127
+ |---|---|---|
128
+ | `embed` | `(text: str) → ndarray` | Cache-aware single embed |
129
+ | `embed_batch` | `(texts: List[str]) → ndarray` | Cache-aware batch embed |
130
+ | `clear` | `() → None` | Wipe the cache database |
131
+
132
+ ---
133
+
134
+ ## Running Tests
135
+
136
+ ```bash
137
+ pytest
138
+ # With coverage:
139
+ pytest --cov=sentence_embedder --cov-report=term-missing
140
+ ```
141
+
142
+ ---
143
+
144
+ ## Package Structure
145
+
146
+ ```
147
+ sentence_embedder/
148
+ ├── sentence_embedder/
149
+ │ ├── __init__.py # Public exports
150
+ │ ├── embedder.py # SentenceEmbedder (core)
151
+ │ └── cache.py # EmbeddingCache (disk cache)
152
+ ├── tests/
153
+ │ └── test_embedder.py # Unit tests (no model/network needed)
154
+ ├── pyproject.toml # Build config & dependencies
155
+ └── README.md
156
+ ```
157
+
158
+ ---
159
+
160
+ ## Choosing a Model
161
+
162
+ | Model | Dim | Speed | Quality | Use case |
163
+ |---|---|---|---|---|
164
+ | `all-MiniLM-L6-v2` | 384 | ⚡⚡⚡ | ★★★ | Default, general purpose |
165
+ | `all-mpnet-base-v2` | 768 | ⚡⚡ | ★★★★ | Higher quality |
166
+ | `multi-qa-MiniLM-L6-cos-v1` | 384 | ⚡⚡⚡ | ★★★★ | Q&A / search |
167
+ | `text-embedding-3-small` *(OpenAI)* | 1536 | API | ★★★★★ | Best quality, needs key |
@@ -0,0 +1,63 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "sentence-embedder"
7
+ version = "1.0.0"
8
+ description = "Lightweight Python library for sentence/semantic embeddings — SentenceTransformers & OpenAI in one unified API."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ authors = [
13
+ { name = "Your Name", email = "you@example.com" }
14
+ ]
15
+ keywords = [
16
+ "embeddings", "nlp", "sentence-transformers",
17
+ "semantic-search", "openai", "machine-learning"
18
+ ]
19
+ classifiers = [
20
+ "Development Status :: 5 - Production/Stable",
21
+ "Intended Audience :: Developers",
22
+ "Intended Audience :: Science/Research",
23
+ "License :: OSI Approved :: MIT License",
24
+ "Programming Language :: Python :: 3",
25
+ "Programming Language :: Python :: 3.10",
26
+ "Programming Language :: Python :: 3.11",
27
+ "Programming Language :: Python :: 3.12",
28
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
29
+ "Topic :: Software Development :: Libraries :: Python Modules",
30
+ ]
31
+
32
+ dependencies = [
33
+ "numpy>=1.24",
34
+ ]
35
+
36
+ [project.urls]
37
+ Homepage = "https://github.com/your-org/sentence-embedder"
38
+ Repository = "https://github.com/your-org/sentence-embedder"
39
+ "Bug Tracker" = "https://github.com/your-org/sentence-embedder/issues"
40
+ Changelog = "https://github.com/your-org/sentence-embedder/blob/main/CHANGELOG.md"
41
+
42
+ [project.optional-dependencies]
43
+ st = ["sentence-transformers>=2.7"]
44
+ openai = ["openai>=1.0"]
45
+ all = ["sentence-transformers>=2.7", "openai>=1.0"]
46
+ dev = ["pytest>=7", "pytest-cov", "ruff", "mypy", "build", "twine"]
47
+
48
+ [tool.setuptools.packages.find]
49
+ where = ["."]
50
+ include = ["sentence_embedder*"]
51
+
52
+ [tool.pytest.ini_options]
53
+ testpaths = ["tests"]
54
+ addopts = "-v --tb=short"
55
+
56
+ [tool.ruff]
57
+ line-length = 88
58
+ target-version = "py310"
59
+
60
+ [tool.mypy]
61
+ python_version = "3.10"
62
+ strict = false
63
+ ignore_missing_imports = true
@@ -0,0 +1,23 @@
1
+ """
2
+ sentence_embedder
3
+ =================
4
+ A lightweight internal library for generating sentence/semantic embeddings.
5
+
6
+ Quick start
7
+ -----------
8
+ >>> from sentence_embedder import SentenceEmbedder
9
+ >>> embedder = SentenceEmbedder() # uses all-MiniLM-L6-v2
10
+ >>> vec = embedder.embed("The cat sat on the mat.")
11
+ >>> vec.shape
12
+ (384,)
13
+
14
+ >>> results = embedder.most_similar("fast car", ["racing vehicle", "slow snail", "quick automobile"])
15
+ >>> results[0]
16
+ ('quick automobile', 0.87...)
17
+ """
18
+
19
+ from sentence_embedder.embedder import SentenceEmbedder
20
+ from sentence_embedder.cache import EmbeddingCache
21
+
22
+ __all__ = ["SentenceEmbedder", "EmbeddingCache"]
23
+ __version__ = "1.0.0"
@@ -0,0 +1,127 @@
1
+ """
2
+ cache.py
3
+ --------
4
+ Optional disk-based embedding cache to avoid re-computing embeddings
5
+ for texts that have already been processed.
6
+
7
+ Uses a simple ``shelve`` database keyed by ``(model_name, text)``.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import hashlib
13
+ import logging
14
+ import shelve
15
+ from pathlib import Path
16
+ from typing import List, Optional
17
+
18
+ import numpy as np
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class EmbeddingCache:
24
+ """
25
+ Transparent disk cache for embedding vectors.
26
+
27
+ Wrap any :class:`~sentence_embedder.SentenceEmbedder` with this class
28
+ to skip re-embedding texts that were already processed.
29
+
30
+ Args:
31
+ embedder: A :class:`~sentence_embedder.SentenceEmbedder` instance.
32
+ cache_dir (str | Path): Directory where the cache database is stored.
33
+ Created automatically if it does not exist.
34
+
35
+ Example:
36
+ >>> from sentence_embedder import SentenceEmbedder
37
+ >>> from sentence_embedder.cache import EmbeddingCache
38
+ >>> base = SentenceEmbedder()
39
+ >>> cached = EmbeddingCache(base, cache_dir=".cache/embeddings")
40
+ >>> vec = cached.embed("Hello world") # computed and stored
41
+ >>> vec = cached.embed("Hello world") # retrieved from cache
42
+ """
43
+
44
+ def __init__(self, embedder, cache_dir: str | Path = ".cache/embeddings") -> None:
45
+ self._embedder = embedder
46
+ cache_path = Path(cache_dir)
47
+ cache_path.mkdir(parents=True, exist_ok=True)
48
+ self._db_path = str(cache_path / "embeddings")
49
+ logger.info("EmbeddingCache initialised at '%s'.", self._db_path)
50
+
51
+ # ------------------------------------------------------------------
52
+ # Public API (mirrors SentenceEmbedder)
53
+ # ------------------------------------------------------------------
54
+
55
+ def embed(self, text: str) -> np.ndarray:
56
+ """
57
+ Embed a single text, using the cache when available.
58
+
59
+ Args:
60
+ text (str): Input text.
61
+
62
+ Returns:
63
+ np.ndarray: Embedding vector.
64
+ """
65
+ key = self._make_key(text)
66
+ with shelve.open(self._db_path) as db:
67
+ if key in db:
68
+ logger.debug("Cache hit for text hash '%s'.", key)
69
+ return db[key]
70
+
71
+ vec = self._embedder.embed(text)
72
+ with shelve.open(self._db_path) as db:
73
+ db[key] = vec
74
+ return vec
75
+
76
+ def embed_batch(self, texts: List[str]) -> np.ndarray:
77
+ """
78
+ Embed a list of texts, only calling the model for cache misses.
79
+
80
+ Args:
81
+ texts (List[str]): Input texts.
82
+
83
+ Returns:
84
+ np.ndarray: 2-D array of shape ``(len(texts), dim)``.
85
+ """
86
+ keys = [self._make_key(t) for t in texts]
87
+ results: List[Optional[np.ndarray]] = [None] * len(texts)
88
+ missing_indices: List[int] = []
89
+
90
+ with shelve.open(self._db_path) as db:
91
+ for i, key in enumerate(keys):
92
+ if key in db:
93
+ results[i] = db[key]
94
+ else:
95
+ missing_indices.append(i)
96
+
97
+ if missing_indices:
98
+ missing_texts = [texts[i] for i in missing_indices]
99
+ new_vecs = self._embedder.embed_batch(missing_texts)
100
+ with shelve.open(self._db_path) as db:
101
+ for idx, vec in zip(missing_indices, new_vecs):
102
+ db[keys[idx]] = vec
103
+ results[idx] = vec
104
+
105
+ logger.debug(
106
+ "embed_batch: %d cached, %d computed.",
107
+ len(texts) - len(missing_indices),
108
+ len(missing_indices),
109
+ )
110
+ return np.stack(results)
111
+
112
+ def clear(self) -> None:
113
+ """Delete all cached embeddings."""
114
+ import os
115
+ for suffix in ["", ".db", ".dir", ".bak", ".dat"]:
116
+ path = Path(self._db_path + suffix)
117
+ if path.exists():
118
+ os.remove(path)
119
+ logger.info("Embedding cache cleared.")
120
+
121
+ # ------------------------------------------------------------------
122
+ # Helpers
123
+ # ------------------------------------------------------------------
124
+
125
+ def _make_key(self, text: str) -> str:
126
+ payload = f"{self._embedder.model_name}::{text}"
127
+ return hashlib.sha256(payload.encode()).hexdigest()
@@ -0,0 +1,229 @@
1
+ """
2
+ embedder.py
3
+ -----------
4
+ Core module for generating sentence/semantic embeddings.
5
+ Supports multiple backends: SentenceTransformers and OpenAI.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from typing import List
12
+
13
+ import numpy as np
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class SentenceEmbedder:
19
+ """
20
+ A unified interface for generating sentence/semantic embeddings.
21
+
22
+ Supports the following backends:
23
+ - ``"sentencetransformers"`` — Local HuggingFace models (default, offline-friendly)
24
+ - ``"openai"`` — OpenAI text-embedding-* API models
25
+
26
+ Args:
27
+ backend (str): Which embedding backend to use. One of
28
+ ``"sentencetransformers"`` or ``"openai"``. Defaults to
29
+ ``"sentencetransformers"``.
30
+ model_name (str): Model identifier.
31
+ - For SentenceTransformers: any model on HuggingFace Hub,
32
+ e.g. ``"all-MiniLM-L6-v2"`` (default).
33
+ - For OpenAI: e.g. ``"text-embedding-3-small"``.
34
+ openai_api_key (str | None): API key for OpenAI backend. If ``None``,
35
+ falls back to the ``OPENAI_API_KEY`` environment variable.
36
+ device (str): Torch device for SentenceTransformers, e.g. ``"cpu"``
37
+ or ``"cuda"``. Ignored for OpenAI backend.
38
+
39
+ Example:
40
+ >>> embedder = SentenceEmbedder()
41
+ >>> vec = embedder.embed("Hello world")
42
+ >>> vec.shape
43
+ (384,)
44
+ """
45
+
46
+ SUPPORTED_BACKENDS = {"sentencetransformers", "openai"}
47
+
48
+ def __init__(
49
+ self,
50
+ backend: str = "sentencetransformers",
51
+ model_name: str = "all-MiniLM-L6-v2",
52
+ openai_api_key: str | None = None,
53
+ device: str = "cpu",
54
+ ) -> None:
55
+ backend = backend.lower()
56
+ if backend not in self.SUPPORTED_BACKENDS:
57
+ raise ValueError(
58
+ f"Unsupported backend '{backend}'. "
59
+ f"Choose from: {self.SUPPORTED_BACKENDS}"
60
+ )
61
+
62
+ self.backend = backend
63
+ self.model_name = model_name
64
+ self.device = device
65
+ self._model = None
66
+ self._openai_client = None
67
+
68
+ if backend == "openai":
69
+ self._init_openai(openai_api_key)
70
+ else:
71
+ self._init_sentence_transformers()
72
+
73
+ # ------------------------------------------------------------------
74
+ # Initialisation helpers
75
+ # ------------------------------------------------------------------
76
+
77
+ def _init_sentence_transformers(self) -> None:
78
+ try:
79
+ from sentence_transformers import SentenceTransformer
80
+ except ImportError as exc:
81
+ raise ImportError(
82
+ "sentence-transformers is not installed. "
83
+ "Run: pip install sentence-transformers"
84
+ ) from exc
85
+
86
+ logger.info("Loading SentenceTransformer model '%s'…", self.model_name)
87
+ self._model = SentenceTransformer(self.model_name, device=self.device)
88
+ logger.info("Model loaded.")
89
+
90
+ def _init_openai(self, api_key: str | None) -> None:
91
+ try:
92
+ from openai import OpenAI
93
+ except ImportError as exc:
94
+ raise ImportError(
95
+ "openai is not installed. Run: pip install openai"
96
+ ) from exc
97
+
98
+ import os
99
+
100
+ key = api_key or os.getenv("OPENAI_API_KEY")
101
+ if not key:
102
+ raise EnvironmentError(
103
+ "OpenAI API key not provided. Pass openai_api_key= or set "
104
+ "the OPENAI_API_KEY environment variable."
105
+ )
106
+ self._openai_client = OpenAI(api_key=key)
107
+ logger.info("OpenAI client initialised with model '%s'.", self.model_name)
108
+
109
+ # ------------------------------------------------------------------
110
+ # Public API
111
+ # ------------------------------------------------------------------
112
+
113
+ def embed(self, text: str) -> np.ndarray:
114
+ """
115
+ Embed a single sentence/document.
116
+
117
+ Args:
118
+ text (str): Input text to embed.
119
+
120
+ Returns:
121
+ np.ndarray: 1-D float32 embedding vector.
122
+
123
+ Raises:
124
+ ValueError: If ``text`` is empty.
125
+ """
126
+ if not text or not text.strip():
127
+ raise ValueError("Input text must not be empty.")
128
+ return self.embed_batch([text])[0]
129
+
130
+ def embed_batch(self, texts: List[str]) -> np.ndarray:
131
+ """
132
+ Embed a list of sentences/documents.
133
+
134
+ Args:
135
+ texts (List[str]): List of input strings. Empty strings are
136
+ rejected.
137
+
138
+ Returns:
139
+ np.ndarray: 2-D float32 array of shape ``(len(texts), dim)``.
140
+
141
+ Raises:
142
+ ValueError: If ``texts`` is empty or contains blank strings.
143
+ """
144
+ if not texts:
145
+ raise ValueError("texts list must not be empty.")
146
+ if any(not t or not t.strip() for t in texts):
147
+ raise ValueError("texts list must not contain empty strings.")
148
+
149
+ if self.backend == "sentencetransformers":
150
+ return self._embed_st(texts)
151
+ return self._embed_openai(texts)
152
+
153
+ def similarity(self, a: str, b: str) -> float:
154
+ """
155
+ Compute cosine similarity between two sentences.
156
+
157
+ Args:
158
+ a (str): First sentence.
159
+ b (str): Second sentence.
160
+
161
+ Returns:
162
+ float: Cosine similarity in [-1, 1].
163
+ """
164
+ vecs = self.embed_batch([a, b])
165
+ return float(_cosine_similarity(vecs[0], vecs[1]))
166
+
167
+ def most_similar(
168
+ self,
169
+ query: str,
170
+ corpus: List[str],
171
+ top_k: int = 5,
172
+ ) -> List[tuple[str, float]]:
173
+ """
174
+ Return the ``top_k`` most semantically similar sentences from a corpus.
175
+
176
+ Args:
177
+ query (str): The query sentence.
178
+ corpus (List[str]): Pool of candidate sentences.
179
+ top_k (int): Number of results to return.
180
+
181
+ Returns:
182
+ List[tuple[str, float]]: Ranked list of ``(sentence, score)``
183
+ pairs, highest similarity first.
184
+ """
185
+ if not corpus:
186
+ raise ValueError("corpus must not be empty.")
187
+ top_k = min(top_k, len(corpus))
188
+
189
+ query_vec = self.embed(query)
190
+ corpus_vecs = self.embed_batch(corpus)
191
+
192
+ scores = [
193
+ float(_cosine_similarity(query_vec, cv)) for cv in corpus_vecs
194
+ ]
195
+ ranked = sorted(zip(corpus, scores), key=lambda x: x[1], reverse=True)
196
+ return ranked[:top_k]
197
+
198
+ # ------------------------------------------------------------------
199
+ # Backend implementations
200
+ # ------------------------------------------------------------------
201
+
202
+ def _embed_st(self, texts: List[str]) -> np.ndarray:
203
+ vecs = self._model.encode(
204
+ texts,
205
+ convert_to_numpy=True,
206
+ show_progress_bar=len(texts) > 50,
207
+ )
208
+ return vecs.astype(np.float32)
209
+
210
+ def _embed_openai(self, texts: List[str]) -> np.ndarray:
211
+ response = self._openai_client.embeddings.create(
212
+ input=texts,
213
+ model=self.model_name,
214
+ )
215
+ vecs = [item.embedding for item in response.data]
216
+ return np.array(vecs, dtype=np.float32)
217
+
218
+
219
+ # ------------------------------------------------------------------
220
+ # Utilities
221
+ # ------------------------------------------------------------------
222
+
223
+ def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
224
+ """Compute cosine similarity between two vectors."""
225
+ norm_a = np.linalg.norm(a)
226
+ norm_b = np.linalg.norm(b)
227
+ if norm_a == 0 or norm_b == 0:
228
+ return 0.0
229
+ return float(np.dot(a, b) / (norm_a * norm_b))
@@ -0,0 +1,206 @@
1
+ Metadata-Version: 2.4
2
+ Name: sentence-embedder
3
+ Version: 1.0.0
4
+ Summary: Lightweight Python library for sentence/semantic embeddings — SentenceTransformers & OpenAI in one unified API.
5
+ Author-email: Your Name <you@example.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/your-org/sentence-embedder
8
+ Project-URL: Repository, https://github.com/your-org/sentence-embedder
9
+ Project-URL: Bug Tracker, https://github.com/your-org/sentence-embedder/issues
10
+ Project-URL: Changelog, https://github.com/your-org/sentence-embedder/blob/main/CHANGELOG.md
11
+ Keywords: embeddings,nlp,sentence-transformers,semantic-search,openai,machine-learning
12
+ Classifier: Development Status :: 5 - Production/Stable
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
+ Requires-Python: >=3.10
23
+ Description-Content-Type: text/markdown
24
+ Requires-Dist: numpy>=1.24
25
+ Provides-Extra: st
26
+ Requires-Dist: sentence-transformers>=2.7; extra == "st"
27
+ Provides-Extra: openai
28
+ Requires-Dist: openai>=1.0; extra == "openai"
29
+ Provides-Extra: all
30
+ Requires-Dist: sentence-transformers>=2.7; extra == "all"
31
+ Requires-Dist: openai>=1.0; extra == "all"
32
+ Provides-Extra: dev
33
+ Requires-Dist: pytest>=7; extra == "dev"
34
+ Requires-Dist: pytest-cov; extra == "dev"
35
+ Requires-Dist: ruff; extra == "dev"
36
+ Requires-Dist: mypy; extra == "dev"
37
+ Requires-Dist: build; extra == "dev"
38
+ Requires-Dist: twine; extra == "dev"
39
+
40
+ # sentence-embedder
41
+
42
+ A lightweight **internal Python library** for generating sentence/semantic embeddings with a clean, unified API.
43
+
44
+ ---
45
+
46
+ ## Features
47
+
48
+ | Feature | Details |
49
+ |---|---|
50
+ | **Backends** | SentenceTransformers (local, offline) · OpenAI API |
51
+ | **Single & batch** | `embed()` and `embed_batch()` |
52
+ | **Similarity** | Cosine similarity between two sentences |
53
+ | **Semantic search** | `most_similar()` over an in-memory corpus |
54
+ | **Disk cache** | `EmbeddingCache` — skip re-embedding seen texts |
55
+ | **Typed** | Full type hints, compatible with `mypy` |
56
+
57
+ ---
58
+
59
+ ## Installation
60
+
61
+ ```bash
62
+ # Clone / copy this package into your project, then:
63
+
64
+ # SentenceTransformers backend (local, recommended)
65
+ pip install -e ".[st]"
66
+
67
+ # OpenAI backend
68
+ pip install -e ".[openai]"
69
+
70
+ # Both
71
+ pip install -e ".[all]"
72
+
73
+ # With dev tools (pytest, ruff, mypy)
74
+ pip install -e ".[all,dev]"
75
+ ```
76
+
77
+ ---
78
+
79
+ ## Quick Start
80
+
81
+ ### 1 — Basic embedding
82
+
83
+ ```python
84
+ from sentence_embedder import SentenceEmbedder
85
+
86
+ embedder = SentenceEmbedder() # defaults: SentenceTransformers, all-MiniLM-L6-v2
87
+
88
+ vec = embedder.embed("The cat sat on the mat.")
89
+ print(vec.shape) # (384,)
90
+ ```
91
+
92
+ ### 2 — Batch embedding
93
+
94
+ ```python
95
+ texts = ["Hello world", "Machine learning is fun", "I love Python"]
96
+ vecs = embedder.embed_batch(texts)
97
+ print(vecs.shape) # (3, 384)
98
+ ```
99
+
100
+ ### 3 — Cosine similarity
101
+
102
+ ```python
103
+ score = embedder.similarity("fast car", "quick automobile")
104
+ print(score) # ~0.85
105
+ ```
106
+
107
+ ### 4 — Semantic search
108
+
109
+ ```python
110
+ corpus = [
111
+ "The stock market crashed today.",
112
+ "Scientists discover a new planet.",
113
+ "Football team wins the championship.",
114
+ "A new AI model beats human performance.",
115
+ ]
116
+
117
+ results = embedder.most_similar("breakthrough in artificial intelligence", corpus, top_k=2)
118
+ for sentence, score in results:
119
+ print(f"{score:.3f} {sentence}")
120
+ ```
121
+
122
+ ### 5 — Disk cache (avoid re-embedding)
123
+
124
+ ```python
125
+ from sentence_embedder import SentenceEmbedder, EmbeddingCache
126
+
127
+ base = SentenceEmbedder()
128
+ embedder = EmbeddingCache(base, cache_dir=".cache/embeddings")
129
+
130
+ vec = embedder.embed("Hello world") # computed → stored on disk
131
+ vec = embedder.embed("Hello world") # loaded from cache instantly
132
+ ```
133
+
134
+ ---
135
+
136
+ ## OpenAI Backend
137
+
138
+ ```python
139
+ from sentence_embedder import SentenceEmbedder
140
+
141
+ embedder = SentenceEmbedder(
142
+ backend="openai",
143
+ model_name="text-embedding-3-small",
144
+ openai_api_key="sk-...", # or set OPENAI_API_KEY env var
145
+ )
146
+
147
+ vec = embedder.embed("Hello from OpenAI!")
148
+ ```
149
+
150
+ ---
151
+
152
+ ## API Reference
153
+
154
+ ### `SentenceEmbedder`
155
+
156
+ | Method | Signature | Description |
157
+ |---|---|---|
158
+ | `embed` | `(text: str) → ndarray` | Embed one sentence → 1-D vector |
159
+ | `embed_batch` | `(texts: List[str]) → ndarray` | Embed many → 2-D array `(N, dim)` |
160
+ | `similarity` | `(a: str, b: str) → float` | Cosine similarity in `[-1, 1]` |
161
+ | `most_similar` | `(query, corpus, top_k=5) → List[tuple]` | Ranked `(sentence, score)` pairs |
162
+
163
+ ### `EmbeddingCache`
164
+
165
+ | Method | Signature | Description |
166
+ |---|---|---|
167
+ | `embed` | `(text: str) → ndarray` | Cache-aware single embed |
168
+ | `embed_batch` | `(texts: List[str]) → ndarray` | Cache-aware batch embed |
169
+ | `clear` | `() → None` | Wipe the cache database |
170
+
171
+ ---
172
+
173
+ ## Running Tests
174
+
175
+ ```bash
176
+ pytest
177
+ # With coverage:
178
+ pytest --cov=sentence_embedder --cov-report=term-missing
179
+ ```
180
+
181
+ ---
182
+
183
+ ## Package Structure
184
+
185
+ ```
186
+ sentence_embedder/
187
+ ├── sentence_embedder/
188
+ │ ├── __init__.py # Public exports
189
+ │ ├── embedder.py # SentenceEmbedder (core)
190
+ │ └── cache.py # EmbeddingCache (disk cache)
191
+ ├── tests/
192
+ │ └── test_embedder.py # Unit tests (no model/network needed)
193
+ ├── pyproject.toml # Build config & dependencies
194
+ └── README.md
195
+ ```
196
+
197
+ ---
198
+
199
+ ## Choosing a Model
200
+
201
+ | Model | Dim | Speed | Quality | Use case |
202
+ |---|---|---|---|---|
203
+ | `all-MiniLM-L6-v2` | 384 | ⚡⚡⚡ | ★★★ | Default, general purpose |
204
+ | `all-mpnet-base-v2` | 768 | ⚡⚡ | ★★★★ | Higher quality |
205
+ | `multi-qa-MiniLM-L6-cos-v1` | 384 | ⚡⚡⚡ | ★★★★ | Q&A / search |
206
+ | `text-embedding-3-small` *(OpenAI)* | 1536 | API | ★★★★★ | Best quality, needs key |
@@ -0,0 +1,11 @@
1
+ README.md
2
+ pyproject.toml
3
+ sentence_embedder/__init__.py
4
+ sentence_embedder/cache.py
5
+ sentence_embedder/embedder.py
6
+ sentence_embedder.egg-info/PKG-INFO
7
+ sentence_embedder.egg-info/SOURCES.txt
8
+ sentence_embedder.egg-info/dependency_links.txt
9
+ sentence_embedder.egg-info/requires.txt
10
+ sentence_embedder.egg-info/top_level.txt
11
+ tests/test_embedder.py
@@ -0,0 +1,19 @@
1
+ numpy>=1.24
2
+
3
+ [all]
4
+ sentence-transformers>=2.7
5
+ openai>=1.0
6
+
7
+ [dev]
8
+ pytest>=7
9
+ pytest-cov
10
+ ruff
11
+ mypy
12
+ build
13
+ twine
14
+
15
+ [openai]
16
+ openai>=1.0
17
+
18
+ [st]
19
+ sentence-transformers>=2.7
@@ -0,0 +1 @@
1
+ sentence_embedder
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,137 @@
1
+ """
2
+ tests/test_embedder.py
3
+ ----------------------
4
+ Unit tests for SentenceEmbedder and EmbeddingCache.
5
+ Uses mocks so no model or network is required.
6
+ """
7
+
8
+ import numpy as np
9
+ import pytest
10
+ from unittest.mock import MagicMock, patch
11
+
12
+
13
+ # ---------------------------------------------------------------------------
14
+ # Helpers
15
+ # ---------------------------------------------------------------------------
16
+
17
+ def make_mock_st_model(dim: int = 8):
18
+ """Return a mock SentenceTransformer-like model."""
19
+ model = MagicMock()
20
+ model.encode = lambda texts, **kw: np.random.rand(len(texts), dim).astype(np.float32)
21
+ return model
22
+
23
+
24
+ # ---------------------------------------------------------------------------
25
+ # SentenceEmbedder — SentenceTransformers backend
26
+ # ---------------------------------------------------------------------------
27
+
28
+ class TestSentenceEmbedderST:
29
+ """Tests using the (mocked) SentenceTransformers backend."""
30
+
31
+ @pytest.fixture()
32
+ def embedder(self):
33
+ with patch(
34
+ "sentence_embedder.embedder.SentenceEmbedder._init_sentence_transformers"
35
+ ):
36
+ from sentence_embedder import SentenceEmbedder
37
+
38
+ emb = SentenceEmbedder(backend="sentencetransformers")
39
+ emb._model = make_mock_st_model(dim=8)
40
+ return emb
41
+
42
+ def test_embed_returns_1d_array(self, embedder):
43
+ vec = embedder.embed("Hello world")
44
+ assert vec.ndim == 1
45
+ assert vec.shape == (8,)
46
+
47
+ def test_embed_batch_returns_2d_array(self, embedder):
48
+ vecs = embedder.embed_batch(["Hello", "World", "Test"])
49
+ assert vecs.ndim == 2
50
+ assert vecs.shape == (3, 8)
51
+
52
+ def test_embed_empty_string_raises(self, embedder):
53
+ with pytest.raises(ValueError, match="empty"):
54
+ embedder.embed("")
55
+
56
+ def test_embed_batch_empty_list_raises(self, embedder):
57
+ with pytest.raises(ValueError, match="empty"):
58
+ embedder.embed_batch([])
59
+
60
+ def test_embed_batch_blank_string_raises(self, embedder):
61
+ with pytest.raises(ValueError, match="empty"):
62
+ embedder.embed_batch(["hello", " "])
63
+
64
+ def test_similarity_returns_float_in_range(self, embedder):
65
+ score = embedder.similarity("cat", "dog")
66
+ assert isinstance(score, float)
67
+ assert -1.0 <= score <= 1.0
68
+
69
+ def test_most_similar_returns_ranked_list(self, embedder):
70
+ corpus = ["apple", "banana", "cherry", "date"]
71
+ results = embedder.most_similar("fruit", corpus, top_k=2)
72
+ assert len(results) == 2
73
+ assert all(isinstance(s, str) and isinstance(sc, float) for s, sc in results)
74
+
75
+ def test_most_similar_top_k_capped(self, embedder):
76
+ corpus = ["a", "b"]
77
+ results = embedder.most_similar("query", corpus, top_k=10)
78
+ assert len(results) == 2
79
+
80
+ def test_unsupported_backend_raises(self):
81
+ with pytest.raises(ValueError, match="Unsupported backend"):
82
+ from sentence_embedder import SentenceEmbedder
83
+ SentenceEmbedder(backend="fakegpt")
84
+
85
+
86
+ # ---------------------------------------------------------------------------
87
+ # EmbeddingCache
88
+ # ---------------------------------------------------------------------------
89
+
90
+ class TestEmbeddingCache:
91
+ """Tests for the disk cache wrapper."""
92
+
93
+ @pytest.fixture()
94
+ def embedder(self, tmp_path):
95
+ """A mock embedder that counts calls."""
96
+ with patch(
97
+ "sentence_embedder.embedder.SentenceEmbedder._init_sentence_transformers"
98
+ ):
99
+ from sentence_embedder import SentenceEmbedder, EmbeddingCache
100
+
101
+ emb = SentenceEmbedder(backend="sentencetransformers")
102
+ emb._model = make_mock_st_model(dim=8)
103
+
104
+ cached = EmbeddingCache(emb, cache_dir=str(tmp_path / "cache"))
105
+ # Track how many times the underlying model is called
106
+ original_embed_batch = emb.embed_batch
107
+ emb._call_count = 0
108
+
109
+ def counting_embed_batch(texts):
110
+ emb._call_count += len(texts)
111
+ return original_embed_batch(texts)
112
+
113
+ emb.embed_batch = counting_embed_batch
114
+ return emb, cached
115
+
116
+ def test_cache_miss_then_hit(self, embedder):
117
+ base, cached = embedder
118
+ _ = cached.embed("The sky is blue")
119
+ assert base._call_count == 1
120
+ _ = cached.embed("The sky is blue")
121
+ assert base._call_count == 1 # no second call
122
+
123
+ def test_batch_partial_cache(self, embedder):
124
+ base, cached = embedder
125
+ texts = ["sentence one", "sentence two", "sentence three"]
126
+ _ = cached.embed_batch(texts[:2]) # cache first two
127
+ base._call_count = 0 # reset counter
128
+ _ = cached.embed_batch(texts) # only "sentence three" should be new
129
+ assert base._call_count == 1
130
+
131
+ def test_clear_cache(self, embedder, tmp_path):
132
+ base, cached = embedder
133
+ _ = cached.embed("some text")
134
+ cached.clear()
135
+ base._call_count = 0
136
+ _ = cached.embed("some text") # must recompute after clear
137
+ assert base._call_count == 1