kgmodule-utils 0.2.1__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kgmodule_utils-0.2.1 → kgmodule_utils-0.2.3}/PKG-INFO +1 -1
- {kgmodule_utils-0.2.1 → kgmodule_utils-0.2.3}/pyproject.toml +22 -1
- {kgmodule_utils-0.2.1 → kgmodule_utils-0.2.3}/src/kg_utils/__init__.py +3 -1
- kgmodule_utils-0.2.3/src/kg_utils/embedder.py +242 -0
- {kgmodule_utils-0.2.1 → kgmodule_utils-0.2.3}/LICENSE +0 -0
- {kgmodule_utils-0.2.1 → kgmodule_utils-0.2.3}/README.md +0 -0
- {kgmodule_utils-0.2.1 → kgmodule_utils-0.2.3}/src/kg_utils/embed.py +0 -0
- {kgmodule_utils-0.2.1 → kgmodule_utils-0.2.3}/src/kg_utils/py.typed +0 -0
- {kgmodule_utils-0.2.1 → kgmodule_utils-0.2.3}/src/kg_utils/snapshots/__init__.py +0 -0
- {kgmodule_utils-0.2.1 → kgmodule_utils-0.2.3}/src/kg_utils/snapshots/manager.py +0 -0
- {kgmodule_utils-0.2.1 → kgmodule_utils-0.2.3}/src/kg_utils/snapshots/models.py +0 -0
- {kgmodule_utils-0.2.1 → kgmodule_utils-0.2.3}/src/kg_utils/types/__init__.py +0 -0
- {kgmodule_utils-0.2.1 → kgmodule_utils-0.2.3}/src/kg_utils/types/extractor.py +0 -0
- {kgmodule_utils-0.2.1 → kgmodule_utils-0.2.3}/src/kg_utils/types/module.py +0 -0
- {kgmodule_utils-0.2.1 → kgmodule_utils-0.2.3}/src/kg_utils/types/specs.py +0 -0
|
@@ -10,7 +10,7 @@ build-backend = "poetry.core.masonry.api"
|
|
|
10
10
|
|
|
11
11
|
[project]
|
|
12
12
|
name = "kgmodule-utils"
|
|
13
|
-
version = "0.2.
|
|
13
|
+
version = "0.2.3"
|
|
14
14
|
description = "Shared types and snapshot infrastructure for the KGModule SDK"
|
|
15
15
|
readme = "README.md"
|
|
16
16
|
license = { text = "Elastic-2.0" }
|
|
@@ -52,8 +52,29 @@ target-version = ["py312", "py313"]
|
|
|
52
52
|
line-length = 100
|
|
53
53
|
target-version = "py312"
|
|
54
54
|
|
|
55
|
+
[tool.pylint.main]
|
|
56
|
+
source-roots = ["src"]
|
|
57
|
+
init-hook = "import sys; sys.path.insert(0, 'src')"
|
|
58
|
+
|
|
59
|
+
[tool.pylint."messages control"]
|
|
60
|
+
disable = [
|
|
61
|
+
"missing-module-docstring",
|
|
62
|
+
]
|
|
63
|
+
|
|
55
64
|
[tool.mypy]
|
|
56
65
|
python_version = "3.12"
|
|
57
66
|
strict = true
|
|
58
67
|
warn_unused_ignores = true
|
|
59
68
|
disallow_untyped_defs = true
|
|
69
|
+
|
|
70
|
+
[[tool.mypy.overrides]]
|
|
71
|
+
module = [
|
|
72
|
+
"sentence_transformers.*",
|
|
73
|
+
"transformers.*",
|
|
74
|
+
"numpy.*",
|
|
75
|
+
]
|
|
76
|
+
ignore_missing_imports = true
|
|
77
|
+
|
|
78
|
+
[[tool.mypy.overrides]]
|
|
79
|
+
module = "kg_utils.embedder"
|
|
80
|
+
disallow_untyped_calls = false
|
|
@@ -5,6 +5,8 @@ Sub-packages:
|
|
|
5
5
|
kg_utils.snapshots — Snapshot, SnapshotManager, SnapshotManifest, etc.
|
|
6
6
|
kg_utils.embed — Embedder protocol, DEFAULT_MODEL, KNOWN_MODELS,
|
|
7
7
|
kg_model_cache_dir(), resolve_model_path().
|
|
8
|
+
kg_utils.embedder — Concrete SentenceTransformerEmbedder, get_embedder(),
|
|
9
|
+
wrap_embedder(), load_sentence_transformer().
|
|
8
10
|
"""
|
|
9
11
|
|
|
10
|
-
__version__ = "0.2.
|
|
12
|
+
__version__ = "0.2.3"
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
"""kg_utils.embedder — Concrete SentenceTransformer embedding for the KGModule stack.
|
|
2
|
+
|
|
3
|
+
All model-loading logic lives here so that the ``local_files_only`` guard,
|
|
4
|
+
KNOWN_MODELS alias resolution, and path convention are defined exactly once.
|
|
5
|
+
Every KG module (doc_kg, diary_kg, code_kg, …) imports from here instead of
|
|
6
|
+
reimplementing the load sequence.
|
|
7
|
+
|
|
8
|
+
Contents
|
|
9
|
+
--------
|
|
10
|
+
Embedder
|
|
11
|
+
Abstract base class with ``embed_texts`` + ``embed_query`` + ``dim``.
|
|
12
|
+
|
|
13
|
+
SentenceTransformerEmbedder
|
|
14
|
+
Concrete implementation. Always uses ``local_files_only=True`` when the
|
|
15
|
+
model is cached locally — prevents HuggingFace HEAD requests that leave
|
|
16
|
+
stale thread/network state and cause SIGBUS on MPS.
|
|
17
|
+
|
|
18
|
+
load_sentence_transformer(model_name)
|
|
19
|
+
Raw ``SentenceTransformer`` factory with the canonical safe-load sequence.
|
|
20
|
+
Use when you need the bare model object (e.g. multi-process workers that
|
|
21
|
+
each load their own copy by name).
|
|
22
|
+
|
|
23
|
+
get_embedder(model_name)
|
|
24
|
+
High-level factory returning a ready-to-use ``SentenceTransformerEmbedder``.
|
|
25
|
+
|
|
26
|
+
wrap_embedder(st_model, model_name)
|
|
27
|
+
Wrap an already-loaded ``SentenceTransformer`` as an ``Embedder``. Use
|
|
28
|
+
this to share a live model between pipeline stages (e.g. DiaryTransformer
|
|
29
|
+
→ DocKG) without loading a second copy on MPS/CUDA.
|
|
30
|
+
|
|
31
|
+
Author: Eric G. Suchanek, PhD
|
|
32
|
+
License: Elastic 2.0
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
from __future__ import annotations
|
|
36
|
+
|
|
37
|
+
import os
|
|
38
|
+
from typing import Any
|
|
39
|
+
|
|
40
|
+
from kg_utils.embed import DEFAULT_MODEL, KNOWN_MODELS, resolve_model_path
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# ---------------------------------------------------------------------------
|
|
44
|
+
# Abstract base
|
|
45
|
+
# ---------------------------------------------------------------------------
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class Embedder:
|
|
49
|
+
"""Abstract embedding backend for the KGModule stack.
|
|
50
|
+
|
|
51
|
+
:param dim: Embedding dimension — set by concrete ``__init__``.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
dim: int
|
|
55
|
+
|
|
56
|
+
def embed_texts(self, texts: list[str]) -> list[list[float]]:
|
|
57
|
+
"""Embed a list of strings into float32 vectors.
|
|
58
|
+
|
|
59
|
+
:param texts: Input strings.
|
|
60
|
+
:return: One float32 vector per input.
|
|
61
|
+
"""
|
|
62
|
+
raise NotImplementedError
|
|
63
|
+
|
|
64
|
+
def embed_query(self, query: str) -> list[float]:
|
|
65
|
+
"""Embed a single query string.
|
|
66
|
+
|
|
67
|
+
:param query: Query string.
|
|
68
|
+
:return: Float32 vector.
|
|
69
|
+
"""
|
|
70
|
+
return self.embed_texts([query])[0]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
# Canonical model loader
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def load_sentence_transformer(model_name: str = DEFAULT_MODEL) -> Any:
|
|
79
|
+
"""Load a ``SentenceTransformer`` with the canonical safe-load sequence.
|
|
80
|
+
|
|
81
|
+
Resolution order:
|
|
82
|
+
|
|
83
|
+
1. Resolve KNOWN_MODELS alias → HuggingFace repo ID.
|
|
84
|
+
2. If ``resolve_model_path()`` returns an existing directory, load from
|
|
85
|
+
the local path with ``local_files_only=True`` — no HF HEAD requests.
|
|
86
|
+
3. Otherwise try ``local_files_only=True`` (hits HF's own cache layout).
|
|
87
|
+
4. Fall back to a live network fetch only if the model is genuinely absent.
|
|
88
|
+
|
|
89
|
+
The ``local_files_only=True`` guard on step 2 is critical on MPS: HF HEAD
|
|
90
|
+
retry loops leave stale thread state that causes SIGBUS on the first
|
|
91
|
+
``encode()`` call.
|
|
92
|
+
|
|
93
|
+
:param model_name: HuggingFace model ID or KNOWN_MODELS alias.
|
|
94
|
+
:return: Loaded ``SentenceTransformer`` instance.
|
|
95
|
+
"""
|
|
96
|
+
from sentence_transformers import SentenceTransformer # pylint: disable=import-outside-toplevel
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
from transformers import logging as hf_logging # pylint: disable=import-outside-toplevel
|
|
100
|
+
|
|
101
|
+
hf_logging.set_verbosity_error()
|
|
102
|
+
except ImportError:
|
|
103
|
+
pass
|
|
104
|
+
|
|
105
|
+
os.environ["TQDM_DISABLE"] = "1"
|
|
106
|
+
|
|
107
|
+
resolved = KNOWN_MODELS.get(model_name, model_name)
|
|
108
|
+
trust_remote = "nomic-ai/" in resolved
|
|
109
|
+
local_path = resolve_model_path(resolved)
|
|
110
|
+
|
|
111
|
+
if local_path.exists():
|
|
112
|
+
return SentenceTransformer(
|
|
113
|
+
str(local_path),
|
|
114
|
+
local_files_only=True,
|
|
115
|
+
trust_remote_code=trust_remote,
|
|
116
|
+
)
|
|
117
|
+
try:
|
|
118
|
+
return SentenceTransformer(
|
|
119
|
+
resolved,
|
|
120
|
+
local_files_only=True,
|
|
121
|
+
trust_remote_code=trust_remote,
|
|
122
|
+
)
|
|
123
|
+
except OSError:
|
|
124
|
+
return SentenceTransformer(resolved, trust_remote_code=trust_remote)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
# ---------------------------------------------------------------------------
|
|
128
|
+
# Concrete embedder
|
|
129
|
+
# ---------------------------------------------------------------------------
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class SentenceTransformerEmbedder(Embedder):
|
|
133
|
+
"""Concrete embedder backed by ``sentence-transformers``.
|
|
134
|
+
|
|
135
|
+
Delegates model loading to :func:`load_sentence_transformer` so the
|
|
136
|
+
``local_files_only`` guard is always in effect.
|
|
137
|
+
|
|
138
|
+
:param model_name: HuggingFace model ID or KNOWN_MODELS alias.
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
def __init__(self, model_name: str = DEFAULT_MODEL) -> None:
|
|
142
|
+
try:
|
|
143
|
+
from transformers import logging as hf_logging # pylint: disable=import-outside-toplevel
|
|
144
|
+
|
|
145
|
+
hf_logging.set_verbosity_error()
|
|
146
|
+
except ImportError:
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
_prev = os.environ.get("TQDM_DISABLE")
|
|
150
|
+
os.environ["TQDM_DISABLE"] = "1"
|
|
151
|
+
try:
|
|
152
|
+
self.model = load_sentence_transformer(model_name)
|
|
153
|
+
finally:
|
|
154
|
+
if _prev is None:
|
|
155
|
+
os.environ.pop("TQDM_DISABLE", None)
|
|
156
|
+
else:
|
|
157
|
+
os.environ["TQDM_DISABLE"] = _prev
|
|
158
|
+
|
|
159
|
+
self.model_name: str = KNOWN_MODELS.get(model_name, model_name)
|
|
160
|
+
# ST ≥5.4 renamed to get_embedding_dimension; ≤5.3 only had get_sentence_embedding_dimension.
|
|
161
|
+
_dim_fn = getattr(self.model, "get_embedding_dimension", None) or getattr(
|
|
162
|
+
self.model, "get_sentence_embedding_dimension", None
|
|
163
|
+
)
|
|
164
|
+
self.dim: int = (_dim_fn() if _dim_fn is not None else None) or 384
|
|
165
|
+
|
|
166
|
+
def embed_texts(self, texts: list[str], encode_batch_size: int = 512) -> list[list[float]]:
|
|
167
|
+
"""Embed a list of strings into float32 vectors.
|
|
168
|
+
|
|
169
|
+
:param texts: Input strings.
|
|
170
|
+
:param encode_batch_size: Passed to ``model.encode()`` — tune down if OOM on MPS.
|
|
171
|
+
"""
|
|
172
|
+
import numpy as np # pylint: disable=import-outside-toplevel
|
|
173
|
+
|
|
174
|
+
vecs = self.model.encode(
|
|
175
|
+
texts,
|
|
176
|
+
batch_size=encode_batch_size,
|
|
177
|
+
normalize_embeddings=True,
|
|
178
|
+
show_progress_bar=False,
|
|
179
|
+
)
|
|
180
|
+
return [np.asarray(v, dtype="float32").tolist() for v in vecs]
|
|
181
|
+
|
|
182
|
+
def embed_query(self, query: str) -> list[float]:
|
|
183
|
+
"""Embed a single query string into a float32 vector."""
|
|
184
|
+
import numpy as np # pylint: disable=import-outside-toplevel
|
|
185
|
+
|
|
186
|
+
vec = self.model.encode([query], normalize_embeddings=True)[0]
|
|
187
|
+
return list(np.asarray(vec, dtype="float32").tolist())
|
|
188
|
+
|
|
189
|
+
def __repr__(self) -> str:
|
|
190
|
+
return f"SentenceTransformerEmbedder(model={self.model_name!r}, dim={self.dim})"
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
# ---------------------------------------------------------------------------
|
|
194
|
+
# Factory functions
|
|
195
|
+
# ---------------------------------------------------------------------------
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def get_embedder(model_name: str = DEFAULT_MODEL) -> SentenceTransformerEmbedder:
|
|
199
|
+
"""Return a ready-to-use :class:`SentenceTransformerEmbedder`.
|
|
200
|
+
|
|
201
|
+
:param model_name: HuggingFace model ID or KNOWN_MODELS alias.
|
|
202
|
+
:return: Configured embedder instance.
|
|
203
|
+
"""
|
|
204
|
+
return SentenceTransformerEmbedder(model_name)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def wrap_embedder(st_model: Any, model_name: str = DEFAULT_MODEL) -> Embedder:
|
|
208
|
+
"""Wrap an already-loaded ``SentenceTransformer`` as an :class:`Embedder`.
|
|
209
|
+
|
|
210
|
+
Use this when a live model is already on the GPU (e.g. DiaryTransformer →
|
|
211
|
+
DocKG handoff) to avoid loading a second copy on MPS/CUDA.
|
|
212
|
+
|
|
213
|
+
:param st_model: Live ``SentenceTransformer`` instance.
|
|
214
|
+
:param model_name: Model name stored as metadata on the wrapper.
|
|
215
|
+
:return: An :class:`Embedder` that delegates all calls to *st_model*.
|
|
216
|
+
"""
|
|
217
|
+
import numpy as np # pylint: disable=import-outside-toplevel
|
|
218
|
+
|
|
219
|
+
resolved = KNOWN_MODELS.get(model_name, model_name)
|
|
220
|
+
_dim_fn = getattr(st_model, "get_embedding_dimension", None) or getattr(
|
|
221
|
+
st_model, "get_sentence_embedding_dimension", None
|
|
222
|
+
)
|
|
223
|
+
_dim = (_dim_fn() if _dim_fn is not None else None) or 384
|
|
224
|
+
|
|
225
|
+
class _WrappedEmbedder(Embedder):
|
|
226
|
+
model_name: str = resolved
|
|
227
|
+
dim: int = _dim
|
|
228
|
+
|
|
229
|
+
def embed_texts(self, texts: list[str]) -> list[list[float]]:
|
|
230
|
+
vecs = st_model.encode(
|
|
231
|
+
texts,
|
|
232
|
+
batch_size=512,
|
|
233
|
+
normalize_embeddings=True,
|
|
234
|
+
show_progress_bar=False,
|
|
235
|
+
)
|
|
236
|
+
return [np.asarray(v, dtype="float32").tolist() for v in vecs]
|
|
237
|
+
|
|
238
|
+
def embed_query(self, query: str) -> list[float]:
|
|
239
|
+
vec = st_model.encode([query], normalize_embeddings=True)[0]
|
|
240
|
+
return list(np.asarray(vec, dtype="float32").tolist())
|
|
241
|
+
|
|
242
|
+
return _WrappedEmbedder()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|