kgmodule-utils 0.2.1__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kgmodule-utils
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: Shared types and snapshot infrastructure for the KGModule SDK
5
5
  License: Elastic-2.0
6
6
  License-File: LICENSE
@@ -10,7 +10,7 @@ build-backend = "poetry.core.masonry.api"
10
10
 
11
11
  [project]
12
12
  name = "kgmodule-utils"
13
- version = "0.2.1"
13
+ version = "0.2.3"
14
14
  description = "Shared types and snapshot infrastructure for the KGModule SDK"
15
15
  readme = "README.md"
16
16
  license = { text = "Elastic-2.0" }
@@ -52,8 +52,29 @@ target-version = ["py312", "py313"]
52
52
  line-length = 100
53
53
  target-version = "py312"
54
54
 
55
+ [tool.pylint.main]
56
+ source-roots = ["src"]
57
+ init-hook = "import sys; sys.path.insert(0, 'src')"
58
+
59
+ [tool.pylint."messages control"]
60
+ disable = [
61
+ "missing-module-docstring",
62
+ ]
63
+
55
64
  [tool.mypy]
56
65
  python_version = "3.12"
57
66
  strict = true
58
67
  warn_unused_ignores = true
59
68
  disallow_untyped_defs = true
69
+
70
+ [[tool.mypy.overrides]]
71
+ module = [
72
+ "sentence_transformers.*",
73
+ "transformers.*",
74
+ "numpy.*",
75
+ ]
76
+ ignore_missing_imports = true
77
+
78
+ [[tool.mypy.overrides]]
79
+ module = "kg_utils.embedder"
80
+ disallow_untyped_calls = false
@@ -5,6 +5,8 @@ Sub-packages:
5
5
  kg_utils.snapshots — Snapshot, SnapshotManager, SnapshotManifest, etc.
6
6
  kg_utils.embed — Embedder protocol, DEFAULT_MODEL, KNOWN_MODELS,
7
7
  kg_model_cache_dir(), resolve_model_path().
8
+ kg_utils.embedder — Concrete SentenceTransformerEmbedder, get_embedder(),
9
+ wrap_embedder(), load_sentence_transformer().
8
10
  """
9
11
 
10
- __version__ = "0.2.1"
12
+ __version__ = "0.2.3"
@@ -0,0 +1,242 @@
1
+ """kg_utils.embedder — Concrete SentenceTransformer embedding for the KGModule stack.
2
+
3
+ All model-loading logic lives here so that the ``local_files_only`` guard,
4
+ KNOWN_MODELS alias resolution, and path convention are defined exactly once.
5
+ Every KG module (doc_kg, diary_kg, code_kg, …) imports from here instead of
6
+ reimplementing the load sequence.
7
+
8
+ Contents
9
+ --------
10
+ Embedder
11
+ Abstract base class with ``embed_texts`` + ``embed_query`` + ``dim``.
12
+
13
+ SentenceTransformerEmbedder
14
+ Concrete implementation. Always uses ``local_files_only=True`` when the
15
+ model is cached locally — prevents HuggingFace HEAD requests that leave
16
+ stale thread/network state and cause SIGBUS on MPS.
17
+
18
+ load_sentence_transformer(model_name)
19
+ Raw ``SentenceTransformer`` factory with the canonical safe-load sequence.
20
+ Use when you need the bare model object (e.g. multi-process workers that
21
+ each load their own copy by name).
22
+
23
+ get_embedder(model_name)
24
+ High-level factory returning a ready-to-use ``SentenceTransformerEmbedder``.
25
+
26
+ wrap_embedder(st_model, model_name)
27
+ Wrap an already-loaded ``SentenceTransformer`` as an ``Embedder``. Use
28
+ this to share a live model between pipeline stages (e.g. DiaryTransformer
29
+ → DocKG) without loading a second copy on MPS/CUDA.
30
+
31
+ Author: Eric G. Suchanek, PhD
32
+ License: Elastic 2.0
33
+ """
34
+
35
+ from __future__ import annotations
36
+
37
+ import os
38
+ from typing import Any
39
+
40
+ from kg_utils.embed import DEFAULT_MODEL, KNOWN_MODELS, resolve_model_path
41
+
42
+
43
+ # ---------------------------------------------------------------------------
44
+ # Abstract base
45
+ # ---------------------------------------------------------------------------
46
+
47
+
48
+ class Embedder:
49
+ """Abstract embedding backend for the KGModule stack.
50
+
51
+ :param dim: Embedding dimension — set by concrete ``__init__``.
52
+ """
53
+
54
+ dim: int
55
+
56
+ def embed_texts(self, texts: list[str]) -> list[list[float]]:
57
+ """Embed a list of strings into float32 vectors.
58
+
59
+ :param texts: Input strings.
60
+ :return: One float32 vector per input.
61
+ """
62
+ raise NotImplementedError
63
+
64
+ def embed_query(self, query: str) -> list[float]:
65
+ """Embed a single query string.
66
+
67
+ :param query: Query string.
68
+ :return: Float32 vector.
69
+ """
70
+ return self.embed_texts([query])[0]
71
+
72
+
73
+ # ---------------------------------------------------------------------------
74
+ # Canonical model loader
75
+ # ---------------------------------------------------------------------------
76
+
77
+
78
+ def load_sentence_transformer(model_name: str = DEFAULT_MODEL) -> Any:
79
+ """Load a ``SentenceTransformer`` with the canonical safe-load sequence.
80
+
81
+ Resolution order:
82
+
83
+ 1. Resolve KNOWN_MODELS alias → HuggingFace repo ID.
84
+ 2. If ``resolve_model_path()`` returns an existing directory, load from
85
+ the local path with ``local_files_only=True`` — no HF HEAD requests.
86
+ 3. Otherwise try ``local_files_only=True`` (hits HF's own cache layout).
87
+ 4. Fall back to a live network fetch only if the model is genuinely absent.
88
+
89
+ The ``local_files_only=True`` guard on step 2 is critical on MPS: HF HEAD
90
+ retry loops leave stale thread state that causes SIGBUS on the first
91
+ ``encode()`` call.
92
+
93
+ :param model_name: HuggingFace model ID or KNOWN_MODELS alias.
94
+ :return: Loaded ``SentenceTransformer`` instance.
95
+ """
96
+ from sentence_transformers import SentenceTransformer # pylint: disable=import-outside-toplevel
97
+
98
+ try:
99
+ from transformers import logging as hf_logging # pylint: disable=import-outside-toplevel
100
+
101
+ hf_logging.set_verbosity_error()
102
+ except ImportError:
103
+ pass
104
+
105
+ os.environ["TQDM_DISABLE"] = "1"
106
+
107
+ resolved = KNOWN_MODELS.get(model_name, model_name)
108
+ trust_remote = "nomic-ai/" in resolved
109
+ local_path = resolve_model_path(resolved)
110
+
111
+ if local_path.exists():
112
+ return SentenceTransformer(
113
+ str(local_path),
114
+ local_files_only=True,
115
+ trust_remote_code=trust_remote,
116
+ )
117
+ try:
118
+ return SentenceTransformer(
119
+ resolved,
120
+ local_files_only=True,
121
+ trust_remote_code=trust_remote,
122
+ )
123
+ except OSError:
124
+ return SentenceTransformer(resolved, trust_remote_code=trust_remote)
125
+
126
+
127
+ # ---------------------------------------------------------------------------
128
+ # Concrete embedder
129
+ # ---------------------------------------------------------------------------
130
+
131
+
132
+ class SentenceTransformerEmbedder(Embedder):
133
+ """Concrete embedder backed by ``sentence-transformers``.
134
+
135
+ Delegates model loading to :func:`load_sentence_transformer` so the
136
+ ``local_files_only`` guard is always in effect.
137
+
138
+ :param model_name: HuggingFace model ID or KNOWN_MODELS alias.
139
+ """
140
+
141
+ def __init__(self, model_name: str = DEFAULT_MODEL) -> None:
142
+ try:
143
+ from transformers import logging as hf_logging # pylint: disable=import-outside-toplevel
144
+
145
+ hf_logging.set_verbosity_error()
146
+ except ImportError:
147
+ pass
148
+
149
+ _prev = os.environ.get("TQDM_DISABLE")
150
+ os.environ["TQDM_DISABLE"] = "1"
151
+ try:
152
+ self.model = load_sentence_transformer(model_name)
153
+ finally:
154
+ if _prev is None:
155
+ os.environ.pop("TQDM_DISABLE", None)
156
+ else:
157
+ os.environ["TQDM_DISABLE"] = _prev
158
+
159
+ self.model_name: str = KNOWN_MODELS.get(model_name, model_name)
160
+ # ST ≥5.4 renamed to get_embedding_dimension; ≤5.3 only had get_sentence_embedding_dimension.
161
+ _dim_fn = getattr(self.model, "get_embedding_dimension", None) or getattr(
162
+ self.model, "get_sentence_embedding_dimension", None
163
+ )
164
+ self.dim: int = (_dim_fn() if _dim_fn is not None else None) or 384
165
+
166
+ def embed_texts(self, texts: list[str], encode_batch_size: int = 512) -> list[list[float]]:
167
+ """Embed a list of strings into float32 vectors.
168
+
169
+ :param texts: Input strings.
170
+ :param encode_batch_size: Passed to ``model.encode()`` — tune down if OOM on MPS.
171
+ """
172
+ import numpy as np # pylint: disable=import-outside-toplevel
173
+
174
+ vecs = self.model.encode(
175
+ texts,
176
+ batch_size=encode_batch_size,
177
+ normalize_embeddings=True,
178
+ show_progress_bar=False,
179
+ )
180
+ return [np.asarray(v, dtype="float32").tolist() for v in vecs]
181
+
182
+ def embed_query(self, query: str) -> list[float]:
183
+ """Embed a single query string into a float32 vector."""
184
+ import numpy as np # pylint: disable=import-outside-toplevel
185
+
186
+ vec = self.model.encode([query], normalize_embeddings=True)[0]
187
+ return list(np.asarray(vec, dtype="float32").tolist())
188
+
189
+ def __repr__(self) -> str:
190
+ return f"SentenceTransformerEmbedder(model={self.model_name!r}, dim={self.dim})"
191
+
192
+
193
+ # ---------------------------------------------------------------------------
194
+ # Factory functions
195
+ # ---------------------------------------------------------------------------
196
+
197
+
198
+ def get_embedder(model_name: str = DEFAULT_MODEL) -> SentenceTransformerEmbedder:
199
+ """Return a ready-to-use :class:`SentenceTransformerEmbedder`.
200
+
201
+ :param model_name: HuggingFace model ID or KNOWN_MODELS alias.
202
+ :return: Configured embedder instance.
203
+ """
204
+ return SentenceTransformerEmbedder(model_name)
205
+
206
+
207
+ def wrap_embedder(st_model: Any, model_name: str = DEFAULT_MODEL) -> Embedder:
208
+ """Wrap an already-loaded ``SentenceTransformer`` as an :class:`Embedder`.
209
+
210
+ Use this when a live model is already on the GPU (e.g. DiaryTransformer →
211
+ DocKG handoff) to avoid loading a second copy on MPS/CUDA.
212
+
213
+ :param st_model: Live ``SentenceTransformer`` instance.
214
+ :param model_name: Model name stored as metadata on the wrapper.
215
+ :return: An :class:`Embedder` that delegates all calls to *st_model*.
216
+ """
217
+ import numpy as np # pylint: disable=import-outside-toplevel
218
+
219
+ resolved = KNOWN_MODELS.get(model_name, model_name)
220
+ _dim_fn = getattr(st_model, "get_embedding_dimension", None) or getattr(
221
+ st_model, "get_sentence_embedding_dimension", None
222
+ )
223
+ _dim = (_dim_fn() if _dim_fn is not None else None) or 384
224
+
225
+ class _WrappedEmbedder(Embedder):
226
+ model_name: str = resolved
227
+ dim: int = _dim
228
+
229
+ def embed_texts(self, texts: list[str]) -> list[list[float]]:
230
+ vecs = st_model.encode(
231
+ texts,
232
+ batch_size=512,
233
+ normalize_embeddings=True,
234
+ show_progress_bar=False,
235
+ )
236
+ return [np.asarray(v, dtype="float32").tolist() for v in vecs]
237
+
238
+ def embed_query(self, query: str) -> list[float]:
239
+ vec = st_model.encode([query], normalize_embeddings=True)[0]
240
+ return list(np.asarray(vec, dtype="float32").tolist())
241
+
242
+ return _WrappedEmbedder()
File without changes
File without changes