kgmodule-utils 0.2.4__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,270 @@
1
+ Metadata-Version: 2.4
2
+ Name: kgmodule-utils
3
+ Version: 0.3.1
4
+ Summary: Shared types, graph store, semantic index, and pipeline base for the KGModule SDK
5
+ License: Elastic-2.0
6
+ License-File: LICENSE
7
+ Keywords: knowledge-graph,kgmodule,sdk,types,snapshots
8
+ Author: Eric G. Suchanek, PhD
9
+ Author-email: suchanek@flux-frontiers.com
10
+ Requires-Python: >=3.12,<3.14
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Provides-Extra: semantic
17
+ Requires-Dist: lancedb (>=0.19.0) ; extra == "semantic"
18
+ Requires-Dist: numpy (>=1.24.0) ; extra == "semantic"
19
+ Requires-Dist: sentence-transformers (>=5.4.1) ; extra == "semantic"
20
+ Requires-Dist: torch (>=2.5.1) ; extra == "semantic"
21
+ Requires-Dist: transformers (>=4.40.0,<4.57) ; extra == "semantic"
22
+ Project-URL: Repository, https://github.com/Flux-Frontiers/kg_utils
23
+ Description-Content-Type: text/markdown
24
+
25
+
26
+ [![Python](https://img.shields.io/badge/python-3.12%20%7C%203.13-blue.svg)](https://www.python.org/)
27
+ [![License: Elastic-2.0](https://img.shields.io/badge/License-Elastic%202.0-blue.svg)](https://www.elastic.co/licensing/elastic-license)
28
+ [![Version](https://img.shields.io/badge/version-0.3.1-blue.svg)](https://github.com/Flux-Frontiers/KG_utils/releases)
29
+ [![CI](https://github.com/Flux-Frontiers/KG_utils/actions/workflows/ci.yml/badge.svg)](https://github.com/Flux-Frontiers/KG_utils/actions/workflows/ci.yml)
30
+ [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
31
+
32
+ # kgmodule-utils
33
+
34
+ **kgmodule-utils** — Shared graph store, semantic index, pipeline base, and snapshot infrastructure for the KGModule SDK.
35
+
36
+ *Author: Eric G. Suchanek, PhD*
37
+
38
+ *Flux-Frontiers, Liberty TWP, OH*
39
+
40
+ ---
41
+
42
+ ## Overview
43
+
44
+ kgmodule-utils is the **shared SDK layer** for the Flux-Frontiers knowledge-graph ecosystem. It provides everything a domain KG module needs — from type abstractions and SQLite graph storage through LanceDB vector indexing and a full build/query/pack pipeline — so domain authors implement only what is specific to their source domain.
45
+
46
+ Every KGModule implementation — [PyCodeKG](https://github.com/Flux-Frontiers/pycode_kg), [DocKG](https://github.com/Flux-Frontiers/doc_kg), and others — subclasses `KGModule` from here and implements exactly three methods: `make_extractor()`, `kind()`, and `analyze()`.
47
+
48
+ ---
49
+
50
+ ## Features
51
+
52
+ - **`kg_utils.specs`** — `NodeSpec`, `EdgeSpec`, `BuildStats`, `QueryResult`, `SnippetPack` dataclasses
53
+ - **`kg_utils.extractor`** — `KGExtractor` ABC: `extract()`, `node_kinds()`, `edge_kinds()`, `coverage_metric()`
54
+ - **`kg_utils.store`** — `GraphStore`: SQLite-backed node/edge store with BFS expansion, symbol resolution, caller lookup, and provenance recording
55
+ - **`kg_utils.semantic`** — `SemanticIndex` (LanceDB), `SentenceTransformerEmbedder`, `SeedHit`, model registry, `resolve_model_path()`
56
+ - **`kg_utils.pipeline`** — `KGModule`: full build → query → pack pipeline base with hybrid semantic + lexical reranking and snippet extraction
57
+ - **`kg_utils.embedder`** — `get_embedder()`, `wrap_embedder()`, `load_sentence_transformer()` factory functions
58
+ - **`kg_utils.embed`** — `Embedder` protocol, `DEFAULT_MODEL`, `KNOWN_MODELS`, `resolve_model_path()`
59
+ - **`kg_utils.snapshots`** — `Snapshot`, `SnapshotManager`, `SnapshotManifest` for temporal metric tracking
60
+
61
+ ---
62
+
63
+ ## Installation
64
+
65
+ **Requirements:** Python ≥ 3.12, < 3.14
66
+
67
+ ### Core only (stdlib, no optional deps)
68
+
69
+ ```bash
70
+ pip install kgmodule-utils
71
+ ```
72
+
73
+ ### With semantic search (LanceDB + sentence-transformers)
74
+
75
+ ```bash
76
+ pip install 'kgmodule-utils[semantic]'
77
+ ```
78
+
79
+ ### In a Poetry project
80
+
81
+ ```toml
82
+ [tool.poetry.dependencies]
83
+ kgmodule-utils = { version = ">=0.3.1", extras = ["semantic"] }
84
+ ```
85
+
86
+ ---
87
+
88
+ ## Quick Start
89
+
90
+ ### Build a domain KG module
91
+
92
+ ```python
93
+ from collections.abc import Iterator
94
+ from pathlib import Path
95
+
96
+ from kg_utils.extractor import KGExtractor
97
+ from kg_utils.pipeline import KGModule
98
+ from kg_utils.specs import EdgeSpec, NodeSpec
99
+
100
+
101
+ class MyExtractor(KGExtractor):
102
+ def node_kinds(self) -> list[str]:
103
+ return ["document", "section"]
104
+
105
+ def edge_kinds(self) -> list[str]:
106
+ return ["CONTAINS"]
107
+
108
+ def meaningful_node_kinds(self) -> list[str]:
109
+ return ["section"]
110
+
111
+ def extract(self) -> Iterator[NodeSpec | EdgeSpec]:
112
+ for doc in self.repo_path.glob("**/*.md"):
113
+ doc_id = f"document:{doc}"
114
+ yield NodeSpec(node_id=doc_id, kind="document",
115
+ name=doc.stem, qualname=doc.stem,
116
+ source_path=str(doc))
117
+ # … yield sections and CONTAINS edges
118
+
119
+
120
+ class MyKG(KGModule):
121
+ _default_dir = ".mykg"
122
+
123
+ def make_extractor(self) -> KGExtractor:
124
+ return MyExtractor(self.repo_root)
125
+
126
+ def kind(self) -> str:
127
+ return "my"
128
+
129
+ def analyze(self) -> str:
130
+ s = self.stats()
131
+ return f"# MyKG\nnodes={s['total_nodes']}"
132
+
133
+
134
+ # Build and query
135
+ kg = MyKG("/path/to/repo")
136
+ kg.build(wipe=True)
137
+
138
+ result = kg.query("authentication flow", k=8, hop=1)
139
+ pack = kg.pack("error handling", max_nodes=10)
140
+ print(pack.to_markdown())
141
+ ```
142
+
143
+ ### Track metrics over time
144
+
145
+ ```python
146
+ from kg_utils.snapshots import SnapshotManager
147
+
148
+ mgr = SnapshotManager(".mykg/snapshots", package_name="my-kg")
149
+
150
+ snapshot = mgr.capture(
151
+ version="1.0.0",
152
+ branch="main",
153
+ graph_stats_dict=kg.stats(),
154
+ )
155
+ mgr.save_snapshot(snapshot)
156
+
157
+ snaps = mgr.list_snapshots(limit=5)
158
+ delta = mgr.diff_snapshots(snaps[-1]["key"], snaps[0]["key"])
159
+ ```
160
+
161
+ ---
162
+
163
+ ## API Reference
164
+
165
+ ### `kg_utils.specs`
166
+
167
+ | Class | Description |
168
+ |---|---|
169
+ | `NodeSpec` | Graph node: `node_id`, `kind`, `name`, `qualname`, `source_path`, `lineno`, `end_lineno`, `docstring`, `metadata` |
170
+ | `EdgeSpec` | Graph edge: `source_id`, `target_id`, `relation`, `weight`, `metadata` |
171
+ | `BuildStats` | Build result: node/edge counts, indexed rows, embedding dim |
172
+ | `QueryResult` | Query result: nodes, edges, seeds, hop, relevance metadata |
173
+ | `SnippetPack` | Pack result: nodes with snippets, `to_markdown()`, `to_json()`, `save()` |
174
+
175
+ ### `kg_utils.extractor`
176
+
177
+ | Class | Description |
178
+ |---|---|
179
+ | `KGExtractor` | ABC — implement `node_kinds()`, `edge_kinds()`, `extract()` |
180
+
181
+ ### `kg_utils.store`
182
+
183
+ | Class | Description |
184
+ |---|---|
185
+ | `GraphStore` | SQLite persistence: `write()`, `expand()`, `query_nodes()`, `resolve_symbols()`, `callers_of()`, `stats()` |
186
+
187
+ ### `kg_utils.semantic`
188
+
189
+ | Class / function | Description |
190
+ |---|---|
191
+ | `SemanticIndex` | LanceDB vector index: `build()`, `search()` |
192
+ | `SentenceTransformerEmbedder` | Local embedding via sentence-transformers |
193
+ | `resolve_model_path()` | Resolve model name / alias to local cache path |
194
+ | `suppress_ingestion_logging()` | Silence verbose HF / tqdm output during ingestion |
195
+
196
+ ### `kg_utils.pipeline`
197
+
198
+ | Class | Description |
199
+ |---|---|
200
+ | `KGModule` | Concrete base — implement `make_extractor()`, `kind()`, `analyze()`; get `build()`, `query()`, `pack()`, `stats()` for free |
201
+
202
+ ### `kg_utils.snapshots`
203
+
204
+ | Class | Description |
205
+ |---|---|
206
+ | `Snapshot` | Temporal snapshot keyed by git tree hash with metrics and deltas |
207
+ | `SnapshotManager` | Capture, persist, load, list, diff, and prune snapshots |
208
+ | `SnapshotManifest` | Fast-lookup index with format versioning |
209
+
210
+ ---
211
+
212
+ ## Project Structure
213
+
214
+ ```
215
+ KG_utils/
216
+ ├── pyproject.toml
217
+ ├── src/
218
+ │ └── kg_utils/
219
+ │ ├── __init__.py
220
+ │ ├── specs.py # NodeSpec, EdgeSpec, BuildStats, QueryResult, SnippetPack
221
+ │ ├── extractor.py # KGExtractor ABC
222
+ │ ├── store.py # GraphStore (SQLite)
223
+ │ ├── semantic.py # SemanticIndex, SentenceTransformerEmbedder, SeedHit
224
+ │ ├── pipeline.py # KGModule concrete base class
225
+ │ ├── module.py # Re-export shim
226
+ │ ├── embed.py # Embedder protocol, model registry
227
+ │ ├── embedder.py # SentenceTransformerEmbedder factory functions
228
+ │ └── snapshots/
229
+ │ ├── __init__.py
230
+ │ ├── models.py # Snapshot, SnapshotManifest, PruneResult
231
+ │ └── manager.py # SnapshotManager
232
+ └── tests/
233
+ ├── test_store.py # GraphStore unit tests
234
+ ├── test_pipeline_utils.py # Pipeline utility function tests
235
+ ├── test_pipeline_module.py # End-to-end integration tests (--integration)
236
+ ├── test_types.py # Spec dataclass and KGExtractor tests
237
+ ├── test_snapshots.py # Snapshot lifecycle tests
238
+ └── test_integration.py # Cross-module integration tests
239
+ ```
240
+
241
+ ---
242
+
243
+ ## Development
244
+
245
+ ```bash
246
+ git clone https://github.com/Flux-Frontiers/KG_utils.git
247
+ cd KG_utils
248
+ poetry install --with dev
249
+ ```
250
+
251
+ Run the fast test suite (no model downloads):
252
+
253
+ ```bash
254
+ poetry run pytest -m "not integration"
255
+ ```
256
+
257
+ Run all tests including semantic/integration (requires `[semantic]` extra):
258
+
259
+ ```bash
260
+ poetry run pytest
261
+ ```
262
+
263
+ ---
264
+
265
+ ## License
266
+
267
+ [Elastic License 2.0](https://www.elastic.co/licensing/elastic-license) — see [LICENSE](LICENSE).
268
+
269
+ Free to use, modify, and distribute. You may not offer the software as a hosted or managed service to third parties. Commercial use internally is permitted.
270
+
@@ -0,0 +1,245 @@
1
+
2
+ [![Python](https://img.shields.io/badge/python-3.12%20%7C%203.13-blue.svg)](https://www.python.org/)
3
+ [![License: Elastic-2.0](https://img.shields.io/badge/License-Elastic%202.0-blue.svg)](https://www.elastic.co/licensing/elastic-license)
4
+ [![Version](https://img.shields.io/badge/version-0.3.1-blue.svg)](https://github.com/Flux-Frontiers/KG_utils/releases)
5
+ [![CI](https://github.com/Flux-Frontiers/KG_utils/actions/workflows/ci.yml/badge.svg)](https://github.com/Flux-Frontiers/KG_utils/actions/workflows/ci.yml)
6
+ [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
7
+
8
+ # kgmodule-utils
9
+
10
+ **kgmodule-utils** — Shared graph store, semantic index, pipeline base, and snapshot infrastructure for the KGModule SDK.
11
+
12
+ *Author: Eric G. Suchanek, PhD*
13
+
14
+ *Flux-Frontiers, Liberty TWP, OH*
15
+
16
+ ---
17
+
18
+ ## Overview
19
+
20
+ kgmodule-utils is the **shared SDK layer** for the Flux-Frontiers knowledge-graph ecosystem. It provides everything a domain KG module needs — from type abstractions and SQLite graph storage through LanceDB vector indexing and a full build/query/pack pipeline — so domain authors implement only what is specific to their source domain.
21
+
22
+ Every KGModule implementation — [PyCodeKG](https://github.com/Flux-Frontiers/pycode_kg), [DocKG](https://github.com/Flux-Frontiers/doc_kg), and others — subclasses `KGModule` from here and implements exactly three methods: `make_extractor()`, `kind()`, and `analyze()`.
23
+
24
+ ---
25
+
26
+ ## Features
27
+
28
+ - **`kg_utils.specs`** — `NodeSpec`, `EdgeSpec`, `BuildStats`, `QueryResult`, `SnippetPack` dataclasses
29
+ - **`kg_utils.extractor`** — `KGExtractor` ABC: `extract()`, `node_kinds()`, `edge_kinds()`, `coverage_metric()`
30
+ - **`kg_utils.store`** — `GraphStore`: SQLite-backed node/edge store with BFS expansion, symbol resolution, caller lookup, and provenance recording
31
+ - **`kg_utils.semantic`** — `SemanticIndex` (LanceDB), `SentenceTransformerEmbedder`, `SeedHit`, model registry, `resolve_model_path()`
32
+ - **`kg_utils.pipeline`** — `KGModule`: full build → query → pack pipeline base with hybrid semantic + lexical reranking and snippet extraction
33
+ - **`kg_utils.embedder`** — `get_embedder()`, `wrap_embedder()`, `load_sentence_transformer()` factory functions
34
+ - **`kg_utils.embed`** — `Embedder` protocol, `DEFAULT_MODEL`, `KNOWN_MODELS`, `resolve_model_path()`
35
+ - **`kg_utils.snapshots`** — `Snapshot`, `SnapshotManager`, `SnapshotManifest` for temporal metric tracking
36
+
37
+ ---
38
+
39
+ ## Installation
40
+
41
+ **Requirements:** Python ≥ 3.12, < 3.14
42
+
43
+ ### Core only (stdlib, no optional deps)
44
+
45
+ ```bash
46
+ pip install kgmodule-utils
47
+ ```
48
+
49
+ ### With semantic search (LanceDB + sentence-transformers)
50
+
51
+ ```bash
52
+ pip install 'kgmodule-utils[semantic]'
53
+ ```
54
+
55
+ ### In a Poetry project
56
+
57
+ ```toml
58
+ [tool.poetry.dependencies]
59
+ kgmodule-utils = { version = ">=0.3.1", extras = ["semantic"] }
60
+ ```
61
+
62
+ ---
63
+
64
+ ## Quick Start
65
+
66
+ ### Build a domain KG module
67
+
68
+ ```python
69
+ from collections.abc import Iterator
70
+ from pathlib import Path
71
+
72
+ from kg_utils.extractor import KGExtractor
73
+ from kg_utils.pipeline import KGModule
74
+ from kg_utils.specs import EdgeSpec, NodeSpec
75
+
76
+
77
+ class MyExtractor(KGExtractor):
78
+ def node_kinds(self) -> list[str]:
79
+ return ["document", "section"]
80
+
81
+ def edge_kinds(self) -> list[str]:
82
+ return ["CONTAINS"]
83
+
84
+ def meaningful_node_kinds(self) -> list[str]:
85
+ return ["section"]
86
+
87
+ def extract(self) -> Iterator[NodeSpec | EdgeSpec]:
88
+ for doc in self.repo_path.glob("**/*.md"):
89
+ doc_id = f"document:{doc}"
90
+ yield NodeSpec(node_id=doc_id, kind="document",
91
+ name=doc.stem, qualname=doc.stem,
92
+ source_path=str(doc))
93
+ # … yield sections and CONTAINS edges
94
+
95
+
96
+ class MyKG(KGModule):
97
+ _default_dir = ".mykg"
98
+
99
+ def make_extractor(self) -> KGExtractor:
100
+ return MyExtractor(self.repo_root)
101
+
102
+ def kind(self) -> str:
103
+ return "my"
104
+
105
+ def analyze(self) -> str:
106
+ s = self.stats()
107
+ return f"# MyKG\nnodes={s['total_nodes']}"
108
+
109
+
110
+ # Build and query
111
+ kg = MyKG("/path/to/repo")
112
+ kg.build(wipe=True)
113
+
114
+ result = kg.query("authentication flow", k=8, hop=1)
115
+ pack = kg.pack("error handling", max_nodes=10)
116
+ print(pack.to_markdown())
117
+ ```
118
+
119
+ ### Track metrics over time
120
+
121
+ ```python
122
+ from kg_utils.snapshots import SnapshotManager
123
+
124
+ mgr = SnapshotManager(".mykg/snapshots", package_name="my-kg")
125
+
126
+ snapshot = mgr.capture(
127
+ version="1.0.0",
128
+ branch="main",
129
+ graph_stats_dict=kg.stats(),
130
+ )
131
+ mgr.save_snapshot(snapshot)
132
+
133
+ snaps = mgr.list_snapshots(limit=5)
134
+ delta = mgr.diff_snapshots(snaps[-1]["key"], snaps[0]["key"])
135
+ ```
136
+
137
+ ---
138
+
139
+ ## API Reference
140
+
141
+ ### `kg_utils.specs`
142
+
143
+ | Class | Description |
144
+ |---|---|
145
+ | `NodeSpec` | Graph node: `node_id`, `kind`, `name`, `qualname`, `source_path`, `lineno`, `end_lineno`, `docstring`, `metadata` |
146
+ | `EdgeSpec` | Graph edge: `source_id`, `target_id`, `relation`, `weight`, `metadata` |
147
+ | `BuildStats` | Build result: node/edge counts, indexed rows, embedding dim |
148
+ | `QueryResult` | Query result: nodes, edges, seeds, hop, relevance metadata |
149
+ | `SnippetPack` | Pack result: nodes with snippets, `to_markdown()`, `to_json()`, `save()` |
150
+
151
+ ### `kg_utils.extractor`
152
+
153
+ | Class | Description |
154
+ |---|---|
155
+ | `KGExtractor` | ABC — implement `node_kinds()`, `edge_kinds()`, `extract()` |
156
+
157
+ ### `kg_utils.store`
158
+
159
+ | Class | Description |
160
+ |---|---|
161
+ | `GraphStore` | SQLite persistence: `write()`, `expand()`, `query_nodes()`, `resolve_symbols()`, `callers_of()`, `stats()` |
162
+
163
+ ### `kg_utils.semantic`
164
+
165
+ | Class / function | Description |
166
+ |---|---|
167
+ | `SemanticIndex` | LanceDB vector index: `build()`, `search()` |
168
+ | `SentenceTransformerEmbedder` | Local embedding via sentence-transformers |
169
+ | `resolve_model_path()` | Resolve model name / alias to local cache path |
170
+ | `suppress_ingestion_logging()` | Silence verbose HF / tqdm output during ingestion |
171
+
172
+ ### `kg_utils.pipeline`
173
+
174
+ | Class | Description |
175
+ |---|---|
176
+ | `KGModule` | Concrete base — implement `make_extractor()`, `kind()`, `analyze()`; get `build()`, `query()`, `pack()`, `stats()` for free |
177
+
178
+ ### `kg_utils.snapshots`
179
+
180
+ | Class | Description |
181
+ |---|---|
182
+ | `Snapshot` | Temporal snapshot keyed by git tree hash with metrics and deltas |
183
+ | `SnapshotManager` | Capture, persist, load, list, diff, and prune snapshots |
184
+ | `SnapshotManifest` | Fast-lookup index with format versioning |
185
+
186
+ ---
187
+
188
+ ## Project Structure
189
+
190
+ ```
191
+ KG_utils/
192
+ ├── pyproject.toml
193
+ ├── src/
194
+ │ └── kg_utils/
195
+ │ ├── __init__.py
196
+ │ ├── specs.py # NodeSpec, EdgeSpec, BuildStats, QueryResult, SnippetPack
197
+ │ ├── extractor.py # KGExtractor ABC
198
+ │ ├── store.py # GraphStore (SQLite)
199
+ │ ├── semantic.py # SemanticIndex, SentenceTransformerEmbedder, SeedHit
200
+ │ ├── pipeline.py # KGModule concrete base class
201
+ │ ├── module.py # Re-export shim
202
+ │ ├── embed.py # Embedder protocol, model registry
203
+ │ ├── embedder.py # SentenceTransformerEmbedder factory functions
204
+ │ └── snapshots/
205
+ │ ├── __init__.py
206
+ │ ├── models.py # Snapshot, SnapshotManifest, PruneResult
207
+ │ └── manager.py # SnapshotManager
208
+ └── tests/
209
+ ├── test_store.py # GraphStore unit tests
210
+ ├── test_pipeline_utils.py # Pipeline utility function tests
211
+ ├── test_pipeline_module.py # End-to-end integration tests (--integration)
212
+ ├── test_types.py # Spec dataclass and KGExtractor tests
213
+ ├── test_snapshots.py # Snapshot lifecycle tests
214
+ └── test_integration.py # Cross-module integration tests
215
+ ```
216
+
217
+ ---
218
+
219
+ ## Development
220
+
221
+ ```bash
222
+ git clone https://github.com/Flux-Frontiers/KG_utils.git
223
+ cd KG_utils
224
+ poetry install --with dev
225
+ ```
226
+
227
+ Run the fast test suite (no model downloads):
228
+
229
+ ```bash
230
+ poetry run pytest -m "not integration"
231
+ ```
232
+
233
+ Run all tests including semantic/integration (requires `[semantic]` extra):
234
+
235
+ ```bash
236
+ poetry run pytest
237
+ ```
238
+
239
+ ---
240
+
241
+ ## License
242
+
243
+ [Elastic License 2.0](https://www.elastic.co/licensing/elastic-license) — see [LICENSE](LICENSE).
244
+
245
+ Free to use, modify, and distribute. You may not offer the software as a hosted or managed service to third parties. Commercial use internally is permitted.
@@ -10,8 +10,8 @@ build-backend = "poetry.core.masonry.api"
10
10
 
11
11
  [project]
12
12
  name = "kgmodule-utils"
13
- version = "0.2.4"
14
- description = "Shared types and snapshot infrastructure for the KGModule SDK"
13
+ version = "0.3.1"
14
+ description = "Shared types, graph store, semantic index, and pipeline base for the KGModule SDK"
15
15
  readme = "README.md"
16
16
  license = { text = "Elastic-2.0" }
17
17
  authors = [
@@ -19,7 +19,7 @@ authors = [
19
19
  ]
20
20
  keywords = ["knowledge-graph", "kgmodule", "sdk", "types", "snapshots"]
21
21
  classifiers = [
22
- "Development Status :: 3 - Alpha",
22
+ "Development Status :: 4 - Beta",
23
23
  "Intended Audience :: Developers",
24
24
  "Programming Language :: Python :: 3",
25
25
  "Programming Language :: Python :: 3.12",
@@ -28,12 +28,28 @@ classifiers = [
28
28
  requires-python = ">=3.12,<3.14"
29
29
  dependencies = []
30
30
 
31
+ [project.optional-dependencies]
32
+ semantic = [
33
+ "lancedb>=0.19.0",
34
+ "numpy>=1.24.0",
35
+ "sentence-transformers>=5.4.1",
36
+ "torch>=2.5.1",
37
+ "transformers>=4.40.0,<4.57",
38
+ ]
39
+
31
40
  [project.urls]
32
41
  Repository = "https://github.com/Flux-Frontiers/kg_utils"
33
42
 
34
43
  [tool.poetry]
35
44
  packages = [{include = "kg_utils", from = "src"}]
36
45
 
46
+ [tool.poetry.group.kgdeps]
47
+ optional = true
48
+
49
+ [tool.poetry.group.kgdeps.dependencies]
50
+ pycode-kg = ">=0.18.1"
51
+ doc-kg = ">=0.15.2"
52
+
37
53
  [tool.poetry.group.dev]
38
54
  optional = true
39
55
 
@@ -72,6 +88,7 @@ module = [
72
88
  "sentence_transformers.*",
73
89
  "transformers.*",
74
90
  "numpy.*",
91
+ "lancedb",
75
92
  ]
76
93
  ignore_missing_imports = true
77
94
 
@@ -0,0 +1,20 @@
1
+ """kg_utils — Shared types, store, semantic index, and pipeline base for the KGModule SDK.
2
+
3
+ Sub-packages / modules:
4
+ kg_utils.specs — NodeSpec, EdgeSpec, BuildStats, QueryResult, SnippetPack.
5
+ kg_utils.extractor — KGExtractor abstract base class.
6
+ kg_utils.store — GraphStore: SQLite-backed authoritative node/edge store.
7
+ kg_utils.semantic — Embedder, SentenceTransformerEmbedder, SemanticIndex, SeedHit.
8
+ kg_utils.pipeline — KGModule: concrete base class with full build/query/pack pipeline.
9
+ kg_utils.snapshots — Snapshot, SnapshotManager, SnapshotManifest, etc.
10
+ kg_utils.embed — Embedder protocol, DEFAULT_MODEL, KNOWN_MODELS,
11
+ kg_model_cache_dir(), resolve_model_path().
12
+ kg_utils.embedder — Concrete SentenceTransformerEmbedder, get_embedder(),
13
+ wrap_embedder(), load_sentence_transformer().
14
+
15
+ Optional extras
16
+ ---------------
17
+ pip install 'kgmodule-utils[semantic]' # lancedb + sentence-transformers
18
+ """
19
+
20
+ __version__ = "0.3.1"
@@ -99,30 +99,47 @@ def load_sentence_transformer(model_name: str = DEFAULT_MODEL) -> Any:
99
99
  from transformers import logging as hf_logging # pylint: disable=import-outside-toplevel
100
100
 
101
101
  hf_logging.set_verbosity_error()
102
- hf_logging.disable_progress_bar() # TQDM_DISABLE alone misses transformers' _tqdm_active gate
103
- except ImportError:
102
+ # TQDM_DISABLE alone misses transformers' _tqdm_active gate
103
+ hf_logging.disable_progress_bar()
104
+ except (ImportError, ValueError):
104
105
  pass
105
106
 
106
107
  os.environ["TQDM_DISABLE"] = "1"
107
108
 
109
+ import torch # pylint: disable=import-outside-toplevel
110
+
111
+ if torch.cuda.is_available():
112
+ device = "cuda"
113
+ else:
114
+ try:
115
+ device = "mps" if torch.backends.mps.is_available() else "cpu"
116
+ except AttributeError:
117
+ device = "cpu"
118
+
108
119
  resolved = KNOWN_MODELS.get(model_name, model_name)
109
120
  trust_remote = "nomic-ai/" in resolved
110
121
  local_path = resolve_model_path(resolved)
111
122
 
112
123
  if local_path.exists():
113
- return SentenceTransformer(
124
+ model = SentenceTransformer(
114
125
  str(local_path),
115
126
  local_files_only=True,
116
127
  trust_remote_code=trust_remote,
128
+ device=device,
117
129
  )
118
- try:
119
- return SentenceTransformer(
120
- resolved,
121
- local_files_only=True,
122
- trust_remote_code=trust_remote,
123
- )
124
- except OSError:
125
- return SentenceTransformer(resolved, trust_remote_code=trust_remote)
130
+ else:
131
+ try:
132
+ model = SentenceTransformer(
133
+ resolved,
134
+ local_files_only=True,
135
+ trust_remote_code=trust_remote,
136
+ device=device,
137
+ )
138
+ except OSError:
139
+ model = SentenceTransformer(resolved, trust_remote_code=trust_remote, device=device)
140
+
141
+ model = model.to(device)
142
+ return model
126
143
 
127
144
 
128
145
  # ---------------------------------------------------------------------------
@@ -145,7 +162,7 @@ class SentenceTransformerEmbedder(Embedder):
145
162
 
146
163
  hf_logging.set_verbosity_error()
147
164
  hf_logging.disable_progress_bar()
148
- except ImportError:
165
+ except (ImportError, ValueError):
149
166
  pass
150
167
 
151
168
  _prev = os.environ.get("TQDM_DISABLE")
@@ -159,7 +176,7 @@ class SentenceTransformerEmbedder(Embedder):
159
176
  os.environ["TQDM_DISABLE"] = _prev
160
177
 
161
178
  self.model_name: str = KNOWN_MODELS.get(model_name, model_name)
162
- # ST ≥5.4 renamed to get_embedding_dimension; ≤5.3 only had get_sentence_embedding_dimension.
179
+ # ST ≥5.4 renamed get_embedding_dimension; ≤5.3 had get_sentence_embedding_dimension.
163
180
  _dim_fn = getattr(self.model, "get_embedding_dimension", None) or getattr(
164
181
  self.model, "get_sentence_embedding_dimension", None
165
182
  )