kgmodule-utils 0.2.4__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kgmodule_utils-0.3.1/PKG-INFO +270 -0
- kgmodule_utils-0.3.1/README.md +245 -0
- {kgmodule_utils-0.2.4 → kgmodule_utils-0.3.1}/pyproject.toml +20 -3
- kgmodule_utils-0.3.1/src/kg_utils/__init__.py +20 -0
- {kgmodule_utils-0.2.4 → kgmodule_utils-0.3.1}/src/kg_utils/embedder.py +30 -13
- {kgmodule_utils-0.2.4/src/kg_utils/types → kgmodule_utils-0.3.1/src/kg_utils}/extractor.py +28 -20
- {kgmodule_utils-0.2.4/src/kg_utils/types → kgmodule_utils-0.3.1/src/kg_utils}/module.py +7 -3
- kgmodule_utils-0.3.1/src/kg_utils/pipeline.py +853 -0
- kgmodule_utils-0.3.1/src/kg_utils/semantic.py +452 -0
- kgmodule_utils-0.3.1/src/kg_utils/specs.py +286 -0
- kgmodule_utils-0.3.1/src/kg_utils/store.py +672 -0
- kgmodule_utils-0.2.4/PKG-INFO +0 -210
- kgmodule_utils-0.2.4/README.md +0 -191
- kgmodule_utils-0.2.4/src/kg_utils/__init__.py +0 -12
- kgmodule_utils-0.2.4/src/kg_utils/types/__init__.py +0 -14
- kgmodule_utils-0.2.4/src/kg_utils/types/specs.py +0 -90
- {kgmodule_utils-0.2.4 → kgmodule_utils-0.3.1}/LICENSE +0 -0
- {kgmodule_utils-0.2.4 → kgmodule_utils-0.3.1}/src/kg_utils/embed.py +0 -0
- {kgmodule_utils-0.2.4 → kgmodule_utils-0.3.1}/src/kg_utils/py.typed +0 -0
- {kgmodule_utils-0.2.4 → kgmodule_utils-0.3.1}/src/kg_utils/snapshots/__init__.py +0 -0
- {kgmodule_utils-0.2.4 → kgmodule_utils-0.3.1}/src/kg_utils/snapshots/manager.py +0 -0
- {kgmodule_utils-0.2.4 → kgmodule_utils-0.3.1}/src/kg_utils/snapshots/models.py +0 -0
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: kgmodule-utils
|
|
3
|
+
Version: 0.3.1
|
|
4
|
+
Summary: Shared types, graph store, semantic index, and pipeline base for the KGModule SDK
|
|
5
|
+
License: Elastic-2.0
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Keywords: knowledge-graph,kgmodule,sdk,types,snapshots
|
|
8
|
+
Author: Eric G. Suchanek, PhD
|
|
9
|
+
Author-email: suchanek@flux-frontiers.com
|
|
10
|
+
Requires-Python: >=3.12,<3.14
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Provides-Extra: semantic
|
|
17
|
+
Requires-Dist: lancedb (>=0.19.0) ; extra == "semantic"
|
|
18
|
+
Requires-Dist: numpy (>=1.24.0) ; extra == "semantic"
|
|
19
|
+
Requires-Dist: sentence-transformers (>=5.4.1) ; extra == "semantic"
|
|
20
|
+
Requires-Dist: torch (>=2.5.1) ; extra == "semantic"
|
|
21
|
+
Requires-Dist: transformers (>=4.40.0,<4.57) ; extra == "semantic"
|
|
22
|
+
Project-URL: Repository, https://github.com/Flux-Frontiers/kg_utils
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
[](https://www.python.org/)
|
|
27
|
+
[](https://www.elastic.co/licensing/elastic-license)
|
|
28
|
+
[](https://github.com/Flux-Frontiers/KG_utils/releases)
|
|
29
|
+
[](https://github.com/Flux-Frontiers/KG_utils/actions/workflows/ci.yml)
|
|
30
|
+
[](https://python-poetry.org/)
|
|
31
|
+
|
|
32
|
+
# kgmodule-utils
|
|
33
|
+
|
|
34
|
+
**kgmodule-utils** — Shared graph store, semantic index, pipeline base, and snapshot infrastructure for the KGModule SDK.
|
|
35
|
+
|
|
36
|
+
*Author: Eric G. Suchanek, PhD*
|
|
37
|
+
|
|
38
|
+
*Flux-Frontiers, Liberty TWP, OH*
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## Overview
|
|
43
|
+
|
|
44
|
+
kgmodule-utils is the **shared SDK layer** for the Flux-Frontiers knowledge-graph ecosystem. It provides everything a domain KG module needs — from type abstractions and SQLite graph storage through LanceDB vector indexing and a full build/query/pack pipeline — so domain authors implement only what is specific to their source domain.
|
|
45
|
+
|
|
46
|
+
Every KGModule implementation — [PyCodeKG](https://github.com/Flux-Frontiers/pycode_kg), [DocKG](https://github.com/Flux-Frontiers/doc_kg), and others — subclasses `KGModule` from here and implements exactly three methods: `make_extractor()`, `kind()`, and `analyze()`.
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## Features
|
|
51
|
+
|
|
52
|
+
- **`kg_utils.specs`** — `NodeSpec`, `EdgeSpec`, `BuildStats`, `QueryResult`, `SnippetPack` dataclasses
|
|
53
|
+
- **`kg_utils.extractor`** — `KGExtractor` ABC: `extract()`, `node_kinds()`, `edge_kinds()`, `coverage_metric()`
|
|
54
|
+
- **`kg_utils.store`** — `GraphStore`: SQLite-backed node/edge store with BFS expansion, symbol resolution, caller lookup, and provenance recording
|
|
55
|
+
- **`kg_utils.semantic`** — `SemanticIndex` (LanceDB), `SentenceTransformerEmbedder`, `SeedHit`, model registry, `resolve_model_path()`
|
|
56
|
+
- **`kg_utils.pipeline`** — `KGModule`: full build → query → pack pipeline base with hybrid semantic + lexical reranking and snippet extraction
|
|
57
|
+
- **`kg_utils.embedder`** — `get_embedder()`, `wrap_embedder()`, `load_sentence_transformer()` factory functions
|
|
58
|
+
- **`kg_utils.embed`** — `Embedder` protocol, `DEFAULT_MODEL`, `KNOWN_MODELS`, `resolve_model_path()`
|
|
59
|
+
- **`kg_utils.snapshots`** — `Snapshot`, `SnapshotManager`, `SnapshotManifest` for temporal metric tracking
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## Installation
|
|
64
|
+
|
|
65
|
+
**Requirements:** Python ≥ 3.12, < 3.14
|
|
66
|
+
|
|
67
|
+
### Core only (stdlib, no optional deps)
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install kgmodule-utils
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### With semantic search (LanceDB + sentence-transformers)
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pip install 'kgmodule-utils[semantic]'
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### In a Poetry project
|
|
80
|
+
|
|
81
|
+
```toml
|
|
82
|
+
[tool.poetry.dependencies]
|
|
83
|
+
kgmodule-utils = { version = ">=0.3.1", extras = ["semantic"] }
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## Quick Start
|
|
89
|
+
|
|
90
|
+
### Build a domain KG module
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from collections.abc import Iterator
|
|
94
|
+
from pathlib import Path
|
|
95
|
+
|
|
96
|
+
from kg_utils.extractor import KGExtractor
|
|
97
|
+
from kg_utils.pipeline import KGModule
|
|
98
|
+
from kg_utils.specs import EdgeSpec, NodeSpec
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class MyExtractor(KGExtractor):
|
|
102
|
+
def node_kinds(self) -> list[str]:
|
|
103
|
+
return ["document", "section"]
|
|
104
|
+
|
|
105
|
+
def edge_kinds(self) -> list[str]:
|
|
106
|
+
return ["CONTAINS"]
|
|
107
|
+
|
|
108
|
+
def meaningful_node_kinds(self) -> list[str]:
|
|
109
|
+
return ["section"]
|
|
110
|
+
|
|
111
|
+
def extract(self) -> Iterator[NodeSpec | EdgeSpec]:
|
|
112
|
+
for doc in self.repo_path.glob("**/*.md"):
|
|
113
|
+
doc_id = f"document:{doc}"
|
|
114
|
+
yield NodeSpec(node_id=doc_id, kind="document",
|
|
115
|
+
name=doc.stem, qualname=doc.stem,
|
|
116
|
+
source_path=str(doc))
|
|
117
|
+
# … yield sections and CONTAINS edges
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class MyKG(KGModule):
|
|
121
|
+
_default_dir = ".mykg"
|
|
122
|
+
|
|
123
|
+
def make_extractor(self) -> KGExtractor:
|
|
124
|
+
return MyExtractor(self.repo_root)
|
|
125
|
+
|
|
126
|
+
def kind(self) -> str:
|
|
127
|
+
return "my"
|
|
128
|
+
|
|
129
|
+
def analyze(self) -> str:
|
|
130
|
+
s = self.stats()
|
|
131
|
+
return f"# MyKG\nnodes={s['total_nodes']}"
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# Build and query
|
|
135
|
+
kg = MyKG("/path/to/repo")
|
|
136
|
+
kg.build(wipe=True)
|
|
137
|
+
|
|
138
|
+
result = kg.query("authentication flow", k=8, hop=1)
|
|
139
|
+
pack = kg.pack("error handling", max_nodes=10)
|
|
140
|
+
print(pack.to_markdown())
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Track metrics over time
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
from kg_utils.snapshots import SnapshotManager
|
|
147
|
+
|
|
148
|
+
mgr = SnapshotManager(".mykg/snapshots", package_name="my-kg")
|
|
149
|
+
|
|
150
|
+
snapshot = mgr.capture(
|
|
151
|
+
version="1.0.0",
|
|
152
|
+
branch="main",
|
|
153
|
+
graph_stats_dict=kg.stats(),
|
|
154
|
+
)
|
|
155
|
+
mgr.save_snapshot(snapshot)
|
|
156
|
+
|
|
157
|
+
snaps = mgr.list_snapshots(limit=5)
|
|
158
|
+
delta = mgr.diff_snapshots(snaps[-1]["key"], snaps[0]["key"])
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## API Reference
|
|
164
|
+
|
|
165
|
+
### `kg_utils.specs`
|
|
166
|
+
|
|
167
|
+
| Class | Description |
|
|
168
|
+
|---|---|
|
|
169
|
+
| `NodeSpec` | Graph node: `node_id`, `kind`, `name`, `qualname`, `source_path`, `lineno`, `end_lineno`, `docstring`, `metadata` |
|
|
170
|
+
| `EdgeSpec` | Graph edge: `source_id`, `target_id`, `relation`, `weight`, `metadata` |
|
|
171
|
+
| `BuildStats` | Build result: node/edge counts, indexed rows, embedding dim |
|
|
172
|
+
| `QueryResult` | Query result: nodes, edges, seeds, hop, relevance metadata |
|
|
173
|
+
| `SnippetPack` | Pack result: nodes with snippets, `to_markdown()`, `to_json()`, `save()` |
|
|
174
|
+
|
|
175
|
+
### `kg_utils.extractor`
|
|
176
|
+
|
|
177
|
+
| Class | Description |
|
|
178
|
+
|---|---|
|
|
179
|
+
| `KGExtractor` | ABC — implement `node_kinds()`, `edge_kinds()`, `extract()` |
|
|
180
|
+
|
|
181
|
+
### `kg_utils.store`
|
|
182
|
+
|
|
183
|
+
| Class | Description |
|
|
184
|
+
|---|---|
|
|
185
|
+
| `GraphStore` | SQLite persistence: `write()`, `expand()`, `query_nodes()`, `resolve_symbols()`, `callers_of()`, `stats()` |
|
|
186
|
+
|
|
187
|
+
### `kg_utils.semantic`
|
|
188
|
+
|
|
189
|
+
| Class / function | Description |
|
|
190
|
+
|---|---|
|
|
191
|
+
| `SemanticIndex` | LanceDB vector index: `build()`, `search()` |
|
|
192
|
+
| `SentenceTransformerEmbedder` | Local embedding via sentence-transformers |
|
|
193
|
+
| `resolve_model_path()` | Resolve model name / alias to local cache path |
|
|
194
|
+
| `suppress_ingestion_logging()` | Silence verbose HF / tqdm output during ingestion |
|
|
195
|
+
|
|
196
|
+
### `kg_utils.pipeline`
|
|
197
|
+
|
|
198
|
+
| Class | Description |
|
|
199
|
+
|---|---|
|
|
200
|
+
| `KGModule` | Concrete base — implement `make_extractor()`, `kind()`, `analyze()`; get `build()`, `query()`, `pack()`, `stats()` for free |
|
|
201
|
+
|
|
202
|
+
### `kg_utils.snapshots`
|
|
203
|
+
|
|
204
|
+
| Class | Description |
|
|
205
|
+
|---|---|
|
|
206
|
+
| `Snapshot` | Temporal snapshot keyed by git tree hash with metrics and deltas |
|
|
207
|
+
| `SnapshotManager` | Capture, persist, load, list, diff, and prune snapshots |
|
|
208
|
+
| `SnapshotManifest` | Fast-lookup index with format versioning |
|
|
209
|
+
|
|
210
|
+
---
|
|
211
|
+
|
|
212
|
+
## Project Structure
|
|
213
|
+
|
|
214
|
+
```
|
|
215
|
+
KG_utils/
|
|
216
|
+
├── pyproject.toml
|
|
217
|
+
├── src/
|
|
218
|
+
│ └── kg_utils/
|
|
219
|
+
│ ├── __init__.py
|
|
220
|
+
│ ├── specs.py # NodeSpec, EdgeSpec, BuildStats, QueryResult, SnippetPack
|
|
221
|
+
│ ├── extractor.py # KGExtractor ABC
|
|
222
|
+
│ ├── store.py # GraphStore (SQLite)
|
|
223
|
+
│ ├── semantic.py # SemanticIndex, SentenceTransformerEmbedder, SeedHit
|
|
224
|
+
│ ├── pipeline.py # KGModule concrete base class
|
|
225
|
+
│ ├── module.py # Re-export shim
|
|
226
|
+
│ ├── embed.py # Embedder protocol, model registry
|
|
227
|
+
│ ├── embedder.py # SentenceTransformerEmbedder factory functions
|
|
228
|
+
│ └── snapshots/
|
|
229
|
+
│ ├── __init__.py
|
|
230
|
+
│ ├── models.py # Snapshot, SnapshotManifest, PruneResult
|
|
231
|
+
│ └── manager.py # SnapshotManager
|
|
232
|
+
└── tests/
|
|
233
|
+
├── test_store.py # GraphStore unit tests
|
|
234
|
+
├── test_pipeline_utils.py # Pipeline utility function tests
|
|
235
|
+
├── test_pipeline_module.py # End-to-end integration tests (--integration)
|
|
236
|
+
├── test_types.py # Spec dataclass and KGExtractor tests
|
|
237
|
+
├── test_snapshots.py # Snapshot lifecycle tests
|
|
238
|
+
└── test_integration.py # Cross-module integration tests
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
---
|
|
242
|
+
|
|
243
|
+
## Development
|
|
244
|
+
|
|
245
|
+
```bash
|
|
246
|
+
git clone https://github.com/Flux-Frontiers/KG_utils.git
|
|
247
|
+
cd KG_utils
|
|
248
|
+
poetry install --with dev
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
Run the fast test suite (no model downloads):
|
|
252
|
+
|
|
253
|
+
```bash
|
|
254
|
+
poetry run pytest -m "not integration"
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
Run all tests including semantic/integration (requires `[semantic]` extra):
|
|
258
|
+
|
|
259
|
+
```bash
|
|
260
|
+
poetry run pytest
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
---
|
|
264
|
+
|
|
265
|
+
## License
|
|
266
|
+
|
|
267
|
+
[Elastic License 2.0](https://www.elastic.co/licensing/elastic-license) — see [LICENSE](LICENSE).
|
|
268
|
+
|
|
269
|
+
Free to use, modify, and distribute. You may not offer the software as a hosted or managed service to third parties. Commercial use internally is permitted.
|
|
270
|
+
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
|
|
2
|
+
[](https://www.python.org/)
|
|
3
|
+
[](https://www.elastic.co/licensing/elastic-license)
|
|
4
|
+
[](https://github.com/Flux-Frontiers/KG_utils/releases)
|
|
5
|
+
[](https://github.com/Flux-Frontiers/KG_utils/actions/workflows/ci.yml)
|
|
6
|
+
[](https://python-poetry.org/)
|
|
7
|
+
|
|
8
|
+
# kgmodule-utils
|
|
9
|
+
|
|
10
|
+
**kgmodule-utils** — Shared graph store, semantic index, pipeline base, and snapshot infrastructure for the KGModule SDK.
|
|
11
|
+
|
|
12
|
+
*Author: Eric G. Suchanek, PhD*
|
|
13
|
+
|
|
14
|
+
*Flux-Frontiers, Liberty TWP, OH*
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## Overview
|
|
19
|
+
|
|
20
|
+
kgmodule-utils is the **shared SDK layer** for the Flux-Frontiers knowledge-graph ecosystem. It provides everything a domain KG module needs — from type abstractions and SQLite graph storage through LanceDB vector indexing and a full build/query/pack pipeline — so domain authors implement only what is specific to their source domain.
|
|
21
|
+
|
|
22
|
+
Every KGModule implementation — [PyCodeKG](https://github.com/Flux-Frontiers/pycode_kg), [DocKG](https://github.com/Flux-Frontiers/doc_kg), and others — subclasses `KGModule` from here and implements exactly three methods: `make_extractor()`, `kind()`, and `analyze()`.
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Features
|
|
27
|
+
|
|
28
|
+
- **`kg_utils.specs`** — `NodeSpec`, `EdgeSpec`, `BuildStats`, `QueryResult`, `SnippetPack` dataclasses
|
|
29
|
+
- **`kg_utils.extractor`** — `KGExtractor` ABC: `extract()`, `node_kinds()`, `edge_kinds()`, `coverage_metric()`
|
|
30
|
+
- **`kg_utils.store`** — `GraphStore`: SQLite-backed node/edge store with BFS expansion, symbol resolution, caller lookup, and provenance recording
|
|
31
|
+
- **`kg_utils.semantic`** — `SemanticIndex` (LanceDB), `SentenceTransformerEmbedder`, `SeedHit`, model registry, `resolve_model_path()`
|
|
32
|
+
- **`kg_utils.pipeline`** — `KGModule`: full build → query → pack pipeline base with hybrid semantic + lexical reranking and snippet extraction
|
|
33
|
+
- **`kg_utils.embedder`** — `get_embedder()`, `wrap_embedder()`, `load_sentence_transformer()` factory functions
|
|
34
|
+
- **`kg_utils.embed`** — `Embedder` protocol, `DEFAULT_MODEL`, `KNOWN_MODELS`, `resolve_model_path()`
|
|
35
|
+
- **`kg_utils.snapshots`** — `Snapshot`, `SnapshotManager`, `SnapshotManifest` for temporal metric tracking
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## Installation
|
|
40
|
+
|
|
41
|
+
**Requirements:** Python ≥ 3.12, < 3.14
|
|
42
|
+
|
|
43
|
+
### Core only (stdlib, no optional deps)
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install kgmodule-utils
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### With semantic search (LanceDB + sentence-transformers)
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install 'kgmodule-utils[semantic]'
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### In a Poetry project
|
|
56
|
+
|
|
57
|
+
```toml
|
|
58
|
+
[tool.poetry.dependencies]
|
|
59
|
+
kgmodule-utils = { version = ">=0.3.1", extras = ["semantic"] }
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## Quick Start
|
|
65
|
+
|
|
66
|
+
### Build a domain KG module
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from collections.abc import Iterator
|
|
70
|
+
from pathlib import Path
|
|
71
|
+
|
|
72
|
+
from kg_utils.extractor import KGExtractor
|
|
73
|
+
from kg_utils.pipeline import KGModule
|
|
74
|
+
from kg_utils.specs import EdgeSpec, NodeSpec
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class MyExtractor(KGExtractor):
|
|
78
|
+
def node_kinds(self) -> list[str]:
|
|
79
|
+
return ["document", "section"]
|
|
80
|
+
|
|
81
|
+
def edge_kinds(self) -> list[str]:
|
|
82
|
+
return ["CONTAINS"]
|
|
83
|
+
|
|
84
|
+
def meaningful_node_kinds(self) -> list[str]:
|
|
85
|
+
return ["section"]
|
|
86
|
+
|
|
87
|
+
def extract(self) -> Iterator[NodeSpec | EdgeSpec]:
|
|
88
|
+
for doc in self.repo_path.glob("**/*.md"):
|
|
89
|
+
doc_id = f"document:{doc}"
|
|
90
|
+
yield NodeSpec(node_id=doc_id, kind="document",
|
|
91
|
+
name=doc.stem, qualname=doc.stem,
|
|
92
|
+
source_path=str(doc))
|
|
93
|
+
# … yield sections and CONTAINS edges
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class MyKG(KGModule):
|
|
97
|
+
_default_dir = ".mykg"
|
|
98
|
+
|
|
99
|
+
def make_extractor(self) -> KGExtractor:
|
|
100
|
+
return MyExtractor(self.repo_root)
|
|
101
|
+
|
|
102
|
+
def kind(self) -> str:
|
|
103
|
+
return "my"
|
|
104
|
+
|
|
105
|
+
def analyze(self) -> str:
|
|
106
|
+
s = self.stats()
|
|
107
|
+
return f"# MyKG\nnodes={s['total_nodes']}"
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# Build and query
|
|
111
|
+
kg = MyKG("/path/to/repo")
|
|
112
|
+
kg.build(wipe=True)
|
|
113
|
+
|
|
114
|
+
result = kg.query("authentication flow", k=8, hop=1)
|
|
115
|
+
pack = kg.pack("error handling", max_nodes=10)
|
|
116
|
+
print(pack.to_markdown())
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Track metrics over time
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
from kg_utils.snapshots import SnapshotManager
|
|
123
|
+
|
|
124
|
+
mgr = SnapshotManager(".mykg/snapshots", package_name="my-kg")
|
|
125
|
+
|
|
126
|
+
snapshot = mgr.capture(
|
|
127
|
+
version="1.0.0",
|
|
128
|
+
branch="main",
|
|
129
|
+
graph_stats_dict=kg.stats(),
|
|
130
|
+
)
|
|
131
|
+
mgr.save_snapshot(snapshot)
|
|
132
|
+
|
|
133
|
+
snaps = mgr.list_snapshots(limit=5)
|
|
134
|
+
delta = mgr.diff_snapshots(snaps[-1]["key"], snaps[0]["key"])
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
## API Reference
|
|
140
|
+
|
|
141
|
+
### `kg_utils.specs`
|
|
142
|
+
|
|
143
|
+
| Class | Description |
|
|
144
|
+
|---|---|
|
|
145
|
+
| `NodeSpec` | Graph node: `node_id`, `kind`, `name`, `qualname`, `source_path`, `lineno`, `end_lineno`, `docstring`, `metadata` |
|
|
146
|
+
| `EdgeSpec` | Graph edge: `source_id`, `target_id`, `relation`, `weight`, `metadata` |
|
|
147
|
+
| `BuildStats` | Build result: node/edge counts, indexed rows, embedding dim |
|
|
148
|
+
| `QueryResult` | Query result: nodes, edges, seeds, hop, relevance metadata |
|
|
149
|
+
| `SnippetPack` | Pack result: nodes with snippets, `to_markdown()`, `to_json()`, `save()` |
|
|
150
|
+
|
|
151
|
+
### `kg_utils.extractor`
|
|
152
|
+
|
|
153
|
+
| Class | Description |
|
|
154
|
+
|---|---|
|
|
155
|
+
| `KGExtractor` | ABC — implement `node_kinds()`, `edge_kinds()`, `extract()` |
|
|
156
|
+
|
|
157
|
+
### `kg_utils.store`
|
|
158
|
+
|
|
159
|
+
| Class | Description |
|
|
160
|
+
|---|---|
|
|
161
|
+
| `GraphStore` | SQLite persistence: `write()`, `expand()`, `query_nodes()`, `resolve_symbols()`, `callers_of()`, `stats()` |
|
|
162
|
+
|
|
163
|
+
### `kg_utils.semantic`
|
|
164
|
+
|
|
165
|
+
| Class / function | Description |
|
|
166
|
+
|---|---|
|
|
167
|
+
| `SemanticIndex` | LanceDB vector index: `build()`, `search()` |
|
|
168
|
+
| `SentenceTransformerEmbedder` | Local embedding via sentence-transformers |
|
|
169
|
+
| `resolve_model_path()` | Resolve model name / alias to local cache path |
|
|
170
|
+
| `suppress_ingestion_logging()` | Silence verbose HF / tqdm output during ingestion |
|
|
171
|
+
|
|
172
|
+
### `kg_utils.pipeline`
|
|
173
|
+
|
|
174
|
+
| Class | Description |
|
|
175
|
+
|---|---|
|
|
176
|
+
| `KGModule` | Concrete base — implement `make_extractor()`, `kind()`, `analyze()`; get `build()`, `query()`, `pack()`, `stats()` for free |
|
|
177
|
+
|
|
178
|
+
### `kg_utils.snapshots`
|
|
179
|
+
|
|
180
|
+
| Class | Description |
|
|
181
|
+
|---|---|
|
|
182
|
+
| `Snapshot` | Temporal snapshot keyed by git tree hash with metrics and deltas |
|
|
183
|
+
| `SnapshotManager` | Capture, persist, load, list, diff, and prune snapshots |
|
|
184
|
+
| `SnapshotManifest` | Fast-lookup index with format versioning |
|
|
185
|
+
|
|
186
|
+
---
|
|
187
|
+
|
|
188
|
+
## Project Structure
|
|
189
|
+
|
|
190
|
+
```
|
|
191
|
+
KG_utils/
|
|
192
|
+
├── pyproject.toml
|
|
193
|
+
├── src/
|
|
194
|
+
│ └── kg_utils/
|
|
195
|
+
│ ├── __init__.py
|
|
196
|
+
│ ├── specs.py # NodeSpec, EdgeSpec, BuildStats, QueryResult, SnippetPack
|
|
197
|
+
│ ├── extractor.py # KGExtractor ABC
|
|
198
|
+
│ ├── store.py # GraphStore (SQLite)
|
|
199
|
+
│ ├── semantic.py # SemanticIndex, SentenceTransformerEmbedder, SeedHit
|
|
200
|
+
│ ├── pipeline.py # KGModule concrete base class
|
|
201
|
+
│ ├── module.py # Re-export shim
|
|
202
|
+
│ ├── embed.py # Embedder protocol, model registry
|
|
203
|
+
│ ├── embedder.py # SentenceTransformerEmbedder factory functions
|
|
204
|
+
│ └── snapshots/
|
|
205
|
+
│ ├── __init__.py
|
|
206
|
+
│ ├── models.py # Snapshot, SnapshotManifest, PruneResult
|
|
207
|
+
│ └── manager.py # SnapshotManager
|
|
208
|
+
└── tests/
|
|
209
|
+
├── test_store.py # GraphStore unit tests
|
|
210
|
+
├── test_pipeline_utils.py # Pipeline utility function tests
|
|
211
|
+
├── test_pipeline_module.py # End-to-end integration tests (--integration)
|
|
212
|
+
├── test_types.py # Spec dataclass and KGExtractor tests
|
|
213
|
+
├── test_snapshots.py # Snapshot lifecycle tests
|
|
214
|
+
└── test_integration.py # Cross-module integration tests
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## Development
|
|
220
|
+
|
|
221
|
+
```bash
|
|
222
|
+
git clone https://github.com/Flux-Frontiers/KG_utils.git
|
|
223
|
+
cd KG_utils
|
|
224
|
+
poetry install --with dev
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
Run the fast test suite (no model downloads):
|
|
228
|
+
|
|
229
|
+
```bash
|
|
230
|
+
poetry run pytest -m "not integration"
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
Run all tests including semantic/integration (requires `[semantic]` extra):
|
|
234
|
+
|
|
235
|
+
```bash
|
|
236
|
+
poetry run pytest
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
---
|
|
240
|
+
|
|
241
|
+
## License
|
|
242
|
+
|
|
243
|
+
[Elastic License 2.0](https://www.elastic.co/licensing/elastic-license) — see [LICENSE](LICENSE).
|
|
244
|
+
|
|
245
|
+
Free to use, modify, and distribute. You may not offer the software as a hosted or managed service to third parties. Commercial use internally is permitted.
|
|
@@ -10,8 +10,8 @@ build-backend = "poetry.core.masonry.api"
|
|
|
10
10
|
|
|
11
11
|
[project]
|
|
12
12
|
name = "kgmodule-utils"
|
|
13
|
-
version = "0.
|
|
14
|
-
description = "Shared types and
|
|
13
|
+
version = "0.3.1"
|
|
14
|
+
description = "Shared types, graph store, semantic index, and pipeline base for the KGModule SDK"
|
|
15
15
|
readme = "README.md"
|
|
16
16
|
license = { text = "Elastic-2.0" }
|
|
17
17
|
authors = [
|
|
@@ -19,7 +19,7 @@ authors = [
|
|
|
19
19
|
]
|
|
20
20
|
keywords = ["knowledge-graph", "kgmodule", "sdk", "types", "snapshots"]
|
|
21
21
|
classifiers = [
|
|
22
|
-
"Development Status ::
|
|
22
|
+
"Development Status :: 4 - Beta",
|
|
23
23
|
"Intended Audience :: Developers",
|
|
24
24
|
"Programming Language :: Python :: 3",
|
|
25
25
|
"Programming Language :: Python :: 3.12",
|
|
@@ -28,12 +28,28 @@ classifiers = [
|
|
|
28
28
|
requires-python = ">=3.12,<3.14"
|
|
29
29
|
dependencies = []
|
|
30
30
|
|
|
31
|
+
[project.optional-dependencies]
|
|
32
|
+
semantic = [
|
|
33
|
+
"lancedb>=0.19.0",
|
|
34
|
+
"numpy>=1.24.0",
|
|
35
|
+
"sentence-transformers>=5.4.1",
|
|
36
|
+
"torch>=2.5.1",
|
|
37
|
+
"transformers>=4.40.0,<4.57",
|
|
38
|
+
]
|
|
39
|
+
|
|
31
40
|
[project.urls]
|
|
32
41
|
Repository = "https://github.com/Flux-Frontiers/kg_utils"
|
|
33
42
|
|
|
34
43
|
[tool.poetry]
|
|
35
44
|
packages = [{include = "kg_utils", from = "src"}]
|
|
36
45
|
|
|
46
|
+
[tool.poetry.group.kgdeps]
|
|
47
|
+
optional = true
|
|
48
|
+
|
|
49
|
+
[tool.poetry.group.kgdeps.dependencies]
|
|
50
|
+
pycode-kg = ">=0.18.1"
|
|
51
|
+
doc-kg = ">=0.15.2"
|
|
52
|
+
|
|
37
53
|
[tool.poetry.group.dev]
|
|
38
54
|
optional = true
|
|
39
55
|
|
|
@@ -72,6 +88,7 @@ module = [
|
|
|
72
88
|
"sentence_transformers.*",
|
|
73
89
|
"transformers.*",
|
|
74
90
|
"numpy.*",
|
|
91
|
+
"lancedb",
|
|
75
92
|
]
|
|
76
93
|
ignore_missing_imports = true
|
|
77
94
|
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""kg_utils — Shared types, store, semantic index, and pipeline base for the KGModule SDK.
|
|
2
|
+
|
|
3
|
+
Sub-packages / modules:
|
|
4
|
+
kg_utils.specs — NodeSpec, EdgeSpec, BuildStats, QueryResult, SnippetPack.
|
|
5
|
+
kg_utils.extractor — KGExtractor abstract base class.
|
|
6
|
+
kg_utils.store — GraphStore: SQLite-backed authoritative node/edge store.
|
|
7
|
+
kg_utils.semantic — Embedder, SentenceTransformerEmbedder, SemanticIndex, SeedHit.
|
|
8
|
+
kg_utils.pipeline — KGModule: concrete base class with full build/query/pack pipeline.
|
|
9
|
+
kg_utils.snapshots — Snapshot, SnapshotManager, SnapshotManifest, etc.
|
|
10
|
+
kg_utils.embed — Embedder protocol, DEFAULT_MODEL, KNOWN_MODELS,
|
|
11
|
+
kg_model_cache_dir(), resolve_model_path().
|
|
12
|
+
kg_utils.embedder — Concrete SentenceTransformerEmbedder, get_embedder(),
|
|
13
|
+
wrap_embedder(), load_sentence_transformer().
|
|
14
|
+
|
|
15
|
+
Optional extras
|
|
16
|
+
---------------
|
|
17
|
+
pip install 'kgmodule-utils[semantic]' # lancedb + sentence-transformers
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
__version__ = "0.3.1"
|
|
@@ -99,30 +99,47 @@ def load_sentence_transformer(model_name: str = DEFAULT_MODEL) -> Any:
|
|
|
99
99
|
from transformers import logging as hf_logging # pylint: disable=import-outside-toplevel
|
|
100
100
|
|
|
101
101
|
hf_logging.set_verbosity_error()
|
|
102
|
-
|
|
103
|
-
|
|
102
|
+
# TQDM_DISABLE alone misses transformers' _tqdm_active gate
|
|
103
|
+
hf_logging.disable_progress_bar()
|
|
104
|
+
except (ImportError, ValueError):
|
|
104
105
|
pass
|
|
105
106
|
|
|
106
107
|
os.environ["TQDM_DISABLE"] = "1"
|
|
107
108
|
|
|
109
|
+
import torch # pylint: disable=import-outside-toplevel
|
|
110
|
+
|
|
111
|
+
if torch.cuda.is_available():
|
|
112
|
+
device = "cuda"
|
|
113
|
+
else:
|
|
114
|
+
try:
|
|
115
|
+
device = "mps" if torch.backends.mps.is_available() else "cpu"
|
|
116
|
+
except AttributeError:
|
|
117
|
+
device = "cpu"
|
|
118
|
+
|
|
108
119
|
resolved = KNOWN_MODELS.get(model_name, model_name)
|
|
109
120
|
trust_remote = "nomic-ai/" in resolved
|
|
110
121
|
local_path = resolve_model_path(resolved)
|
|
111
122
|
|
|
112
123
|
if local_path.exists():
|
|
113
|
-
|
|
124
|
+
model = SentenceTransformer(
|
|
114
125
|
str(local_path),
|
|
115
126
|
local_files_only=True,
|
|
116
127
|
trust_remote_code=trust_remote,
|
|
128
|
+
device=device,
|
|
117
129
|
)
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
130
|
+
else:
|
|
131
|
+
try:
|
|
132
|
+
model = SentenceTransformer(
|
|
133
|
+
resolved,
|
|
134
|
+
local_files_only=True,
|
|
135
|
+
trust_remote_code=trust_remote,
|
|
136
|
+
device=device,
|
|
137
|
+
)
|
|
138
|
+
except OSError:
|
|
139
|
+
model = SentenceTransformer(resolved, trust_remote_code=trust_remote, device=device)
|
|
140
|
+
|
|
141
|
+
model = model.to(device)
|
|
142
|
+
return model
|
|
126
143
|
|
|
127
144
|
|
|
128
145
|
# ---------------------------------------------------------------------------
|
|
@@ -145,7 +162,7 @@ class SentenceTransformerEmbedder(Embedder):
|
|
|
145
162
|
|
|
146
163
|
hf_logging.set_verbosity_error()
|
|
147
164
|
hf_logging.disable_progress_bar()
|
|
148
|
-
except ImportError:
|
|
165
|
+
except (ImportError, ValueError):
|
|
149
166
|
pass
|
|
150
167
|
|
|
151
168
|
_prev = os.environ.get("TQDM_DISABLE")
|
|
@@ -159,7 +176,7 @@ class SentenceTransformerEmbedder(Embedder):
|
|
|
159
176
|
os.environ["TQDM_DISABLE"] = _prev
|
|
160
177
|
|
|
161
178
|
self.model_name: str = KNOWN_MODELS.get(model_name, model_name)
|
|
162
|
-
# ST ≥5.4 renamed
|
|
179
|
+
# ST ≥5.4 renamed get_embedding_dimension; ≤5.3 had get_sentence_embedding_dimension.
|
|
163
180
|
_dim_fn = getattr(self.model, "get_embedding_dimension", None) or getattr(
|
|
164
181
|
self.model, "get_sentence_embedding_dimension", None
|
|
165
182
|
)
|