kgmodule-utils 0.2.3__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/PKG-INFO +9 -3
- {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/pyproject.toml +20 -3
- kgmodule_utils-0.3.0/src/kg_utils/__init__.py +20 -0
- {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/src/kg_utils/embedder.py +31 -12
- kgmodule_utils-0.3.0/src/kg_utils/extractor.py +76 -0
- kgmodule_utils-0.3.0/src/kg_utils/module.py +91 -0
- kgmodule_utils-0.3.0/src/kg_utils/pipeline.py +853 -0
- kgmodule_utils-0.3.0/src/kg_utils/semantic.py +452 -0
- kgmodule_utils-0.3.0/src/kg_utils/specs.py +286 -0
- kgmodule_utils-0.3.0/src/kg_utils/store.py +672 -0
- kgmodule_utils-0.2.3/src/kg_utils/__init__.py +0 -12
- {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/LICENSE +0 -0
- {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/README.md +0 -0
- {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/src/kg_utils/embed.py +0 -0
- {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/src/kg_utils/py.typed +0 -0
- {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/src/kg_utils/snapshots/__init__.py +0 -0
- {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/src/kg_utils/snapshots/manager.py +0 -0
- {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/src/kg_utils/snapshots/models.py +0 -0
- {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/src/kg_utils/types/__init__.py +0 -0
- {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/src/kg_utils/types/extractor.py +0 -0
- {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/src/kg_utils/types/module.py +0 -0
- {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/src/kg_utils/types/specs.py +0 -0
|
@@ -1,18 +1,24 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: kgmodule-utils
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary: Shared types and
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Shared types, graph store, semantic index, and pipeline base for the KGModule SDK
|
|
5
5
|
License: Elastic-2.0
|
|
6
6
|
License-File: LICENSE
|
|
7
7
|
Keywords: knowledge-graph,kgmodule,sdk,types,snapshots
|
|
8
8
|
Author: Eric G. Suchanek, PhD
|
|
9
9
|
Author-email: suchanek@flux-frontiers.com
|
|
10
10
|
Requires-Python: >=3.12,<3.14
|
|
11
|
-
Classifier: Development Status ::
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
12
|
Classifier: Intended Audience :: Developers
|
|
13
13
|
Classifier: Programming Language :: Python :: 3
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.12
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Provides-Extra: semantic
|
|
17
|
+
Requires-Dist: lancedb (>=0.19.0) ; extra == "semantic"
|
|
18
|
+
Requires-Dist: numpy (>=1.24.0) ; extra == "semantic"
|
|
19
|
+
Requires-Dist: sentence-transformers (>=5.4.1) ; extra == "semantic"
|
|
20
|
+
Requires-Dist: torch (>=2.5.1) ; extra == "semantic"
|
|
21
|
+
Requires-Dist: transformers (>=4.40.0,<4.57) ; extra == "semantic"
|
|
16
22
|
Project-URL: Repository, https://github.com/Flux-Frontiers/kg_utils
|
|
17
23
|
Description-Content-Type: text/markdown
|
|
18
24
|
|
|
@@ -10,8 +10,8 @@ build-backend = "poetry.core.masonry.api"
|
|
|
10
10
|
|
|
11
11
|
[project]
|
|
12
12
|
name = "kgmodule-utils"
|
|
13
|
-
version = "0.
|
|
14
|
-
description = "Shared types and
|
|
13
|
+
version = "0.3.0"
|
|
14
|
+
description = "Shared types, graph store, semantic index, and pipeline base for the KGModule SDK"
|
|
15
15
|
readme = "README.md"
|
|
16
16
|
license = { text = "Elastic-2.0" }
|
|
17
17
|
authors = [
|
|
@@ -19,7 +19,7 @@ authors = [
|
|
|
19
19
|
]
|
|
20
20
|
keywords = ["knowledge-graph", "kgmodule", "sdk", "types", "snapshots"]
|
|
21
21
|
classifiers = [
|
|
22
|
-
"Development Status ::
|
|
22
|
+
"Development Status :: 4 - Beta",
|
|
23
23
|
"Intended Audience :: Developers",
|
|
24
24
|
"Programming Language :: Python :: 3",
|
|
25
25
|
"Programming Language :: Python :: 3.12",
|
|
@@ -28,12 +28,28 @@ classifiers = [
|
|
|
28
28
|
requires-python = ">=3.12,<3.14"
|
|
29
29
|
dependencies = []
|
|
30
30
|
|
|
31
|
+
[project.optional-dependencies]
|
|
32
|
+
semantic = [
|
|
33
|
+
"lancedb>=0.19.0",
|
|
34
|
+
"numpy>=1.24.0",
|
|
35
|
+
"sentence-transformers>=5.4.1",
|
|
36
|
+
"torch>=2.5.1",
|
|
37
|
+
"transformers>=4.40.0,<4.57",
|
|
38
|
+
]
|
|
39
|
+
|
|
31
40
|
[project.urls]
|
|
32
41
|
Repository = "https://github.com/Flux-Frontiers/kg_utils"
|
|
33
42
|
|
|
34
43
|
[tool.poetry]
|
|
35
44
|
packages = [{include = "kg_utils", from = "src"}]
|
|
36
45
|
|
|
46
|
+
[tool.poetry.group.kgdeps]
|
|
47
|
+
optional = true
|
|
48
|
+
|
|
49
|
+
[tool.poetry.group.kgdeps.dependencies]
|
|
50
|
+
pycode-kg = ">=0.18.1"
|
|
51
|
+
doc-kg = ">=0.15.2"
|
|
52
|
+
|
|
37
53
|
[tool.poetry.group.dev]
|
|
38
54
|
optional = true
|
|
39
55
|
|
|
@@ -72,6 +88,7 @@ module = [
|
|
|
72
88
|
"sentence_transformers.*",
|
|
73
89
|
"transformers.*",
|
|
74
90
|
"numpy.*",
|
|
91
|
+
"lancedb",
|
|
75
92
|
]
|
|
76
93
|
ignore_missing_imports = true
|
|
77
94
|
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""kg_utils — Shared types, store, semantic index, and pipeline base for the KGModule SDK.
|
|
2
|
+
|
|
3
|
+
Sub-packages / modules:
|
|
4
|
+
kg_utils.types — NodeSpec, EdgeSpec, BuildStats, QueryResult, SnippetPack,
|
|
5
|
+
KGExtractor (abstract), KGModule (abstract interface).
|
|
6
|
+
kg_utils.store — GraphStore: SQLite-backed authoritative node/edge store.
|
|
7
|
+
kg_utils.semantic — Embedder, SentenceTransformerEmbedder, SemanticIndex, SeedHit.
|
|
8
|
+
kg_utils.pipeline — KGModule: concrete base class with full build/query/pack pipeline.
|
|
9
|
+
kg_utils.snapshots — Snapshot, SnapshotManager, SnapshotManifest, etc.
|
|
10
|
+
kg_utils.embed — Embedder protocol, DEFAULT_MODEL, KNOWN_MODELS,
|
|
11
|
+
kg_model_cache_dir(), resolve_model_path().
|
|
12
|
+
kg_utils.embedder — Concrete SentenceTransformerEmbedder, get_embedder(),
|
|
13
|
+
wrap_embedder(), load_sentence_transformer().
|
|
14
|
+
|
|
15
|
+
Optional extras
|
|
16
|
+
---------------
|
|
17
|
+
pip install 'kgmodule-utils[semantic]' # lancedb + sentence-transformers
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
__version__ = "0.3.0"
|
|
@@ -99,29 +99,47 @@ def load_sentence_transformer(model_name: str = DEFAULT_MODEL) -> Any:
|
|
|
99
99
|
from transformers import logging as hf_logging # pylint: disable=import-outside-toplevel
|
|
100
100
|
|
|
101
101
|
hf_logging.set_verbosity_error()
|
|
102
|
-
|
|
102
|
+
# TQDM_DISABLE alone misses transformers' _tqdm_active gate
|
|
103
|
+
hf_logging.disable_progress_bar()
|
|
104
|
+
except (ImportError, ValueError):
|
|
103
105
|
pass
|
|
104
106
|
|
|
105
107
|
os.environ["TQDM_DISABLE"] = "1"
|
|
106
108
|
|
|
109
|
+
import torch # pylint: disable=import-outside-toplevel
|
|
110
|
+
|
|
111
|
+
if torch.cuda.is_available():
|
|
112
|
+
device = "cuda"
|
|
113
|
+
else:
|
|
114
|
+
try:
|
|
115
|
+
device = "mps" if torch.backends.mps.is_available() else "cpu"
|
|
116
|
+
except AttributeError:
|
|
117
|
+
device = "cpu"
|
|
118
|
+
|
|
107
119
|
resolved = KNOWN_MODELS.get(model_name, model_name)
|
|
108
120
|
trust_remote = "nomic-ai/" in resolved
|
|
109
121
|
local_path = resolve_model_path(resolved)
|
|
110
122
|
|
|
111
123
|
if local_path.exists():
|
|
112
|
-
|
|
124
|
+
model = SentenceTransformer(
|
|
113
125
|
str(local_path),
|
|
114
126
|
local_files_only=True,
|
|
115
127
|
trust_remote_code=trust_remote,
|
|
128
|
+
device=device,
|
|
116
129
|
)
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
130
|
+
else:
|
|
131
|
+
try:
|
|
132
|
+
model = SentenceTransformer(
|
|
133
|
+
resolved,
|
|
134
|
+
local_files_only=True,
|
|
135
|
+
trust_remote_code=trust_remote,
|
|
136
|
+
device=device,
|
|
137
|
+
)
|
|
138
|
+
except OSError:
|
|
139
|
+
model = SentenceTransformer(resolved, trust_remote_code=trust_remote, device=device)
|
|
140
|
+
|
|
141
|
+
model = model.to(device)
|
|
142
|
+
return model
|
|
125
143
|
|
|
126
144
|
|
|
127
145
|
# ---------------------------------------------------------------------------
|
|
@@ -143,7 +161,8 @@ class SentenceTransformerEmbedder(Embedder):
|
|
|
143
161
|
from transformers import logging as hf_logging # pylint: disable=import-outside-toplevel
|
|
144
162
|
|
|
145
163
|
hf_logging.set_verbosity_error()
|
|
146
|
-
|
|
164
|
+
hf_logging.disable_progress_bar()
|
|
165
|
+
except (ImportError, ValueError):
|
|
147
166
|
pass
|
|
148
167
|
|
|
149
168
|
_prev = os.environ.get("TQDM_DISABLE")
|
|
@@ -157,7 +176,7 @@ class SentenceTransformerEmbedder(Embedder):
|
|
|
157
176
|
os.environ["TQDM_DISABLE"] = _prev
|
|
158
177
|
|
|
159
178
|
self.model_name: str = KNOWN_MODELS.get(model_name, model_name)
|
|
160
|
-
# ST ≥5.4 renamed
|
|
179
|
+
# ST ≥5.4 renamed get_embedding_dimension; ≤5.3 had get_sentence_embedding_dimension.
|
|
161
180
|
_dim_fn = getattr(self.model, "get_embedding_dimension", None) or getattr(
|
|
162
181
|
self.model, "get_sentence_embedding_dimension", None
|
|
163
182
|
)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""kg_utils/types/extractor.py — Abstract base class for KG extractors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from collections.abc import Iterator
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from kg_utils.specs import EdgeSpec, NodeSpec
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class KGExtractor(ABC):
|
|
14
|
+
"""Abstract extraction protocol for any knowledge graph domain.
|
|
15
|
+
|
|
16
|
+
Subclass and implement :meth:`node_kinds`, :meth:`edge_kinds`, and
|
|
17
|
+
:meth:`extract`. The concrete :class:`~kg_utils.pipeline.KGModule`
|
|
18
|
+
infrastructure (:class:`~kg_utils.store.GraphStore`,
|
|
19
|
+
:class:`~kg_utils.semantic.SemanticIndex`, snapshot management) is
|
|
20
|
+
provided by the framework — you only implement domain-specific parsing.
|
|
21
|
+
|
|
22
|
+
:param repo_path: Absolute path to the repository or corpus root.
|
|
23
|
+
:param config: Optional domain-specific configuration dict.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, repo_path: Path, config: dict[str, Any] | None = None) -> None:
|
|
27
|
+
self.repo_path = Path(repo_path).resolve()
|
|
28
|
+
self.config = config or {}
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def node_kinds(self) -> list[str]:
|
|
32
|
+
"""Return canonical node kind names emitted by this extractor.
|
|
33
|
+
|
|
34
|
+
:return: List of node kind strings (e.g. ``['module', 'class', 'function']``).
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def edge_kinds(self) -> list[str]:
|
|
39
|
+
"""Return canonical edge relation types emitted by this extractor.
|
|
40
|
+
|
|
41
|
+
:return: List of edge relation strings (e.g. ``['CONTAINS', 'CALLS']``).
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
@abstractmethod
|
|
45
|
+
def extract(self) -> Iterator[NodeSpec | EdgeSpec]:
|
|
46
|
+
"""Traverse the source and yield NodeSpec / EdgeSpec objects.
|
|
47
|
+
|
|
48
|
+
Implementations must be deterministic: the same source should produce
|
|
49
|
+
the same stream on every call.
|
|
50
|
+
|
|
51
|
+
:return: Iterator of :class:`NodeSpec` and :class:`EdgeSpec` objects.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def meaningful_node_kinds(self) -> list[str]:
|
|
55
|
+
"""Return node kinds included in the vector index and coverage metrics.
|
|
56
|
+
|
|
57
|
+
Default: all of :meth:`node_kinds`. Override to exclude structural
|
|
58
|
+
stubs (e.g., unresolved import placeholders) from semantic indexing.
|
|
59
|
+
|
|
60
|
+
:return: Subset of node_kinds() to index semantically.
|
|
61
|
+
"""
|
|
62
|
+
return self.node_kinds()
|
|
63
|
+
|
|
64
|
+
def coverage_metric(self, nodes: list[NodeSpec]) -> float:
|
|
65
|
+
"""Compute a domain coverage quality metric.
|
|
66
|
+
|
|
67
|
+
Default: fraction of meaningful nodes with a non-empty docstring.
|
|
68
|
+
|
|
69
|
+
:param nodes: All extracted NodeSpec objects.
|
|
70
|
+
:return: Coverage score in [0.0, 1.0].
|
|
71
|
+
"""
|
|
72
|
+
meaningful = [n for n in nodes if n.kind in self.meaningful_node_kinds()]
|
|
73
|
+
if not meaningful:
|
|
74
|
+
return 0.0
|
|
75
|
+
covered = sum(1 for n in meaningful if n.docstring.strip())
|
|
76
|
+
return covered / len(meaningful)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""kg_utils/types/module.py — Minimal abstract interface for KG modules.
|
|
2
|
+
|
|
3
|
+
For the concrete, production-grade base class with full build/query/pack
|
|
4
|
+
infrastructure, use :class:`kg_utils.pipeline.KGModule` instead.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from kg_utils.extractor import KGExtractor
|
|
13
|
+
from kg_utils.specs import QueryResult, SnippetPack
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class KGModule:
|
|
17
|
+
"""Base class for knowledge-graph modules.
|
|
18
|
+
|
|
19
|
+
Subclasses must implement :meth:`make_extractor`, :meth:`kind`,
|
|
20
|
+
and should override :meth:`build`, :meth:`query`, :meth:`stats`,
|
|
21
|
+
:meth:`pack`, and :meth:`analyze` with domain-specific logic.
|
|
22
|
+
|
|
23
|
+
:param repo_root: Absolute path to the repository or corpus root.
|
|
24
|
+
:param db_path: Path for the SQLite graph database.
|
|
25
|
+
:param lancedb_dir: Path for the LanceDB vector index directory.
|
|
26
|
+
:param config: Optional domain-specific configuration dict.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
repo_root: Path,
|
|
32
|
+
db_path: Path | None = None,
|
|
33
|
+
lancedb_dir: Path | None = None,
|
|
34
|
+
config: dict[str, Any] | None = None,
|
|
35
|
+
) -> None:
|
|
36
|
+
self.repo_root = repo_root
|
|
37
|
+
self.db_path = db_path
|
|
38
|
+
self.lancedb_dir = lancedb_dir
|
|
39
|
+
self.config = config or {}
|
|
40
|
+
|
|
41
|
+
def make_extractor(self) -> KGExtractor:
|
|
42
|
+
"""Return the domain extractor for this module.
|
|
43
|
+
|
|
44
|
+
:return: KGExtractor subclass instance.
|
|
45
|
+
"""
|
|
46
|
+
raise NotImplementedError
|
|
47
|
+
|
|
48
|
+
def kind(self) -> str:
|
|
49
|
+
"""Return the KGKind string for this module.
|
|
50
|
+
|
|
51
|
+
:return: Kind string (e.g. "code", "meta", "doc").
|
|
52
|
+
"""
|
|
53
|
+
raise NotImplementedError
|
|
54
|
+
|
|
55
|
+
def build(self, wipe: bool = False) -> None:
|
|
56
|
+
"""Build the knowledge graph index.
|
|
57
|
+
|
|
58
|
+
:param wipe: If True, delete existing index before building.
|
|
59
|
+
"""
|
|
60
|
+
raise NotImplementedError
|
|
61
|
+
|
|
62
|
+
def query(self, q: str, k: int = 8, **kwargs: Any) -> QueryResult:
|
|
63
|
+
"""Query the knowledge graph.
|
|
64
|
+
|
|
65
|
+
:param q: Natural-language query string.
|
|
66
|
+
:param k: Number of results to return.
|
|
67
|
+
:return: QueryResult with matched nodes and edges.
|
|
68
|
+
"""
|
|
69
|
+
raise NotImplementedError
|
|
70
|
+
|
|
71
|
+
def stats(self) -> dict[str, Any]:
|
|
72
|
+
"""Return statistics about the knowledge graph.
|
|
73
|
+
|
|
74
|
+
:return: Dict with keys like total_nodes, total_edges, etc.
|
|
75
|
+
"""
|
|
76
|
+
raise NotImplementedError
|
|
77
|
+
|
|
78
|
+
def pack(self, q: str, **kwargs: Any) -> SnippetPack:
|
|
79
|
+
"""Pack query results with source context.
|
|
80
|
+
|
|
81
|
+
:param q: Natural-language query string.
|
|
82
|
+
:return: SnippetPack with nodes, edges, and snippets.
|
|
83
|
+
"""
|
|
84
|
+
raise NotImplementedError
|
|
85
|
+
|
|
86
|
+
def analyze(self) -> str:
|
|
87
|
+
"""Run full analysis and return a Markdown report.
|
|
88
|
+
|
|
89
|
+
:return: Markdown-formatted analysis report.
|
|
90
|
+
"""
|
|
91
|
+
raise NotImplementedError
|