kgmodule-utils 0.2.4__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kgmodule_utils-0.2.4 → kgmodule_utils-0.3.0}/PKG-INFO +9 -3
- {kgmodule_utils-0.2.4 → kgmodule_utils-0.3.0}/pyproject.toml +20 -3
- kgmodule_utils-0.3.0/src/kg_utils/__init__.py +20 -0
- {kgmodule_utils-0.2.4 → kgmodule_utils-0.3.0}/src/kg_utils/embedder.py +30 -13
- kgmodule_utils-0.3.0/src/kg_utils/extractor.py +76 -0
- kgmodule_utils-0.3.0/src/kg_utils/module.py +91 -0
- kgmodule_utils-0.3.0/src/kg_utils/pipeline.py +853 -0
- kgmodule_utils-0.3.0/src/kg_utils/semantic.py +452 -0
- kgmodule_utils-0.3.0/src/kg_utils/specs.py +286 -0
- kgmodule_utils-0.3.0/src/kg_utils/store.py +672 -0
- kgmodule_utils-0.2.4/src/kg_utils/__init__.py +0 -12
- {kgmodule_utils-0.2.4 → kgmodule_utils-0.3.0}/LICENSE +0 -0
- {kgmodule_utils-0.2.4 → kgmodule_utils-0.3.0}/README.md +0 -0
- {kgmodule_utils-0.2.4 → kgmodule_utils-0.3.0}/src/kg_utils/embed.py +0 -0
- {kgmodule_utils-0.2.4 → kgmodule_utils-0.3.0}/src/kg_utils/py.typed +0 -0
- {kgmodule_utils-0.2.4 → kgmodule_utils-0.3.0}/src/kg_utils/snapshots/__init__.py +0 -0
- {kgmodule_utils-0.2.4 → kgmodule_utils-0.3.0}/src/kg_utils/snapshots/manager.py +0 -0
- {kgmodule_utils-0.2.4 → kgmodule_utils-0.3.0}/src/kg_utils/snapshots/models.py +0 -0
- {kgmodule_utils-0.2.4 → kgmodule_utils-0.3.0}/src/kg_utils/types/__init__.py +0 -0
- {kgmodule_utils-0.2.4 → kgmodule_utils-0.3.0}/src/kg_utils/types/extractor.py +0 -0
- {kgmodule_utils-0.2.4 → kgmodule_utils-0.3.0}/src/kg_utils/types/module.py +0 -0
- {kgmodule_utils-0.2.4 → kgmodule_utils-0.3.0}/src/kg_utils/types/specs.py +0 -0
|
@@ -1,18 +1,24 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: kgmodule-utils
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary: Shared types and
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Shared types, graph store, semantic index, and pipeline base for the KGModule SDK
|
|
5
5
|
License: Elastic-2.0
|
|
6
6
|
License-File: LICENSE
|
|
7
7
|
Keywords: knowledge-graph,kgmodule,sdk,types,snapshots
|
|
8
8
|
Author: Eric G. Suchanek, PhD
|
|
9
9
|
Author-email: suchanek@flux-frontiers.com
|
|
10
10
|
Requires-Python: >=3.12,<3.14
|
|
11
|
-
Classifier: Development Status ::
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
12
|
Classifier: Intended Audience :: Developers
|
|
13
13
|
Classifier: Programming Language :: Python :: 3
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.12
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Provides-Extra: semantic
|
|
17
|
+
Requires-Dist: lancedb (>=0.19.0) ; extra == "semantic"
|
|
18
|
+
Requires-Dist: numpy (>=1.24.0) ; extra == "semantic"
|
|
19
|
+
Requires-Dist: sentence-transformers (>=5.4.1) ; extra == "semantic"
|
|
20
|
+
Requires-Dist: torch (>=2.5.1) ; extra == "semantic"
|
|
21
|
+
Requires-Dist: transformers (>=4.40.0,<4.57) ; extra == "semantic"
|
|
16
22
|
Project-URL: Repository, https://github.com/Flux-Frontiers/kg_utils
|
|
17
23
|
Description-Content-Type: text/markdown
|
|
18
24
|
|
|
@@ -10,8 +10,8 @@ build-backend = "poetry.core.masonry.api"
|
|
|
10
10
|
|
|
11
11
|
[project]
|
|
12
12
|
name = "kgmodule-utils"
|
|
13
|
-
version = "0.
|
|
14
|
-
description = "Shared types and
|
|
13
|
+
version = "0.3.0"
|
|
14
|
+
description = "Shared types, graph store, semantic index, and pipeline base for the KGModule SDK"
|
|
15
15
|
readme = "README.md"
|
|
16
16
|
license = { text = "Elastic-2.0" }
|
|
17
17
|
authors = [
|
|
@@ -19,7 +19,7 @@ authors = [
|
|
|
19
19
|
]
|
|
20
20
|
keywords = ["knowledge-graph", "kgmodule", "sdk", "types", "snapshots"]
|
|
21
21
|
classifiers = [
|
|
22
|
-
"Development Status ::
|
|
22
|
+
"Development Status :: 4 - Beta",
|
|
23
23
|
"Intended Audience :: Developers",
|
|
24
24
|
"Programming Language :: Python :: 3",
|
|
25
25
|
"Programming Language :: Python :: 3.12",
|
|
@@ -28,12 +28,28 @@ classifiers = [
|
|
|
28
28
|
requires-python = ">=3.12,<3.14"
|
|
29
29
|
dependencies = []
|
|
30
30
|
|
|
31
|
+
[project.optional-dependencies]
|
|
32
|
+
semantic = [
|
|
33
|
+
"lancedb>=0.19.0",
|
|
34
|
+
"numpy>=1.24.0",
|
|
35
|
+
"sentence-transformers>=5.4.1",
|
|
36
|
+
"torch>=2.5.1",
|
|
37
|
+
"transformers>=4.40.0,<4.57",
|
|
38
|
+
]
|
|
39
|
+
|
|
31
40
|
[project.urls]
|
|
32
41
|
Repository = "https://github.com/Flux-Frontiers/kg_utils"
|
|
33
42
|
|
|
34
43
|
[tool.poetry]
|
|
35
44
|
packages = [{include = "kg_utils", from = "src"}]
|
|
36
45
|
|
|
46
|
+
[tool.poetry.group.kgdeps]
|
|
47
|
+
optional = true
|
|
48
|
+
|
|
49
|
+
[tool.poetry.group.kgdeps.dependencies]
|
|
50
|
+
pycode-kg = ">=0.18.1"
|
|
51
|
+
doc-kg = ">=0.15.2"
|
|
52
|
+
|
|
37
53
|
[tool.poetry.group.dev]
|
|
38
54
|
optional = true
|
|
39
55
|
|
|
@@ -72,6 +88,7 @@ module = [
|
|
|
72
88
|
"sentence_transformers.*",
|
|
73
89
|
"transformers.*",
|
|
74
90
|
"numpy.*",
|
|
91
|
+
"lancedb",
|
|
75
92
|
]
|
|
76
93
|
ignore_missing_imports = true
|
|
77
94
|
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""kg_utils — Shared types, store, semantic index, and pipeline base for the KGModule SDK.
|
|
2
|
+
|
|
3
|
+
Sub-packages / modules:
|
|
4
|
+
kg_utils.types — NodeSpec, EdgeSpec, BuildStats, QueryResult, SnippetPack,
|
|
5
|
+
KGExtractor (abstract), KGModule (abstract interface).
|
|
6
|
+
kg_utils.store — GraphStore: SQLite-backed authoritative node/edge store.
|
|
7
|
+
kg_utils.semantic — Embedder, SentenceTransformerEmbedder, SemanticIndex, SeedHit.
|
|
8
|
+
kg_utils.pipeline — KGModule: concrete base class with full build/query/pack pipeline.
|
|
9
|
+
kg_utils.snapshots — Snapshot, SnapshotManager, SnapshotManifest, etc.
|
|
10
|
+
kg_utils.embed — Embedder protocol, DEFAULT_MODEL, KNOWN_MODELS,
|
|
11
|
+
kg_model_cache_dir(), resolve_model_path().
|
|
12
|
+
kg_utils.embedder — Concrete SentenceTransformerEmbedder, get_embedder(),
|
|
13
|
+
wrap_embedder(), load_sentence_transformer().
|
|
14
|
+
|
|
15
|
+
Optional extras
|
|
16
|
+
---------------
|
|
17
|
+
pip install 'kgmodule-utils[semantic]' # lancedb + sentence-transformers
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
__version__ = "0.3.0"
|
|
@@ -99,30 +99,47 @@ def load_sentence_transformer(model_name: str = DEFAULT_MODEL) -> Any:
|
|
|
99
99
|
from transformers import logging as hf_logging # pylint: disable=import-outside-toplevel
|
|
100
100
|
|
|
101
101
|
hf_logging.set_verbosity_error()
|
|
102
|
-
|
|
103
|
-
|
|
102
|
+
# TQDM_DISABLE alone misses transformers' _tqdm_active gate
|
|
103
|
+
hf_logging.disable_progress_bar()
|
|
104
|
+
except (ImportError, ValueError):
|
|
104
105
|
pass
|
|
105
106
|
|
|
106
107
|
os.environ["TQDM_DISABLE"] = "1"
|
|
107
108
|
|
|
109
|
+
import torch # pylint: disable=import-outside-toplevel
|
|
110
|
+
|
|
111
|
+
if torch.cuda.is_available():
|
|
112
|
+
device = "cuda"
|
|
113
|
+
else:
|
|
114
|
+
try:
|
|
115
|
+
device = "mps" if torch.backends.mps.is_available() else "cpu"
|
|
116
|
+
except AttributeError:
|
|
117
|
+
device = "cpu"
|
|
118
|
+
|
|
108
119
|
resolved = KNOWN_MODELS.get(model_name, model_name)
|
|
109
120
|
trust_remote = "nomic-ai/" in resolved
|
|
110
121
|
local_path = resolve_model_path(resolved)
|
|
111
122
|
|
|
112
123
|
if local_path.exists():
|
|
113
|
-
|
|
124
|
+
model = SentenceTransformer(
|
|
114
125
|
str(local_path),
|
|
115
126
|
local_files_only=True,
|
|
116
127
|
trust_remote_code=trust_remote,
|
|
128
|
+
device=device,
|
|
117
129
|
)
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
130
|
+
else:
|
|
131
|
+
try:
|
|
132
|
+
model = SentenceTransformer(
|
|
133
|
+
resolved,
|
|
134
|
+
local_files_only=True,
|
|
135
|
+
trust_remote_code=trust_remote,
|
|
136
|
+
device=device,
|
|
137
|
+
)
|
|
138
|
+
except OSError:
|
|
139
|
+
model = SentenceTransformer(resolved, trust_remote_code=trust_remote, device=device)
|
|
140
|
+
|
|
141
|
+
model = model.to(device)
|
|
142
|
+
return model
|
|
126
143
|
|
|
127
144
|
|
|
128
145
|
# ---------------------------------------------------------------------------
|
|
@@ -145,7 +162,7 @@ class SentenceTransformerEmbedder(Embedder):
|
|
|
145
162
|
|
|
146
163
|
hf_logging.set_verbosity_error()
|
|
147
164
|
hf_logging.disable_progress_bar()
|
|
148
|
-
except ImportError:
|
|
165
|
+
except (ImportError, ValueError):
|
|
149
166
|
pass
|
|
150
167
|
|
|
151
168
|
_prev = os.environ.get("TQDM_DISABLE")
|
|
@@ -159,7 +176,7 @@ class SentenceTransformerEmbedder(Embedder):
|
|
|
159
176
|
os.environ["TQDM_DISABLE"] = _prev
|
|
160
177
|
|
|
161
178
|
self.model_name: str = KNOWN_MODELS.get(model_name, model_name)
|
|
162
|
-
# ST ≥5.4 renamed
|
|
179
|
+
# ST ≥5.4 renamed get_embedding_dimension; ≤5.3 had get_sentence_embedding_dimension.
|
|
163
180
|
_dim_fn = getattr(self.model, "get_embedding_dimension", None) or getattr(
|
|
164
181
|
self.model, "get_sentence_embedding_dimension", None
|
|
165
182
|
)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""kg_utils/types/extractor.py — Abstract base class for KG extractors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from collections.abc import Iterator
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from kg_utils.specs import EdgeSpec, NodeSpec
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class KGExtractor(ABC):
|
|
14
|
+
"""Abstract extraction protocol for any knowledge graph domain.
|
|
15
|
+
|
|
16
|
+
Subclass and implement :meth:`node_kinds`, :meth:`edge_kinds`, and
|
|
17
|
+
:meth:`extract`. The concrete :class:`~kg_utils.pipeline.KGModule`
|
|
18
|
+
infrastructure (:class:`~kg_utils.store.GraphStore`,
|
|
19
|
+
:class:`~kg_utils.semantic.SemanticIndex`, snapshot management) is
|
|
20
|
+
provided by the framework — you only implement domain-specific parsing.
|
|
21
|
+
|
|
22
|
+
:param repo_path: Absolute path to the repository or corpus root.
|
|
23
|
+
:param config: Optional domain-specific configuration dict.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, repo_path: Path, config: dict[str, Any] | None = None) -> None:
|
|
27
|
+
self.repo_path = Path(repo_path).resolve()
|
|
28
|
+
self.config = config or {}
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def node_kinds(self) -> list[str]:
|
|
32
|
+
"""Return canonical node kind names emitted by this extractor.
|
|
33
|
+
|
|
34
|
+
:return: List of node kind strings (e.g. ``['module', 'class', 'function']``).
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def edge_kinds(self) -> list[str]:
|
|
39
|
+
"""Return canonical edge relation types emitted by this extractor.
|
|
40
|
+
|
|
41
|
+
:return: List of edge relation strings (e.g. ``['CONTAINS', 'CALLS']``).
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
@abstractmethod
|
|
45
|
+
def extract(self) -> Iterator[NodeSpec | EdgeSpec]:
|
|
46
|
+
"""Traverse the source and yield NodeSpec / EdgeSpec objects.
|
|
47
|
+
|
|
48
|
+
Implementations must be deterministic: the same source should produce
|
|
49
|
+
the same stream on every call.
|
|
50
|
+
|
|
51
|
+
:return: Iterator of :class:`NodeSpec` and :class:`EdgeSpec` objects.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def meaningful_node_kinds(self) -> list[str]:
|
|
55
|
+
"""Return node kinds included in the vector index and coverage metrics.
|
|
56
|
+
|
|
57
|
+
Default: all of :meth:`node_kinds`. Override to exclude structural
|
|
58
|
+
stubs (e.g., unresolved import placeholders) from semantic indexing.
|
|
59
|
+
|
|
60
|
+
:return: Subset of node_kinds() to index semantically.
|
|
61
|
+
"""
|
|
62
|
+
return self.node_kinds()
|
|
63
|
+
|
|
64
|
+
def coverage_metric(self, nodes: list[NodeSpec]) -> float:
|
|
65
|
+
"""Compute a domain coverage quality metric.
|
|
66
|
+
|
|
67
|
+
Default: fraction of meaningful nodes with a non-empty docstring.
|
|
68
|
+
|
|
69
|
+
:param nodes: All extracted NodeSpec objects.
|
|
70
|
+
:return: Coverage score in [0.0, 1.0].
|
|
71
|
+
"""
|
|
72
|
+
meaningful = [n for n in nodes if n.kind in self.meaningful_node_kinds()]
|
|
73
|
+
if not meaningful:
|
|
74
|
+
return 0.0
|
|
75
|
+
covered = sum(1 for n in meaningful if n.docstring.strip())
|
|
76
|
+
return covered / len(meaningful)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""kg_utils/types/module.py — Minimal abstract interface for KG modules.
|
|
2
|
+
|
|
3
|
+
For the concrete, production-grade base class with full build/query/pack
|
|
4
|
+
infrastructure, use :class:`kg_utils.pipeline.KGModule` instead.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from kg_utils.extractor import KGExtractor
|
|
13
|
+
from kg_utils.specs import QueryResult, SnippetPack
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class KGModule:
|
|
17
|
+
"""Base class for knowledge-graph modules.
|
|
18
|
+
|
|
19
|
+
Subclasses must implement :meth:`make_extractor`, :meth:`kind`,
|
|
20
|
+
and should override :meth:`build`, :meth:`query`, :meth:`stats`,
|
|
21
|
+
:meth:`pack`, and :meth:`analyze` with domain-specific logic.
|
|
22
|
+
|
|
23
|
+
:param repo_root: Absolute path to the repository or corpus root.
|
|
24
|
+
:param db_path: Path for the SQLite graph database.
|
|
25
|
+
:param lancedb_dir: Path for the LanceDB vector index directory.
|
|
26
|
+
:param config: Optional domain-specific configuration dict.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
repo_root: Path,
|
|
32
|
+
db_path: Path | None = None,
|
|
33
|
+
lancedb_dir: Path | None = None,
|
|
34
|
+
config: dict[str, Any] | None = None,
|
|
35
|
+
) -> None:
|
|
36
|
+
self.repo_root = repo_root
|
|
37
|
+
self.db_path = db_path
|
|
38
|
+
self.lancedb_dir = lancedb_dir
|
|
39
|
+
self.config = config or {}
|
|
40
|
+
|
|
41
|
+
def make_extractor(self) -> KGExtractor:
|
|
42
|
+
"""Return the domain extractor for this module.
|
|
43
|
+
|
|
44
|
+
:return: KGExtractor subclass instance.
|
|
45
|
+
"""
|
|
46
|
+
raise NotImplementedError
|
|
47
|
+
|
|
48
|
+
def kind(self) -> str:
|
|
49
|
+
"""Return the KGKind string for this module.
|
|
50
|
+
|
|
51
|
+
:return: Kind string (e.g. "code", "meta", "doc").
|
|
52
|
+
"""
|
|
53
|
+
raise NotImplementedError
|
|
54
|
+
|
|
55
|
+
def build(self, wipe: bool = False) -> None:
|
|
56
|
+
"""Build the knowledge graph index.
|
|
57
|
+
|
|
58
|
+
:param wipe: If True, delete existing index before building.
|
|
59
|
+
"""
|
|
60
|
+
raise NotImplementedError
|
|
61
|
+
|
|
62
|
+
def query(self, q: str, k: int = 8, **kwargs: Any) -> QueryResult:
|
|
63
|
+
"""Query the knowledge graph.
|
|
64
|
+
|
|
65
|
+
:param q: Natural-language query string.
|
|
66
|
+
:param k: Number of results to return.
|
|
67
|
+
:return: QueryResult with matched nodes and edges.
|
|
68
|
+
"""
|
|
69
|
+
raise NotImplementedError
|
|
70
|
+
|
|
71
|
+
def stats(self) -> dict[str, Any]:
|
|
72
|
+
"""Return statistics about the knowledge graph.
|
|
73
|
+
|
|
74
|
+
:return: Dict with keys like total_nodes, total_edges, etc.
|
|
75
|
+
"""
|
|
76
|
+
raise NotImplementedError
|
|
77
|
+
|
|
78
|
+
def pack(self, q: str, **kwargs: Any) -> SnippetPack:
|
|
79
|
+
"""Pack query results with source context.
|
|
80
|
+
|
|
81
|
+
:param q: Natural-language query string.
|
|
82
|
+
:return: SnippetPack with nodes, edges, and snippets.
|
|
83
|
+
"""
|
|
84
|
+
raise NotImplementedError
|
|
85
|
+
|
|
86
|
+
def analyze(self) -> str:
|
|
87
|
+
"""Run full analysis and return a Markdown report.
|
|
88
|
+
|
|
89
|
+
:return: Markdown-formatted analysis report.
|
|
90
|
+
"""
|
|
91
|
+
raise NotImplementedError
|