kgmodule-utils 0.2.3__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/PKG-INFO +9 -3
  2. {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/pyproject.toml +20 -3
  3. kgmodule_utils-0.3.0/src/kg_utils/__init__.py +20 -0
  4. {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/src/kg_utils/embedder.py +31 -12
  5. kgmodule_utils-0.3.0/src/kg_utils/extractor.py +76 -0
  6. kgmodule_utils-0.3.0/src/kg_utils/module.py +91 -0
  7. kgmodule_utils-0.3.0/src/kg_utils/pipeline.py +853 -0
  8. kgmodule_utils-0.3.0/src/kg_utils/semantic.py +452 -0
  9. kgmodule_utils-0.3.0/src/kg_utils/specs.py +286 -0
  10. kgmodule_utils-0.3.0/src/kg_utils/store.py +672 -0
  11. kgmodule_utils-0.2.3/src/kg_utils/__init__.py +0 -12
  12. {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/LICENSE +0 -0
  13. {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/README.md +0 -0
  14. {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/src/kg_utils/embed.py +0 -0
  15. {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/src/kg_utils/py.typed +0 -0
  16. {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/src/kg_utils/snapshots/__init__.py +0 -0
  17. {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/src/kg_utils/snapshots/manager.py +0 -0
  18. {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/src/kg_utils/snapshots/models.py +0 -0
  19. {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/src/kg_utils/types/__init__.py +0 -0
  20. {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/src/kg_utils/types/extractor.py +0 -0
  21. {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/src/kg_utils/types/module.py +0 -0
  22. {kgmodule_utils-0.2.3 → kgmodule_utils-0.3.0}/src/kg_utils/types/specs.py +0 -0
@@ -1,18 +1,24 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kgmodule-utils
3
- Version: 0.2.3
4
- Summary: Shared types and snapshot infrastructure for the KGModule SDK
3
+ Version: 0.3.0
4
+ Summary: Shared types, graph store, semantic index, and pipeline base for the KGModule SDK
5
5
  License: Elastic-2.0
6
6
  License-File: LICENSE
7
7
  Keywords: knowledge-graph,kgmodule,sdk,types,snapshots
8
8
  Author: Eric G. Suchanek, PhD
9
9
  Author-email: suchanek@flux-frontiers.com
10
10
  Requires-Python: >=3.12,<3.14
11
- Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Development Status :: 4 - Beta
12
12
  Classifier: Intended Audience :: Developers
13
13
  Classifier: Programming Language :: Python :: 3
14
14
  Classifier: Programming Language :: Python :: 3.12
15
15
  Classifier: Programming Language :: Python :: 3.13
16
+ Provides-Extra: semantic
17
+ Requires-Dist: lancedb (>=0.19.0) ; extra == "semantic"
18
+ Requires-Dist: numpy (>=1.24.0) ; extra == "semantic"
19
+ Requires-Dist: sentence-transformers (>=5.4.1) ; extra == "semantic"
20
+ Requires-Dist: torch (>=2.5.1) ; extra == "semantic"
21
+ Requires-Dist: transformers (>=4.40.0,<4.57) ; extra == "semantic"
16
22
  Project-URL: Repository, https://github.com/Flux-Frontiers/kg_utils
17
23
  Description-Content-Type: text/markdown
18
24
 
@@ -10,8 +10,8 @@ build-backend = "poetry.core.masonry.api"
10
10
 
11
11
  [project]
12
12
  name = "kgmodule-utils"
13
- version = "0.2.3"
14
- description = "Shared types and snapshot infrastructure for the KGModule SDK"
13
+ version = "0.3.0"
14
+ description = "Shared types, graph store, semantic index, and pipeline base for the KGModule SDK"
15
15
  readme = "README.md"
16
16
  license = { text = "Elastic-2.0" }
17
17
  authors = [
@@ -19,7 +19,7 @@ authors = [
19
19
  ]
20
20
  keywords = ["knowledge-graph", "kgmodule", "sdk", "types", "snapshots"]
21
21
  classifiers = [
22
- "Development Status :: 3 - Alpha",
22
+ "Development Status :: 4 - Beta",
23
23
  "Intended Audience :: Developers",
24
24
  "Programming Language :: Python :: 3",
25
25
  "Programming Language :: Python :: 3.12",
@@ -28,12 +28,28 @@ classifiers = [
28
28
  requires-python = ">=3.12,<3.14"
29
29
  dependencies = []
30
30
 
31
+ [project.optional-dependencies]
32
+ semantic = [
33
+ "lancedb>=0.19.0",
34
+ "numpy>=1.24.0",
35
+ "sentence-transformers>=5.4.1",
36
+ "torch>=2.5.1",
37
+ "transformers>=4.40.0,<4.57",
38
+ ]
39
+
31
40
  [project.urls]
32
41
  Repository = "https://github.com/Flux-Frontiers/kg_utils"
33
42
 
34
43
  [tool.poetry]
35
44
  packages = [{include = "kg_utils", from = "src"}]
36
45
 
46
+ [tool.poetry.group.kgdeps]
47
+ optional = true
48
+
49
+ [tool.poetry.group.kgdeps.dependencies]
50
+ pycode-kg = ">=0.18.1"
51
+ doc-kg = ">=0.15.2"
52
+
37
53
  [tool.poetry.group.dev]
38
54
  optional = true
39
55
 
@@ -72,6 +88,7 @@ module = [
72
88
  "sentence_transformers.*",
73
89
  "transformers.*",
74
90
  "numpy.*",
91
+ "lancedb",
75
92
  ]
76
93
  ignore_missing_imports = true
77
94
 
@@ -0,0 +1,20 @@
1
+ """kg_utils — Shared types, store, semantic index, and pipeline base for the KGModule SDK.
2
+
3
+ Sub-packages / modules:
4
+ kg_utils.types — NodeSpec, EdgeSpec, BuildStats, QueryResult, SnippetPack,
5
+ KGExtractor (abstract), KGModule (abstract interface).
6
+ kg_utils.store — GraphStore: SQLite-backed authoritative node/edge store.
7
+ kg_utils.semantic — Embedder, SentenceTransformerEmbedder, SemanticIndex, SeedHit.
8
+ kg_utils.pipeline — KGModule: concrete base class with full build/query/pack pipeline.
9
+ kg_utils.snapshots — Snapshot, SnapshotManager, SnapshotManifest, etc.
10
+ kg_utils.embed — Embedder protocol, DEFAULT_MODEL, KNOWN_MODELS,
11
+ kg_model_cache_dir(), resolve_model_path().
12
+ kg_utils.embedder — Concrete SentenceTransformerEmbedder, get_embedder(),
13
+ wrap_embedder(), load_sentence_transformer().
14
+
15
+ Optional extras
16
+ ---------------
17
+ pip install 'kgmodule-utils[semantic]' # lancedb + sentence-transformers
18
+ """
19
+
20
+ __version__ = "0.3.0"
@@ -99,29 +99,47 @@ def load_sentence_transformer(model_name: str = DEFAULT_MODEL) -> Any:
99
99
  from transformers import logging as hf_logging # pylint: disable=import-outside-toplevel
100
100
 
101
101
  hf_logging.set_verbosity_error()
102
- except ImportError:
102
+ # TQDM_DISABLE alone misses transformers' _tqdm_active gate
103
+ hf_logging.disable_progress_bar()
104
+ except (ImportError, ValueError):
103
105
  pass
104
106
 
105
107
  os.environ["TQDM_DISABLE"] = "1"
106
108
 
109
+ import torch # pylint: disable=import-outside-toplevel
110
+
111
+ if torch.cuda.is_available():
112
+ device = "cuda"
113
+ else:
114
+ try:
115
+ device = "mps" if torch.backends.mps.is_available() else "cpu"
116
+ except AttributeError:
117
+ device = "cpu"
118
+
107
119
  resolved = KNOWN_MODELS.get(model_name, model_name)
108
120
  trust_remote = "nomic-ai/" in resolved
109
121
  local_path = resolve_model_path(resolved)
110
122
 
111
123
  if local_path.exists():
112
- return SentenceTransformer(
124
+ model = SentenceTransformer(
113
125
  str(local_path),
114
126
  local_files_only=True,
115
127
  trust_remote_code=trust_remote,
128
+ device=device,
116
129
  )
117
- try:
118
- return SentenceTransformer(
119
- resolved,
120
- local_files_only=True,
121
- trust_remote_code=trust_remote,
122
- )
123
- except OSError:
124
- return SentenceTransformer(resolved, trust_remote_code=trust_remote)
130
+ else:
131
+ try:
132
+ model = SentenceTransformer(
133
+ resolved,
134
+ local_files_only=True,
135
+ trust_remote_code=trust_remote,
136
+ device=device,
137
+ )
138
+ except OSError:
139
+ model = SentenceTransformer(resolved, trust_remote_code=trust_remote, device=device)
140
+
141
+ model = model.to(device)
142
+ return model
125
143
 
126
144
 
127
145
  # ---------------------------------------------------------------------------
@@ -143,7 +161,8 @@ class SentenceTransformerEmbedder(Embedder):
143
161
  from transformers import logging as hf_logging # pylint: disable=import-outside-toplevel
144
162
 
145
163
  hf_logging.set_verbosity_error()
146
- except ImportError:
164
+ hf_logging.disable_progress_bar()
165
+ except (ImportError, ValueError):
147
166
  pass
148
167
 
149
168
  _prev = os.environ.get("TQDM_DISABLE")
@@ -157,7 +176,7 @@ class SentenceTransformerEmbedder(Embedder):
157
176
  os.environ["TQDM_DISABLE"] = _prev
158
177
 
159
178
  self.model_name: str = KNOWN_MODELS.get(model_name, model_name)
160
- # ST ≥5.4 renamed to get_embedding_dimension; ≤5.3 only had get_sentence_embedding_dimension.
179
+ # ST ≥5.4 renamed get_embedding_dimension; ≤5.3 had get_sentence_embedding_dimension.
161
180
  _dim_fn = getattr(self.model, "get_embedding_dimension", None) or getattr(
162
181
  self.model, "get_sentence_embedding_dimension", None
163
182
  )
@@ -0,0 +1,76 @@
1
+ """kg_utils/types/extractor.py — Abstract base class for KG extractors."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from collections.abc import Iterator
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ from kg_utils.specs import EdgeSpec, NodeSpec
11
+
12
+
13
+ class KGExtractor(ABC):
14
+ """Abstract extraction protocol for any knowledge graph domain.
15
+
16
+ Subclass and implement :meth:`node_kinds`, :meth:`edge_kinds`, and
17
+ :meth:`extract`. The concrete :class:`~kg_utils.pipeline.KGModule`
18
+ infrastructure (:class:`~kg_utils.store.GraphStore`,
19
+ :class:`~kg_utils.semantic.SemanticIndex`, snapshot management) is
20
+ provided by the framework — you only implement domain-specific parsing.
21
+
22
+ :param repo_path: Absolute path to the repository or corpus root.
23
+ :param config: Optional domain-specific configuration dict.
24
+ """
25
+
26
+ def __init__(self, repo_path: Path, config: dict[str, Any] | None = None) -> None:
27
+ self.repo_path = Path(repo_path).resolve()
28
+ self.config = config or {}
29
+
30
+ @abstractmethod
31
+ def node_kinds(self) -> list[str]:
32
+ """Return canonical node kind names emitted by this extractor.
33
+
34
+ :return: List of node kind strings (e.g. ``['module', 'class', 'function']``).
35
+ """
36
+
37
+ @abstractmethod
38
+ def edge_kinds(self) -> list[str]:
39
+ """Return canonical edge relation types emitted by this extractor.
40
+
41
+ :return: List of edge relation strings (e.g. ``['CONTAINS', 'CALLS']``).
42
+ """
43
+
44
+ @abstractmethod
45
+ def extract(self) -> Iterator[NodeSpec | EdgeSpec]:
46
+ """Traverse the source and yield NodeSpec / EdgeSpec objects.
47
+
48
+ Implementations must be deterministic: the same source should produce
49
+ the same stream on every call.
50
+
51
+ :return: Iterator of :class:`NodeSpec` and :class:`EdgeSpec` objects.
52
+ """
53
+
54
+ def meaningful_node_kinds(self) -> list[str]:
55
+ """Return node kinds included in the vector index and coverage metrics.
56
+
57
+ Default: all of :meth:`node_kinds`. Override to exclude structural
58
+ stubs (e.g., unresolved import placeholders) from semantic indexing.
59
+
60
+ :return: Subset of node_kinds() to index semantically.
61
+ """
62
+ return self.node_kinds()
63
+
64
+ def coverage_metric(self, nodes: list[NodeSpec]) -> float:
65
+ """Compute a domain coverage quality metric.
66
+
67
+ Default: fraction of meaningful nodes with a non-empty docstring.
68
+
69
+ :param nodes: All extracted NodeSpec objects.
70
+ :return: Coverage score in [0.0, 1.0].
71
+ """
72
+ meaningful = [n for n in nodes if n.kind in self.meaningful_node_kinds()]
73
+ if not meaningful:
74
+ return 0.0
75
+ covered = sum(1 for n in meaningful if n.docstring.strip())
76
+ return covered / len(meaningful)
@@ -0,0 +1,91 @@
1
+ """kg_utils/types/module.py — Minimal abstract interface for KG modules.
2
+
3
+ For the concrete, production-grade base class with full build/query/pack
4
+ infrastructure, use :class:`kg_utils.pipeline.KGModule` instead.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from kg_utils.extractor import KGExtractor
13
+ from kg_utils.specs import QueryResult, SnippetPack
14
+
15
+
16
+ class KGModule:
17
+ """Base class for knowledge-graph modules.
18
+
19
+ Subclasses must implement :meth:`make_extractor`, :meth:`kind`,
20
+ and should override :meth:`build`, :meth:`query`, :meth:`stats`,
21
+ :meth:`pack`, and :meth:`analyze` with domain-specific logic.
22
+
23
+ :param repo_root: Absolute path to the repository or corpus root.
24
+ :param db_path: Path for the SQLite graph database.
25
+ :param lancedb_dir: Path for the LanceDB vector index directory.
26
+ :param config: Optional domain-specific configuration dict.
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ repo_root: Path,
32
+ db_path: Path | None = None,
33
+ lancedb_dir: Path | None = None,
34
+ config: dict[str, Any] | None = None,
35
+ ) -> None:
36
+ self.repo_root = repo_root
37
+ self.db_path = db_path
38
+ self.lancedb_dir = lancedb_dir
39
+ self.config = config or {}
40
+
41
+ def make_extractor(self) -> KGExtractor:
42
+ """Return the domain extractor for this module.
43
+
44
+ :return: KGExtractor subclass instance.
45
+ """
46
+ raise NotImplementedError
47
+
48
+ def kind(self) -> str:
49
+ """Return the KGKind string for this module.
50
+
51
+ :return: Kind string (e.g. "code", "meta", "doc").
52
+ """
53
+ raise NotImplementedError
54
+
55
+ def build(self, wipe: bool = False) -> None:
56
+ """Build the knowledge graph index.
57
+
58
+ :param wipe: If True, delete existing index before building.
59
+ """
60
+ raise NotImplementedError
61
+
62
+ def query(self, q: str, k: int = 8, **kwargs: Any) -> QueryResult:
63
+ """Query the knowledge graph.
64
+
65
+ :param q: Natural-language query string.
66
+ :param k: Number of results to return.
67
+ :return: QueryResult with matched nodes and edges.
68
+ """
69
+ raise NotImplementedError
70
+
71
+ def stats(self) -> dict[str, Any]:
72
+ """Return statistics about the knowledge graph.
73
+
74
+ :return: Dict with keys like total_nodes, total_edges, etc.
75
+ """
76
+ raise NotImplementedError
77
+
78
+ def pack(self, q: str, **kwargs: Any) -> SnippetPack:
79
+ """Pack query results with source context.
80
+
81
+ :param q: Natural-language query string.
82
+ :return: SnippetPack with nodes, edges, and snippets.
83
+ """
84
+ raise NotImplementedError
85
+
86
+ def analyze(self) -> str:
87
+ """Run full analysis and return a Markdown report.
88
+
89
+ :return: Markdown-formatted analysis report.
90
+ """
91
+ raise NotImplementedError