knowledge-gateway 0.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gateway/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ """knowledge-gateway: filesystem/git-native FastMCP knowledge gateway (vault + code-graph + convert)."""
2
+ from importlib.metadata import PackageNotFoundError, version
3
+
4
+ try:
5
+ __version__ = version("knowledge-gateway")
6
+ except PackageNotFoundError: # running from a source tree that is not installed
7
+ __version__ = "0.2.0"
gateway/__main__.py ADDED
@@ -0,0 +1,3 @@
1
+ from .server import main
2
+
3
+ main()
gateway/acl.py ADDED
@@ -0,0 +1,62 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+
6
+ class AccessDenied(Exception):
7
+ """Raised when a token may not touch a vault. Message is deliberately opaque
8
+ so a caller cannot tell a forbidden vault apart from a non-existent one."""
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class TokenInfo:
13
+ sub: str
14
+ vaults: frozenset[str]
15
+ write: bool
16
+ email: str = ""
17
+
18
+
19
+ def build_registry(tokens_cfg: dict) -> dict[str, TokenInfo]:
20
+ reg: dict[str, TokenInfo] = {}
21
+ for token, meta in (tokens_cfg or {}).items():
22
+ if not isinstance(token, str) or not token:
23
+ raise ValueError("token key must be a non-empty string")
24
+ sub = meta.get("sub", "unknown")
25
+ vaults = meta.get("vaults", [])
26
+ write = meta.get("write", False)
27
+ # Validate types explicitly: `vaults: myvault` (a str) would otherwise
28
+ # become a frozenset of characters, and `write: "false"` (a str) would be
29
+ # truthy via bool() and silently grant write access.
30
+ if not isinstance(vaults, list) or not all(isinstance(x, str) for x in vaults):
31
+ raise ValueError(f"token '{sub}': vaults must be a list of strings")
32
+ if not isinstance(write, bool):
33
+ raise ValueError(f"token '{sub}': write must be a boolean (true/false)")
34
+ reg[token] = TokenInfo(
35
+ sub=str(sub),
36
+ vaults=frozenset(vaults),
37
+ write=write,
38
+ email=str(meta.get("email", "")),
39
+ )
40
+ return reg
41
+
42
+
43
+ def scopes_for(info: TokenInfo) -> list[str]:
44
+ scopes = [f"vault:{v}" for v in sorted(info.vaults)]
45
+ if info.write:
46
+ scopes.append("write")
47
+ return scopes
48
+
49
+
50
+ def allowed_vaults(scopes) -> set[str]:
51
+ return {s.split(":", 1)[1] for s in scopes if s.startswith("vault:")}
52
+
53
+
54
+ def can_write(scopes) -> bool:
55
+ return "write" in set(scopes)
56
+
57
+
58
+ def authorize(scopes, vault: str, *, write: bool) -> None:
59
+ if vault not in allowed_vaults(scopes):
60
+ raise AccessDenied(f"vault_forbidden: {vault}")
61
+ if write and not can_write(scopes):
62
+ raise AccessDenied(f"write_forbidden: {vault}")
@@ -0,0 +1,15 @@
1
+ """codegraph - build a queryable code/Ansible knowledge graph from a source tree.
2
+
3
+ Our own extractor (no third-party graph tool): stdlib `ast` for Python, PyYAML for
4
+ Ansible, and an optional generic tree-sitter pass for more languages (JS/TS/Go/
5
+ Terraform/bash/PowerShell/... via the [graph-all] extra). Output is a NetworkX
6
+ node-link `graph.json` the gateway serves read-only. AST-only: no LLM, no network.
7
+
8
+ Node: {id, label, type, file_type, source_file, source_location?, community?}
9
+ Edge: {source, target, relation, confidence(EXTRACTED|INFERRED|AMBIGUOUS), confidence_score?}
10
+ """
11
+ from __future__ import annotations
12
+
13
+ from .build import build_graph, SCHEMA_VERSION
14
+
15
+ __all__ = ["build_graph", "SCHEMA_VERSION"]
@@ -0,0 +1,89 @@
1
+ """Build a NetworkX code/Ansible graph from a source tree and return node-link JSON.
2
+
3
+ Passes: Ansible (PyYAML, repo-level) + Python (ast, per file) + optional broad
4
+ languages (tree-sitter, per file). Communities (greedy modularity) are baked onto
5
+ nodes. AST-only, read-only on the source tree. networkx is imported lazily so this
6
+ module is importable without the [graph] extra.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ from pathlib import Path
12
+
13
+ from . import extract_ansible, extract_python, treesitter
14
+
15
+ SCHEMA_VERSION = 1
16
+ _PRUNE = extract_ansible.PRUNE # one prune set shared by the Ansible pass and the file walk
17
+
18
+
19
+ def _iter_files(root: Path):
20
+ for dirpath, dirnames, filenames in os.walk(root):
21
+ dirnames[:] = [d for d in dirnames if d not in _PRUNE] # prune in place: do not descend
22
+ for fn in filenames:
23
+ yield Path(dirpath) / fn
24
+
25
+
26
+ def build_graph(root, languages: list[str] | None = None) -> dict:
27
+ """Build the graph for `root`. Returns NetworkX node-link data (with a `graph`
28
+ metadata block). `languages` optionally restricts the broad tree-sitter pass."""
29
+ try:
30
+ import networkx as nx
31
+ from networkx.algorithms.community import greedy_modularity_communities
32
+ except ImportError:
33
+ raise ValueError("graph_unavailable: install the [graph] extra (networkx) to build graphs")
34
+
35
+ root = Path(root).resolve()
36
+ if not root.is_dir():
37
+ raise FileNotFoundError(f"not_found: {root}")
38
+
39
+ fragments: list[dict] = [extract_ansible.extract(root)]
40
+ n_py = n_ts = 0
41
+ for p in _iter_files(root):
42
+ rel = p.relative_to(root).as_posix()
43
+ if p.suffix == ".py":
44
+ fragments.append(extract_python.extract(p, rel))
45
+ n_py += 1
46
+ elif p.suffix.lower() in treesitter.LANG_EXTS:
47
+ if languages and treesitter.EXT_LANG.get(p.suffix.lower()) not in languages:
48
+ continue
49
+ frag = treesitter.extract(p, rel)
50
+ if frag["nodes"]:
51
+ n_ts += 1
52
+ fragments.append(frag)
53
+
54
+ G = nx.DiGraph()
55
+ for frag in fragments:
56
+ for n in frag["nodes"]:
57
+ nid = n["id"]
58
+ if nid in G:
59
+ G.nodes[nid].update({k: v for k, v in n.items() if v is not None and k != "id"})
60
+ else:
61
+ G.add_node(nid, **{k: v for k, v in n.items() if k != "id"})
62
+ for e in frag["edges"]:
63
+ for end in (e["source"], e["target"]):
64
+ if end not in G:
65
+ G.add_node(end)
66
+ G.add_edge(e["source"], e["target"],
67
+ **{k: v for k, v in e.items() if k not in ("source", "target")})
68
+
69
+ communities: list = []
70
+ if G.number_of_nodes():
71
+ try:
72
+ communities = list(greedy_modularity_communities(G.to_undirected()))
73
+ except Exception:
74
+ communities = []
75
+ for cid, members in enumerate(communities):
76
+ for nid in members:
77
+ G.nodes[nid]["community"] = cid
78
+
79
+ G.graph.update({
80
+ "schema_version": SCHEMA_VERSION,
81
+ "root": root.name,
82
+ "files_python": n_py,
83
+ "files_treesitter": n_ts,
84
+ "treesitter_available": treesitter.AVAILABLE,
85
+ "communities": len(communities),
86
+ "node_count": G.number_of_nodes(),
87
+ "edge_count": G.number_of_edges(),
88
+ })
89
+ return nx.node_link_data(G, edges="links")
@@ -0,0 +1,37 @@
1
+ """CLI: build a code graph from a source tree into a graph.json.
2
+
3
+ knowledge-gateway-graph <source> -o <vault>/.graph/<name>.json [--languages js ts ...]
4
+
5
+ Runs where the code is (the source tree may be outside any vault); writes a node-link
6
+ graph.json the gateway then serves read-only. AST-only, no network, no LLM.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import argparse
11
+ import json
12
+ import sys
13
+ from pathlib import Path
14
+
15
+
16
+ def main(argv: list[str] | None = None) -> int:
17
+ ap = argparse.ArgumentParser(prog="knowledge-gateway-graph",
18
+ description="Build a code/Ansible knowledge graph (graph.json).")
19
+ ap.add_argument("source", help="source tree (code repo) to graph")
20
+ ap.add_argument("-o", "--out", required=True, help="output graph.json path")
21
+ ap.add_argument("--languages", nargs="*", default=None,
22
+ help="restrict the tree-sitter pass to these languages (default: all available)")
23
+ args = ap.parse_args(argv)
24
+
25
+ from .build import build_graph # imports networkx; needs the [graph] extra
26
+ data = build_graph(args.source, languages=args.languages)
27
+ out = Path(args.out)
28
+ out.parent.mkdir(parents=True, exist_ok=True)
29
+ out.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
30
+ g = data.get("graph", {})
31
+ print(f"built: {g.get('node_count')} nodes, {g.get('edge_count')} edges, "
32
+ f"{g.get('communities')} communities (tree-sitter: {g.get('treesitter_available')}) -> {out}")
33
+ return 0
34
+
35
+
36
+ if __name__ == "__main__": # pragma: no cover
37
+ sys.exit(main())
@@ -0,0 +1,183 @@
1
+ """Ansible/YAML extractor - the relationships a generic AST tool cannot see.
2
+
3
+ Repo-level pass: walks playbooks, roles (tasks/handlers/meta), and filter_plugins,
4
+ emitting nodes for playbooks/plays/roles/tasksfiles/tasks/handlers/filters and edges
5
+ for uses_role, role_depends_on, include_role, include_tasks, notifies, has_tasks,
6
+ calls_filter (Ansible task -> Python filter plugin), and implemented_by (filter -> fn).
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import ast
11
+ import re
12
+ from pathlib import Path
13
+
14
+ import yaml
15
+
16
+ INCLUDE_TASKS = {"include_tasks", "import_tasks",
17
+ "ansible.builtin.include_tasks", "ansible.builtin.import_tasks"}
18
+ INCLUDE_ROLE = {"include_role", "import_role",
19
+ "ansible.builtin.include_role", "ansible.builtin.import_role"}
20
+ _FILTER_RE = re.compile(r"\|\s*([a-zA-Z_]\w*)")
21
+ PRUNE = {".git", "node_modules", ".venv", "venv", "__pycache__", ".graph",
22
+ "dist", "build", ".pytest_cache", ".mypy_cache", ".ruff_cache", ".tox"}
23
+
24
+
25
+ def _pruned(p, root) -> bool:
26
+ return any(part in PRUNE for part in p.relative_to(root).parts)
27
+
28
+
29
+ def _safe_load(path: Path):
30
+ try:
31
+ return yaml.safe_load(path.read_text(encoding="utf-8", errors="replace"))
32
+ except Exception:
33
+ return None
34
+
35
+
36
+ def _iter_strings(obj):
37
+ if isinstance(obj, str):
38
+ yield obj
39
+ elif isinstance(obj, dict):
40
+ for v in obj.values():
41
+ yield from _iter_strings(v)
42
+ elif isinstance(obj, list):
43
+ for v in obj:
44
+ yield from _iter_strings(v)
45
+
46
+
47
+ def extract(root: Path) -> dict:
48
+ root = Path(root)
49
+ nodes: dict[str, dict] = {}
50
+ edges: list[dict] = []
51
+
52
+ def node(nid, **attrs):
53
+ cur = nodes.setdefault(nid, {"id": nid})
54
+ cur.update({k: v for k, v in attrs.items() if v is not None})
55
+
56
+ def edge(s, t, relation, confidence="EXTRACTED", score=None):
57
+ node(s)
58
+ node(t)
59
+ e = {"source": s, "target": t, "relation": relation, "confidence": confidence}
60
+ if score is not None:
61
+ e["confidence_score"] = score
62
+ edges.append(e)
63
+
64
+ # --- filter plugins: name -> function (what `| name` resolves to) ---
65
+ filter_names: set[str] = set()
66
+ for py in root.rglob("filter_plugins/*.py"):
67
+ if _pruned(py, root):
68
+ continue
69
+ rel = py.relative_to(root).as_posix()
70
+ try:
71
+ tree = ast.parse(py.read_text(encoding="utf-8", errors="replace"))
72
+ except SyntaxError:
73
+ continue
74
+ for f in tree.body: # module-level functions only - avoid bogus method ids
75
+ if isinstance(f, (ast.FunctionDef, ast.AsyncFunctionDef)):
76
+ node(f"pyfunc:{rel}:{f.name}", label=f.name, type="function",
77
+ file_type="python", source_file=rel)
78
+ for n in ast.walk(tree):
79
+ if isinstance(n, ast.FunctionDef) and n.name == "filters":
80
+ for d in ast.walk(n):
81
+ if isinstance(d, ast.Dict):
82
+ for k, v in zip(d.keys, d.values):
83
+ if isinstance(k, ast.Constant) and isinstance(k.value, str):
84
+ fname = k.value
85
+ filter_names.add(fname)
86
+ node(f"filter:{fname}", label=fname, type="ansible_filter",
87
+ file_type="python", source_file=rel)
88
+ if isinstance(v, ast.Name):
89
+ edge(f"filter:{fname}", f"pyfunc:{rel}:{v.id}", "implemented_by")
90
+
91
+ def walk_tasks(tasks, owner, rel):
92
+ if not isinstance(tasks, list):
93
+ return
94
+ for i, t in enumerate(tasks):
95
+ if not isinstance(t, dict):
96
+ continue
97
+ for blk in ("block", "rescue", "always"):
98
+ if blk in t:
99
+ walk_tasks(t[blk], owner, rel)
100
+ name = t.get("name")
101
+ tid = f"task:{rel}:{i}:" + (name or "unnamed")[:40]
102
+ interesting = False
103
+ for k in t:
104
+ if k in INCLUDE_TASKS:
105
+ spec = t[k]
106
+ f = spec.get("file") if isinstance(spec, dict) else spec
107
+ if isinstance(f, str):
108
+ edge(owner, f"tasksfile:{(Path(rel).parent / f).as_posix()}", "include_tasks")
109
+ interesting = True
110
+ if k in INCLUDE_ROLE:
111
+ spec = t[k]
112
+ rn = spec.get("name") if isinstance(spec, dict) else spec
113
+ if isinstance(rn, str):
114
+ edge(owner, f"role:{rn}", "include_role")
115
+ interesting = True
116
+ nt = t.get("notify")
117
+ for h in ([nt] if isinstance(nt, str) else nt or []):
118
+ if isinstance(h, str):
119
+ node(tid, label=name or "unnamed", type="task", file_type="ansible",
120
+ source_file=rel, source_location=f"#{i}")
121
+ edge(tid, f"handler:{h}", "notifies")
122
+ interesting = True
123
+ used = {m for s in _iter_strings(t) for m in _FILTER_RE.findall(s) if m in filter_names}
124
+ for fn in used:
125
+ node(tid, label=name or "unnamed", type="task", file_type="ansible",
126
+ source_file=rel, source_location=f"#{i}")
127
+ edge(tid, f"filter:{fn}", "calls_filter")
128
+ if (interesting or used) and tid in nodes:
129
+ edge(owner, tid, "has_task")
130
+
131
+ for meta in root.rglob("roles/*/meta/main.yml"):
132
+ if _pruned(meta, root):
133
+ continue
134
+ rname = meta.parts[-3]
135
+ node(f"role:{rname}", label=rname, type="role", file_type="ansible",
136
+ source_file=meta.relative_to(root).as_posix())
137
+ data = _safe_load(meta) or {}
138
+ if isinstance(data, dict):
139
+ for dep in (data.get("dependencies") or []):
140
+ dn = (dep.get("role") or dep.get("name")) if isinstance(dep, dict) else dep
141
+ if isinstance(dn, str):
142
+ edge(f"role:{rname}", f"role:{dn}", "role_depends_on")
143
+
144
+ for tf in root.rglob("roles/*/tasks/*.yml"):
145
+ if _pruned(tf, root):
146
+ continue
147
+ rel = tf.relative_to(root).as_posix()
148
+ rname = tf.parts[-3]
149
+ node(f"tasksfile:{rel}", label=Path(rel).name, type="tasksfile",
150
+ file_type="ansible", source_file=rel)
151
+ node(f"role:{rname}", label=rname, type="role", file_type="ansible")
152
+ edge(f"role:{rname}", f"tasksfile:{rel}", "has_tasks")
153
+ walk_tasks(_safe_load(tf), f"tasksfile:{rel}", rel)
154
+
155
+ for hf in root.rglob("roles/*/handlers/main.yml"):
156
+ if _pruned(hf, root):
157
+ continue
158
+ hs = _safe_load(hf) or []
159
+ rel = hf.relative_to(root).as_posix()
160
+ for h in hs if isinstance(hs, list) else []:
161
+ if isinstance(h, dict) and h.get("name"):
162
+ node(f"handler:{h['name']}", label=h["name"], type="handler",
163
+ file_type="ansible", source_file=rel)
164
+
165
+ for pb in list(root.glob("playbook_*.yml")) + list(root.glob("*.yml")):
166
+ plays = _safe_load(pb)
167
+ if not isinstance(plays, list) or not any(
168
+ isinstance(p, dict) and ("hosts" in p or "roles" in p or "tasks" in p) for p in plays
169
+ ):
170
+ continue # not a playbook
171
+ rel = pb.relative_to(root).as_posix()
172
+ node(f"playbook:{rel}", label=pb.name, type="playbook", file_type="ansible", source_file=rel)
173
+ for play in plays:
174
+ if not isinstance(play, dict):
175
+ continue
176
+ for r in (play.get("roles") or []):
177
+ rn = (r.get("role") or r.get("name")) if isinstance(r, dict) else r
178
+ if isinstance(rn, str):
179
+ edge(f"playbook:{rel}", f"role:{rn}", "uses_role")
180
+ for sec in ("tasks", "pre_tasks", "post_tasks", "handlers"):
181
+ walk_tasks(play.get(sec) or [], f"playbook:{rel}", rel)
182
+
183
+ return {"nodes": list(nodes.values()), "edges": edges}
@@ -0,0 +1,64 @@
1
+ """Python extractor (stdlib `ast`, no dependency): module/function/class/method nodes,
2
+ import edges, and within-file call edges (INFERRED).
3
+
4
+ Top-level functions get stable `pyfunc:<rel>:<name>` ids (these are what Ansible filter
5
+ plugins and call edges reference); methods are scoped as `pymethod:<rel>:<Class>.<name>`
6
+ so same-named functions/methods do not collide.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import ast
11
+ from pathlib import Path
12
+
13
+ _FUNC = (ast.FunctionDef, ast.AsyncFunctionDef)
14
+
15
+
16
+ def extract(path: Path, rel: str) -> dict:
17
+ try:
18
+ tree = ast.parse(Path(path).read_text(encoding="utf-8", errors="replace"))
19
+ except SyntaxError:
20
+ return {"nodes": [], "edges": []}
21
+
22
+ nodes: list[dict] = []
23
+ edges: list[dict] = []
24
+ mod = f"module:{rel}"
25
+ nodes.append({"id": mod, "label": rel, "type": "module", "file_type": "python", "source_file": rel})
26
+
27
+ top_funcs: set[str] = set()
28
+ for node in tree.body: # module-level only -> stable, collision-free ids
29
+ if isinstance(node, _FUNC):
30
+ top_funcs.add(node.name)
31
+ nid = f"pyfunc:{rel}:{node.name}"
32
+ nodes.append({"id": nid, "label": node.name, "type": "function",
33
+ "file_type": "python", "source_file": rel, "source_location": f"L{node.lineno}"})
34
+ edges.append({"source": mod, "target": nid, "relation": "defines", "confidence": "EXTRACTED"})
35
+ elif isinstance(node, ast.ClassDef):
36
+ cid = f"pyclass:{rel}:{node.name}"
37
+ nodes.append({"id": cid, "label": node.name, "type": "class",
38
+ "file_type": "python", "source_file": rel, "source_location": f"L{node.lineno}"})
39
+ edges.append({"source": mod, "target": cid, "relation": "defines", "confidence": "EXTRACTED"})
40
+ for m in node.body:
41
+ if isinstance(m, _FUNC):
42
+ mid = f"pymethod:{rel}:{node.name}.{m.name}"
43
+ nodes.append({"id": mid, "label": f"{node.name}.{m.name}", "type": "method",
44
+ "file_type": "python", "source_file": rel, "source_location": f"L{m.lineno}"})
45
+ edges.append({"source": cid, "target": mid, "relation": "defines", "confidence": "EXTRACTED"})
46
+
47
+ for n in ast.walk(tree):
48
+ if isinstance(n, ast.Import):
49
+ for a in n.names:
50
+ edges.append({"source": mod, "target": f"extmodule:{a.name.split('.')[0]}",
51
+ "relation": "imports", "confidence": "EXTRACTED"})
52
+ elif isinstance(n, ast.ImportFrom) and n.module:
53
+ edges.append({"source": mod, "target": f"extmodule:{n.module.split('.')[0]}",
54
+ "relation": "imports", "confidence": "EXTRACTED"})
55
+
56
+ # within-file calls between top-level functions (INFERRED)
57
+ for fn in [n for n in tree.body if isinstance(n, _FUNC)]:
58
+ caller = f"pyfunc:{rel}:{fn.name}"
59
+ for c in ast.walk(fn):
60
+ if (isinstance(c, ast.Call) and isinstance(c.func, ast.Name)
61
+ and c.func.id in top_funcs and c.func.id != fn.name):
62
+ edges.append({"source": caller, "target": f"pyfunc:{rel}:{c.func.id}",
63
+ "relation": "calls", "confidence": "INFERRED", "confidence_score": 0.85})
64
+ return {"nodes": nodes, "edges": edges}
@@ -0,0 +1,84 @@
1
+ """Optional broad-language pass via tree-sitter-language-pack (the [graph-all] extra).
2
+
3
+ One dependency covers many languages (JS/TS/Go/Rust/Java/C#/PHP/Ruby/bash/PowerShell/
4
+ Terraform/...). If the package is not installed, AVAILABLE is False and the build simply
5
+ skips these languages - the Python (ast) and Ansible (yaml) passes need no tree-sitter.
6
+
7
+ Adding a language later = one line in EXT_LANG (data, not a rewrite).
8
+ """
9
+ from __future__ import annotations
10
+
11
+ from pathlib import Path
12
+
13
+ try: # the broad pass is opt-in; core install stays light
14
+ from tree_sitter_language_pack import get_parser
15
+ AVAILABLE = True
16
+ except Exception: # pragma: no cover - exercised only with the [graph-all] extra
17
+ AVAILABLE = False
18
+
19
+ # extension -> tree-sitter language name. Extend freely; coverage is data, not code.
20
+ EXT_LANG = {
21
+ ".js": "javascript", ".jsx": "javascript", ".mjs": "javascript", ".cjs": "javascript",
22
+ ".ts": "typescript", ".tsx": "tsx",
23
+ ".go": "go", ".rs": "rust", ".java": "java", ".cs": "c_sharp",
24
+ ".rb": "ruby", ".php": "php", ".sh": "bash", ".bash": "bash",
25
+ ".ps1": "powershell", ".psm1": "powershell",
26
+ ".tf": "hcl", ".tfvars": "hcl", ".hcl": "hcl",
27
+ ".c": "c", ".h": "c", ".cpp": "cpp", ".cc": "cpp", ".hpp": "cpp",
28
+ ".lua": "lua", ".kt": "kotlin", ".swift": "swift", ".scala": "scala",
29
+ }
30
+ LANG_EXTS = set(EXT_LANG)
31
+
32
+ # tree-sitter node types that denote a definition, across grammars (heuristic, language-agnostic).
33
+ _DEF_TYPES = {
34
+ "function_declaration", "function_definition", "function_item", "method_definition",
35
+ "method_declaration", "function", "method", "arrow_function", "func_literal",
36
+ "class_declaration", "class_definition", "class_specifier", "class", "struct_item",
37
+ "struct_specifier", "interface_declaration", "type_declaration", "module",
38
+ "block", # terraform/hcl resource/module blocks
39
+ }
40
+ _NAME_TYPES = ("identifier", "name", "type_identifier", "field_identifier",
41
+ "constant", "word", "string_literal")
42
+
43
+
44
+ def _name(node, src: bytes) -> str | None:
45
+ for ch in node.children:
46
+ if ch.type in _NAME_TYPES:
47
+ return src[ch.start_byte:ch.end_byte].decode("utf-8", "replace").strip('"').strip()
48
+ return None
49
+
50
+
51
+ def extract(path: Path, rel: str) -> dict:
52
+ if not AVAILABLE:
53
+ return {"nodes": [], "edges": []}
54
+ lang = EXT_LANG.get(path.suffix.lower())
55
+ if not lang:
56
+ return {"nodes": [], "edges": []}
57
+ try:
58
+ parser = get_parser(lang)
59
+ src = path.read_bytes()
60
+ tree = parser.parse(src)
61
+ except Exception:
62
+ return {"nodes": [], "edges": []}
63
+
64
+ nodes: list[dict] = []
65
+ edges: list[dict] = []
66
+ mod = f"module:{rel}"
67
+ nodes.append({"id": mod, "label": rel, "type": "module", "file_type": lang, "source_file": rel})
68
+
69
+ stack = [tree.root_node]
70
+ seen: set[str] = set()
71
+ while stack:
72
+ n = stack.pop()
73
+ if n.type in _DEF_TYPES:
74
+ nm = _name(n, src)
75
+ if nm:
76
+ nid = f"{lang}:{rel}:{nm}:L{n.start_point[0] + 1}"
77
+ if nid not in seen:
78
+ seen.add(nid)
79
+ kind = "class" if "class" in n.type or "struct" in n.type or "interface" in n.type else "function"
80
+ nodes.append({"id": nid, "label": nm, "type": kind, "file_type": lang,
81
+ "source_file": rel, "source_location": f"L{n.start_point[0] + 1}"})
82
+ edges.append({"source": mod, "target": nid, "relation": "defines", "confidence": "EXTRACTED"})
83
+ stack.extend(n.children)
84
+ return {"nodes": nodes, "edges": edges}
gateway/config.py ADDED
@@ -0,0 +1,79 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from pathlib import Path
5
+
6
+ import yaml
7
+
8
+ from .vaults import Vault
9
+
10
+ DEFAULT_VAULTS_FILE = "vaults.yaml"
11
+ DEFAULT_TOKENS_FILE = "tokens.yaml"
12
+
13
+
14
+ def _base_dir() -> Path:
15
+ return Path(__file__).resolve().parent.parent
16
+
17
+
18
+ def _resolve(env_var: str, default_name: str) -> Path:
19
+ override = os.environ.get(env_var)
20
+ if override:
21
+ return Path(override).expanduser().resolve()
22
+ return _base_dir() / default_name
23
+
24
+
25
+ def load_vaults(path: Path | None = None) -> dict[str, Vault]:
26
+ cfg = path or _resolve("KNOWLEDGE_GATEWAY_VAULTS", DEFAULT_VAULTS_FILE)
27
+ if not cfg.exists():
28
+ raise FileNotFoundError(
29
+ f"vaults config not found: {cfg} — copy vaults.example.yaml to vaults.yaml"
30
+ )
31
+ data = yaml.safe_load(cfg.read_text()) or {}
32
+ vaults = {
33
+ name: Vault.from_spec(name, spec)
34
+ for name, spec in (data.get("vaults") or {}).items()
35
+ }
36
+ if not vaults:
37
+ raise ValueError(f"no vaults defined in {cfg}")
38
+
39
+ items = list(vaults.values())
40
+ for v in items:
41
+ if v.subdir == "." and v.repo_root != v.path:
42
+ raise ValueError(
43
+ f"vault '{v.name}': subdir '.' with repo_root != path would commit the whole repo"
44
+ )
45
+ if v.subdir.startswith("/") or ".." in Path(v.subdir).parts:
46
+ raise ValueError(f"vault '{v.name}': subdir must be a relative path without '..'")
47
+ # subdir must actually locate path under repo_root, else a commit scoped
48
+ # to the subdir could touch a sibling tree (e.g. backend/) the vault
49
+ # token was never granted.
50
+ if (v.repo_root / v.subdir).resolve() != v.path:
51
+ raise ValueError(
52
+ f"vault '{v.name}': repo_root/subdir does not resolve to path"
53
+ )
54
+ for i, a in enumerate(items):
55
+ for b in items[i + 1:]:
56
+ if a.path == b.path or a.path.is_relative_to(b.path) or b.path.is_relative_to(a.path):
57
+ raise ValueError(
58
+ f"vault paths overlap ('{a.name}', '{b.name}') — grants would leak across them"
59
+ )
60
+ return vaults
61
+
62
+
63
+ def load_tokens(path: Path | None = None) -> dict:
64
+ cfg = path or _resolve("KNOWLEDGE_GATEWAY_TOKENS", DEFAULT_TOKENS_FILE)
65
+ if not cfg.exists():
66
+ raise FileNotFoundError(
67
+ f"tokens config not found: {cfg} — copy tokens.example.yaml to tokens.yaml"
68
+ )
69
+ # Secrets file: refuse to load if it is group/world-accessible. The docs tell
70
+ # admins to `chmod 0600`, but nothing enforced it — a 0644 tokens.yaml would
71
+ # silently expose every bearer token to other local users.
72
+ if os.name == "posix":
73
+ mode = cfg.stat().st_mode & 0o777
74
+ if mode & 0o077:
75
+ raise PermissionError(
76
+ f"{cfg} is group/world-accessible (mode {mode:#o}); run: chmod 0600 {cfg}"
77
+ )
78
+ data = yaml.safe_load(cfg.read_text()) or {}
79
+ return data.get("tokens") or {}
gateway/convert.py ADDED
@@ -0,0 +1,25 @@
1
+ """Convert attachments (PDF / Office / images / HTML / ...) to Markdown via markitdown.
2
+
3
+ markitdown is an optional dependency (the [convert] extra); it is imported lazily so the
4
+ gateway core never requires it. Returns Markdown text; writes nothing.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ from pathlib import Path
9
+
10
+ MAX_CONVERT_BYTES = 50 * 1024 * 1024
11
+
12
+
13
+ def to_markdown(path: Path) -> str:
14
+ try:
15
+ from markitdown import MarkItDown
16
+ except ImportError:
17
+ raise ValueError("convert_unavailable: install the [convert] extra (markitdown) to convert documents")
18
+ p = Path(path)
19
+ if p.stat().st_size > MAX_CONVERT_BYTES:
20
+ raise ValueError(f"too_large: {p.name} is over {MAX_CONVERT_BYTES // (1024 * 1024)} MiB")
21
+ try:
22
+ result = MarkItDown().convert(str(p))
23
+ except Exception as e:
24
+ raise ValueError(f"convert_failed: {p.name}: {e}")
25
+ return result.text_content or ""