vigil-codeintel 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. vigil_codeintel-0.1.0.dist-info/METADATA +780 -0
  2. vigil_codeintel-0.1.0.dist-info/RECORD +131 -0
  3. vigil_codeintel-0.1.0.dist-info/WHEEL +5 -0
  4. vigil_codeintel-0.1.0.dist-info/entry_points.txt +3 -0
  5. vigil_codeintel-0.1.0.dist-info/licenses/LICENSE +21 -0
  6. vigil_codeintel-0.1.0.dist-info/top_level.txt +3 -0
  7. vigil_forensic/__init__.py +224 -0
  8. vigil_forensic/_git_utils.py +178 -0
  9. vigil_forensic/_shared.py +510 -0
  10. vigil_forensic/_stubs.py +156 -0
  11. vigil_forensic/gate_checks/__init__.py +1 -0
  12. vigil_forensic/gate_checks/_ast_helpers.py +629 -0
  13. vigil_forensic/gate_checks/_deployment_detector.py +573 -0
  14. vigil_forensic/gate_checks/atomic_write_checks.py +1143 -0
  15. vigil_forensic/gate_checks/authority_checks.py +95 -0
  16. vigil_forensic/gate_checks/boundary_breach_checks.py +202 -0
  17. vigil_forensic/gate_checks/broad_except_checks.py +301 -0
  18. vigil_forensic/gate_checks/broad_except_hidden_sentinel_checks.py +365 -0
  19. vigil_forensic/gate_checks/common.py +253 -0
  20. vigil_forensic/gate_checks/config_safety_checks.py +704 -0
  21. vigil_forensic/gate_checks/config_ssot_checks.py +78 -0
  22. vigil_forensic/gate_checks/conflict_checks.py +193 -0
  23. vigil_forensic/gate_checks/context_fallback_checks.py +697 -0
  24. vigil_forensic/gate_checks/context_health_checks.py +289 -0
  25. vigil_forensic/gate_checks/contract_shape_drift_checks.py +459 -0
  26. vigil_forensic/gate_checks/dirty_baseline_check.py +274 -0
  27. vigil_forensic/gate_checks/duplication_checks.py +387 -0
  28. vigil_forensic/gate_checks/embedded_string_checks.py +123 -0
  29. vigil_forensic/gate_checks/empty_output_checks.py +87 -0
  30. vigil_forensic/gate_checks/encoding_checks.py +847 -0
  31. vigil_forensic/gate_checks/export_completeness_checks.py +156 -0
  32. vigil_forensic/gate_checks/fallback_checks.py +41 -0
  33. vigil_forensic/gate_checks/file_proliferation_checks.py +171 -0
  34. vigil_forensic/gate_checks/fix_without_test_checks.py +69 -0
  35. vigil_forensic/gate_checks/forensic_cluster_runners/__init__.py +9 -0
  36. vigil_forensic/gate_checks/forensic_cluster_runners/_helpers.py +71 -0
  37. vigil_forensic/gate_checks/forensic_cluster_runners/advanced_checks.py +322 -0
  38. vigil_forensic/gate_checks/forensic_cluster_runners/core.py +273 -0
  39. vigil_forensic/gate_checks/forensic_cluster_runners/integrity_checks.py +203 -0
  40. vigil_forensic/gate_checks/forensic_cluster_runners/quality_checks.py +666 -0
  41. vigil_forensic/gate_checks/forensic_clusters/__init__.py +193 -0
  42. vigil_forensic/gate_checks/forensic_clusters/allowlist.py +426 -0
  43. vigil_forensic/gate_checks/forensic_clusters/allowlist_writer.py +302 -0
  44. vigil_forensic/gate_checks/forensic_clusters/api_protocol.py +231 -0
  45. vigil_forensic/gate_checks/forensic_clusters/async_quality.py +1156 -0
  46. vigil_forensic/gate_checks/forensic_clusters/code_style.py +808 -0
  47. vigil_forensic/gate_checks/forensic_clusters/core.py +319 -0
  48. vigil_forensic/gate_checks/forensic_clusters/data_quality.py +763 -0
  49. vigil_forensic/gate_checks/forensic_clusters/dead_code.py +480 -0
  50. vigil_forensic/gate_checks/forensic_clusters/edit_mutation.py +842 -0
  51. vigil_forensic/gate_checks/forensic_clusters/exception_boundary.py +240 -0
  52. vigil_forensic/gate_checks/forensic_clusters/legacy_debt.py +556 -0
  53. vigil_forensic/gate_checks/forensic_clusters/static_analysis.py +834 -0
  54. vigil_forensic/gate_checks/forensic_clusters/structural_quality.py +298 -0
  55. vigil_forensic/gate_checks/god_object_zones_checks.py +173 -0
  56. vigil_forensic/gate_checks/hallucination_checks.py +566 -0
  57. vigil_forensic/gate_checks/hunter_artifact_completeness_check.py +139 -0
  58. vigil_forensic/gate_checks/implementation_overfit_checks.py +380 -0
  59. vigil_forensic/gate_checks/import_integrity_checks.py +233 -0
  60. vigil_forensic/gate_checks/imports_in_function_checks.py +283 -0
  61. vigil_forensic/gate_checks/ml_checks.py +318 -0
  62. vigil_forensic/gate_checks/performance_checks.py +106 -0
  63. vigil_forensic/gate_checks/project_specific_runner.py +691 -0
  64. vigil_forensic/gate_checks/provider_capability_checks.py +73 -0
  65. vigil_forensic/gate_checks/refactor_completeness_checks.py +274 -0
  66. vigil_forensic/gate_checks/reliability_checks.py +389 -0
  67. vigil_forensic/gate_checks/reporting_checks.py +55 -0
  68. vigil_forensic/gate_checks/runtime_behavior_checks.py +220 -0
  69. vigil_forensic/gate_checks/security_injection_checks.py +332 -0
  70. vigil_forensic/gate_checks/semantic_intent_checks.py +139 -0
  71. vigil_forensic/gate_checks/size_complexity_checks.py +336 -0
  72. vigil_forensic/gate_checks/stuck_feature_flag_checks.py +354 -0
  73. vigil_forensic/gate_checks/syntax_validity_checks.py +217 -0
  74. vigil_forensic/gate_checks/temporal_freshness_checks.py +79 -0
  75. vigil_forensic/gate_checks/test_quality_checks.py +946 -0
  76. vigil_forensic/gate_checks/testing_checks.py +149 -0
  77. vigil_forensic/gate_checks/toctou_checks.py +367 -0
  78. vigil_forensic/gate_checks/type_checking_checks.py +316 -0
  79. vigil_forensic/gate_models.py +392 -0
  80. vigil_forensic/gate_packs/__init__.py +1 -0
  81. vigil_forensic/gate_packs/universal.py +179 -0
  82. vigil_forensic/gate_profile.json +31 -0
  83. vigil_forensic/gate_registry.py +21 -0
  84. vigil_forensic/language_profiles.py +219 -0
  85. vigil_forensic/meta_findings.py +207 -0
  86. vigil_forensic/self_audit.py +725 -0
  87. vigil_forensic/source_analysis.py +175 -0
  88. vigil_mapper/__init__.py +103 -0
  89. vigil_mapper/_ast_helpers_minimal.py +229 -0
  90. vigil_mapper/_extract_imports_impl.py +123 -0
  91. vigil_mapper/_file_count_guard.py +129 -0
  92. vigil_mapper/_git_utils.py +178 -0
  93. vigil_mapper/_runtime_ast.py +438 -0
  94. vigil_mapper/_runtime_dispatch.py +137 -0
  95. vigil_mapper/_seed_helpers.py +82 -0
  96. vigil_mapper/authority_builder.py +1102 -0
  97. vigil_mapper/cli_entry.py +731 -0
  98. vigil_mapper/conflict_builder.py +818 -0
  99. vigil_mapper/data_contract_builder.py +446 -0
  100. vigil_mapper/findings_builder.py +716 -0
  101. vigil_mapper/fingerprint.py +53 -0
  102. vigil_mapper/hotspot_builder.py +539 -0
  103. vigil_mapper/map_common.py +449 -0
  104. vigil_mapper/map_errors.py +55 -0
  105. vigil_mapper/map_models.py +431 -0
  106. vigil_mapper/map_models_ext.py +206 -0
  107. vigil_mapper/map_models_findings.py +130 -0
  108. vigil_mapper/map_storage.py +455 -0
  109. vigil_mapper/parse_cache.py +795 -0
  110. vigil_mapper/refactor_boundary_builder.py +266 -0
  111. vigil_mapper/runtime_builder.py +527 -0
  112. vigil_mapper/runtime_tracer.py +243 -0
  113. vigil_mapper/runtime_tracer_entry.py +199 -0
  114. vigil_mapper/semantic_diff.py +71 -0
  115. vigil_mapper/source_adapters/__init__.py +109 -0
  116. vigil_mapper/source_adapters/_base.py +264 -0
  117. vigil_mapper/source_adapters/_ir.py +156 -0
  118. vigil_mapper/source_adapters/_lexer.py +309 -0
  119. vigil_mapper/source_adapters/_patterns.py +212 -0
  120. vigil_mapper/source_adapters/_treesitter.py +182 -0
  121. vigil_mapper/source_adapters/go.py +553 -0
  122. vigil_mapper/source_adapters/java.py +541 -0
  123. vigil_mapper/source_adapters/javascript.py +626 -0
  124. vigil_mapper/source_adapters/python.py +325 -0
  125. vigil_mapper/source_adapters/typescript.py +749 -0
  126. vigil_mapper/structural_builder.py +586 -0
  127. vigil_mcp/__init__.py +1 -0
  128. vigil_mcp/_jobs.py +587 -0
  129. vigil_mcp/_paths.py +93 -0
  130. vigil_mcp/forensic_server.py +419 -0
  131. vigil_mcp/map_server.py +452 -0
@@ -0,0 +1,1102 @@
1
+ """Authority map builder -- reads seed file and auto-discovers writers via AST.
2
+
3
+ Generic tool: operates on any target project_dir.
4
+ Seed file: <project_dir>/.cortex/map_seeds/authority_domains.json
5
+
6
+ WITH a seed: each domain seed entry may carry ``target_file_patterns`` (glob
7
+ patterns). A writer is attributed to a domain only when at least one resolved
8
+ write-target path matches a pattern. Writers with unresolvable targets are
9
+ dropped from all domains. Empty/missing patterns -> no per-domain discovery.
10
+
11
+ WITHOUT a seed (out-of-box): every discovered write site is auto-surfaced as an
12
+ inferred per-writer ``AuthorityDomain`` (status="inferred", source="static_scan")
13
+ so the map is useful immediately. Each entry names the writer file plus its
14
+ write targets and operation kinds. Pure reads never produce an entry.
15
+
16
+ Write detection (Python AST): ``.write_text`` / ``.write_bytes`` / ``.save`` /
17
+ ``os.replace`` (method writes) and ``open(..., "w"/"a"/"x"/"+")`` / ``json.dump``
18
+ (function writes). Reads -- ``open(p)`` / ``open(p, "r")`` / ``.read_text()`` /
19
+ ``json.load`` / ``json.dumps`` -- are NOT writes. Non-Python writers (Go/Java/
20
+ JS/TS) are detected via adapter ``extract_writer_calls`` and surface the same way.
21
+ """
22
+ from __future__ import annotations
23
+
24
+ import ast
25
+ import fnmatch
26
+ import hashlib
27
+ import json
28
+ import logging
29
+ import re
30
+ from pathlib import Path
31
+ from typing import Any, NamedTuple, Sequence
32
+
33
+ from .map_common import classify_file_role, iter_py_files, iter_source_files, make_metadata
34
+ from .source_adapters import get_adapter_for_file
35
+ from .source_adapters._ir import AuthorityWriteCandidate
36
+ from .map_errors import MapIntegrityError
37
+ from .map_models import AuthorityDomain
38
+ from .map_storage import seeds_dir
39
+
40
+ __all__ = ["build_authority_map"]
41
+
42
+ _log = logging.getLogger(__name__)
43
+
44
+ _SEED_FILENAME = "authority_domains.json"
45
+ _WRITE_METHOD_NAMES = frozenset({"write_text", "write_bytes", "save"})
46
+ _UNKNOWN_TARGET = "__unknown_target__"
47
+
48
+
49
+ def _open_mode_is_write(mode: str) -> bool:
50
+ """True iff an open() mode string mutates the target.
51
+
52
+ Write modes contain 'w', 'a', 'x' (create/truncate/append) or '+'
53
+ (read-update / write-update — both can write). A bare ``open(p)`` defaults
54
+ to ``"r"``; ``"r"`` / ``"rb"`` / ``"rt"`` are pure READS → not writes.
55
+ The ``b``/``t`` flags are binary/text modifiers and do not imply a write.
56
+ """
57
+ return any(ch in mode for ch in ("w", "a", "x", "+"))
58
+
59
+ # Provenance type constants for path tracking
60
+ _PROVENANCE_PATH_CONSTRUCTOR = "path_constructor" # Path(...), PurePath(...), etc.
61
+ _PROVENANCE_STRING_LITERAL = "string_literal" # "literal_path"
62
+ _PROVENANCE_FUNCTION_PARAM = "function_parameter" # def foo(target):
63
+ _PROVENANCE_UNKNOWN = "unknown"
64
+
65
+
66
+ # ---------------------------------------------------------------------------
67
+ # Write call tracking
68
+ # ---------------------------------------------------------------------------
69
+
70
+ class WriteCall(NamedTuple):
71
+ """Represents a single write call with provenance and location info."""
72
+ target: str # resolved target path or _UNKNOWN_TARGET
73
+ operation: str # "write_text" | "write_bytes" | "os.replace" | "save" | "unknown"
74
+ line: int | None # source line number of the call (or None if unavailable)
75
+ provenance: str # "path_constructor" | "string_literal" | "function_parameter" | "unknown"
76
+
77
+
78
+ # ---------------------------------------------------------------------------
79
+ # Glob matching — ** support (PurePath.match added ** only in Python 3.12)
80
+ # ---------------------------------------------------------------------------
81
+
82
+ def _match_glob_path(path: str, pattern: str) -> bool:
83
+ """Match forward-slash path against glob pattern supporting **."""
84
+ path = path.replace("\\", "/")
85
+ pattern = pattern.replace("\\", "/")
86
+ if "**" not in pattern:
87
+ return fnmatch.fnmatch(path, pattern)
88
+ return _match_double_star(path, pattern)
89
+
90
+
91
+ def _match_double_star(path: str, pattern: str) -> bool:
92
+ """Recursive ** expansion: ** matches zero or more path segments."""
93
+ if "**" not in pattern:
94
+ return fnmatch.fnmatch(path, pattern)
95
+ idx = pattern.find("**")
96
+ prefix = pattern[:idx].rstrip("/")
97
+ rest = pattern[idx + 2:].lstrip("/")
98
+ path_parts = path.split("/")
99
+ if prefix:
100
+ n = len(prefix.split("/"))
101
+ if len(path_parts) < n:
102
+ return False
103
+ if not fnmatch.fnmatch("/".join(path_parts[:n]), prefix):
104
+ return False
105
+ path_parts = path_parts[n:]
106
+ if not rest:
107
+ return True
108
+ for i in range(len(path_parts) + 1):
109
+ if _match_double_star("/".join(path_parts[i:]), rest):
110
+ return True
111
+ return False
112
+
113
+
114
+ # ---------------------------------------------------------------------------
115
+ # AST helpers
116
+ # ---------------------------------------------------------------------------
117
+
118
+ def _extract_string_value(node: ast.expr | None) -> str | None:
119
+ if isinstance(node, ast.Constant) and isinstance(node.value, str):
120
+ return node.value
121
+ return None
122
+
123
+
124
+ def _normalize_target_path(target: str) -> str:
125
+ """Strip .tmp/.bak/.backup/.temp suffixes → canonical base target."""
126
+ name = Path(target).name
127
+ for suffix in (".tmp", ".bak", ".backup", ".temp"):
128
+ if name.endswith(suffix):
129
+ return str(Path(target).with_name(name[: -len(suffix)]))
130
+ # Also strip uuid-based suffixes: state.abc123.tmp → state
131
+ # Pattern: name.<hex/uuid>.<ext_or_tmp>
132
+ stripped = re.sub(r'\.[0-9a-f\-]{8,}\.tmp$', '', target)
133
+ if stripped != target:
134
+ return stripped
135
+ return target
136
+
137
+
138
+ def _is_plausible_path(s: str) -> bool:
139
+ """True iff s looks like a file path (not a multi-line string or code snippet).
140
+
141
+ Bare filenames (Makefile, Dockerfile, LICENSE, Procfile, README) are valid.
142
+ """
143
+ if not s or len(s) > 512:
144
+ return False
145
+ if '\n' in s or '\r' in s:
146
+ return False
147
+ # Bare filenames that are valid write targets
148
+ if Path(s).name in {"Makefile", "Dockerfile", "Procfile", "LICENSE", "README"}:
149
+ return True
150
+ # Otherwise must contain at least one path-like character
151
+ if '/' not in s and '\\' not in s and '.' not in s:
152
+ return False
153
+ return True
154
+
155
+
156
+ def _module_prefix(rel_posix: str) -> str:
157
+ """First path component = top-level module/package."""
158
+ return rel_posix.split("/")[0]
159
+
160
+
161
+ def _safe_domain_name(target: str) -> str:
162
+ """Convert target path to safe domain name (parent_stem_hash, max 40 chars).
163
+
164
+ Includes parent directory to avoid collisions: api/config.json vs settings/config.yaml.
165
+ Uses stable blake2s hash (deterministic across processes) for collision avoidance.
166
+ """
167
+ p = Path(target)
168
+ parts = []
169
+ # Add parent directory name if present
170
+ if p.parent.name and p.parent.name not in (".", ""):
171
+ parts.append(p.parent.name)
172
+ # Add filename stem
173
+ parts.append(p.stem)
174
+ # Include first 4 chars of stable blake2s hash for collision avoidance
175
+ target_hash = hashlib.blake2s(
176
+ target.encode("utf-8"),
177
+ digest_size=2,
178
+ ).hexdigest()
179
+ parts.append(target_hash)
180
+ raw = "_".join(parts)
181
+ # Sanitize and truncate
182
+ return re.sub(r'[^a-zA-Z0-9_]', '_', raw)[:40]
183
+
184
+
185
+ def _seed_covers_target(target: str, seed_domains: list[dict]) -> bool:
186
+ """True if any seed domain's target_file_patterns matches this target."""
187
+ for domain_def in seed_domains:
188
+ for pattern in domain_def.get("target_file_patterns", []):
189
+ if _match_glob_path(target, pattern):
190
+ return True
191
+ return False
192
+
193
+
194
+ def _resolve_call_target(
195
+ call_node: ast.Call,
196
+ assignments: dict[str, str],
197
+ aliases: dict[str, str] | None = None,
198
+ ) -> str | None:
199
+ """Resolve the file-path target of a write call via AST analysis."""
200
+ if aliases is None:
201
+ aliases = {}
202
+ func = call_node.func
203
+ if not isinstance(func, ast.Attribute):
204
+ return None
205
+ receiver = func.value
206
+ # Path("literal").write_text(...) or path.with_suffix(...).write_text(...)
207
+ if isinstance(receiver, ast.Call) and isinstance(receiver.func, (ast.Name, ast.Attribute)):
208
+ fname = receiver.func.id if isinstance(receiver.func, ast.Name) else receiver.func.attr
209
+ if fname in ("Path", "PurePath", "PosixPath", "WindowsPath") and receiver.args:
210
+ return _extract_string_value(receiver.args[0])
211
+ # .with_suffix(...).write_text() or .with_name(...).write_text()
212
+ if fname in ("with_suffix", "with_name", "with_stem"):
213
+ inner = receiver.func.value if isinstance(receiver.func, ast.Attribute) else None
214
+ if isinstance(inner, ast.Name):
215
+ name = inner.id
216
+ resolved = name
217
+ visited: set[str] = {resolved}
218
+ for _ in range(8):
219
+ nxt = aliases.get(resolved)
220
+ if nxt is None or nxt in visited:
221
+ break
222
+ visited.add(nxt)
223
+ resolved = nxt
224
+ return assignments.get(resolved)
225
+ # name.write_text(...) with alias following
226
+ if isinstance(receiver, ast.Name):
227
+ name = receiver.id
228
+ resolved = name
229
+ visited: set[str] = {resolved}
230
+ for _ in range(8):
231
+ nxt = aliases.get(resolved)
232
+ if nxt is None or nxt in visited:
233
+ break
234
+ visited.add(nxt)
235
+ resolved = nxt
236
+ return assignments.get(resolved)
237
+ # self.attr.write_text(...)
238
+ if isinstance(receiver, ast.Attribute) and isinstance(receiver.value, ast.Name):
239
+ return assignments.get("%s.%s" % (receiver.value.id, receiver.attr))
240
+ return None
241
+
242
+
243
+ def _resolve_func_arg_target(
244
+ arg_node: ast.expr | None,
245
+ assignments: dict[str, str],
246
+ aliases: dict[str, str],
247
+ ) -> str | None:
248
+ """Resolve a path-like target from a positional call argument.
249
+
250
+ Handles a string literal, ``Path("literal")``, or a variable name that an
251
+ assignment resolved to a string/Path (with alias chaining). Returns None
252
+ when the target cannot be resolved.
253
+ """
254
+ if arg_node is None:
255
+ return None
256
+ # "literal"
257
+ lit = _extract_string_value(arg_node)
258
+ if lit is not None:
259
+ return lit
260
+ # Path("literal") / PurePath("literal")
261
+ if isinstance(arg_node, ast.Call) and isinstance(arg_node.func, (ast.Name, ast.Attribute)):
262
+ fname = arg_node.func.id if isinstance(arg_node.func, ast.Name) else arg_node.func.attr
263
+ if fname in ("Path", "PurePath", "PosixPath", "WindowsPath") and arg_node.args:
264
+ return _extract_string_value(arg_node.args[0])
265
+ # variable name -> resolve through aliases + assignments
266
+ if isinstance(arg_node, ast.Name):
267
+ resolved = arg_node.id
268
+ visited: set[str] = {resolved}
269
+ for _ in range(8):
270
+ nxt = aliases.get(resolved)
271
+ if nxt is None or nxt in visited:
272
+ break
273
+ visited.add(nxt)
274
+ resolved = nxt
275
+ return assignments.get(resolved)
276
+ return None
277
+
278
+
279
+ def _detect_func_write(
280
+ node: ast.Call,
281
+ assignments: dict[str, str],
282
+ aliases: dict[str, str],
283
+ ) -> tuple[str, str | None] | None:
284
+ """Detect ``open(path, "w")`` and ``json.dump(obj, fp)`` function-call writes.
285
+
286
+ These are plain function calls (``ast.Name`` / module ``ast.Attribute``),
287
+ NOT receiver-method calls, so the standard ``_WRITE_METHOD_NAMES`` scan
288
+ misses them. Returns ``(operation, target_or_None)`` for a write, else None.
289
+
290
+ - ``open(path, mode)`` is a write only when ``mode`` mutates the target
291
+ (see :func:`_open_mode_is_write`). A bare ``open(p)`` / ``open(p, "r")``
292
+ is a READ → returns None (precision guard).
293
+ - ``json.dump(obj, fp)`` writes to ``fp``; ``json.dumps`` (returns a string)
294
+ and ``json.load`` / ``json.loads`` (reads) are NOT writes.
295
+ """
296
+ func = node.func
297
+
298
+ # open(...) — builtin name (assume builtin; shadowing is rare and out of scope)
299
+ if isinstance(func, ast.Name) and func.id == "open":
300
+ mode = "r" # open() default
301
+ if len(node.args) >= 2:
302
+ lit = _extract_string_value(node.args[1])
303
+ if lit is not None:
304
+ mode = lit
305
+ else:
306
+ for kw in node.keywords:
307
+ if kw.arg == "mode":
308
+ lit = _extract_string_value(kw.value)
309
+ if lit is not None:
310
+ mode = lit
311
+ if not _open_mode_is_write(mode):
312
+ return None
313
+ target = _resolve_func_arg_target(
314
+ node.args[0] if node.args else None, assignments, aliases
315
+ )
316
+ return ("open_write", target)
317
+
318
+ # json.dump(obj, fp, ...) — module attribute call. Target = fp (2nd arg),
319
+ # best-effort (usually a file handle variable). dumps/load/loads excluded.
320
+ if isinstance(func, ast.Attribute) and func.attr == "dump":
321
+ receiver = func.value
322
+ if isinstance(receiver, ast.Name) and receiver.id == "json":
323
+ fp_node = node.args[1] if len(node.args) >= 2 else None
324
+ target = _resolve_func_arg_target(fp_node, assignments, aliases)
325
+ return ("json_dump", target)
326
+
327
+ return None
328
+
329
+
330
+ def _collect_assignments(tree: ast.AST) -> tuple[dict[str, tuple[str, str]], dict[str, str]]:
331
+ """Return (assignments_typed, aliases).
332
+
333
+ assignments_typed: name -> (string-path, provenance_type)
334
+ - Provenance types: path_constructor, string_literal, function_parameter, unknown
335
+ aliases: name -> other_name (for .with_suffix/.with_name/.with_stem chains)
336
+
337
+ First pass: extract function parameters (lower precedence)
338
+ Second pass: extract assignments (higher precedence, overwrites params)
339
+ """
340
+ assignments_typed: dict[str, tuple[str, str]] = {}
341
+ aliases: dict[str, str] = {}
342
+
343
+ # PASS 1: Extract function parameters
344
+ for node in ast.walk(tree):
345
+ if not isinstance(node, ast.FunctionDef):
346
+ continue
347
+ for arg in node.args.args:
348
+ # Function parameter: store with empty path, provenance_type = function_parameter
349
+ assignments_typed[arg.arg] = ("", _PROVENANCE_FUNCTION_PARAM)
350
+
351
+ # PASS 2: Extract assignments (overwrites function params if same name)
352
+ for node in ast.walk(tree):
353
+ if not isinstance(node, ast.Assign) or len(node.targets) != 1:
354
+ continue
355
+ target = node.targets[0]
356
+ if isinstance(target, ast.Name):
357
+ key = target.id
358
+ elif isinstance(target, ast.Attribute) and isinstance(target.value, ast.Name):
359
+ key = "%s.%s" % (target.value.id, target.attr)
360
+ else:
361
+ continue
362
+ value = node.value
363
+ if isinstance(value, ast.Constant) and isinstance(value.value, str):
364
+ # String literal assignment
365
+ assignments_typed[key] = (value.value, _PROVENANCE_STRING_LITERAL)
366
+ elif isinstance(value, ast.Call) and isinstance(value.func, (ast.Name, ast.Attribute)):
367
+ fname = value.func.id if isinstance(value.func, ast.Name) else value.func.attr
368
+ if fname in ("Path", "PurePath", "PosixPath", "WindowsPath") and value.args:
369
+ val = _extract_string_value(value.args[0])
370
+ if val is not None:
371
+ # Path constructor
372
+ assignments_typed[key] = (val, _PROVENANCE_PATH_CONSTRUCTOR)
373
+ else:
374
+ assignments_typed[key] = ("", _PROVENANCE_UNKNOWN)
375
+ elif fname in ("with_suffix", "with_name", "with_stem"):
376
+ receiver = value.func.value
377
+ if isinstance(receiver, ast.Name):
378
+ aliases[key] = receiver.id
379
+ return assignments_typed, aliases
380
+
381
+
382
+ def _scan_write_calls(
383
+ tree: ast.AST,
384
+ assignments_typed: dict[str, tuple[str, str]],
385
+ aliases: dict[str, str] | None = None,
386
+ ) -> list[WriteCall]:
387
+ """Return WriteCall objects for each write call found in tree.
388
+
389
+ Args:
390
+ tree: AST tree to scan
391
+ assignments_typed: {var_name: (target_path, provenance_type)}
392
+ aliases: {var_name: alias_var_name} for .with_suffix chains
393
+
394
+ Returns:
395
+ list[WriteCall] with target, operation, line, and provenance
396
+ """
397
+ if aliases is None:
398
+ aliases = {}
399
+
400
+ # Flatten assignments for target resolution
401
+ assignments = {k: v[0] for k, v in assignments_typed.items()}
402
+
403
+ calls: list[WriteCall] = []
404
+ for node in ast.walk(tree):
405
+ if not isinstance(node, ast.Call):
406
+ continue
407
+
408
+ line_no = node.lineno if hasattr(node, 'lineno') else None
409
+
410
+ # Function-call writes: open(path, "w"), json.dump(obj, fp).
411
+ # These are NOT receiver-method calls, so handle them before the
412
+ # ast.Attribute gate below. Reads (open(p)/open(p,"r")) return None.
413
+ func_write = _detect_func_write(node, assignments, aliases)
414
+ if func_write is not None:
415
+ operation, resolved = func_write
416
+ if resolved is not None and not _is_plausible_path(resolved):
417
+ resolved = _UNKNOWN_TARGET
418
+ target = resolved if resolved is not None else _UNKNOWN_TARGET
419
+ provenance = _PROVENANCE_UNKNOWN
420
+ for var_name, (path, prov_type) in assignments_typed.items():
421
+ if path == target and target != _UNKNOWN_TARGET:
422
+ provenance = prov_type
423
+ break
424
+ calls.append(WriteCall(target=target, operation=operation, line=line_no, provenance=provenance))
425
+ continue
426
+
427
+ if not isinstance(node.func, ast.Attribute):
428
+ continue
429
+
430
+ # Standard write methods: path.write_text(), path.save(), etc.
431
+ if node.func.attr in _WRITE_METHOD_NAMES:
432
+ operation = node.func.attr
433
+ resolved = _resolve_call_target(node, assignments, aliases)
434
+ if resolved is not None and not _is_plausible_path(resolved):
435
+ resolved = _UNKNOWN_TARGET
436
+ target = resolved if resolved is not None else _UNKNOWN_TARGET
437
+
438
+ # Determine provenance from assignments_typed
439
+ provenance = _PROVENANCE_UNKNOWN
440
+ for var_name, (path, prov_type) in assignments_typed.items():
441
+ if path == target and target != _UNKNOWN_TARGET:
442
+ provenance = prov_type
443
+ break
444
+
445
+ calls.append(WriteCall(target=target, operation=operation, line=line_no, provenance=provenance))
446
+ continue
447
+
448
+ # os.replace(src, dst) — dst is second positional arg
449
+ if (node.func.attr == "replace" and
450
+ isinstance(node.func.value, ast.Name) and
451
+ node.func.value.id == "os" and
452
+ len(node.args) >= 2):
453
+ operation = "os.replace"
454
+ dst_node = node.args[1]
455
+ if isinstance(dst_node, ast.Name):
456
+ name = dst_node.id
457
+ resolved = name
458
+ visited: set[str] = {resolved}
459
+ for _ in range(8):
460
+ nxt = aliases.get(resolved)
461
+ if nxt is None or nxt in visited:
462
+ break
463
+ visited.add(nxt)
464
+ resolved = nxt
465
+ target = assignments.get(resolved, _UNKNOWN_TARGET)
466
+
467
+ # Determine provenance
468
+ provenance = _PROVENANCE_UNKNOWN
469
+ for var_name, (path, prov_type) in assignments_typed.items():
470
+ if path == target and target != _UNKNOWN_TARGET:
471
+ provenance = prov_type
472
+ break
473
+
474
+ calls.append(WriteCall(target=target, operation=operation, line=line_no, provenance=provenance))
475
+
476
+ return calls
477
+
478
+
479
+ # ---------------------------------------------------------------------------
480
+ # Per-file scan
481
+ # ---------------------------------------------------------------------------
482
+
483
+ def _scan_writers(
484
+ project_dir: Path,
485
+ include_roots: Sequence[str] | None,
486
+ ) -> dict[str, list[WriteCall]]:
487
+ """Return mapping writer_rel_posix -> list[WriteCall].
488
+
489
+ Uses parallel ThreadPoolExecutor for faster AST scanning on large projects.
490
+ """
491
+ from concurrent.futures import ThreadPoolExecutor, as_completed
492
+ import os as _os
493
+
494
+ project_dir = project_dir.resolve()
495
+ py_files = sorted(iter_py_files(project_dir, include_roots))
496
+
497
+ def _scan_one(py_file: Path) -> tuple[str, list[WriteCall]] | None:
498
+ try:
499
+ source = py_file.read_text(encoding="utf-8", errors="replace")
500
+ tree = ast.parse(source, filename=str(py_file))
501
+ except (OSError, SyntaxError) as exc:
502
+ _log.debug("_scan_writers: skipping %s: %s", py_file, exc)
503
+ return None
504
+ # Cheap pre-filter: skip files with no candidate write-shaped call.
505
+ # Method writes (.write_text/.save/os.replace) are ast.Attribute; the
506
+ # function-call writes open(...) / json.dump(...) are ast.Name / module
507
+ # attribute respectively. Mode/target precision is decided later in
508
+ # _scan_write_calls — this is only a coarse "worth parsing?" gate.
509
+ def _is_candidate(node: ast.AST) -> bool:
510
+ if not isinstance(node, ast.Call):
511
+ return False
512
+ func = node.func
513
+ if isinstance(func, ast.Name):
514
+ return func.id == "open" # mode filtered later
515
+ if isinstance(func, ast.Attribute):
516
+ if func.attr in _WRITE_METHOD_NAMES:
517
+ return True
518
+ if func.attr == "dump" and isinstance(func.value, ast.Name) and func.value.id == "json":
519
+ return True
520
+ if (func.attr == "replace"
521
+ and isinstance(func.value, ast.Name)
522
+ and func.value.id == "os"):
523
+ return True
524
+ return False
525
+
526
+ if not any(_is_candidate(node) for node in ast.walk(tree)):
527
+ return None
528
+ assignments_typed, aliases = _collect_assignments(tree)
529
+ write_calls = _scan_write_calls(tree, assignments_typed, aliases)
530
+ if not write_calls:
531
+ return None
532
+ try:
533
+ rel = py_file.resolve().relative_to(project_dir).as_posix()
534
+ except ValueError:
535
+ _log.debug("_scan_writers: cannot relativize %s", py_file)
536
+ return None
537
+ return rel, write_calls
538
+
539
+ raw: dict[str, list[WriteCall]] = {}
540
+ max_workers = min(8, (_os.cpu_count() or 4))
541
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
542
+ futures = {pool.submit(_scan_one, f): f for f in py_files}
543
+ for future in as_completed(futures):
544
+ out = future.result()
545
+ if out is not None:
546
+ rel, write_calls = out
547
+ raw[rel] = write_calls
548
+
549
+ # Sort for determinism (as_completed order is arbitrary)
550
+ result = dict(sorted(raw.items()))
551
+ _log.debug("_scan_writers: found %d writer file(s)", len(result))
552
+ return result
553
+
554
+
555
+ # ---------------------------------------------------------------------------
556
+ # Classification + domain matching
557
+ # ---------------------------------------------------------------------------
558
+
559
+ def _classify_writer(rel_path: str, allowed_writers: tuple[str, ...]) -> str:
560
+ return "canonical_write" if rel_path in allowed_writers else "illegal_write"
561
+
562
+
563
+ def _writer_matches_domain(targets: list[str], patterns: tuple[str, ...]) -> bool:
564
+ """True if any resolved (non-unknown) target matches any domain pattern."""
565
+ for target in targets:
566
+ if target == _UNKNOWN_TARGET:
567
+ continue
568
+ for pattern in patterns:
569
+ if _match_glob_path(target, pattern):
570
+ return True
571
+ return False
572
+
573
+
574
+ # ---------------------------------------------------------------------------
575
+ # Seed loading
576
+ # ---------------------------------------------------------------------------
577
+
578
+ def _load_seed(project_dir: Path) -> list[dict] | None:
579
+ """Load authority domains seed. Returns None if missing, raises on corrupt."""
580
+ seed_path = seeds_dir(project_dir) / _SEED_FILENAME
581
+ if not seed_path.exists():
582
+ _log.info("build_authority_map: no seed at %s, returning empty map", seed_path)
583
+ return None
584
+ try:
585
+ raw = json.loads(seed_path.read_text(encoding="utf-8"))
586
+ except (json.JSONDecodeError, OSError, UnicodeDecodeError) as exc:
587
+ raise MapIntegrityError(
588
+ "authority seed corrupt (JSON parse failed): %s -- %s" % (seed_path, exc)
589
+ ) from exc
590
+ if not isinstance(raw, dict):
591
+ raise MapIntegrityError(
592
+ "authority seed must be a JSON object, got %s" % type(raw).__name__
593
+ )
594
+ schema_version = raw.get("schema_version")
595
+ if schema_version is None:
596
+ raise MapIntegrityError(
597
+ "authority seed missing required field 'schema_version' in %s" % seed_path
598
+ )
599
+ try:
600
+ major = int(str(schema_version).split(".")[0])
601
+ except (ValueError, IndexError) as exc:
602
+ raise MapIntegrityError(
603
+ "authority seed has unparseable schema_version %r in %s" % (schema_version, seed_path)
604
+ ) from exc
605
+ if major > 1:
606
+ raise MapIntegrityError(
607
+ "authority seed schema_version %r has major version %d > 1 -- "
608
+ "upgrade the builder to read this seed" % (schema_version, major)
609
+ )
610
+ domains = raw.get("domains", [])
611
+ if not isinstance(domains, list):
612
+ raise MapIntegrityError(
613
+ "authority seed 'domains' must be a list, got %s" % type(domains).__name__
614
+ )
615
+ _log.debug("_load_seed: loaded %d domain(s) from %s", len(domains), seed_path)
616
+ return domains
617
+
618
+
619
+ # ---------------------------------------------------------------------------
620
+ # Auto-discovery: collect targets and infer domains (seed-free)
621
+ # ---------------------------------------------------------------------------
622
+
623
+ def _collect_auto_write_targets(
624
+ writers_map: dict[str, list[WriteCall]],
625
+ adapter_candidates: dict[str, list],
626
+ ) -> tuple[dict[str, list[str]], dict[str, list[tuple[str, WriteCall | None]]]]:
627
+ """Collect target -> [writer_rel_posix] mapping and WriteCall tracking.
628
+
629
+ Returns:
630
+ (target_to_writers, target_to_write_calls) where:
631
+ - target_to_writers: target -> [writer_rel_posix] (backward compat)
632
+ - target_to_write_calls: target -> [(writer_rel, WriteCall|None), ...]
633
+ WriteCall is None for non-Python adapter writers (no AST info).
634
+
635
+ target keys are normalized (tmp/bak stripped).
636
+ Only non-UNKNOWN targets included.
637
+ Result is sorted for determinism.
638
+ """
639
+ target_to_writers: dict[str, list[str]] = {}
640
+ target_to_write_calls: dict[str, list[tuple[str, WriteCall | None]]] = {}
641
+
642
+ # Python AST writers
643
+ for writer_rel, write_calls in sorted(writers_map.items()):
644
+ for write_call in write_calls:
645
+ target = write_call.target
646
+ if target == _UNKNOWN_TARGET:
647
+ continue
648
+ base = _normalize_target_path(target)
649
+ target_to_writers.setdefault(base, []).append(writer_rel)
650
+ target_to_write_calls.setdefault(base, []).append((writer_rel, write_call))
651
+
652
+ # Non-Python adapter writers (TS/JS/Go/Java etc.)
653
+ for writer_rel, candidates in sorted(adapter_candidates.items()):
654
+ for candidate in candidates:
655
+ if not candidate.target_hint:
656
+ continue
657
+ base = _normalize_target_path(candidate.target_hint)
658
+ target_to_writers.setdefault(base, []).append(writer_rel)
659
+ # No WriteCall object available for adapters
660
+ target_to_write_calls.setdefault(base, []).append((writer_rel, None))
661
+
662
+ # Deduplicate and sort writers lists for determinism
663
+ target_to_writers_result = {
664
+ target: sorted(dict.fromkeys(writers))
665
+ for target, writers in sorted(target_to_writers.items())
666
+ }
667
+
668
+ # Deduplicate WriteCall entries (keep first occurrence per writer_rel)
669
+ target_to_write_calls_result = {}
670
+ for target in sorted(target_to_write_calls.keys()):
671
+ # Keep only first WriteCall per writer_rel for this target
672
+ seen_writers: dict[str, tuple[str, WriteCall | None]] = {}
673
+ for writer_rel, write_call in target_to_write_calls[target]:
674
+ if writer_rel not in seen_writers:
675
+ seen_writers[writer_rel] = (writer_rel, write_call)
676
+ target_to_write_calls_result[target] = list(seen_writers.values())
677
+
678
+ return target_to_writers_result, target_to_write_calls_result
679
+
680
+
681
+ def _auto_discover_domains(
682
+ target_to_writers: dict[str, list[str]],
683
+ target_to_write_calls: dict[str, list[tuple[str, WriteCall | None]]],
684
+ seed_domains: list[dict],
685
+ ) -> list[dict]:
686
+ """Find shared-write clusters not covered by seed.
687
+
688
+ Returns synthetic domain defs for build_authority_map() merge loop.
689
+ Only includes groups with 2+ writers from DIFFERENT module prefixes.
690
+ Seed-covered targets are skipped.
691
+ Test-only shared writes (all writers non-production) are skipped.
692
+
693
+ Args:
694
+ target_to_writers: target -> [writer_rel] mapping (backward compat usage)
695
+ target_to_write_calls: target -> [(writer_rel, WriteCall|None)] mapping
696
+ seed_domains: list of seed domain definitions
697
+ """
698
+ auto_domains = []
699
+ for target, writers in target_to_writers.items():
700
+ if len(writers) < 2:
701
+ continue
702
+ # Must come from different module prefixes
703
+ prefixes = {_module_prefix(w) for w in writers}
704
+ if len(prefixes) < 2:
705
+ continue
706
+ # Skip if any seed domain already covers this target
707
+ if _seed_covers_target(target, seed_domains):
708
+ continue
709
+ # Skip if all writers are non-production (test/fixture/generated)
710
+ roles = {classify_file_role(w) for w in writers}
711
+ if "production" not in roles:
712
+ continue
713
+ auto_domains.append({
714
+ "_auto": True,
715
+ "authority_domain": f"shared_write:{_safe_domain_name(target)}",
716
+ "canonical_owner": "",
717
+ "allowed_writers": [],
718
+ "target_file_patterns": [target],
719
+ "_shared_target": target,
720
+ "_all_writers": writers,
721
+ "_write_calls": target_to_write_calls.get(target, []),
722
+ })
723
+ return auto_domains
724
+
725
+
726
+ def _safe_writer_domain_name(writer_rel: str) -> str:
727
+ """Stable, filesystem-safe domain name for a writer file (no-seed mode)."""
728
+ digest = hashlib.blake2s(writer_rel.encode("utf-8"), digest_size=2).hexdigest()
729
+ stem = Path(writer_rel).stem
730
+ raw = "%s_%s" % (stem, digest)
731
+ return "auto_discovered:" + re.sub(r"[^a-zA-Z0-9_]", "_", raw)[:48]
732
+
733
+
734
+ def _build_no_seed_writer_domains(
735
+ writers_map: dict[str, list[WriteCall]],
736
+ adapter_candidates: dict[str, list[AuthorityWriteCandidate]],
737
+ ) -> list[AuthorityDomain]:
738
+ """Auto-surface every discovered writer as an inferred AuthorityDomain.
739
+
740
+ Used ONLY when no seed file exists. One domain per writer file: the writer
741
+ is named as canonical_owner and listed in writers_detected together with
742
+ each resolved write target + operation/kind, so the entry is actionable
743
+ out-of-the-box. status="inferred", source names "static_scan".
744
+
745
+ Writers with no resolvable targets are still surfaced (with an unknown
746
+ target) because a confirmed write operation is itself authority evidence;
747
+ pure reads never reach this map (they produce no WriteCall / candidate).
748
+ """
749
+ metadata = make_metadata(source="static_scan", confidence=0.5, status="inferred")
750
+ domains: list[AuthorityDomain] = []
751
+
752
+ # union of all writer files (Python AST + non-Python adapter), sorted
753
+ all_writers = sorted(set(writers_map) | set(adapter_candidates))
754
+
755
+ for writer_rel in all_writers:
756
+ writers_detected: list[dict] = []
757
+ targets: list[str] = []
758
+
759
+ # Python AST write calls
760
+ for wc in writers_map.get(writer_rel, []):
761
+ target = wc.target if wc.target != _UNKNOWN_TARGET else ""
762
+ if target:
763
+ targets.append(_normalize_target_path(target))
764
+ writers_detected.append({
765
+ "location": writer_rel,
766
+ "kind": "write",
767
+ "target": _normalize_target_path(target) if target else "",
768
+ "operation": wc.operation,
769
+ "line": wc.line,
770
+ "provenance": wc.provenance,
771
+ "file_role": classify_file_role(writer_rel),
772
+ })
773
+
774
+ # Non-Python adapter candidates (Go/Java/JS/TS)
775
+ for cand in adapter_candidates.get(writer_rel, []):
776
+ target = cand.target_hint or ""
777
+ if target:
778
+ targets.append(_normalize_target_path(target))
779
+ writers_detected.append({
780
+ "location": writer_rel,
781
+ "kind": cand.write_kind,
782
+ "target": _normalize_target_path(target) if target else "",
783
+ "operation": cand.write_kind,
784
+ "line": cand.line,
785
+ "provenance": _PROVENANCE_UNKNOWN,
786
+ "file_role": classify_file_role(writer_rel),
787
+ })
788
+
789
+ if not writers_detected:
790
+ continue
791
+
792
+ # Deterministic order inside the entry
793
+ writers_detected.sort(key=lambda w: (w.get("line") or 0, w.get("target", ""), w.get("operation", "")))
794
+ resolved_targets = sorted(dict.fromkeys(t for t in targets if t))
795
+
796
+ domains.append(AuthorityDomain(
797
+ authority_domain=_safe_writer_domain_name(writer_rel),
798
+ canonical_owner=writer_rel,
799
+ allowed_writers=(writer_rel,),
800
+ derived_readers=(),
801
+ cache_layers=(),
802
+ freshness_sla="immediate",
803
+ invalidation_rule="unknown",
804
+ drift_policy="observe",
805
+ writers_detected=tuple(
806
+ json.dumps(w, sort_keys=True) for w in writers_detected
807
+ ),
808
+ last_drift_events=(),
809
+ target_file_patterns=tuple(resolved_targets),
810
+ source=metadata["source"],
811
+ evidence=tuple(metadata["evidence"]),
812
+ confidence=metadata["confidence"],
813
+ freshness=metadata["freshness"],
814
+ status=metadata["status"],
815
+ ))
816
+
817
+ return domains
818
+
819
+
820
+ # ---------------------------------------------------------------------------
821
+ # Non-Python adapter writer collection (L7a)
822
+ # ---------------------------------------------------------------------------
823
+
824
+ def _collect_adapter_writer_candidates(
825
+ project_dir: Path,
826
+ include_roots: Sequence[str] | None,
827
+ ) -> dict[str, list[AuthorityWriteCandidate]]:
828
+ """Return mapping rel_posix -> list[AuthorityWriteCandidate] for non-Python files.
829
+
830
+ Iterates all source files via iter_source_files, skips Python (handled by
831
+ AST pass), skips adapters without supports_authority_writes=True.
832
+ """
833
+ result: dict[str, list[AuthorityWriteCandidate]] = {}
834
+ project_dir = project_dir.resolve()
835
+ for src_file in iter_source_files(project_dir, include_roots=include_roots):
836
+ adapter = get_adapter_for_file(src_file)
837
+ if adapter is None:
838
+ continue
839
+ if adapter.language == "python":
840
+ continue
841
+ if not getattr(adapter, "supports_authority_writes", False):
842
+ continue
843
+ try:
844
+ content = src_file.read_text(encoding="utf-8", errors="replace")
845
+ except OSError as exc:
846
+ _log.debug("_collect_adapter_writer_candidates: skipping %s: %s", src_file, exc)
847
+ continue
848
+ try:
849
+ candidates = adapter.extract_writer_calls(content, src_file) # type: ignore[attr-defined]
850
+ except Exception as exc: # noqa: BLE001
851
+ _log.debug(
852
+ "_collect_adapter_writer_candidates: error in %s for %s: %s",
853
+ adapter.language, src_file, exc,
854
+ )
855
+ continue
856
+ if not candidates:
857
+ continue
858
+ try:
859
+ rel = src_file.resolve().relative_to(project_dir).as_posix()
860
+ except ValueError:
861
+ _log.debug("_collect_adapter_writer_candidates: cannot relativize %s", src_file)
862
+ continue
863
+ result[rel] = candidates
864
+ _log.debug(
865
+ "_collect_adapter_writer_candidates: found %d non-Python writer file(s)", len(result)
866
+ )
867
+ return result
868
+
869
+
870
+ # ---------------------------------------------------------------------------
871
+ # Public API
872
+ # ---------------------------------------------------------------------------
873
+
874
+ def build_authority_map(
875
+ project_dir: Path,
876
+ include_roots: Sequence[str] | None = None,
877
+ parse_cache: Any | None = None,
878
+ ) -> list[AuthorityDomain]:
879
+ """Build authority map for a target project.
880
+
881
+ Reads seed from <project_dir>/.cortex/map_seeds/authority_domains.json.
882
+ Each domain's ``target_file_patterns`` controls which writers are attributed
883
+ to it via AST-resolved write-target matching. Missing patterns -> no
884
+ auto-discovery for that domain.
885
+
886
+ Also performs seed-free auto-discovery: detects shared write targets
887
+ (2+ writers from different module prefixes) and creates inferred domains.
888
+
889
+ When NO seed file exists, additionally auto-surfaces every discovered write
890
+ site as an inferred per-writer domain (out-of-box usefulness). With a seed
891
+ present this step is skipped to preserve the structured behaviour.
892
+
893
+ Returns empty list only if no seed file exists AND no write sites were found.
894
+ Raises MapIntegrityError if seed is corrupt or has incompatible version.
895
+ """
896
+ project_dir = Path(project_dir).resolve()
897
+ _log.info("build_authority_map: starting for %s", project_dir)
898
+ # parse_cache is accepted for API uniformity with other builders.
899
+ # _scan_writers uses ThreadPoolExecutor internally, and ParseCacheL1 is
900
+ # not thread-safe, so the cache is not passed into the threaded scan path.
901
+ if parse_cache is not None:
902
+ _log.debug("build_authority_map: parse_cache provided but not used in threaded _scan_writers")
903
+ domains_raw = _load_seed(project_dir)
904
+ no_seed = domains_raw is None # no seed file at all -> auto-surface mode
905
+ seed_list: list[dict] = domains_raw or []
906
+
907
+ # ALWAYS scan writers (not gated behind seed anymore)
908
+ _log.info("build_authority_map: scanning writers via AST in %s", project_dir)
909
+ writers_map = _scan_writers(project_dir, include_roots)
910
+ _log.debug("build_authority_map: %d Python writer file(s)", len(writers_map))
911
+
912
+ # L7a: collect non-Python write candidates via adapter dispatch
913
+ adapter_candidates = _collect_adapter_writer_candidates(project_dir, include_roots)
914
+ _log.debug(
915
+ "build_authority_map: %d non-Python writer file(s)", len(adapter_candidates)
916
+ )
917
+
918
+ # Collect shared write targets (Python + non-Python)
919
+ target_to_writers, target_to_write_calls = _collect_auto_write_targets(writers_map, adapter_candidates)
920
+
921
+ metadata = make_metadata(source="seed + static_scan", confidence=0.85, status="observed")
922
+ results: list[AuthorityDomain] = []
923
+
924
+ for domain_def in seed_list:
925
+ if not isinstance(domain_def, dict):
926
+ raise MapIntegrityError(
927
+ "authority seed domain entry must be a dict, got %s" % type(domain_def).__name__
928
+ )
929
+ authority_domain = str(domain_def.get("authority_domain", ""))
930
+ if not authority_domain:
931
+ raise MapIntegrityError(
932
+ "authority seed domain entry missing 'authority_domain' field: %r" % domain_def
933
+ )
934
+
935
+ allowed_writers: tuple[str, ...] = tuple(domain_def.get("allowed_writers", []))
936
+ target_file_patterns: tuple[str, ...] = tuple(domain_def.get("target_file_patterns", []))
937
+
938
+ if not target_file_patterns:
939
+ _log.info(
940
+ "build_authority_map: domain=%s has no target_file_patterns -- "
941
+ "skipping auto-discovery",
942
+ authority_domain,
943
+ )
944
+
945
+ seen_locations: set[str] = set()
946
+ writers_detected_dicts: list[dict] = []
947
+ if target_file_patterns:
948
+ # Python AST writers
949
+ for writer_path, write_calls in sorted(writers_map.items()):
950
+ if writer_path in seen_locations:
951
+ continue
952
+ # Extract targets from WriteCall objects for domain matching
953
+ targets = [wc.target for wc in write_calls]
954
+ if _writer_matches_domain(targets, target_file_patterns):
955
+ kind = _classify_writer(writer_path, allowed_writers)
956
+ # Pick most significant write call (by provenance priority)
957
+ # Priority: path_constructor > string_literal > function_parameter > unknown
958
+ _prov_priority = {
959
+ _PROVENANCE_PATH_CONSTRUCTOR: 3,
960
+ _PROVENANCE_STRING_LITERAL: 2,
961
+ _PROVENANCE_FUNCTION_PARAM: 1,
962
+ _PROVENANCE_UNKNOWN: 0,
963
+ }
964
+ best_wc = max(write_calls, key=lambda wc: _prov_priority.get(wc.provenance, -1))
965
+ writers_detected_dicts.append({
966
+ "location": writer_path,
967
+ "kind": kind,
968
+ "file_role": classify_file_role(writer_path),
969
+ "operation": best_wc.operation,
970
+ "line": best_wc.line,
971
+ "provenance": best_wc.provenance,
972
+ })
973
+ seen_locations.add(writer_path)
974
+
975
+ # Non-Python adapter writers (L7a)
976
+ for writer_path, aw_candidates in sorted(adapter_candidates.items()):
977
+ if writer_path in seen_locations:
978
+ continue
979
+ # Use target_hint values as synthetic targets for domain matching.
980
+ # Empty hints are treated as unknown targets (same as Python's
981
+ # _UNKNOWN_TARGET) and do not contribute to domain matching.
982
+ synthetic_targets = [
983
+ c.target_hint for c in aw_candidates if c.target_hint
984
+ ]
985
+ if not synthetic_targets:
986
+ continue
987
+ if _writer_matches_domain(synthetic_targets, target_file_patterns):
988
+ kind = _classify_writer(writer_path, allowed_writers)
989
+ writers_detected_dicts.append({
990
+ "location": writer_path,
991
+ "kind": kind,
992
+ "file_role": classify_file_role(writer_path),
993
+ })
994
+ seen_locations.add(writer_path)
995
+
996
+ domain = AuthorityDomain(
997
+ authority_domain=authority_domain,
998
+ canonical_owner=str(domain_def.get("canonical_owner", "")),
999
+ allowed_writers=allowed_writers,
1000
+ derived_readers=tuple(domain_def.get("derived_readers", [])),
1001
+ cache_layers=tuple(domain_def.get("cache_layers", [])),
1002
+ freshness_sla=str(domain_def.get("freshness_sla", "immediate")),
1003
+ invalidation_rule=str(domain_def.get("invalidation_rule", "")),
1004
+ drift_policy=str(domain_def.get("drift_policy", "fail_close")),
1005
+ writers_detected=tuple(json.dumps(w, sort_keys=True) for w in writers_detected_dicts),
1006
+ last_drift_events=(),
1007
+ target_file_patterns=target_file_patterns,
1008
+ source=metadata["source"],
1009
+ evidence=tuple(metadata["evidence"]),
1010
+ confidence=metadata["confidence"],
1011
+ freshness=metadata["freshness"],
1012
+ status=metadata["status"],
1013
+ )
1014
+ results.append(domain)
1015
+ _log.debug(
1016
+ "build_authority_map: domain=%s patterns=%d writers_detected=%d",
1017
+ authority_domain, len(target_file_patterns), len(writers_detected_dicts),
1018
+ )
1019
+
1020
+ # --- Auto-discovered domains (seed-free) ---
1021
+ auto_domains = _auto_discover_domains(target_to_writers, target_to_write_calls, seed_list)
1022
+ auto_metadata = make_metadata(source="auto_scan", confidence=0.6, status="inferred")
1023
+
1024
+ for ad in auto_domains:
1025
+ writers_detected_list = []
1026
+ # Build mapping of writer -> WriteCall for quick lookup
1027
+ write_calls_by_writer: dict[str, WriteCall | None] = {}
1028
+ for writer_rel, write_call in ad["_write_calls"]:
1029
+ write_calls_by_writer[writer_rel] = write_call
1030
+
1031
+ for w in ad["_all_writers"]:
1032
+ # Look up WriteCall for this writer
1033
+ write_call = write_calls_by_writer.get(w)
1034
+
1035
+ if write_call is not None:
1036
+ # Python AST writer with full WriteCall information
1037
+ provenance = write_call.provenance
1038
+ operation = write_call.operation
1039
+ line = write_call.line
1040
+ else:
1041
+ # Non-Python adapter writer (no AST info available)
1042
+ provenance = _PROVENANCE_UNKNOWN
1043
+ operation = "unknown"
1044
+ line = None
1045
+
1046
+ writers_detected_list.append({
1047
+ "location": w,
1048
+ "kind": "shared_write",
1049
+ "target": ad["_shared_target"],
1050
+ "module_prefix": _module_prefix(w),
1051
+ "file_role": classify_file_role(w),
1052
+ "operation": operation,
1053
+ "line": line,
1054
+ "provenance": provenance,
1055
+ })
1056
+
1057
+ writers_detected = tuple(
1058
+ json.dumps(w, sort_keys=True)
1059
+ for w in writers_detected_list
1060
+ )
1061
+ results.append(AuthorityDomain(
1062
+ authority_domain=ad["authority_domain"],
1063
+ canonical_owner="",
1064
+ allowed_writers=(),
1065
+ derived_readers=(),
1066
+ cache_layers=(),
1067
+ freshness_sla="immediate",
1068
+ invalidation_rule="unknown",
1069
+ drift_policy="observe",
1070
+ writers_detected=writers_detected,
1071
+ last_drift_events=(),
1072
+ target_file_patterns=tuple(ad["target_file_patterns"]),
1073
+ source=auto_metadata["source"],
1074
+ evidence=tuple(auto_metadata["evidence"]),
1075
+ confidence=auto_metadata["confidence"],
1076
+ freshness=auto_metadata["freshness"],
1077
+ status=auto_metadata["status"],
1078
+ ))
1079
+ _log.debug(
1080
+ "build_authority_map: auto domain=%s target=%s writers_detected=%d",
1081
+ ad["authority_domain"], ad["_shared_target"], len(ad["_all_writers"]),
1082
+ )
1083
+
1084
+ # --- No-seed auto-surface (out-of-box) ---
1085
+ # When NO seed file exists, the per-domain loop above never runs and the
1086
+ # shared-write heuristic only catches multi-writer targets, so most projects
1087
+ # got an empty authority map. Surface every discovered writer (Python +
1088
+ # adapter) as an inferred per-writer domain so the map is useful immediately.
1089
+ # When a seed exists we keep the structured behaviour and do NOT add these
1090
+ # (avoids double-surfacing writers already attributed to seed domains).
1091
+ no_seed_count = 0
1092
+ if no_seed:
1093
+ no_seed_domains = _build_no_seed_writer_domains(writers_map, adapter_candidates)
1094
+ results.extend(no_seed_domains)
1095
+ no_seed_count = len(no_seed_domains)
1096
+
1097
+ _log.info(
1098
+ "build_authority_map: completed %d domain(s) (seed=%d auto=%d no_seed=%d), %d writer file(s) scanned",
1099
+ len(results), len([r for r in results if r.status == "observed"]),
1100
+ len(auto_domains), no_seed_count, len(writers_map),
1101
+ )
1102
+ return results