vigil-codeintel 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vigil_codeintel-0.1.0.dist-info/METADATA +780 -0
- vigil_codeintel-0.1.0.dist-info/RECORD +131 -0
- vigil_codeintel-0.1.0.dist-info/WHEEL +5 -0
- vigil_codeintel-0.1.0.dist-info/entry_points.txt +3 -0
- vigil_codeintel-0.1.0.dist-info/licenses/LICENSE +21 -0
- vigil_codeintel-0.1.0.dist-info/top_level.txt +3 -0
- vigil_forensic/__init__.py +224 -0
- vigil_forensic/_git_utils.py +178 -0
- vigil_forensic/_shared.py +510 -0
- vigil_forensic/_stubs.py +156 -0
- vigil_forensic/gate_checks/__init__.py +1 -0
- vigil_forensic/gate_checks/_ast_helpers.py +629 -0
- vigil_forensic/gate_checks/_deployment_detector.py +573 -0
- vigil_forensic/gate_checks/atomic_write_checks.py +1143 -0
- vigil_forensic/gate_checks/authority_checks.py +95 -0
- vigil_forensic/gate_checks/boundary_breach_checks.py +202 -0
- vigil_forensic/gate_checks/broad_except_checks.py +301 -0
- vigil_forensic/gate_checks/broad_except_hidden_sentinel_checks.py +365 -0
- vigil_forensic/gate_checks/common.py +253 -0
- vigil_forensic/gate_checks/config_safety_checks.py +704 -0
- vigil_forensic/gate_checks/config_ssot_checks.py +78 -0
- vigil_forensic/gate_checks/conflict_checks.py +193 -0
- vigil_forensic/gate_checks/context_fallback_checks.py +697 -0
- vigil_forensic/gate_checks/context_health_checks.py +289 -0
- vigil_forensic/gate_checks/contract_shape_drift_checks.py +459 -0
- vigil_forensic/gate_checks/dirty_baseline_check.py +274 -0
- vigil_forensic/gate_checks/duplication_checks.py +387 -0
- vigil_forensic/gate_checks/embedded_string_checks.py +123 -0
- vigil_forensic/gate_checks/empty_output_checks.py +87 -0
- vigil_forensic/gate_checks/encoding_checks.py +847 -0
- vigil_forensic/gate_checks/export_completeness_checks.py +156 -0
- vigil_forensic/gate_checks/fallback_checks.py +41 -0
- vigil_forensic/gate_checks/file_proliferation_checks.py +171 -0
- vigil_forensic/gate_checks/fix_without_test_checks.py +69 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/__init__.py +9 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/_helpers.py +71 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/advanced_checks.py +322 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/core.py +273 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/integrity_checks.py +203 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/quality_checks.py +666 -0
- vigil_forensic/gate_checks/forensic_clusters/__init__.py +193 -0
- vigil_forensic/gate_checks/forensic_clusters/allowlist.py +426 -0
- vigil_forensic/gate_checks/forensic_clusters/allowlist_writer.py +302 -0
- vigil_forensic/gate_checks/forensic_clusters/api_protocol.py +231 -0
- vigil_forensic/gate_checks/forensic_clusters/async_quality.py +1156 -0
- vigil_forensic/gate_checks/forensic_clusters/code_style.py +808 -0
- vigil_forensic/gate_checks/forensic_clusters/core.py +319 -0
- vigil_forensic/gate_checks/forensic_clusters/data_quality.py +763 -0
- vigil_forensic/gate_checks/forensic_clusters/dead_code.py +480 -0
- vigil_forensic/gate_checks/forensic_clusters/edit_mutation.py +842 -0
- vigil_forensic/gate_checks/forensic_clusters/exception_boundary.py +240 -0
- vigil_forensic/gate_checks/forensic_clusters/legacy_debt.py +556 -0
- vigil_forensic/gate_checks/forensic_clusters/static_analysis.py +834 -0
- vigil_forensic/gate_checks/forensic_clusters/structural_quality.py +298 -0
- vigil_forensic/gate_checks/god_object_zones_checks.py +173 -0
- vigil_forensic/gate_checks/hallucination_checks.py +566 -0
- vigil_forensic/gate_checks/hunter_artifact_completeness_check.py +139 -0
- vigil_forensic/gate_checks/implementation_overfit_checks.py +380 -0
- vigil_forensic/gate_checks/import_integrity_checks.py +233 -0
- vigil_forensic/gate_checks/imports_in_function_checks.py +283 -0
- vigil_forensic/gate_checks/ml_checks.py +318 -0
- vigil_forensic/gate_checks/performance_checks.py +106 -0
- vigil_forensic/gate_checks/project_specific_runner.py +691 -0
- vigil_forensic/gate_checks/provider_capability_checks.py +73 -0
- vigil_forensic/gate_checks/refactor_completeness_checks.py +274 -0
- vigil_forensic/gate_checks/reliability_checks.py +389 -0
- vigil_forensic/gate_checks/reporting_checks.py +55 -0
- vigil_forensic/gate_checks/runtime_behavior_checks.py +220 -0
- vigil_forensic/gate_checks/security_injection_checks.py +332 -0
- vigil_forensic/gate_checks/semantic_intent_checks.py +139 -0
- vigil_forensic/gate_checks/size_complexity_checks.py +336 -0
- vigil_forensic/gate_checks/stuck_feature_flag_checks.py +354 -0
- vigil_forensic/gate_checks/syntax_validity_checks.py +217 -0
- vigil_forensic/gate_checks/temporal_freshness_checks.py +79 -0
- vigil_forensic/gate_checks/test_quality_checks.py +946 -0
- vigil_forensic/gate_checks/testing_checks.py +149 -0
- vigil_forensic/gate_checks/toctou_checks.py +367 -0
- vigil_forensic/gate_checks/type_checking_checks.py +316 -0
- vigil_forensic/gate_models.py +392 -0
- vigil_forensic/gate_packs/__init__.py +1 -0
- vigil_forensic/gate_packs/universal.py +179 -0
- vigil_forensic/gate_profile.json +31 -0
- vigil_forensic/gate_registry.py +21 -0
- vigil_forensic/language_profiles.py +219 -0
- vigil_forensic/meta_findings.py +207 -0
- vigil_forensic/self_audit.py +725 -0
- vigil_forensic/source_analysis.py +175 -0
- vigil_mapper/__init__.py +103 -0
- vigil_mapper/_ast_helpers_minimal.py +229 -0
- vigil_mapper/_extract_imports_impl.py +123 -0
- vigil_mapper/_file_count_guard.py +129 -0
- vigil_mapper/_git_utils.py +178 -0
- vigil_mapper/_runtime_ast.py +438 -0
- vigil_mapper/_runtime_dispatch.py +137 -0
- vigil_mapper/_seed_helpers.py +82 -0
- vigil_mapper/authority_builder.py +1102 -0
- vigil_mapper/cli_entry.py +731 -0
- vigil_mapper/conflict_builder.py +818 -0
- vigil_mapper/data_contract_builder.py +446 -0
- vigil_mapper/findings_builder.py +716 -0
- vigil_mapper/fingerprint.py +53 -0
- vigil_mapper/hotspot_builder.py +539 -0
- vigil_mapper/map_common.py +449 -0
- vigil_mapper/map_errors.py +55 -0
- vigil_mapper/map_models.py +431 -0
- vigil_mapper/map_models_ext.py +206 -0
- vigil_mapper/map_models_findings.py +130 -0
- vigil_mapper/map_storage.py +455 -0
- vigil_mapper/parse_cache.py +795 -0
- vigil_mapper/refactor_boundary_builder.py +266 -0
- vigil_mapper/runtime_builder.py +527 -0
- vigil_mapper/runtime_tracer.py +243 -0
- vigil_mapper/runtime_tracer_entry.py +199 -0
- vigil_mapper/semantic_diff.py +71 -0
- vigil_mapper/source_adapters/__init__.py +109 -0
- vigil_mapper/source_adapters/_base.py +264 -0
- vigil_mapper/source_adapters/_ir.py +156 -0
- vigil_mapper/source_adapters/_lexer.py +309 -0
- vigil_mapper/source_adapters/_patterns.py +212 -0
- vigil_mapper/source_adapters/_treesitter.py +182 -0
- vigil_mapper/source_adapters/go.py +553 -0
- vigil_mapper/source_adapters/java.py +541 -0
- vigil_mapper/source_adapters/javascript.py +626 -0
- vigil_mapper/source_adapters/python.py +325 -0
- vigil_mapper/source_adapters/typescript.py +749 -0
- vigil_mapper/structural_builder.py +586 -0
- vigil_mcp/__init__.py +1 -0
- vigil_mcp/_jobs.py +587 -0
- vigil_mcp/_paths.py +93 -0
- vigil_mcp/forensic_server.py +419 -0
- vigil_mcp/map_server.py +452 -0
|
@@ -0,0 +1,1102 @@
|
|
|
1
|
+
"""Authority map builder -- reads seed file and auto-discovers writers via AST.
|
|
2
|
+
|
|
3
|
+
Generic tool: operates on any target project_dir.
|
|
4
|
+
Seed file: <project_dir>/.cortex/map_seeds/authority_domains.json
|
|
5
|
+
|
|
6
|
+
WITH a seed: each domain seed entry may carry ``target_file_patterns`` (glob
|
|
7
|
+
patterns). A writer is attributed to a domain only when at least one resolved
|
|
8
|
+
write-target path matches a pattern. Writers with unresolvable targets are
|
|
9
|
+
dropped from all domains. Empty/missing patterns -> no per-domain discovery.
|
|
10
|
+
|
|
11
|
+
WITHOUT a seed (out-of-box): every discovered write site is auto-surfaced as an
|
|
12
|
+
inferred per-writer ``AuthorityDomain`` (status="inferred", source="static_scan")
|
|
13
|
+
so the map is useful immediately. Each entry names the writer file plus its
|
|
14
|
+
write targets and operation kinds. Pure reads never produce an entry.
|
|
15
|
+
|
|
16
|
+
Write detection (Python AST): ``.write_text`` / ``.write_bytes`` / ``.save`` /
|
|
17
|
+
``os.replace`` (method writes) and ``open(..., "w"/"a"/"x"/"+")`` / ``json.dump``
|
|
18
|
+
(function writes). Reads -- ``open(p)`` / ``open(p, "r")`` / ``.read_text()`` /
|
|
19
|
+
``json.load`` / ``json.dumps`` -- are NOT writes. Non-Python writers (Go/Java/
|
|
20
|
+
JS/TS) are detected via adapter ``extract_writer_calls`` and surface the same way.
|
|
21
|
+
"""
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import ast
|
|
25
|
+
import fnmatch
|
|
26
|
+
import hashlib
|
|
27
|
+
import json
|
|
28
|
+
import logging
|
|
29
|
+
import re
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
from typing import Any, NamedTuple, Sequence
|
|
32
|
+
|
|
33
|
+
from .map_common import classify_file_role, iter_py_files, iter_source_files, make_metadata
|
|
34
|
+
from .source_adapters import get_adapter_for_file
|
|
35
|
+
from .source_adapters._ir import AuthorityWriteCandidate
|
|
36
|
+
from .map_errors import MapIntegrityError
|
|
37
|
+
from .map_models import AuthorityDomain
|
|
38
|
+
from .map_storage import seeds_dir
|
|
39
|
+
|
|
40
|
+
__all__ = ["build_authority_map"]
|
|
41
|
+
|
|
42
|
+
_log = logging.getLogger(__name__)
|
|
43
|
+
|
|
44
|
+
_SEED_FILENAME = "authority_domains.json"
|
|
45
|
+
_WRITE_METHOD_NAMES = frozenset({"write_text", "write_bytes", "save"})
|
|
46
|
+
_UNKNOWN_TARGET = "__unknown_target__"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _open_mode_is_write(mode: str) -> bool:
|
|
50
|
+
"""True iff an open() mode string mutates the target.
|
|
51
|
+
|
|
52
|
+
Write modes contain 'w', 'a', 'x' (create/truncate/append) or '+'
|
|
53
|
+
(read-update / write-update — both can write). A bare ``open(p)`` defaults
|
|
54
|
+
to ``"r"``; ``"r"`` / ``"rb"`` / ``"rt"`` are pure READS → not writes.
|
|
55
|
+
The ``b``/``t`` flags are binary/text modifiers and do not imply a write.
|
|
56
|
+
"""
|
|
57
|
+
return any(ch in mode for ch in ("w", "a", "x", "+"))
|
|
58
|
+
|
|
59
|
+
# Provenance type constants for path tracking
|
|
60
|
+
_PROVENANCE_PATH_CONSTRUCTOR = "path_constructor" # Path(...), PurePath(...), etc.
|
|
61
|
+
_PROVENANCE_STRING_LITERAL = "string_literal" # "literal_path"
|
|
62
|
+
_PROVENANCE_FUNCTION_PARAM = "function_parameter" # def foo(target):
|
|
63
|
+
_PROVENANCE_UNKNOWN = "unknown"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
# Write call tracking
|
|
68
|
+
# ---------------------------------------------------------------------------
|
|
69
|
+
|
|
70
|
+
class WriteCall(NamedTuple):
|
|
71
|
+
"""Represents a single write call with provenance and location info."""
|
|
72
|
+
target: str # resolved target path or _UNKNOWN_TARGET
|
|
73
|
+
operation: str # "write_text" | "write_bytes" | "os.replace" | "save" | "unknown"
|
|
74
|
+
line: int | None # source line number of the call (or None if unavailable)
|
|
75
|
+
provenance: str # "path_constructor" | "string_literal" | "function_parameter" | "unknown"
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# ---------------------------------------------------------------------------
|
|
79
|
+
# Glob matching — ** support (PurePath.match added ** only in Python 3.12)
|
|
80
|
+
# ---------------------------------------------------------------------------
|
|
81
|
+
|
|
82
|
+
def _match_glob_path(path: str, pattern: str) -> bool:
|
|
83
|
+
"""Match forward-slash path against glob pattern supporting **."""
|
|
84
|
+
path = path.replace("\\", "/")
|
|
85
|
+
pattern = pattern.replace("\\", "/")
|
|
86
|
+
if "**" not in pattern:
|
|
87
|
+
return fnmatch.fnmatch(path, pattern)
|
|
88
|
+
return _match_double_star(path, pattern)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _match_double_star(path: str, pattern: str) -> bool:
|
|
92
|
+
"""Recursive ** expansion: ** matches zero or more path segments."""
|
|
93
|
+
if "**" not in pattern:
|
|
94
|
+
return fnmatch.fnmatch(path, pattern)
|
|
95
|
+
idx = pattern.find("**")
|
|
96
|
+
prefix = pattern[:idx].rstrip("/")
|
|
97
|
+
rest = pattern[idx + 2:].lstrip("/")
|
|
98
|
+
path_parts = path.split("/")
|
|
99
|
+
if prefix:
|
|
100
|
+
n = len(prefix.split("/"))
|
|
101
|
+
if len(path_parts) < n:
|
|
102
|
+
return False
|
|
103
|
+
if not fnmatch.fnmatch("/".join(path_parts[:n]), prefix):
|
|
104
|
+
return False
|
|
105
|
+
path_parts = path_parts[n:]
|
|
106
|
+
if not rest:
|
|
107
|
+
return True
|
|
108
|
+
for i in range(len(path_parts) + 1):
|
|
109
|
+
if _match_double_star("/".join(path_parts[i:]), rest):
|
|
110
|
+
return True
|
|
111
|
+
return False
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# ---------------------------------------------------------------------------
|
|
115
|
+
# AST helpers
|
|
116
|
+
# ---------------------------------------------------------------------------
|
|
117
|
+
|
|
118
|
+
def _extract_string_value(node: ast.expr | None) -> str | None:
|
|
119
|
+
if isinstance(node, ast.Constant) and isinstance(node.value, str):
|
|
120
|
+
return node.value
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _normalize_target_path(target: str) -> str:
|
|
125
|
+
"""Strip .tmp/.bak/.backup/.temp suffixes → canonical base target."""
|
|
126
|
+
name = Path(target).name
|
|
127
|
+
for suffix in (".tmp", ".bak", ".backup", ".temp"):
|
|
128
|
+
if name.endswith(suffix):
|
|
129
|
+
return str(Path(target).with_name(name[: -len(suffix)]))
|
|
130
|
+
# Also strip uuid-based suffixes: state.abc123.tmp → state
|
|
131
|
+
# Pattern: name.<hex/uuid>.<ext_or_tmp>
|
|
132
|
+
stripped = re.sub(r'\.[0-9a-f\-]{8,}\.tmp$', '', target)
|
|
133
|
+
if stripped != target:
|
|
134
|
+
return stripped
|
|
135
|
+
return target
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _is_plausible_path(s: str) -> bool:
|
|
139
|
+
"""True iff s looks like a file path (not a multi-line string or code snippet).
|
|
140
|
+
|
|
141
|
+
Bare filenames (Makefile, Dockerfile, LICENSE, Procfile, README) are valid.
|
|
142
|
+
"""
|
|
143
|
+
if not s or len(s) > 512:
|
|
144
|
+
return False
|
|
145
|
+
if '\n' in s or '\r' in s:
|
|
146
|
+
return False
|
|
147
|
+
# Bare filenames that are valid write targets
|
|
148
|
+
if Path(s).name in {"Makefile", "Dockerfile", "Procfile", "LICENSE", "README"}:
|
|
149
|
+
return True
|
|
150
|
+
# Otherwise must contain at least one path-like character
|
|
151
|
+
if '/' not in s and '\\' not in s and '.' not in s:
|
|
152
|
+
return False
|
|
153
|
+
return True
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _module_prefix(rel_posix: str) -> str:
|
|
157
|
+
"""First path component = top-level module/package."""
|
|
158
|
+
return rel_posix.split("/")[0]
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _safe_domain_name(target: str) -> str:
|
|
162
|
+
"""Convert target path to safe domain name (parent_stem_hash, max 40 chars).
|
|
163
|
+
|
|
164
|
+
Includes parent directory to avoid collisions: api/config.json vs settings/config.yaml.
|
|
165
|
+
Uses stable blake2s hash (deterministic across processes) for collision avoidance.
|
|
166
|
+
"""
|
|
167
|
+
p = Path(target)
|
|
168
|
+
parts = []
|
|
169
|
+
# Add parent directory name if present
|
|
170
|
+
if p.parent.name and p.parent.name not in (".", ""):
|
|
171
|
+
parts.append(p.parent.name)
|
|
172
|
+
# Add filename stem
|
|
173
|
+
parts.append(p.stem)
|
|
174
|
+
# Include first 4 chars of stable blake2s hash for collision avoidance
|
|
175
|
+
target_hash = hashlib.blake2s(
|
|
176
|
+
target.encode("utf-8"),
|
|
177
|
+
digest_size=2,
|
|
178
|
+
).hexdigest()
|
|
179
|
+
parts.append(target_hash)
|
|
180
|
+
raw = "_".join(parts)
|
|
181
|
+
# Sanitize and truncate
|
|
182
|
+
return re.sub(r'[^a-zA-Z0-9_]', '_', raw)[:40]
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _seed_covers_target(target: str, seed_domains: list[dict]) -> bool:
|
|
186
|
+
"""True if any seed domain's target_file_patterns matches this target."""
|
|
187
|
+
for domain_def in seed_domains:
|
|
188
|
+
for pattern in domain_def.get("target_file_patterns", []):
|
|
189
|
+
if _match_glob_path(target, pattern):
|
|
190
|
+
return True
|
|
191
|
+
return False
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _resolve_call_target(
|
|
195
|
+
call_node: ast.Call,
|
|
196
|
+
assignments: dict[str, str],
|
|
197
|
+
aliases: dict[str, str] | None = None,
|
|
198
|
+
) -> str | None:
|
|
199
|
+
"""Resolve the file-path target of a write call via AST analysis."""
|
|
200
|
+
if aliases is None:
|
|
201
|
+
aliases = {}
|
|
202
|
+
func = call_node.func
|
|
203
|
+
if not isinstance(func, ast.Attribute):
|
|
204
|
+
return None
|
|
205
|
+
receiver = func.value
|
|
206
|
+
# Path("literal").write_text(...) or path.with_suffix(...).write_text(...)
|
|
207
|
+
if isinstance(receiver, ast.Call) and isinstance(receiver.func, (ast.Name, ast.Attribute)):
|
|
208
|
+
fname = receiver.func.id if isinstance(receiver.func, ast.Name) else receiver.func.attr
|
|
209
|
+
if fname in ("Path", "PurePath", "PosixPath", "WindowsPath") and receiver.args:
|
|
210
|
+
return _extract_string_value(receiver.args[0])
|
|
211
|
+
# .with_suffix(...).write_text() or .with_name(...).write_text()
|
|
212
|
+
if fname in ("with_suffix", "with_name", "with_stem"):
|
|
213
|
+
inner = receiver.func.value if isinstance(receiver.func, ast.Attribute) else None
|
|
214
|
+
if isinstance(inner, ast.Name):
|
|
215
|
+
name = inner.id
|
|
216
|
+
resolved = name
|
|
217
|
+
visited: set[str] = {resolved}
|
|
218
|
+
for _ in range(8):
|
|
219
|
+
nxt = aliases.get(resolved)
|
|
220
|
+
if nxt is None or nxt in visited:
|
|
221
|
+
break
|
|
222
|
+
visited.add(nxt)
|
|
223
|
+
resolved = nxt
|
|
224
|
+
return assignments.get(resolved)
|
|
225
|
+
# name.write_text(...) with alias following
|
|
226
|
+
if isinstance(receiver, ast.Name):
|
|
227
|
+
name = receiver.id
|
|
228
|
+
resolved = name
|
|
229
|
+
visited: set[str] = {resolved}
|
|
230
|
+
for _ in range(8):
|
|
231
|
+
nxt = aliases.get(resolved)
|
|
232
|
+
if nxt is None or nxt in visited:
|
|
233
|
+
break
|
|
234
|
+
visited.add(nxt)
|
|
235
|
+
resolved = nxt
|
|
236
|
+
return assignments.get(resolved)
|
|
237
|
+
# self.attr.write_text(...)
|
|
238
|
+
if isinstance(receiver, ast.Attribute) and isinstance(receiver.value, ast.Name):
|
|
239
|
+
return assignments.get("%s.%s" % (receiver.value.id, receiver.attr))
|
|
240
|
+
return None
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _resolve_func_arg_target(
|
|
244
|
+
arg_node: ast.expr | None,
|
|
245
|
+
assignments: dict[str, str],
|
|
246
|
+
aliases: dict[str, str],
|
|
247
|
+
) -> str | None:
|
|
248
|
+
"""Resolve a path-like target from a positional call argument.
|
|
249
|
+
|
|
250
|
+
Handles a string literal, ``Path("literal")``, or a variable name that an
|
|
251
|
+
assignment resolved to a string/Path (with alias chaining). Returns None
|
|
252
|
+
when the target cannot be resolved.
|
|
253
|
+
"""
|
|
254
|
+
if arg_node is None:
|
|
255
|
+
return None
|
|
256
|
+
# "literal"
|
|
257
|
+
lit = _extract_string_value(arg_node)
|
|
258
|
+
if lit is not None:
|
|
259
|
+
return lit
|
|
260
|
+
# Path("literal") / PurePath("literal")
|
|
261
|
+
if isinstance(arg_node, ast.Call) and isinstance(arg_node.func, (ast.Name, ast.Attribute)):
|
|
262
|
+
fname = arg_node.func.id if isinstance(arg_node.func, ast.Name) else arg_node.func.attr
|
|
263
|
+
if fname in ("Path", "PurePath", "PosixPath", "WindowsPath") and arg_node.args:
|
|
264
|
+
return _extract_string_value(arg_node.args[0])
|
|
265
|
+
# variable name -> resolve through aliases + assignments
|
|
266
|
+
if isinstance(arg_node, ast.Name):
|
|
267
|
+
resolved = arg_node.id
|
|
268
|
+
visited: set[str] = {resolved}
|
|
269
|
+
for _ in range(8):
|
|
270
|
+
nxt = aliases.get(resolved)
|
|
271
|
+
if nxt is None or nxt in visited:
|
|
272
|
+
break
|
|
273
|
+
visited.add(nxt)
|
|
274
|
+
resolved = nxt
|
|
275
|
+
return assignments.get(resolved)
|
|
276
|
+
return None
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def _detect_func_write(
|
|
280
|
+
node: ast.Call,
|
|
281
|
+
assignments: dict[str, str],
|
|
282
|
+
aliases: dict[str, str],
|
|
283
|
+
) -> tuple[str, str | None] | None:
|
|
284
|
+
"""Detect ``open(path, "w")`` and ``json.dump(obj, fp)`` function-call writes.
|
|
285
|
+
|
|
286
|
+
These are plain function calls (``ast.Name`` / module ``ast.Attribute``),
|
|
287
|
+
NOT receiver-method calls, so the standard ``_WRITE_METHOD_NAMES`` scan
|
|
288
|
+
misses them. Returns ``(operation, target_or_None)`` for a write, else None.
|
|
289
|
+
|
|
290
|
+
- ``open(path, mode)`` is a write only when ``mode`` mutates the target
|
|
291
|
+
(see :func:`_open_mode_is_write`). A bare ``open(p)`` / ``open(p, "r")``
|
|
292
|
+
is a READ → returns None (precision guard).
|
|
293
|
+
- ``json.dump(obj, fp)`` writes to ``fp``; ``json.dumps`` (returns a string)
|
|
294
|
+
and ``json.load`` / ``json.loads`` (reads) are NOT writes.
|
|
295
|
+
"""
|
|
296
|
+
func = node.func
|
|
297
|
+
|
|
298
|
+
# open(...) — builtin name (assume builtin; shadowing is rare and out of scope)
|
|
299
|
+
if isinstance(func, ast.Name) and func.id == "open":
|
|
300
|
+
mode = "r" # open() default
|
|
301
|
+
if len(node.args) >= 2:
|
|
302
|
+
lit = _extract_string_value(node.args[1])
|
|
303
|
+
if lit is not None:
|
|
304
|
+
mode = lit
|
|
305
|
+
else:
|
|
306
|
+
for kw in node.keywords:
|
|
307
|
+
if kw.arg == "mode":
|
|
308
|
+
lit = _extract_string_value(kw.value)
|
|
309
|
+
if lit is not None:
|
|
310
|
+
mode = lit
|
|
311
|
+
if not _open_mode_is_write(mode):
|
|
312
|
+
return None
|
|
313
|
+
target = _resolve_func_arg_target(
|
|
314
|
+
node.args[0] if node.args else None, assignments, aliases
|
|
315
|
+
)
|
|
316
|
+
return ("open_write", target)
|
|
317
|
+
|
|
318
|
+
# json.dump(obj, fp, ...) — module attribute call. Target = fp (2nd arg),
|
|
319
|
+
# best-effort (usually a file handle variable). dumps/load/loads excluded.
|
|
320
|
+
if isinstance(func, ast.Attribute) and func.attr == "dump":
|
|
321
|
+
receiver = func.value
|
|
322
|
+
if isinstance(receiver, ast.Name) and receiver.id == "json":
|
|
323
|
+
fp_node = node.args[1] if len(node.args) >= 2 else None
|
|
324
|
+
target = _resolve_func_arg_target(fp_node, assignments, aliases)
|
|
325
|
+
return ("json_dump", target)
|
|
326
|
+
|
|
327
|
+
return None
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def _collect_assignments(tree: ast.AST) -> tuple[dict[str, tuple[str, str]], dict[str, str]]:
|
|
331
|
+
"""Return (assignments_typed, aliases).
|
|
332
|
+
|
|
333
|
+
assignments_typed: name -> (string-path, provenance_type)
|
|
334
|
+
- Provenance types: path_constructor, string_literal, function_parameter, unknown
|
|
335
|
+
aliases: name -> other_name (for .with_suffix/.with_name/.with_stem chains)
|
|
336
|
+
|
|
337
|
+
First pass: extract function parameters (lower precedence)
|
|
338
|
+
Second pass: extract assignments (higher precedence, overwrites params)
|
|
339
|
+
"""
|
|
340
|
+
assignments_typed: dict[str, tuple[str, str]] = {}
|
|
341
|
+
aliases: dict[str, str] = {}
|
|
342
|
+
|
|
343
|
+
# PASS 1: Extract function parameters
|
|
344
|
+
for node in ast.walk(tree):
|
|
345
|
+
if not isinstance(node, ast.FunctionDef):
|
|
346
|
+
continue
|
|
347
|
+
for arg in node.args.args:
|
|
348
|
+
# Function parameter: store with empty path, provenance_type = function_parameter
|
|
349
|
+
assignments_typed[arg.arg] = ("", _PROVENANCE_FUNCTION_PARAM)
|
|
350
|
+
|
|
351
|
+
# PASS 2: Extract assignments (overwrites function params if same name)
|
|
352
|
+
for node in ast.walk(tree):
|
|
353
|
+
if not isinstance(node, ast.Assign) or len(node.targets) != 1:
|
|
354
|
+
continue
|
|
355
|
+
target = node.targets[0]
|
|
356
|
+
if isinstance(target, ast.Name):
|
|
357
|
+
key = target.id
|
|
358
|
+
elif isinstance(target, ast.Attribute) and isinstance(target.value, ast.Name):
|
|
359
|
+
key = "%s.%s" % (target.value.id, target.attr)
|
|
360
|
+
else:
|
|
361
|
+
continue
|
|
362
|
+
value = node.value
|
|
363
|
+
if isinstance(value, ast.Constant) and isinstance(value.value, str):
|
|
364
|
+
# String literal assignment
|
|
365
|
+
assignments_typed[key] = (value.value, _PROVENANCE_STRING_LITERAL)
|
|
366
|
+
elif isinstance(value, ast.Call) and isinstance(value.func, (ast.Name, ast.Attribute)):
|
|
367
|
+
fname = value.func.id if isinstance(value.func, ast.Name) else value.func.attr
|
|
368
|
+
if fname in ("Path", "PurePath", "PosixPath", "WindowsPath") and value.args:
|
|
369
|
+
val = _extract_string_value(value.args[0])
|
|
370
|
+
if val is not None:
|
|
371
|
+
# Path constructor
|
|
372
|
+
assignments_typed[key] = (val, _PROVENANCE_PATH_CONSTRUCTOR)
|
|
373
|
+
else:
|
|
374
|
+
assignments_typed[key] = ("", _PROVENANCE_UNKNOWN)
|
|
375
|
+
elif fname in ("with_suffix", "with_name", "with_stem"):
|
|
376
|
+
receiver = value.func.value
|
|
377
|
+
if isinstance(receiver, ast.Name):
|
|
378
|
+
aliases[key] = receiver.id
|
|
379
|
+
return assignments_typed, aliases
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def _scan_write_calls(
|
|
383
|
+
tree: ast.AST,
|
|
384
|
+
assignments_typed: dict[str, tuple[str, str]],
|
|
385
|
+
aliases: dict[str, str] | None = None,
|
|
386
|
+
) -> list[WriteCall]:
|
|
387
|
+
"""Return WriteCall objects for each write call found in tree.
|
|
388
|
+
|
|
389
|
+
Args:
|
|
390
|
+
tree: AST tree to scan
|
|
391
|
+
assignments_typed: {var_name: (target_path, provenance_type)}
|
|
392
|
+
aliases: {var_name: alias_var_name} for .with_suffix chains
|
|
393
|
+
|
|
394
|
+
Returns:
|
|
395
|
+
list[WriteCall] with target, operation, line, and provenance
|
|
396
|
+
"""
|
|
397
|
+
if aliases is None:
|
|
398
|
+
aliases = {}
|
|
399
|
+
|
|
400
|
+
# Flatten assignments for target resolution
|
|
401
|
+
assignments = {k: v[0] for k, v in assignments_typed.items()}
|
|
402
|
+
|
|
403
|
+
calls: list[WriteCall] = []
|
|
404
|
+
for node in ast.walk(tree):
|
|
405
|
+
if not isinstance(node, ast.Call):
|
|
406
|
+
continue
|
|
407
|
+
|
|
408
|
+
line_no = node.lineno if hasattr(node, 'lineno') else None
|
|
409
|
+
|
|
410
|
+
# Function-call writes: open(path, "w"), json.dump(obj, fp).
|
|
411
|
+
# These are NOT receiver-method calls, so handle them before the
|
|
412
|
+
# ast.Attribute gate below. Reads (open(p)/open(p,"r")) return None.
|
|
413
|
+
func_write = _detect_func_write(node, assignments, aliases)
|
|
414
|
+
if func_write is not None:
|
|
415
|
+
operation, resolved = func_write
|
|
416
|
+
if resolved is not None and not _is_plausible_path(resolved):
|
|
417
|
+
resolved = _UNKNOWN_TARGET
|
|
418
|
+
target = resolved if resolved is not None else _UNKNOWN_TARGET
|
|
419
|
+
provenance = _PROVENANCE_UNKNOWN
|
|
420
|
+
for var_name, (path, prov_type) in assignments_typed.items():
|
|
421
|
+
if path == target and target != _UNKNOWN_TARGET:
|
|
422
|
+
provenance = prov_type
|
|
423
|
+
break
|
|
424
|
+
calls.append(WriteCall(target=target, operation=operation, line=line_no, provenance=provenance))
|
|
425
|
+
continue
|
|
426
|
+
|
|
427
|
+
if not isinstance(node.func, ast.Attribute):
|
|
428
|
+
continue
|
|
429
|
+
|
|
430
|
+
# Standard write methods: path.write_text(), path.save(), etc.
|
|
431
|
+
if node.func.attr in _WRITE_METHOD_NAMES:
|
|
432
|
+
operation = node.func.attr
|
|
433
|
+
resolved = _resolve_call_target(node, assignments, aliases)
|
|
434
|
+
if resolved is not None and not _is_plausible_path(resolved):
|
|
435
|
+
resolved = _UNKNOWN_TARGET
|
|
436
|
+
target = resolved if resolved is not None else _UNKNOWN_TARGET
|
|
437
|
+
|
|
438
|
+
# Determine provenance from assignments_typed
|
|
439
|
+
provenance = _PROVENANCE_UNKNOWN
|
|
440
|
+
for var_name, (path, prov_type) in assignments_typed.items():
|
|
441
|
+
if path == target and target != _UNKNOWN_TARGET:
|
|
442
|
+
provenance = prov_type
|
|
443
|
+
break
|
|
444
|
+
|
|
445
|
+
calls.append(WriteCall(target=target, operation=operation, line=line_no, provenance=provenance))
|
|
446
|
+
continue
|
|
447
|
+
|
|
448
|
+
# os.replace(src, dst) — dst is second positional arg
|
|
449
|
+
if (node.func.attr == "replace" and
|
|
450
|
+
isinstance(node.func.value, ast.Name) and
|
|
451
|
+
node.func.value.id == "os" and
|
|
452
|
+
len(node.args) >= 2):
|
|
453
|
+
operation = "os.replace"
|
|
454
|
+
dst_node = node.args[1]
|
|
455
|
+
if isinstance(dst_node, ast.Name):
|
|
456
|
+
name = dst_node.id
|
|
457
|
+
resolved = name
|
|
458
|
+
visited: set[str] = {resolved}
|
|
459
|
+
for _ in range(8):
|
|
460
|
+
nxt = aliases.get(resolved)
|
|
461
|
+
if nxt is None or nxt in visited:
|
|
462
|
+
break
|
|
463
|
+
visited.add(nxt)
|
|
464
|
+
resolved = nxt
|
|
465
|
+
target = assignments.get(resolved, _UNKNOWN_TARGET)
|
|
466
|
+
|
|
467
|
+
# Determine provenance
|
|
468
|
+
provenance = _PROVENANCE_UNKNOWN
|
|
469
|
+
for var_name, (path, prov_type) in assignments_typed.items():
|
|
470
|
+
if path == target and target != _UNKNOWN_TARGET:
|
|
471
|
+
provenance = prov_type
|
|
472
|
+
break
|
|
473
|
+
|
|
474
|
+
calls.append(WriteCall(target=target, operation=operation, line=line_no, provenance=provenance))
|
|
475
|
+
|
|
476
|
+
return calls
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
# ---------------------------------------------------------------------------
|
|
480
|
+
# Per-file scan
|
|
481
|
+
# ---------------------------------------------------------------------------
|
|
482
|
+
|
|
483
|
+
def _scan_writers(
|
|
484
|
+
project_dir: Path,
|
|
485
|
+
include_roots: Sequence[str] | None,
|
|
486
|
+
) -> dict[str, list[WriteCall]]:
|
|
487
|
+
"""Return mapping writer_rel_posix -> list[WriteCall].
|
|
488
|
+
|
|
489
|
+
Uses parallel ThreadPoolExecutor for faster AST scanning on large projects.
|
|
490
|
+
"""
|
|
491
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
492
|
+
import os as _os
|
|
493
|
+
|
|
494
|
+
project_dir = project_dir.resolve()
|
|
495
|
+
py_files = sorted(iter_py_files(project_dir, include_roots))
|
|
496
|
+
|
|
497
|
+
def _scan_one(py_file: Path) -> tuple[str, list[WriteCall]] | None:
|
|
498
|
+
try:
|
|
499
|
+
source = py_file.read_text(encoding="utf-8", errors="replace")
|
|
500
|
+
tree = ast.parse(source, filename=str(py_file))
|
|
501
|
+
except (OSError, SyntaxError) as exc:
|
|
502
|
+
_log.debug("_scan_writers: skipping %s: %s", py_file, exc)
|
|
503
|
+
return None
|
|
504
|
+
# Cheap pre-filter: skip files with no candidate write-shaped call.
|
|
505
|
+
# Method writes (.write_text/.save/os.replace) are ast.Attribute; the
|
|
506
|
+
# function-call writes open(...) / json.dump(...) are ast.Name / module
|
|
507
|
+
# attribute respectively. Mode/target precision is decided later in
|
|
508
|
+
# _scan_write_calls — this is only a coarse "worth parsing?" gate.
|
|
509
|
+
def _is_candidate(node: ast.AST) -> bool:
|
|
510
|
+
if not isinstance(node, ast.Call):
|
|
511
|
+
return False
|
|
512
|
+
func = node.func
|
|
513
|
+
if isinstance(func, ast.Name):
|
|
514
|
+
return func.id == "open" # mode filtered later
|
|
515
|
+
if isinstance(func, ast.Attribute):
|
|
516
|
+
if func.attr in _WRITE_METHOD_NAMES:
|
|
517
|
+
return True
|
|
518
|
+
if func.attr == "dump" and isinstance(func.value, ast.Name) and func.value.id == "json":
|
|
519
|
+
return True
|
|
520
|
+
if (func.attr == "replace"
|
|
521
|
+
and isinstance(func.value, ast.Name)
|
|
522
|
+
and func.value.id == "os"):
|
|
523
|
+
return True
|
|
524
|
+
return False
|
|
525
|
+
|
|
526
|
+
if not any(_is_candidate(node) for node in ast.walk(tree)):
|
|
527
|
+
return None
|
|
528
|
+
assignments_typed, aliases = _collect_assignments(tree)
|
|
529
|
+
write_calls = _scan_write_calls(tree, assignments_typed, aliases)
|
|
530
|
+
if not write_calls:
|
|
531
|
+
return None
|
|
532
|
+
try:
|
|
533
|
+
rel = py_file.resolve().relative_to(project_dir).as_posix()
|
|
534
|
+
except ValueError:
|
|
535
|
+
_log.debug("_scan_writers: cannot relativize %s", py_file)
|
|
536
|
+
return None
|
|
537
|
+
return rel, write_calls
|
|
538
|
+
|
|
539
|
+
raw: dict[str, list[WriteCall]] = {}
|
|
540
|
+
max_workers = min(8, (_os.cpu_count() or 4))
|
|
541
|
+
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
542
|
+
futures = {pool.submit(_scan_one, f): f for f in py_files}
|
|
543
|
+
for future in as_completed(futures):
|
|
544
|
+
out = future.result()
|
|
545
|
+
if out is not None:
|
|
546
|
+
rel, write_calls = out
|
|
547
|
+
raw[rel] = write_calls
|
|
548
|
+
|
|
549
|
+
# Sort for determinism (as_completed order is arbitrary)
|
|
550
|
+
result = dict(sorted(raw.items()))
|
|
551
|
+
_log.debug("_scan_writers: found %d writer file(s)", len(result))
|
|
552
|
+
return result
|
|
553
|
+
|
|
554
|
+
|
|
555
|
+
# ---------------------------------------------------------------------------
|
|
556
|
+
# Classification + domain matching
|
|
557
|
+
# ---------------------------------------------------------------------------
|
|
558
|
+
|
|
559
|
+
def _classify_writer(rel_path: str, allowed_writers: tuple[str, ...]) -> str:
|
|
560
|
+
return "canonical_write" if rel_path in allowed_writers else "illegal_write"
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def _writer_matches_domain(targets: list[str], patterns: tuple[str, ...]) -> bool:
|
|
564
|
+
"""True if any resolved (non-unknown) target matches any domain pattern."""
|
|
565
|
+
for target in targets:
|
|
566
|
+
if target == _UNKNOWN_TARGET:
|
|
567
|
+
continue
|
|
568
|
+
for pattern in patterns:
|
|
569
|
+
if _match_glob_path(target, pattern):
|
|
570
|
+
return True
|
|
571
|
+
return False
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
# ---------------------------------------------------------------------------
|
|
575
|
+
# Seed loading
|
|
576
|
+
# ---------------------------------------------------------------------------
|
|
577
|
+
|
|
578
|
+
def _load_seed(project_dir: Path) -> list[dict] | None:
|
|
579
|
+
"""Load authority domains seed. Returns None if missing, raises on corrupt."""
|
|
580
|
+
seed_path = seeds_dir(project_dir) / _SEED_FILENAME
|
|
581
|
+
if not seed_path.exists():
|
|
582
|
+
_log.info("build_authority_map: no seed at %s, returning empty map", seed_path)
|
|
583
|
+
return None
|
|
584
|
+
try:
|
|
585
|
+
raw = json.loads(seed_path.read_text(encoding="utf-8"))
|
|
586
|
+
except (json.JSONDecodeError, OSError, UnicodeDecodeError) as exc:
|
|
587
|
+
raise MapIntegrityError(
|
|
588
|
+
"authority seed corrupt (JSON parse failed): %s -- %s" % (seed_path, exc)
|
|
589
|
+
) from exc
|
|
590
|
+
if not isinstance(raw, dict):
|
|
591
|
+
raise MapIntegrityError(
|
|
592
|
+
"authority seed must be a JSON object, got %s" % type(raw).__name__
|
|
593
|
+
)
|
|
594
|
+
schema_version = raw.get("schema_version")
|
|
595
|
+
if schema_version is None:
|
|
596
|
+
raise MapIntegrityError(
|
|
597
|
+
"authority seed missing required field 'schema_version' in %s" % seed_path
|
|
598
|
+
)
|
|
599
|
+
try:
|
|
600
|
+
major = int(str(schema_version).split(".")[0])
|
|
601
|
+
except (ValueError, IndexError) as exc:
|
|
602
|
+
raise MapIntegrityError(
|
|
603
|
+
"authority seed has unparseable schema_version %r in %s" % (schema_version, seed_path)
|
|
604
|
+
) from exc
|
|
605
|
+
if major > 1:
|
|
606
|
+
raise MapIntegrityError(
|
|
607
|
+
"authority seed schema_version %r has major version %d > 1 -- "
|
|
608
|
+
"upgrade the builder to read this seed" % (schema_version, major)
|
|
609
|
+
)
|
|
610
|
+
domains = raw.get("domains", [])
|
|
611
|
+
if not isinstance(domains, list):
|
|
612
|
+
raise MapIntegrityError(
|
|
613
|
+
"authority seed 'domains' must be a list, got %s" % type(domains).__name__
|
|
614
|
+
)
|
|
615
|
+
_log.debug("_load_seed: loaded %d domain(s) from %s", len(domains), seed_path)
|
|
616
|
+
return domains
|
|
617
|
+
|
|
618
|
+
|
|
619
|
+
# ---------------------------------------------------------------------------
|
|
620
|
+
# Auto-discovery: collect targets and infer domains (seed-free)
|
|
621
|
+
# ---------------------------------------------------------------------------
|
|
622
|
+
|
|
623
|
+
def _collect_auto_write_targets(
|
|
624
|
+
writers_map: dict[str, list[WriteCall]],
|
|
625
|
+
adapter_candidates: dict[str, list],
|
|
626
|
+
) -> tuple[dict[str, list[str]], dict[str, list[tuple[str, WriteCall | None]]]]:
|
|
627
|
+
"""Collect target -> [writer_rel_posix] mapping and WriteCall tracking.
|
|
628
|
+
|
|
629
|
+
Returns:
|
|
630
|
+
(target_to_writers, target_to_write_calls) where:
|
|
631
|
+
- target_to_writers: target -> [writer_rel_posix] (backward compat)
|
|
632
|
+
- target_to_write_calls: target -> [(writer_rel, WriteCall|None), ...]
|
|
633
|
+
WriteCall is None for non-Python adapter writers (no AST info).
|
|
634
|
+
|
|
635
|
+
target keys are normalized (tmp/bak stripped).
|
|
636
|
+
Only non-UNKNOWN targets included.
|
|
637
|
+
Result is sorted for determinism.
|
|
638
|
+
"""
|
|
639
|
+
target_to_writers: dict[str, list[str]] = {}
|
|
640
|
+
target_to_write_calls: dict[str, list[tuple[str, WriteCall | None]]] = {}
|
|
641
|
+
|
|
642
|
+
# Python AST writers
|
|
643
|
+
for writer_rel, write_calls in sorted(writers_map.items()):
|
|
644
|
+
for write_call in write_calls:
|
|
645
|
+
target = write_call.target
|
|
646
|
+
if target == _UNKNOWN_TARGET:
|
|
647
|
+
continue
|
|
648
|
+
base = _normalize_target_path(target)
|
|
649
|
+
target_to_writers.setdefault(base, []).append(writer_rel)
|
|
650
|
+
target_to_write_calls.setdefault(base, []).append((writer_rel, write_call))
|
|
651
|
+
|
|
652
|
+
# Non-Python adapter writers (TS/JS/Go/Java etc.)
|
|
653
|
+
for writer_rel, candidates in sorted(adapter_candidates.items()):
|
|
654
|
+
for candidate in candidates:
|
|
655
|
+
if not candidate.target_hint:
|
|
656
|
+
continue
|
|
657
|
+
base = _normalize_target_path(candidate.target_hint)
|
|
658
|
+
target_to_writers.setdefault(base, []).append(writer_rel)
|
|
659
|
+
# No WriteCall object available for adapters
|
|
660
|
+
target_to_write_calls.setdefault(base, []).append((writer_rel, None))
|
|
661
|
+
|
|
662
|
+
# Deduplicate and sort writers lists for determinism
|
|
663
|
+
target_to_writers_result = {
|
|
664
|
+
target: sorted(dict.fromkeys(writers))
|
|
665
|
+
for target, writers in sorted(target_to_writers.items())
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
# Deduplicate WriteCall entries (keep first occurrence per writer_rel)
|
|
669
|
+
target_to_write_calls_result = {}
|
|
670
|
+
for target in sorted(target_to_write_calls.keys()):
|
|
671
|
+
# Keep only first WriteCall per writer_rel for this target
|
|
672
|
+
seen_writers: dict[str, tuple[str, WriteCall | None]] = {}
|
|
673
|
+
for writer_rel, write_call in target_to_write_calls[target]:
|
|
674
|
+
if writer_rel not in seen_writers:
|
|
675
|
+
seen_writers[writer_rel] = (writer_rel, write_call)
|
|
676
|
+
target_to_write_calls_result[target] = list(seen_writers.values())
|
|
677
|
+
|
|
678
|
+
return target_to_writers_result, target_to_write_calls_result
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
def _auto_discover_domains(
|
|
682
|
+
target_to_writers: dict[str, list[str]],
|
|
683
|
+
target_to_write_calls: dict[str, list[tuple[str, WriteCall | None]]],
|
|
684
|
+
seed_domains: list[dict],
|
|
685
|
+
) -> list[dict]:
|
|
686
|
+
"""Find shared-write clusters not covered by seed.
|
|
687
|
+
|
|
688
|
+
Returns synthetic domain defs for build_authority_map() merge loop.
|
|
689
|
+
Only includes groups with 2+ writers from DIFFERENT module prefixes.
|
|
690
|
+
Seed-covered targets are skipped.
|
|
691
|
+
Test-only shared writes (all writers non-production) are skipped.
|
|
692
|
+
|
|
693
|
+
Args:
|
|
694
|
+
target_to_writers: target -> [writer_rel] mapping (backward compat usage)
|
|
695
|
+
target_to_write_calls: target -> [(writer_rel, WriteCall|None)] mapping
|
|
696
|
+
seed_domains: list of seed domain definitions
|
|
697
|
+
"""
|
|
698
|
+
auto_domains = []
|
|
699
|
+
for target, writers in target_to_writers.items():
|
|
700
|
+
if len(writers) < 2:
|
|
701
|
+
continue
|
|
702
|
+
# Must come from different module prefixes
|
|
703
|
+
prefixes = {_module_prefix(w) for w in writers}
|
|
704
|
+
if len(prefixes) < 2:
|
|
705
|
+
continue
|
|
706
|
+
# Skip if any seed domain already covers this target
|
|
707
|
+
if _seed_covers_target(target, seed_domains):
|
|
708
|
+
continue
|
|
709
|
+
# Skip if all writers are non-production (test/fixture/generated)
|
|
710
|
+
roles = {classify_file_role(w) for w in writers}
|
|
711
|
+
if "production" not in roles:
|
|
712
|
+
continue
|
|
713
|
+
auto_domains.append({
|
|
714
|
+
"_auto": True,
|
|
715
|
+
"authority_domain": f"shared_write:{_safe_domain_name(target)}",
|
|
716
|
+
"canonical_owner": "",
|
|
717
|
+
"allowed_writers": [],
|
|
718
|
+
"target_file_patterns": [target],
|
|
719
|
+
"_shared_target": target,
|
|
720
|
+
"_all_writers": writers,
|
|
721
|
+
"_write_calls": target_to_write_calls.get(target, []),
|
|
722
|
+
})
|
|
723
|
+
return auto_domains
|
|
724
|
+
|
|
725
|
+
|
|
726
|
+
def _safe_writer_domain_name(writer_rel: str) -> str:
|
|
727
|
+
"""Stable, filesystem-safe domain name for a writer file (no-seed mode)."""
|
|
728
|
+
digest = hashlib.blake2s(writer_rel.encode("utf-8"), digest_size=2).hexdigest()
|
|
729
|
+
stem = Path(writer_rel).stem
|
|
730
|
+
raw = "%s_%s" % (stem, digest)
|
|
731
|
+
return "auto_discovered:" + re.sub(r"[^a-zA-Z0-9_]", "_", raw)[:48]
|
|
732
|
+
|
|
733
|
+
|
|
734
|
+
def _build_no_seed_writer_domains(
|
|
735
|
+
writers_map: dict[str, list[WriteCall]],
|
|
736
|
+
adapter_candidates: dict[str, list[AuthorityWriteCandidate]],
|
|
737
|
+
) -> list[AuthorityDomain]:
|
|
738
|
+
"""Auto-surface every discovered writer as an inferred AuthorityDomain.
|
|
739
|
+
|
|
740
|
+
Used ONLY when no seed file exists. One domain per writer file: the writer
|
|
741
|
+
is named as canonical_owner and listed in writers_detected together with
|
|
742
|
+
each resolved write target + operation/kind, so the entry is actionable
|
|
743
|
+
out-of-the-box. status="inferred", source names "static_scan".
|
|
744
|
+
|
|
745
|
+
Writers with no resolvable targets are still surfaced (with an unknown
|
|
746
|
+
target) because a confirmed write operation is itself authority evidence;
|
|
747
|
+
pure reads never reach this map (they produce no WriteCall / candidate).
|
|
748
|
+
"""
|
|
749
|
+
metadata = make_metadata(source="static_scan", confidence=0.5, status="inferred")
|
|
750
|
+
domains: list[AuthorityDomain] = []
|
|
751
|
+
|
|
752
|
+
# union of all writer files (Python AST + non-Python adapter), sorted
|
|
753
|
+
all_writers = sorted(set(writers_map) | set(adapter_candidates))
|
|
754
|
+
|
|
755
|
+
for writer_rel in all_writers:
|
|
756
|
+
writers_detected: list[dict] = []
|
|
757
|
+
targets: list[str] = []
|
|
758
|
+
|
|
759
|
+
# Python AST write calls
|
|
760
|
+
for wc in writers_map.get(writer_rel, []):
|
|
761
|
+
target = wc.target if wc.target != _UNKNOWN_TARGET else ""
|
|
762
|
+
if target:
|
|
763
|
+
targets.append(_normalize_target_path(target))
|
|
764
|
+
writers_detected.append({
|
|
765
|
+
"location": writer_rel,
|
|
766
|
+
"kind": "write",
|
|
767
|
+
"target": _normalize_target_path(target) if target else "",
|
|
768
|
+
"operation": wc.operation,
|
|
769
|
+
"line": wc.line,
|
|
770
|
+
"provenance": wc.provenance,
|
|
771
|
+
"file_role": classify_file_role(writer_rel),
|
|
772
|
+
})
|
|
773
|
+
|
|
774
|
+
# Non-Python adapter candidates (Go/Java/JS/TS)
|
|
775
|
+
for cand in adapter_candidates.get(writer_rel, []):
|
|
776
|
+
target = cand.target_hint or ""
|
|
777
|
+
if target:
|
|
778
|
+
targets.append(_normalize_target_path(target))
|
|
779
|
+
writers_detected.append({
|
|
780
|
+
"location": writer_rel,
|
|
781
|
+
"kind": cand.write_kind,
|
|
782
|
+
"target": _normalize_target_path(target) if target else "",
|
|
783
|
+
"operation": cand.write_kind,
|
|
784
|
+
"line": cand.line,
|
|
785
|
+
"provenance": _PROVENANCE_UNKNOWN,
|
|
786
|
+
"file_role": classify_file_role(writer_rel),
|
|
787
|
+
})
|
|
788
|
+
|
|
789
|
+
if not writers_detected:
|
|
790
|
+
continue
|
|
791
|
+
|
|
792
|
+
# Deterministic order inside the entry
|
|
793
|
+
writers_detected.sort(key=lambda w: (w.get("line") or 0, w.get("target", ""), w.get("operation", "")))
|
|
794
|
+
resolved_targets = sorted(dict.fromkeys(t for t in targets if t))
|
|
795
|
+
|
|
796
|
+
domains.append(AuthorityDomain(
|
|
797
|
+
authority_domain=_safe_writer_domain_name(writer_rel),
|
|
798
|
+
canonical_owner=writer_rel,
|
|
799
|
+
allowed_writers=(writer_rel,),
|
|
800
|
+
derived_readers=(),
|
|
801
|
+
cache_layers=(),
|
|
802
|
+
freshness_sla="immediate",
|
|
803
|
+
invalidation_rule="unknown",
|
|
804
|
+
drift_policy="observe",
|
|
805
|
+
writers_detected=tuple(
|
|
806
|
+
json.dumps(w, sort_keys=True) for w in writers_detected
|
|
807
|
+
),
|
|
808
|
+
last_drift_events=(),
|
|
809
|
+
target_file_patterns=tuple(resolved_targets),
|
|
810
|
+
source=metadata["source"],
|
|
811
|
+
evidence=tuple(metadata["evidence"]),
|
|
812
|
+
confidence=metadata["confidence"],
|
|
813
|
+
freshness=metadata["freshness"],
|
|
814
|
+
status=metadata["status"],
|
|
815
|
+
))
|
|
816
|
+
|
|
817
|
+
return domains
|
|
818
|
+
|
|
819
|
+
|
|
820
|
+
# ---------------------------------------------------------------------------
|
|
821
|
+
# Non-Python adapter writer collection (L7a)
|
|
822
|
+
# ---------------------------------------------------------------------------
|
|
823
|
+
|
|
824
|
+
def _collect_adapter_writer_candidates(
|
|
825
|
+
project_dir: Path,
|
|
826
|
+
include_roots: Sequence[str] | None,
|
|
827
|
+
) -> dict[str, list[AuthorityWriteCandidate]]:
|
|
828
|
+
"""Return mapping rel_posix -> list[AuthorityWriteCandidate] for non-Python files.
|
|
829
|
+
|
|
830
|
+
Iterates all source files via iter_source_files, skips Python (handled by
|
|
831
|
+
AST pass), skips adapters without supports_authority_writes=True.
|
|
832
|
+
"""
|
|
833
|
+
result: dict[str, list[AuthorityWriteCandidate]] = {}
|
|
834
|
+
project_dir = project_dir.resolve()
|
|
835
|
+
for src_file in iter_source_files(project_dir, include_roots=include_roots):
|
|
836
|
+
adapter = get_adapter_for_file(src_file)
|
|
837
|
+
if adapter is None:
|
|
838
|
+
continue
|
|
839
|
+
if adapter.language == "python":
|
|
840
|
+
continue
|
|
841
|
+
if not getattr(adapter, "supports_authority_writes", False):
|
|
842
|
+
continue
|
|
843
|
+
try:
|
|
844
|
+
content = src_file.read_text(encoding="utf-8", errors="replace")
|
|
845
|
+
except OSError as exc:
|
|
846
|
+
_log.debug("_collect_adapter_writer_candidates: skipping %s: %s", src_file, exc)
|
|
847
|
+
continue
|
|
848
|
+
try:
|
|
849
|
+
candidates = adapter.extract_writer_calls(content, src_file) # type: ignore[attr-defined]
|
|
850
|
+
except Exception as exc: # noqa: BLE001
|
|
851
|
+
_log.debug(
|
|
852
|
+
"_collect_adapter_writer_candidates: error in %s for %s: %s",
|
|
853
|
+
adapter.language, src_file, exc,
|
|
854
|
+
)
|
|
855
|
+
continue
|
|
856
|
+
if not candidates:
|
|
857
|
+
continue
|
|
858
|
+
try:
|
|
859
|
+
rel = src_file.resolve().relative_to(project_dir).as_posix()
|
|
860
|
+
except ValueError:
|
|
861
|
+
_log.debug("_collect_adapter_writer_candidates: cannot relativize %s", src_file)
|
|
862
|
+
continue
|
|
863
|
+
result[rel] = candidates
|
|
864
|
+
_log.debug(
|
|
865
|
+
"_collect_adapter_writer_candidates: found %d non-Python writer file(s)", len(result)
|
|
866
|
+
)
|
|
867
|
+
return result
|
|
868
|
+
|
|
869
|
+
|
|
870
|
+
# ---------------------------------------------------------------------------
|
|
871
|
+
# Public API
|
|
872
|
+
# ---------------------------------------------------------------------------
|
|
873
|
+
|
|
874
|
+
def build_authority_map(
|
|
875
|
+
project_dir: Path,
|
|
876
|
+
include_roots: Sequence[str] | None = None,
|
|
877
|
+
parse_cache: Any | None = None,
|
|
878
|
+
) -> list[AuthorityDomain]:
|
|
879
|
+
"""Build authority map for a target project.
|
|
880
|
+
|
|
881
|
+
Reads seed from <project_dir>/.cortex/map_seeds/authority_domains.json.
|
|
882
|
+
Each domain's ``target_file_patterns`` controls which writers are attributed
|
|
883
|
+
to it via AST-resolved write-target matching. Missing patterns -> no
|
|
884
|
+
auto-discovery for that domain.
|
|
885
|
+
|
|
886
|
+
Also performs seed-free auto-discovery: detects shared write targets
|
|
887
|
+
(2+ writers from different module prefixes) and creates inferred domains.
|
|
888
|
+
|
|
889
|
+
When NO seed file exists, additionally auto-surfaces every discovered write
|
|
890
|
+
site as an inferred per-writer domain (out-of-box usefulness). With a seed
|
|
891
|
+
present this step is skipped to preserve the structured behaviour.
|
|
892
|
+
|
|
893
|
+
Returns empty list only if no seed file exists AND no write sites were found.
|
|
894
|
+
Raises MapIntegrityError if seed is corrupt or has incompatible version.
|
|
895
|
+
"""
|
|
896
|
+
project_dir = Path(project_dir).resolve()
|
|
897
|
+
_log.info("build_authority_map: starting for %s", project_dir)
|
|
898
|
+
# parse_cache is accepted for API uniformity with other builders.
|
|
899
|
+
# _scan_writers uses ThreadPoolExecutor internally, and ParseCacheL1 is
|
|
900
|
+
# not thread-safe, so the cache is not passed into the threaded scan path.
|
|
901
|
+
if parse_cache is not None:
|
|
902
|
+
_log.debug("build_authority_map: parse_cache provided but not used in threaded _scan_writers")
|
|
903
|
+
domains_raw = _load_seed(project_dir)
|
|
904
|
+
no_seed = domains_raw is None # no seed file at all -> auto-surface mode
|
|
905
|
+
seed_list: list[dict] = domains_raw or []
|
|
906
|
+
|
|
907
|
+
# ALWAYS scan writers (not gated behind seed anymore)
|
|
908
|
+
_log.info("build_authority_map: scanning writers via AST in %s", project_dir)
|
|
909
|
+
writers_map = _scan_writers(project_dir, include_roots)
|
|
910
|
+
_log.debug("build_authority_map: %d Python writer file(s)", len(writers_map))
|
|
911
|
+
|
|
912
|
+
# L7a: collect non-Python write candidates via adapter dispatch
|
|
913
|
+
adapter_candidates = _collect_adapter_writer_candidates(project_dir, include_roots)
|
|
914
|
+
_log.debug(
|
|
915
|
+
"build_authority_map: %d non-Python writer file(s)", len(adapter_candidates)
|
|
916
|
+
)
|
|
917
|
+
|
|
918
|
+
# Collect shared write targets (Python + non-Python)
|
|
919
|
+
target_to_writers, target_to_write_calls = _collect_auto_write_targets(writers_map, adapter_candidates)
|
|
920
|
+
|
|
921
|
+
metadata = make_metadata(source="seed + static_scan", confidence=0.85, status="observed")
|
|
922
|
+
results: list[AuthorityDomain] = []
|
|
923
|
+
|
|
924
|
+
for domain_def in seed_list:
|
|
925
|
+
if not isinstance(domain_def, dict):
|
|
926
|
+
raise MapIntegrityError(
|
|
927
|
+
"authority seed domain entry must be a dict, got %s" % type(domain_def).__name__
|
|
928
|
+
)
|
|
929
|
+
authority_domain = str(domain_def.get("authority_domain", ""))
|
|
930
|
+
if not authority_domain:
|
|
931
|
+
raise MapIntegrityError(
|
|
932
|
+
"authority seed domain entry missing 'authority_domain' field: %r" % domain_def
|
|
933
|
+
)
|
|
934
|
+
|
|
935
|
+
allowed_writers: tuple[str, ...] = tuple(domain_def.get("allowed_writers", []))
|
|
936
|
+
target_file_patterns: tuple[str, ...] = tuple(domain_def.get("target_file_patterns", []))
|
|
937
|
+
|
|
938
|
+
if not target_file_patterns:
|
|
939
|
+
_log.info(
|
|
940
|
+
"build_authority_map: domain=%s has no target_file_patterns -- "
|
|
941
|
+
"skipping auto-discovery",
|
|
942
|
+
authority_domain,
|
|
943
|
+
)
|
|
944
|
+
|
|
945
|
+
seen_locations: set[str] = set()
|
|
946
|
+
writers_detected_dicts: list[dict] = []
|
|
947
|
+
if target_file_patterns:
|
|
948
|
+
# Python AST writers
|
|
949
|
+
for writer_path, write_calls in sorted(writers_map.items()):
|
|
950
|
+
if writer_path in seen_locations:
|
|
951
|
+
continue
|
|
952
|
+
# Extract targets from WriteCall objects for domain matching
|
|
953
|
+
targets = [wc.target for wc in write_calls]
|
|
954
|
+
if _writer_matches_domain(targets, target_file_patterns):
|
|
955
|
+
kind = _classify_writer(writer_path, allowed_writers)
|
|
956
|
+
# Pick most significant write call (by provenance priority)
|
|
957
|
+
# Priority: path_constructor > string_literal > function_parameter > unknown
|
|
958
|
+
_prov_priority = {
|
|
959
|
+
_PROVENANCE_PATH_CONSTRUCTOR: 3,
|
|
960
|
+
_PROVENANCE_STRING_LITERAL: 2,
|
|
961
|
+
_PROVENANCE_FUNCTION_PARAM: 1,
|
|
962
|
+
_PROVENANCE_UNKNOWN: 0,
|
|
963
|
+
}
|
|
964
|
+
best_wc = max(write_calls, key=lambda wc: _prov_priority.get(wc.provenance, -1))
|
|
965
|
+
writers_detected_dicts.append({
|
|
966
|
+
"location": writer_path,
|
|
967
|
+
"kind": kind,
|
|
968
|
+
"file_role": classify_file_role(writer_path),
|
|
969
|
+
"operation": best_wc.operation,
|
|
970
|
+
"line": best_wc.line,
|
|
971
|
+
"provenance": best_wc.provenance,
|
|
972
|
+
})
|
|
973
|
+
seen_locations.add(writer_path)
|
|
974
|
+
|
|
975
|
+
# Non-Python adapter writers (L7a)
|
|
976
|
+
for writer_path, aw_candidates in sorted(adapter_candidates.items()):
|
|
977
|
+
if writer_path in seen_locations:
|
|
978
|
+
continue
|
|
979
|
+
# Use target_hint values as synthetic targets for domain matching.
|
|
980
|
+
# Empty hints are treated as unknown targets (same as Python's
|
|
981
|
+
# _UNKNOWN_TARGET) and do not contribute to domain matching.
|
|
982
|
+
synthetic_targets = [
|
|
983
|
+
c.target_hint for c in aw_candidates if c.target_hint
|
|
984
|
+
]
|
|
985
|
+
if not synthetic_targets:
|
|
986
|
+
continue
|
|
987
|
+
if _writer_matches_domain(synthetic_targets, target_file_patterns):
|
|
988
|
+
kind = _classify_writer(writer_path, allowed_writers)
|
|
989
|
+
writers_detected_dicts.append({
|
|
990
|
+
"location": writer_path,
|
|
991
|
+
"kind": kind,
|
|
992
|
+
"file_role": classify_file_role(writer_path),
|
|
993
|
+
})
|
|
994
|
+
seen_locations.add(writer_path)
|
|
995
|
+
|
|
996
|
+
domain = AuthorityDomain(
|
|
997
|
+
authority_domain=authority_domain,
|
|
998
|
+
canonical_owner=str(domain_def.get("canonical_owner", "")),
|
|
999
|
+
allowed_writers=allowed_writers,
|
|
1000
|
+
derived_readers=tuple(domain_def.get("derived_readers", [])),
|
|
1001
|
+
cache_layers=tuple(domain_def.get("cache_layers", [])),
|
|
1002
|
+
freshness_sla=str(domain_def.get("freshness_sla", "immediate")),
|
|
1003
|
+
invalidation_rule=str(domain_def.get("invalidation_rule", "")),
|
|
1004
|
+
drift_policy=str(domain_def.get("drift_policy", "fail_close")),
|
|
1005
|
+
writers_detected=tuple(json.dumps(w, sort_keys=True) for w in writers_detected_dicts),
|
|
1006
|
+
last_drift_events=(),
|
|
1007
|
+
target_file_patterns=target_file_patterns,
|
|
1008
|
+
source=metadata["source"],
|
|
1009
|
+
evidence=tuple(metadata["evidence"]),
|
|
1010
|
+
confidence=metadata["confidence"],
|
|
1011
|
+
freshness=metadata["freshness"],
|
|
1012
|
+
status=metadata["status"],
|
|
1013
|
+
)
|
|
1014
|
+
results.append(domain)
|
|
1015
|
+
_log.debug(
|
|
1016
|
+
"build_authority_map: domain=%s patterns=%d writers_detected=%d",
|
|
1017
|
+
authority_domain, len(target_file_patterns), len(writers_detected_dicts),
|
|
1018
|
+
)
|
|
1019
|
+
|
|
1020
|
+
# --- Auto-discovered domains (seed-free) ---
|
|
1021
|
+
auto_domains = _auto_discover_domains(target_to_writers, target_to_write_calls, seed_list)
|
|
1022
|
+
auto_metadata = make_metadata(source="auto_scan", confidence=0.6, status="inferred")
|
|
1023
|
+
|
|
1024
|
+
for ad in auto_domains:
|
|
1025
|
+
writers_detected_list = []
|
|
1026
|
+
# Build mapping of writer -> WriteCall for quick lookup
|
|
1027
|
+
write_calls_by_writer: dict[str, WriteCall | None] = {}
|
|
1028
|
+
for writer_rel, write_call in ad["_write_calls"]:
|
|
1029
|
+
write_calls_by_writer[writer_rel] = write_call
|
|
1030
|
+
|
|
1031
|
+
for w in ad["_all_writers"]:
|
|
1032
|
+
# Look up WriteCall for this writer
|
|
1033
|
+
write_call = write_calls_by_writer.get(w)
|
|
1034
|
+
|
|
1035
|
+
if write_call is not None:
|
|
1036
|
+
# Python AST writer with full WriteCall information
|
|
1037
|
+
provenance = write_call.provenance
|
|
1038
|
+
operation = write_call.operation
|
|
1039
|
+
line = write_call.line
|
|
1040
|
+
else:
|
|
1041
|
+
# Non-Python adapter writer (no AST info available)
|
|
1042
|
+
provenance = _PROVENANCE_UNKNOWN
|
|
1043
|
+
operation = "unknown"
|
|
1044
|
+
line = None
|
|
1045
|
+
|
|
1046
|
+
writers_detected_list.append({
|
|
1047
|
+
"location": w,
|
|
1048
|
+
"kind": "shared_write",
|
|
1049
|
+
"target": ad["_shared_target"],
|
|
1050
|
+
"module_prefix": _module_prefix(w),
|
|
1051
|
+
"file_role": classify_file_role(w),
|
|
1052
|
+
"operation": operation,
|
|
1053
|
+
"line": line,
|
|
1054
|
+
"provenance": provenance,
|
|
1055
|
+
})
|
|
1056
|
+
|
|
1057
|
+
writers_detected = tuple(
|
|
1058
|
+
json.dumps(w, sort_keys=True)
|
|
1059
|
+
for w in writers_detected_list
|
|
1060
|
+
)
|
|
1061
|
+
results.append(AuthorityDomain(
|
|
1062
|
+
authority_domain=ad["authority_domain"],
|
|
1063
|
+
canonical_owner="",
|
|
1064
|
+
allowed_writers=(),
|
|
1065
|
+
derived_readers=(),
|
|
1066
|
+
cache_layers=(),
|
|
1067
|
+
freshness_sla="immediate",
|
|
1068
|
+
invalidation_rule="unknown",
|
|
1069
|
+
drift_policy="observe",
|
|
1070
|
+
writers_detected=writers_detected,
|
|
1071
|
+
last_drift_events=(),
|
|
1072
|
+
target_file_patterns=tuple(ad["target_file_patterns"]),
|
|
1073
|
+
source=auto_metadata["source"],
|
|
1074
|
+
evidence=tuple(auto_metadata["evidence"]),
|
|
1075
|
+
confidence=auto_metadata["confidence"],
|
|
1076
|
+
freshness=auto_metadata["freshness"],
|
|
1077
|
+
status=auto_metadata["status"],
|
|
1078
|
+
))
|
|
1079
|
+
_log.debug(
|
|
1080
|
+
"build_authority_map: auto domain=%s target=%s writers_detected=%d",
|
|
1081
|
+
ad["authority_domain"], ad["_shared_target"], len(ad["_all_writers"]),
|
|
1082
|
+
)
|
|
1083
|
+
|
|
1084
|
+
# --- No-seed auto-surface (out-of-box) ---
|
|
1085
|
+
# When NO seed file exists, the per-domain loop above never runs and the
|
|
1086
|
+
# shared-write heuristic only catches multi-writer targets, so most projects
|
|
1087
|
+
# got an empty authority map. Surface every discovered writer (Python +
|
|
1088
|
+
# adapter) as an inferred per-writer domain so the map is useful immediately.
|
|
1089
|
+
# When a seed exists we keep the structured behaviour and do NOT add these
|
|
1090
|
+
# (avoids double-surfacing writers already attributed to seed domains).
|
|
1091
|
+
no_seed_count = 0
|
|
1092
|
+
if no_seed:
|
|
1093
|
+
no_seed_domains = _build_no_seed_writer_domains(writers_map, adapter_candidates)
|
|
1094
|
+
results.extend(no_seed_domains)
|
|
1095
|
+
no_seed_count = len(no_seed_domains)
|
|
1096
|
+
|
|
1097
|
+
_log.info(
|
|
1098
|
+
"build_authority_map: completed %d domain(s) (seed=%d auto=%d no_seed=%d), %d writer file(s) scanned",
|
|
1099
|
+
len(results), len([r for r in results if r.status == "observed"]),
|
|
1100
|
+
len(auto_domains), no_seed_count, len(writers_map),
|
|
1101
|
+
)
|
|
1102
|
+
return results
|