dotscope 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. dotscope/.scope +63 -0
  2. dotscope/__init__.py +3 -0
  3. dotscope/absorber.py +390 -0
  4. dotscope/assertions.py +128 -0
  5. dotscope/ast_analyzer.py +2 -0
  6. dotscope/backtest.py +2 -0
  7. dotscope/bench.py +141 -0
  8. dotscope/budget.py +3 -0
  9. dotscope/cache.py +2 -0
  10. dotscope/check/__init__.py +1 -0
  11. dotscope/check/acknowledge.py +2 -0
  12. dotscope/check/checker.py +3 -0
  13. dotscope/check/checks/__init__.py +1 -0
  14. dotscope/check/checks/antipattern.py +2 -0
  15. dotscope/check/checks/boundary.py +2 -0
  16. dotscope/check/checks/contracts.py +3 -0
  17. dotscope/check/checks/direction.py +2 -0
  18. dotscope/check/checks/intent.py +2 -0
  19. dotscope/check/checks/stability.py +2 -0
  20. dotscope/check/constraints.py +2 -0
  21. dotscope/check/models.py +15 -0
  22. dotscope/cli.py +1447 -0
  23. dotscope/composer.py +147 -0
  24. dotscope/constants.py +45 -0
  25. dotscope/context.py +60 -0
  26. dotscope/counterfactual.py +180 -0
  27. dotscope/debug.py +220 -0
  28. dotscope/discovery.py +104 -0
  29. dotscope/formatter.py +157 -0
  30. dotscope/graph.py +3 -0
  31. dotscope/health.py +212 -0
  32. dotscope/help.py +204 -0
  33. dotscope/history.py +6 -0
  34. dotscope/hooks.py +2 -0
  35. dotscope/ingest.py +858 -0
  36. dotscope/intent.py +618 -0
  37. dotscope/lessons.py +223 -0
  38. dotscope/matcher.py +104 -0
  39. dotscope/mcp_server.py +1081 -0
  40. dotscope/models/.scope +45 -0
  41. dotscope/models/__init__.py +7 -0
  42. dotscope/models/core.py +288 -0
  43. dotscope/models/history.py +73 -0
  44. dotscope/models/intent.py +213 -0
  45. dotscope/models/passes.py +58 -0
  46. dotscope/models/state.py +250 -0
  47. dotscope/models.py +9 -0
  48. dotscope/near_miss.py +3 -0
  49. dotscope/onboarding.py +2 -0
  50. dotscope/parser.py +387 -0
  51. dotscope/passes/.scope +105 -0
  52. dotscope/passes/__init__.py +1 -0
  53. dotscope/passes/ast_analyzer.py +508 -0
  54. dotscope/passes/backtest.py +198 -0
  55. dotscope/passes/budget_allocator.py +164 -0
  56. dotscope/passes/convention_compliance.py +40 -0
  57. dotscope/passes/convention_discovery.py +247 -0
  58. dotscope/passes/convention_parser.py +223 -0
  59. dotscope/passes/graph_builder.py +299 -0
  60. dotscope/passes/history_miner.py +336 -0
  61. dotscope/passes/incremental.py +149 -0
  62. dotscope/passes/lang/__init__.py +38 -0
  63. dotscope/passes/lang/_base.py +20 -0
  64. dotscope/passes/lang/_treesitter.py +93 -0
  65. dotscope/passes/lang/go.py +333 -0
  66. dotscope/passes/lang/javascript.py +348 -0
  67. dotscope/passes/lazy.py +152 -0
  68. dotscope/passes/semantic_diff.py +160 -0
  69. dotscope/passes/sentinel/__init__.py +1 -0
  70. dotscope/passes/sentinel/acknowledge.py +222 -0
  71. dotscope/passes/sentinel/checker.py +383 -0
  72. dotscope/passes/sentinel/checks/__init__.py +1 -0
  73. dotscope/passes/sentinel/checks/antipattern.py +84 -0
  74. dotscope/passes/sentinel/checks/boundary.py +46 -0
  75. dotscope/passes/sentinel/checks/contracts.py +148 -0
  76. dotscope/passes/sentinel/checks/convention.py +54 -0
  77. dotscope/passes/sentinel/checks/direction.py +71 -0
  78. dotscope/passes/sentinel/checks/intent.py +207 -0
  79. dotscope/passes/sentinel/checks/stability.py +66 -0
  80. dotscope/passes/sentinel/checks/voice.py +108 -0
  81. dotscope/passes/sentinel/constraints.py +472 -0
  82. dotscope/passes/sentinel/line_filter.py +88 -0
  83. dotscope/passes/sentinel/models.py +15 -0
  84. dotscope/passes/virtual.py +239 -0
  85. dotscope/passes/voice.py +162 -0
  86. dotscope/passes/voice_defaults.py +28 -0
  87. dotscope/passes/voice_discovery.py +245 -0
  88. dotscope/paths.py +32 -0
  89. dotscope/progress.py +44 -0
  90. dotscope/regression.py +147 -0
  91. dotscope/resolver.py +203 -0
  92. dotscope/scanner.py +246 -0
  93. dotscope/sessions.py +2 -0
  94. dotscope/storage/.scope +64 -0
  95. dotscope/storage/__init__.py +1 -0
  96. dotscope/storage/cache.py +114 -0
  97. dotscope/storage/claude_hooks.py +119 -0
  98. dotscope/storage/git_hooks.py +277 -0
  99. dotscope/storage/incremental_state.py +61 -0
  100. dotscope/storage/mcp_config.py +98 -0
  101. dotscope/storage/near_miss.py +183 -0
  102. dotscope/storage/onboarding.py +150 -0
  103. dotscope/storage/session_manager.py +195 -0
  104. dotscope/storage/timing.py +84 -0
  105. dotscope/timing.py +2 -0
  106. dotscope/tokens.py +53 -0
  107. dotscope/utility.py +123 -0
  108. dotscope/virtual.py +3 -0
  109. dotscope/visibility.py +664 -0
  110. dotscope-0.1.0.dist-info/METADATA +50 -0
  111. dotscope-0.1.0.dist-info/RECORD +114 -0
  112. dotscope-0.1.0.dist-info/WHEEL +4 -0
  113. dotscope-0.1.0.dist-info/entry_points.txt +3 -0
  114. dotscope-0.1.0.dist-info/licenses/LICENSE +21 -0
dotscope/ingest.py ADDED
@@ -0,0 +1,858 @@
1
+ """Ingest: reverse-engineer .scope files from an existing codebase.
2
+
3
+ Orchestrates graph analysis, git history mining, and doc absorption
4
+ to produce complete .scope files for every detected module boundary.
5
+
6
+ This is how dotscope enters any codebase — not by asking humans to write
7
+ .scope files, but by inferring them from signals already in the code.
8
+ """
9
+
10
+
11
+ import os
12
+ import sys
13
+ from pathlib import Path
14
+ from typing import List, Optional, Set, Tuple
15
+
16
+ from .absorber import AbsorptionResult, absorb_docs
17
+ from .context import parse_context
18
+ from .graph import DependencyGraph, ModuleBoundary, build_graph, transitive_dependents
19
+ from .history import HistoryAnalysis, analyze_history
20
+ from .models.core import ScopeConfig, ScopesIndex, ScopeEntry
21
+ from .models.passes import IngestPlan, PlannedScope # noqa: F401
22
+ from .models.state import BacktestReport
23
+ from .parser import serialize_scope
24
+ from .tokens import estimate_scope_tokens
25
+
26
+
27
+ def ingest(
28
+ root: str,
29
+ mine_history: bool = True,
30
+ absorb: bool = True,
31
+ synthesize: bool = True,
32
+ max_commits: int = 500,
33
+ dry_run: bool = False,
34
+ quiet: bool = False,
35
+ voice_override: Optional[str] = None,
36
+ ) -> IngestPlan:
37
+ """Ingest a codebase and produce .scope files.
38
+
39
+ Pipeline:
40
+ 1. Build dependency graph → detect module boundaries
41
+ 2. Mine git history → change coupling, hotspots, implicit contracts
42
+ 3. Absorb existing docs → README, docstrings, signal comments
43
+ 4. Synthesize .scope files → combine all signals into scope configs
44
+ 5. Optionally write files to disk
45
+
46
+ Args:
47
+ root: Repository root
48
+ mine_history: Whether to analyze git history
49
+ absorb: Whether to absorb existing documentation
50
+ synthesize: Whether to use LLM for context synthesis (falls back to template)
51
+ max_commits: Max git commits to analyze
52
+ dry_run: If True, plan but don't write files
53
+ """
54
+ root = os.path.abspath(root)
55
+ plan = IngestPlan(root=root)
56
+
57
+ from .progress import ProgressEmitter
58
+ progress = ProgressEmitter(quiet=quiet)
59
+
60
+ # Step 1: Dependency graph
61
+ progress.start("building dependency graph")
62
+ graph = build_graph(root)
63
+ plan.graph = graph
64
+ plan.total_repo_files = len(graph.files)
65
+ plan.total_repo_tokens = sum(
66
+ estimate_scope_tokens([os.path.join(root, p)])
67
+ for p in graph.files
68
+ )
69
+ from .graph import format_graph_summary
70
+ plan.graph_summary = format_graph_summary(graph)
71
+ edge_count = sum(len(n.imports) for n in graph.files.values())
72
+ progress.finish(f"{len(graph.files)} files, {edge_count} edges, {len(graph.modules)} modules")
73
+
74
+ # Step 2: Git history
75
+ history = HistoryAnalysis()
76
+ if mine_history:
77
+ progress.start(f"mining git history ({max_commits} commits)")
78
+ history = analyze_history(root, max_commits=max_commits)
79
+ from .history import format_history_summary
80
+ plan.history_summary = format_history_summary(history)
81
+ contracts = len(history.implicit_contracts)
82
+ progress.finish(f"{history.commits_analyzed} commits, {contracts} contracts")
83
+ else:
84
+ progress.skip("mining git history", "disabled")
85
+ plan.history = history
86
+
87
+ # Step 3: Doc absorption (with AST data if available)
88
+ docs = AbsorptionResult()
89
+ if absorb:
90
+ progress.start("absorbing documentation")
91
+ docs = absorb_docs(root, apis=graph.apis if graph.apis else None)
92
+ progress.finish(f"{len(docs.fragments)} fragments")
93
+ else:
94
+ progress.skip("absorbing documentation", "disabled")
95
+
96
+ # Step 3b: Discover conventions from structural patterns
97
+ if graph.apis:
98
+ progress.start("discovering conventions")
99
+ from .passes.convention_discovery import discover_conventions
100
+ from .passes.convention_parser import parse_conventions
101
+ from .passes.convention_compliance import compute_compliance
102
+ discovered = discover_conventions(graph.apis, graph, history)
103
+ if discovered:
104
+ nodes = parse_conventions(graph.apis, discovered)
105
+ for conv in discovered:
106
+ conv.compliance = compute_compliance(conv, nodes, graph.apis)
107
+ viable = [c for c in discovered if c.compliance >= 0.5]
108
+ plan.discovered_conventions = viable
109
+ if viable and not dry_run:
110
+ from .intent import save_conventions
111
+ save_conventions(root, viable)
112
+ progress.finish(f"{len(viable) if discovered else 0} patterns")
113
+ else:
114
+ progress.finish("0 patterns")
115
+
116
+ # Step 3c: Voice discovery
117
+ if graph.apis:
118
+ progress.start("discovering voice")
119
+ from .passes.voice_discovery import detect_codebase_maturity, discover_voice
120
+ from .passes.voice_defaults import prescriptive_defaults
121
+ maturity = detect_codebase_maturity(graph.apis, history, voice_override)
122
+ if maturity == "new":
123
+ discovered_voice = prescriptive_defaults()
124
+ else:
125
+ discovered_voice = discover_voice(graph.apis, root)
126
+ if not dry_run:
127
+ from .intent import save_voice_config
128
+ save_voice_config(root, discovered_voice)
129
+ progress.finish(f"{maturity} mode")
130
+
131
+ # Step 4: Synthesize scope files
132
+ progress.start("generating scopes")
133
+ for module in graph.modules:
134
+ planned = synthesize_scope(module, graph, history, docs, root, synthesize)
135
+ if planned:
136
+ plan.scopes.append(planned)
137
+
138
+ # Step 4b: Detect virtual (cross-cutting) scopes
139
+ from .virtual import detect_virtual_scopes
140
+ virtual_scopes = detect_virtual_scopes(graph)
141
+ plan.virtual_scopes = virtual_scopes
142
+ for vs in virtual_scopes:
143
+ plan.scopes.append(PlannedScope(
144
+ directory=f"virtual/{vs.description.split('(')[0].strip().split(':')[-1].strip()}",
145
+ config=vs,
146
+ confidence=0.7,
147
+ signals=["graph: cross-cutting hub detection"],
148
+ ))
149
+ real_count = len([s for s in plan.scopes if not s.directory.startswith("virtual/")])
150
+ virtual_count = len(virtual_scopes)
151
+ progress.finish(f"{real_count} scopes, {virtual_count} virtual")
152
+
153
+ # Step 5: Backtest against git history and auto-correct
154
+ if mine_history and plan.scopes:
155
+ progress.start(f"backtesting ({min(max_commits, 50)} commits)")
156
+ from .backtest import backtest_scopes, auto_correct_scope, format_backtest_report
157
+
158
+ configs = [ps.config for ps in plan.scopes]
159
+ report = backtest_scopes(root, configs, n_commits=min(max_commits, 50))
160
+
161
+ # Auto-correct: up to 2 rounds
162
+ for correction_round in range(2):
163
+ any_corrected = False
164
+ for i, result in enumerate(report.results):
165
+ if result.recall < 1.0 and result.missing_includes:
166
+ updated, changed = auto_correct_scope(
167
+ plan.scopes[i].config, result, root
168
+ )
169
+ if changed:
170
+ plan.scopes[i].config = updated
171
+ plan.scopes[i].signals.append(
172
+ f"backtest: auto-corrected {len(result.missing_includes)} missing include(s)"
173
+ )
174
+ any_corrected = True
175
+
176
+ if not any_corrected:
177
+ break
178
+
179
+ # Re-run backtest after corrections
180
+ configs = [ps.config for ps in plan.scopes]
181
+ report = backtest_scopes(root, configs, n_commits=min(max_commits, 50))
182
+
183
+ plan.backtest_summary = format_backtest_report(report)
184
+ plan.backtest_report = report
185
+ progress.finish(f"{report.overall_recall:.0%} recall")
186
+ elif not mine_history:
187
+ progress.skip("backtesting", "no history")
188
+
189
+ # Build .scopes index
190
+ plan.index = _build_index(plan.scopes, plan.total_repo_tokens)
191
+
192
+ # Step 6: Write to disk
193
+ if not dry_run:
194
+ _write_scopes(plan)
195
+ # Cache structured data for MCP server
196
+ from .cache import cache_ingest_data
197
+ cache_ingest_data(root, history=plan.history, graph=plan.graph)
198
+ # Cache invariants for enforcement
199
+ _cache_invariants(root, plan.history)
200
+ # Reset incremental state + remove needs_full_ingest marker
201
+ try:
202
+ from .storage.incremental_state import reset_incremental_state
203
+ reset_incremental_state(root)
204
+ marker = os.path.join(root, ".dotscope", "needs_full_ingest")
205
+ if os.path.exists(marker):
206
+ os.remove(marker)
207
+ except Exception:
208
+ pass
209
+
210
+ return plan
211
+
212
+
213
+ def synthesize_scope(
214
+ module: ModuleBoundary,
215
+ graph: DependencyGraph,
216
+ history: HistoryAnalysis,
217
+ docs: AbsorptionResult,
218
+ root: str,
219
+ use_llm: bool,
220
+ ) -> Optional[PlannedScope]:
221
+ """Synthesize a single .scope file from all available signals."""
222
+ directory = module.directory
223
+ scope_path = os.path.join(root, directory, ".scope")
224
+
225
+ # Skip if .scope already exists
226
+ if os.path.exists(scope_path):
227
+ return None
228
+
229
+ signals = []
230
+
231
+ # --- Description ---
232
+ file_count = len(module.files)
233
+ # Detect primary language
234
+ langs = {}
235
+ for f in module.files:
236
+ ext = os.path.splitext(f)[1]
237
+ langs[ext] = langs.get(ext, 0) + 1
238
+ primary_ext = max(langs, key=langs.get) if langs else ""
239
+ lang_names = {
240
+ ".py": "Python", ".js": "JavaScript", ".ts": "TypeScript",
241
+ ".go": "Go", ".rs": "Rust", ".rb": "Ruby", ".java": "Java",
242
+ }
243
+ lang = lang_names.get(primary_ext, "")
244
+
245
+ description = f"{directory} module"
246
+ if lang:
247
+ description = f"{directory} -- {lang} module ({file_count} files)"
248
+
249
+ signals.append(f"graph: {file_count} files, cohesion {module.cohesion:.0%}")
250
+
251
+ # --- Includes ---
252
+ includes = [f"{directory}/"]
253
+
254
+ # Add cross-module dependencies detected from imports
255
+ for dep in module.external_deps:
256
+ dep_dir = os.path.join(root, dep)
257
+ if os.path.isdir(dep_dir):
258
+ # Find specific files imported, not the whole directory
259
+ imported_files = _find_cross_module_imports(module, dep, graph)
260
+ for imp_file in imported_files:
261
+ if imp_file not in includes:
262
+ includes.append(imp_file)
263
+
264
+ # Add change-coupled files from other modules
265
+ for coupling in history.change_couplings:
266
+ for f in [coupling.file_a, coupling.file_b]:
267
+ if f.startswith(directory + "/"):
268
+ other = coupling.file_b if f == coupling.file_a else coupling.file_a
269
+ if not other.startswith(directory + "/") and coupling.coupling_strength >= 0.7:
270
+ if other not in includes:
271
+ includes.append(other)
272
+ signals.append(f"history: {other} coupled at {coupling.coupling_strength:.0%}")
273
+
274
+ # --- Excludes ---
275
+ excludes = _default_excludes(directory, module.files)
276
+
277
+ # --- Context (priority: contracts → stability → docs → deps → recent → transitive) ---
278
+ context_parts = []
279
+
280
+ # 1. Implicit contracts FIRST — the thing nobody documented
281
+ relevant_contracts = [
282
+ ic for ic in history.implicit_contracts
283
+ if ic.trigger_file.startswith(directory + "/")
284
+ or ic.coupled_file.startswith(directory + "/")
285
+ ]
286
+ if relevant_contracts:
287
+ context_parts.append("## Implicit Contracts (from git history)")
288
+ for ic in relevant_contracts[:5]:
289
+ context_parts.append(f"- {ic.description}")
290
+ signals.append(f"history: {len(relevant_contracts)} implicit contracts")
291
+
292
+ # 2. Stability profiles — which files are fragile
293
+ stability_lines = []
294
+ for f in module.files:
295
+ fh = history.file_histories.get(f)
296
+ if fh and fh.stability and fh.commit_count >= 3:
297
+ lines_info = f", {fh.total_lines_changed} lines" if fh.total_lines_changed else ""
298
+ stability_lines.append(
299
+ f"- {os.path.basename(f)}: {fh.stability} ({fh.commit_count} commits{lines_info})"
300
+ )
301
+ if stability_lines:
302
+ context_parts.append("## Stability")
303
+ context_parts.extend(stability_lines[:10])
304
+
305
+ # 3. Absorbed docs — READMEs, docstrings, signal comments
306
+ doc_context = docs.synthesize_context(directory, max_chars=1500)
307
+ if doc_context:
308
+ context_parts.append(doc_context)
309
+ signals.append(f"docs: absorbed {len(docs.for_module(directory))} fragments")
310
+
311
+ # 4. Dependencies + structural
312
+ if module.external_deps:
313
+ context_parts.append("## Dependencies")
314
+ context_parts.append(f"This module imports from: {', '.join(module.external_deps)}")
315
+ if module.depended_on_by:
316
+ context_parts.append(f"This module is used by: {', '.join(module.depended_on_by)}")
317
+ context_parts.append("Changes here may affect downstream consumers.")
318
+
319
+ # 5. Recent changes
320
+ recent = history.recent_summaries.get(directory, [])
321
+ if recent:
322
+ context_parts.append("## Recent Changes")
323
+ for msg in recent[:5]:
324
+ context_parts.append(f"- {msg}")
325
+
326
+ # 6. Transitive dependency chain (if deeper than 1 hop)
327
+ from .graph import transitive_deps as _transitive_deps
328
+ deep_deps: Set[str] = set()
329
+ for f in module.files:
330
+ for dep in _transitive_deps(graph, f):
331
+ dep_parts = dep.split("/")
332
+ if len(dep_parts) > 1 and dep_parts[0] != directory:
333
+ deep_deps.add(dep)
334
+ if deep_deps:
335
+ direct: Set[str] = set()
336
+ for dep in module.external_deps:
337
+ direct.update(d for d in deep_deps if d.startswith(dep + "/"))
338
+ transitive_only = deep_deps - direct
339
+ if transitive_only:
340
+ context_parts.append("## Transitive Dependencies")
341
+ for dep in sorted(transitive_only)[:5]:
342
+ context_parts.append(f"- {dep} (indirect)")
343
+
344
+ # 7. NEVER TODO. If empty, synthesize from graph structure.
345
+ if not context_parts:
346
+ context_parts.append(
347
+ f"{directory} module -- {file_count} files, "
348
+ f"cohesion {module.cohesion:.0%}, "
349
+ f"{len(module.external_deps)} external dependencies."
350
+ )
351
+
352
+ context_str = "\n".join(context_parts)
353
+ context = parse_context(context_str)
354
+
355
+ # --- Related ---
356
+ related = []
357
+ for dep in module.external_deps:
358
+ scope_candidate = f"{dep}/.scope"
359
+ related.append(scope_candidate)
360
+ for dep_by in module.depended_on_by:
361
+ scope_candidate = f"{dep_by}/.scope"
362
+ if scope_candidate not in related:
363
+ related.append(scope_candidate)
364
+
365
+ # --- Tags ---
366
+ tags = [directory.lower()]
367
+ if module.external_deps:
368
+ tags.extend(d.lower() for d in module.external_deps[:3])
369
+
370
+ # --- Token estimate ---
371
+ full_paths = [os.path.join(root, f) for f in module.files]
372
+ token_est = estimate_scope_tokens(full_paths)
373
+
374
+ # --- Confidence ---
375
+ confidence = module.cohesion
376
+ if doc_context:
377
+ confidence = min(confidence + 0.1, 1.0)
378
+ if relevant_contracts:
379
+ confidence = min(confidence + 0.1, 1.0)
380
+
381
+ config = ScopeConfig(
382
+ path=scope_path,
383
+ description=description,
384
+ includes=includes,
385
+ excludes=excludes,
386
+ context=context,
387
+ related=related,
388
+ owners=[],
389
+ tags=tags,
390
+ tokens_estimate=token_est,
391
+ )
392
+
393
+ return PlannedScope(
394
+ directory=directory,
395
+ config=config,
396
+ confidence=confidence,
397
+ signals=signals,
398
+ )
399
+
400
+
401
+ def _find_cross_module_imports(
402
+ module: ModuleBoundary, dep_module: str, graph: DependencyGraph
403
+ ) -> List[str]:
404
+ """Find specific files in dep_module that are imported by files in module."""
405
+ imported = set()
406
+ for f in module.files:
407
+ node = graph.files.get(f)
408
+ if not node:
409
+ continue
410
+ for imp in node.imports:
411
+ if imp.startswith(dep_module + "/"):
412
+ imported.add(imp)
413
+ return sorted(imported)
414
+
415
+
416
+ def _default_excludes(directory: str, files: List[str]) -> List[str]:
417
+ """Generate sensible excludes for a module."""
418
+ excludes = []
419
+
420
+ # Common patterns
421
+ excludes.append(f"{directory}/__pycache__/")
422
+ excludes.append("*.pyc")
423
+
424
+ # Detect test/fixture/migration directories
425
+ subdirs = set()
426
+ for f in files:
427
+ parts = f.split("/")
428
+ if len(parts) > 2: # directory/subdir/file
429
+ subdirs.add(parts[1])
430
+
431
+ for subdir in subdirs:
432
+ subdir_lower = subdir.lower()
433
+ if subdir_lower in ("fixtures", "fixture", "testdata", "test_data", "mocks"):
434
+ excludes.append(f"{directory}/{subdir}/")
435
+ if subdir_lower in ("migrations", "migrate"):
436
+ excludes.append(f"{directory}/{subdir}/")
437
+
438
+ return excludes
439
+
440
+
441
+ def _build_index(
442
+ scopes: List[PlannedScope], total_repo_tokens: int = 0,
443
+ ) -> ScopesIndex:
444
+ """Build a .scopes index from planned scopes."""
445
+ entries = {}
446
+ for ps in scopes:
447
+ name = ps.directory
448
+ keywords = list(ps.config.tags)
449
+ # Add words from description
450
+ for word in ps.config.description.split():
451
+ word = word.lower().strip("—()-,.")
452
+ if len(word) > 2 and word not in keywords:
453
+ keywords.append(word)
454
+
455
+ entries[name] = ScopeEntry(
456
+ name=name,
457
+ path=f"{ps.directory}/.scope",
458
+ keywords=keywords[:15], # Cap at 15
459
+ description=ps.config.description,
460
+ )
461
+
462
+ return ScopesIndex(
463
+ version=1,
464
+ scopes=entries,
465
+ defaults={"max_tokens": 8000, "include_related": False},
466
+ total_repo_tokens=total_repo_tokens,
467
+ )
468
+
469
+
470
+ def append_to_index(root: str, planned: PlannedScope) -> None:
471
+ """Append a single scope entry to the .scopes index on disk."""
472
+ from .discovery import load_index
473
+ index = load_index(root)
474
+ if index is None:
475
+ index = ScopesIndex(version=1, scopes={}, defaults={"max_tokens": 8000, "include_related": False})
476
+
477
+ name = planned.directory
478
+ keywords = list(planned.config.tags)
479
+ for word in planned.config.description.split():
480
+ word = word.lower().strip("—()-,.")
481
+ if len(word) > 2 and word not in keywords:
482
+ keywords.append(word)
483
+
484
+ index.scopes[name] = ScopeEntry(
485
+ name=name,
486
+ path=f"{planned.directory}/.scope",
487
+ keywords=keywords[:15],
488
+ description=planned.config.description,
489
+ )
490
+
491
+ index_path = os.path.join(root, ".scopes")
492
+ content = _serialize_index(index)
493
+ with open(index_path, "w", encoding="utf-8") as f:
494
+ f.write(content)
495
+
496
+
497
+ def _write_scopes(plan: IngestPlan) -> None:
498
+ """Write all planned .scope files and the .scopes index to disk."""
499
+ written = 0
500
+
501
+ for ps in plan.scopes:
502
+ scope_path = os.path.join(plan.root, ps.directory, ".scope")
503
+ # Don't overwrite existing
504
+ if os.path.exists(scope_path):
505
+ continue
506
+
507
+ os.makedirs(os.path.dirname(scope_path), exist_ok=True)
508
+ content = serialize_scope(ps.config)
509
+ with open(scope_path, "w", encoding="utf-8") as f:
510
+ f.write(content)
511
+ written += 1
512
+
513
+ # Write .scopes index (only if it doesn't exist)
514
+ index_path = os.path.join(plan.root, ".scopes")
515
+ if plan.index and not os.path.exists(index_path):
516
+ content = _serialize_index(plan.index)
517
+ with open(index_path, "w", encoding="utf-8") as f:
518
+ f.write(content)
519
+ written += 1
520
+
521
+ pass # Progress is handled by the caller
522
+
523
+
524
+ def _serialize_index(index: ScopesIndex) -> str:
525
+ """Serialize a ScopesIndex to .scopes YAML format."""
526
+ lines = [f"version: {index.version}"]
527
+ if index.total_repo_tokens:
528
+ lines.append(f"total_repo_tokens: {index.total_repo_tokens}")
529
+ lines.extend(["", "scopes:"])
530
+
531
+ for name, entry in sorted(index.scopes.items()):
532
+ lines.append(f" {name}:")
533
+ lines.append(f" path: {entry.path}")
534
+ kw_str = ", ".join(entry.keywords)
535
+ lines.append(f" keywords: [{kw_str}]")
536
+
537
+ lines.append("")
538
+ lines.append("defaults:")
539
+ for k, v in index.defaults.items():
540
+ if isinstance(v, bool):
541
+ lines.append(f" {k}: {'true' if v else 'false'}")
542
+ else:
543
+ lines.append(f" {k}: {v}")
544
+
545
+ return "\n".join(lines) + "\n"
546
+
547
+
548
+ def _use_unicode() -> bool:
549
+ """Check if stdout can handle Unicode (emoji, box-drawing)."""
550
+ import io
551
+ enc = getattr(sys.stdout, "encoding", None) or ""
552
+ if isinstance(sys.stdout, io.TextIOWrapper):
553
+ enc = sys.stdout.encoding or ""
554
+ return enc.lower().replace("-", "") in ("utf8", "utf16", "utf32", "utf8sig")
555
+
556
+
557
+ # Glyph sets: Unicode vs ASCII-safe fallbacks
558
+ _GLYPHS_UNICODE = {
559
+ "discoveries": "\u26a1 Discoveries",
560
+ "validation": "\U0001f4ca Validation",
561
+ "created": "\U0001f4c1 Created",
562
+ "bar_full": "\u2588",
563
+ "bar_empty": "\u2591",
564
+ "arrow": "\u2192",
565
+ "dash": "\u2014",
566
+ "attention": "\u2190 needs attention",
567
+ }
568
+ _GLYPHS_ASCII = {
569
+ "discoveries": ">> Discoveries",
570
+ "validation": ">> Validation",
571
+ "created": ">> Created",
572
+ "bar_full": "#",
573
+ "bar_empty": ".",
574
+ "arrow": "->",
575
+ "dash": "--",
576
+ "attention": "<- needs attention",
577
+ }
578
+
579
+
580
+ def _glyphs() -> dict:
581
+ """Return the appropriate glyph set for the current terminal."""
582
+ return _GLYPHS_UNICODE if _use_unicode() else _GLYPHS_ASCII
583
+
584
+
585
+ def format_ingest_report(plan: IngestPlan) -> str:
586
+ """Format the discovery-first ingest report."""
587
+ g = _glyphs()
588
+ lines = []
589
+
590
+ # --- Header ---
591
+ real_scopes = [s for s in plan.scopes if not s.directory.startswith("virtual/")]
592
+ module_count = len(real_scopes)
593
+ lines.append(
594
+ f"dotscope scanned {plan.total_repo_files} files "
595
+ f"across {module_count} modules."
596
+ )
597
+ lines.append("")
598
+
599
+ # --- Section 1: Discoveries ---
600
+ discoveries = _extract_discoveries(plan, g)
601
+ if discoveries:
602
+ lines.append(g["discoveries"])
603
+ lines.append("")
604
+ lines.extend(discoveries)
605
+
606
+ # --- Section 2: Validation ---
607
+ validation = _extract_validation(plan, g)
608
+ if validation:
609
+ lines.append(g["validation"])
610
+ lines.append("")
611
+ lines.extend(validation)
612
+
613
+ # --- Section 3: Files created ---
614
+ lines.append(f"{g['created']} {len(real_scopes)} .scope files + .scopes index")
615
+ lines.append("")
616
+ lines.append(" Try it: dotscope resolve <module>")
617
+ lines.append(" See it: dotscope resolve <module> --json --budget 4000")
618
+ lines.append(" Trust it: dotscope backtest --commits 500")
619
+
620
+ return "\n".join(lines)
621
+
622
+
623
+ # ---------------------------------------------------------------------------
624
+ # Discovery extraction helpers
625
+ # ---------------------------------------------------------------------------
626
+
627
+
628
+ def _is_cross_module(file_a: str, file_b: str) -> bool:
629
+ """True if two files are in different top-level directories."""
630
+ dir_a = file_a.split("/")[0] if "/" in file_a else ""
631
+ dir_b = file_b.split("/")[0] if "/" in file_b else ""
632
+ return dir_a != dir_b and bool(dir_a) and bool(dir_b)
633
+
634
+
635
+ def _find_hub_discoveries(
636
+ graph: DependencyGraph,
637
+ ) -> List[Tuple[str, int, int, int]]:
638
+ """Find files with high import fan-in across multiple modules.
639
+
640
+ Returns: [(path, importer_count, directory_count, blast_radius)]
641
+ """
642
+ results = []
643
+ for path, node in graph.files.items():
644
+ if not node.imported_by:
645
+ continue
646
+ importer_dirs: Set[str] = set()
647
+ for imp_by in node.imported_by:
648
+ parts = Path(imp_by).parts
649
+ if len(parts) > 1:
650
+ importer_dirs.add(parts[0])
651
+
652
+ if len(node.imported_by) >= 3 and len(importer_dirs) >= 2:
653
+ blast = transitive_dependents(graph, path)
654
+ results.append((
655
+ path,
656
+ len(node.imported_by),
657
+ len(importer_dirs),
658
+ len(blast) + 1, # +1 for the file itself
659
+ ))
660
+
661
+ results.sort(key=lambda x: -x[1])
662
+ return results
663
+
664
+
665
+ # Directories an engineer expects to be stable
666
+ _EXPECTED_STABLE = {
667
+ "config", "configs", "settings", "constants",
668
+ "migrations", "fixtures", "static",
669
+ }
670
+
671
+
672
+ def _find_volatility_surprises(
673
+ history: HistoryAnalysis,
674
+ ) -> List[Tuple[str, "FileHistory"]]:
675
+ """Files classified volatile that live in directories expected to be stable."""
676
+ from .history import FileHistory # noqa: F811 — type hint only
677
+
678
+ surprises: List[Tuple[str, FileHistory]] = []
679
+ for path, fh in history.file_histories.items():
680
+ if fh.stability != "volatile":
681
+ continue
682
+ parts = path.split("/")
683
+ if len(parts) > 1 and parts[0].lower() in _EXPECTED_STABLE:
684
+ surprises.append((path, fh))
685
+
686
+ # Also include the repo's most-changed file if high churn
687
+ if history.hotspots:
688
+ top_path, _top_churn = history.hotspots[0]
689
+ top_fh = history.file_histories.get(top_path)
690
+ if top_fh and top_fh.commit_count >= 10:
691
+ if not any(p == top_path for p, _ in surprises):
692
+ surprises.insert(0, (top_path, top_fh))
693
+
694
+ surprises.sort(key=lambda x: -x[1].total_lines_changed)
695
+ return surprises
696
+
697
+
698
+ def _extract_discoveries(plan: IngestPlan, g: Optional[dict] = None) -> List[str]:
699
+ """Extract surprising findings from history, graph, and docs."""
700
+ if g is None:
701
+ g = _glyphs()
702
+ lines: List[str] = []
703
+ history = plan.history
704
+ graph = plan.graph
705
+
706
+ # --- Hidden dependencies (cross-module implicit contracts) ---
707
+ if history and history.implicit_contracts:
708
+ cross_module = [
709
+ ic for ic in history.implicit_contracts
710
+ if _is_cross_module(ic.trigger_file, ic.coupled_file)
711
+ and ic.confidence >= 0.65
712
+ ]
713
+ if cross_module:
714
+ lines.append(
715
+ f" Hidden dependencies "
716
+ f"(from {history.commits_analyzed} commits of git history):"
717
+ )
718
+ for ic in cross_module[:5]:
719
+ trigger = os.path.basename(ic.trigger_file)
720
+ coupled = os.path.basename(ic.coupled_file)
721
+ if trigger == coupled:
722
+ trigger = ic.trigger_file
723
+ coupled = ic.coupled_file
724
+ lines.append(
725
+ f" {trigger} {g['arrow']} {coupled}"
726
+ f" {ic.confidence:.0%} co-change, undocumented"
727
+ )
728
+ lines.append("")
729
+
730
+ # --- Cross-cutting hubs (from graph analysis) ---
731
+ if graph:
732
+ hubs = _find_hub_discoveries(graph)
733
+ if hubs:
734
+ for hub_path, importer_count, dir_count, blast_radius in hubs[:3]:
735
+ lines.append(" Cross-cutting hub:")
736
+ lines.append(
737
+ f" {hub_path} is imported by "
738
+ f"{importer_count} files across {dir_count} modules"
739
+ )
740
+ if blast_radius > importer_count:
741
+ lines.append(
742
+ f" A change here affects "
743
+ f"{blast_radius} files transitively"
744
+ )
745
+ lines.append("")
746
+
747
+ # --- Volatility surprises ---
748
+ if history and history.file_histories:
749
+ surprises = _find_volatility_surprises(history)
750
+ if surprises:
751
+ lines.append(" Volatility surprise:")
752
+ for path, fh in surprises[:3]:
753
+ lines.append(
754
+ f" {path} {g['dash']} {fh.commit_count} commits, "
755
+ f"{fh.total_lines_changed} lines changed"
756
+ )
757
+ # Annotate if top file has no scope covering it
758
+ if surprises:
759
+ top_path = surprises[0][0]
760
+ has_scope = any(
761
+ top_path.startswith(s.directory + "/")
762
+ for s in plan.scopes
763
+ )
764
+ if not has_scope:
765
+ lines.append(
766
+ " Most changed file in the repo. "
767
+ "No .scope context exists for it."
768
+ )
769
+ lines.append("")
770
+
771
+ return lines
772
+
773
+
774
+ def _extract_validation(plan: IngestPlan, g: Optional[dict] = None) -> List[str]:
775
+ """Extract validation stats: backtest recall + token reduction."""
776
+ if g is None:
777
+ g = _glyphs()
778
+ lines: List[str] = []
779
+ report = plan.backtest_report
780
+
781
+ if not report or report.total_commits == 0:
782
+ return lines
783
+
784
+ lines.append(
785
+ f" Backtested against {report.total_commits} recent commits:"
786
+ )
787
+ lines.append(
788
+ f" Overall recall: {report.overall_recall:.0%} {g['dash']} "
789
+ f"scopes would have given agents the right files"
790
+ )
791
+
792
+ # Token reduction ratio — the single most compelling number
793
+ real_scopes = [
794
+ s for s in plan.scopes
795
+ if not s.directory.startswith("virtual/")
796
+ ]
797
+ if plan.total_repo_tokens > 0 and real_scopes:
798
+ avg_scope_tokens = sum(
799
+ s.config.tokens_estimate or 0 for s in real_scopes
800
+ ) / max(len(real_scopes), 1)
801
+ reduction = (1 - avg_scope_tokens / plan.total_repo_tokens) * 100
802
+ lines.append(
803
+ f" Token reduction: {reduction:.0f}% {g['dash']} "
804
+ f"from ~{plan.total_repo_tokens:,} to "
805
+ f"~{int(avg_scope_tokens):,} average per resolution"
806
+ )
807
+
808
+ lines.append("")
809
+
810
+ # Per-scope recall bars
811
+ for result in report.results:
812
+ scope_name = os.path.basename(os.path.dirname(result.scope_path))
813
+ if result.total_commits == 0:
814
+ continue
815
+ filled = int(result.recall * 10)
816
+ bar = g["bar_full"] * filled + g["bar_empty"] * (10 - filled)
817
+ suffix = f" {g['attention']}" if result.recall < 0.8 else ""
818
+ lines.append(
819
+ f" {scope_name:<12} {bar} {result.recall:.0%} recall{suffix}"
820
+ )
821
+
822
+ return lines
823
+
824
+
825
+ def _cache_invariants(root: str, history: Optional[HistoryAnalysis]) -> None:
826
+ """Cache invariants.json with contracts, function_co_changes, and file_stabilities."""
827
+ if not history:
828
+ return
829
+
830
+ dot_dir = os.path.join(root, ".dotscope")
831
+ os.makedirs(dot_dir, exist_ok=True)
832
+
833
+ contracts = []
834
+ for ic in history.implicit_contracts:
835
+ contracts.append({
836
+ "trigger_file": ic.trigger_file,
837
+ "coupled_file": ic.coupled_file,
838
+ "confidence": ic.confidence,
839
+ "description": ic.description,
840
+ })
841
+
842
+ stabilities = {}
843
+ for path, fh in history.file_histories.items():
844
+ stabilities[path] = {
845
+ "classification": fh.stability,
846
+ "commit_count": fh.commit_count,
847
+ }
848
+
849
+ invariants = {
850
+ "contracts": contracts,
851
+ "function_co_changes": {}, # Populated when function-level data available
852
+ "file_stabilities": stabilities,
853
+ }
854
+
855
+ import json
856
+ path = os.path.join(dot_dir, "invariants.json")
857
+ with open(path, "w", encoding="utf-8") as f:
858
+ json.dump(invariants, f, indent=2)