livepilot 1.23.3 → 1.23.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/CHANGELOG.md +93 -0
  2. package/README.md +106 -8
  3. package/m4l_device/LivePilot_Analyzer.amxd +0 -0
  4. package/m4l_device/livepilot_bridge.js +1 -1
  5. package/mcp_server/__init__.py +1 -1
  6. package/mcp_server/atlas/cross_pack_chain.py +658 -0
  7. package/mcp_server/atlas/demo_story.py +700 -0
  8. package/mcp_server/atlas/extract_chain.py +786 -0
  9. package/mcp_server/atlas/macro_fingerprint.py +554 -0
  10. package/mcp_server/atlas/overlays.py +95 -3
  11. package/mcp_server/atlas/pack_aware_compose.py +1255 -0
  12. package/mcp_server/atlas/preset_resolver.py +238 -0
  13. package/mcp_server/atlas/tools.py +1001 -31
  14. package/mcp_server/atlas/transplant.py +1177 -0
  15. package/mcp_server/mix_engine/state_builder.py +44 -1
  16. package/mcp_server/runtime/capability_state.py +34 -3
  17. package/mcp_server/server.py +45 -24
  18. package/mcp_server/tools/agent_os.py +33 -9
  19. package/mcp_server/tools/analyzer.py +38 -7
  20. package/mcp_server/tools/browser.py +20 -1
  21. package/mcp_server/tools/devices.py +78 -11
  22. package/mcp_server/tools/perception.py +5 -1
  23. package/mcp_server/tools/tracks.py +39 -2
  24. package/mcp_server/user_corpus/__init__.py +48 -0
  25. package/mcp_server/user_corpus/manifest.py +142 -0
  26. package/mcp_server/user_corpus/plugin_engine/__init__.py +39 -0
  27. package/mcp_server/user_corpus/plugin_engine/detector.py +579 -0
  28. package/mcp_server/user_corpus/plugin_engine/manual.py +347 -0
  29. package/mcp_server/user_corpus/plugin_engine/research.py +247 -0
  30. package/mcp_server/user_corpus/runner.py +261 -0
  31. package/mcp_server/user_corpus/scanner.py +115 -0
  32. package/mcp_server/user_corpus/scanners/__init__.py +18 -0
  33. package/mcp_server/user_corpus/scanners/adg.py +79 -0
  34. package/mcp_server/user_corpus/scanners/als.py +144 -0
  35. package/mcp_server/user_corpus/scanners/amxd.py +374 -0
  36. package/mcp_server/user_corpus/scanners/plugin_preset.py +202 -0
  37. package/mcp_server/user_corpus/tools.py +904 -0
  38. package/mcp_server/user_corpus/wizard.py +224 -0
  39. package/package.json +2 -2
  40. package/remote_script/LivePilot/__init__.py +1 -1
  41. package/remote_script/LivePilot/browser.py +7 -2
  42. package/requirements.txt +3 -3
  43. package/server.json +2 -2
@@ -0,0 +1,904 @@
1
+ """MCP tool wrappers for the user corpus builder + plugin knowledge engine.
2
+
3
+ Phase 1 tools (file scanner — see USER_CORPUS_GUIDE.md):
4
+ corpus_init — create ~/.livepilot/atlas-overlays/user/ + manifest.yaml
5
+ corpus_add_source — register a directory + scanner type
6
+ corpus_remove_source — unregister
7
+ corpus_scan — run scans (all sources, or one)
8
+ corpus_status — what's in the manifest + freshness
9
+ corpus_list_scanners — show available scanner types
10
+
11
+ Phase 2 tools (plugin knowledge engine — see PLUGIN_KNOWLEDGE_ENGINE.md):
12
+ corpus_detect_plugins — Phase 2.1+2.2: enumerate installed VST3/AU/VST2/AAX
13
+ corpus_canonicalize_plugins — dedupe inventory by vendor+name, prefer VST3
14
+ corpus_cluster_plugins — group by vendor → cluster manifest for efficient research
15
+ corpus_discover_manuals — Phase 2.3+2.4: find + extract local PDFs/HTML/etc.
16
+ corpus_research_targets — Phase 3: emit WebSearch task packet for the agent
17
+ corpus_emit_synthesis_briefs — Phase 4: emit sonnet-subagent brief for identity.yaml
18
+ corpus_trim_plugin_identity — slim a plugin's identity.yaml to lean overlay-required form
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import json
24
+ import re
25
+ from pathlib import Path
26
+
27
+ from fastmcp import Context
28
+
29
+ from ..server import mcp
30
+ from .manifest import (
31
+ DEFAULT_MANIFEST_PATH,
32
+ DEFAULT_OUTPUT_ROOT,
33
+ Manifest,
34
+ Source,
35
+ init_default_manifest,
36
+ load_manifest,
37
+ save_manifest,
38
+ )
39
+ from .runner import run_scan
40
+ from .scanner import list_scanners
41
+
42
+ from .plugin_engine.detector import (
43
+ DetectedPlugin, default_plugin_dir, detect_installed_plugins,
44
+ )
45
+ from .plugin_engine.manual import (
46
+ discover_manuals_for_plugin, extract_manual_text,
47
+ )
48
+ from .plugin_engine.research import (
49
+ build_research_targets, build_synthesis_brief,
50
+ )
51
+
52
+
53
+ # ─── Vendor canonicalization helpers (used by corpus_canonicalize_plugins) ───
54
+ # Strips common vendor-suffix words ("DSP", "LLC", "GmbH", etc.) so different
55
+ # spellings of the same vendor ("Valhalla DSP, LLC" + "Valhalladsp") collapse
56
+ # to a stable canonical key.
57
+
58
+ _VENDOR_SUFFIX_WORDS = (
59
+ "dsp", "llc", "inc", "gmbh", "ltd", "sas", "sa", "srl", "sl", "co", "corp",
60
+ "corporation", "technologies", "technology", "sound", "sounds", "software",
61
+ "audio", "labs", "lab", "studios", "studio", "records", "productions",
62
+ "industries",
63
+ )
64
+
65
+
66
+ def _canon_vendor(vendor: str) -> str:
67
+ """Canonical vendor key: lowercase + alphanum-only + iteratively strip
68
+ suffix words at the END of the string (regardless of word boundary).
69
+
70
+ "Valhalla DSP, LLC" → "valhalla"
71
+ "Valhalladsp" → "valhalla"
72
+ "u-he" → "uhe"
73
+ """
74
+ if not vendor:
75
+ return "unknown"
76
+ cleaned = re.sub(r"[^a-z0-9]+", "", vendor.lower())
77
+ changed = True
78
+ while changed:
79
+ changed = False
80
+ for s in _VENDOR_SUFFIX_WORDS:
81
+ if cleaned.endswith(s) and len(cleaned) > len(s):
82
+ cleaned = cleaned[: -len(s)]
83
+ changed = True
84
+ break
85
+ return cleaned[:24] or "unknown"
86
+
87
+
88
+ def _vendor_score(v: str | None) -> float:
89
+ """Higher score = prettier vendor display string. Used to pick the best
90
+ representation across multi-format detections.
91
+
92
+ Prefers strings with proper case + spaces ("Valhalla DSP, LLC") over
93
+ bundle-id derivatives ("Valhalladsp")."""
94
+ if not v:
95
+ return -1
96
+ return (
97
+ sum(1 for c in v if c.isupper())
98
+ + v.count(" ") * 2
99
+ + v.count(",")
100
+ + len(v) * 0.05
101
+ )
102
+
103
+
104
+ # ─── corpus_init ─────────────────────────────────────────────────────────────
105
+
106
+
107
+ @mcp.tool()
108
+ def corpus_setup_wizard(ctx: Context) -> dict:
109
+ """First-run setup — survey the user's filesystem for sensible scan candidates
110
+ and return an approval packet for the agent to walk through with the user.
111
+
112
+ Does NOT scan anything. Returns:
113
+ - candidates: list of {category, path, file_count, sample_filenames,
114
+ description, recommended_default}
115
+ - plugin_detection_offer: separate prompt for installed-plugin detection
116
+ - instructions: how the agent should proceed (ask each, then add approved)
117
+ - do_not_scan: paths that require explicit per-folder opt-in (e.g. .als projects)
118
+
119
+ Categories surfaced (when present on this machine):
120
+ - user_library_racks — ~/Music/Ableton/User Library/Presets/*.adg
121
+ - max_devices — ~/Documents/Max <N>/Max for Live Devices/*.amxd
122
+ - plugin_presets — ~/Library/Audio/Presets/*.{aupreset,vstpreset,...}
123
+ - samples_advisory — sample folders (scanner not yet implemented)
124
+
125
+ Personal .als project folders are NEVER auto-suggested (privacy-sensitive).
126
+ """
127
+ from .wizard import build_setup_proposal
128
+ return build_setup_proposal()
129
+
130
+
131
+ @mcp.tool()
132
+ def corpus_init(ctx: Context) -> dict:
133
+ """Initialize the user-corpus output directory + manifest.yaml.
134
+
135
+ Creates ``~/.livepilot/atlas-overlays/user/`` (if missing) and writes a
136
+ default manifest if one doesn't already exist. Idempotent: safe to call
137
+ multiple times — preserves an existing manifest.
138
+
139
+ Returns
140
+ -------
141
+ {manifest_path, output_root, sources, scanners_available, created: bool}
142
+ """
143
+ existed = DEFAULT_MANIFEST_PATH.exists()
144
+ manifest = init_default_manifest(DEFAULT_MANIFEST_PATH)
145
+ return {
146
+ "manifest_path": str(DEFAULT_MANIFEST_PATH),
147
+ "output_root": str(manifest.output_root),
148
+ "sources": [s.id for s in manifest.sources],
149
+ "scanners_available": list_scanners(),
150
+ "created": not existed,
151
+ "next_step": (
152
+ "Add sources with corpus_add_source(...) "
153
+ "then run corpus_scan() to build the corpus."
154
+ ),
155
+ }
156
+
157
+
158
+ # ─── corpus_add_source ───────────────────────────────────────────────────────
159
+
160
+
161
+ @mcp.tool()
162
+ def corpus_add_source(
163
+ ctx: Context,
164
+ source_id: str,
165
+ type: str,
166
+ path: str,
167
+ recursive: bool = True,
168
+ exclude_globs: list = None,
169
+ ) -> dict:
170
+ """Register a new scan source in the user manifest.
171
+
172
+ Parameters
173
+ ----------
174
+ source_id : unique short identifier, e.g. "my-projects". Used in
175
+ entity_id slugs and the namespace (user.<source_id>).
176
+ type : scanner type_id. Run corpus_list_scanners to see options.
177
+ Built-ins: "als", "adg", "amxd", "plugin-preset".
178
+ path : filesystem path to scan. May contain ``~``.
179
+ recursive : descend into subdirectories. Default True.
180
+ exclude_globs : list of glob patterns to skip (e.g. ["*Backup*"]).
181
+ """
182
+ if type not in list_scanners():
183
+ return {
184
+ "error": f"Unknown scanner type '{type}'.",
185
+ "available_types": list_scanners(),
186
+ "status": "error",
187
+ }
188
+ resolved = Path(path).expanduser()
189
+ if not resolved.exists():
190
+ return {
191
+ "error": f"Path does not exist: {resolved}",
192
+ "status": "error",
193
+ }
194
+ manifest = load_manifest(DEFAULT_MANIFEST_PATH)
195
+ if manifest.find_source(source_id):
196
+ return {
197
+ "error": f"Source id '{source_id}' already exists.",
198
+ "hint": "Use corpus_remove_source first if you want to redefine it.",
199
+ "status": "error",
200
+ }
201
+ src = Source(
202
+ id=source_id,
203
+ type=type,
204
+ path=path,
205
+ recursive=recursive,
206
+ exclude_globs=exclude_globs or [],
207
+ )
208
+ manifest.add_source(src)
209
+ save_manifest(manifest, DEFAULT_MANIFEST_PATH)
210
+ return {
211
+ "added": {
212
+ "source_id": source_id,
213
+ "type": type,
214
+ "path": str(resolved),
215
+ "recursive": recursive,
216
+ },
217
+ "manifest_sources": [s.id for s in manifest.sources],
218
+ "next_step": "Run corpus_scan() to index, or corpus_scan(source_id=...) to scan only this source.",
219
+ }
220
+
221
+
222
+ # ─── corpus_remove_source ────────────────────────────────────────────────────
223
+
224
+
225
+ @mcp.tool()
226
+ def corpus_remove_source(ctx: Context, source_id: str) -> dict:
227
+ """Remove a source from the manifest.
228
+
229
+ Does NOT delete previously-written sidecars under the output root — those
230
+ persist until you remove them manually. This makes redefining a source
231
+ safe: removed → add → scan, no data loss.
232
+ """
233
+ manifest = load_manifest(DEFAULT_MANIFEST_PATH)
234
+ removed = manifest.remove_source(source_id)
235
+ if not removed:
236
+ return {
237
+ "error": f"No source with id '{source_id}' in manifest.",
238
+ "available_sources": [s.id for s in manifest.sources],
239
+ "status": "error",
240
+ }
241
+ save_manifest(manifest, DEFAULT_MANIFEST_PATH)
242
+ return {
243
+ "removed": source_id,
244
+ "remaining_sources": [s.id for s in manifest.sources],
245
+ "note": (
246
+ "Sidecars under the output root are NOT auto-deleted. "
247
+ "If you want a clean re-scan, remove the per-source subdirectory manually."
248
+ ),
249
+ }
250
+
251
+
252
+ # ─── corpus_scan ─────────────────────────────────────────────────────────────
253
+
254
+
255
+ @mcp.tool()
256
+ def corpus_scan(ctx: Context, source_id: str = "") -> dict:
257
+ """Run scans on the user corpus.
258
+
259
+ Parameters
260
+ ----------
261
+ source_id : optional. If non-empty, scan ONLY that source. Otherwise scan
262
+ every source in the manifest.
263
+
264
+ Returns
265
+ -------
266
+ {
267
+ sources: [{source_id, type_id, files_scanned, files_skipped, files_errored,
268
+ errors, elapsed_sec}, ...],
269
+ total_scanned, total_skipped, total_errored,
270
+ output_root,
271
+ }
272
+ """
273
+ manifest = load_manifest(DEFAULT_MANIFEST_PATH)
274
+ if not manifest.sources:
275
+ return {
276
+ "error": "No sources in manifest.",
277
+ "hint": "Add sources via corpus_add_source(...) first.",
278
+ "status": "error",
279
+ }
280
+ if source_id and not manifest.find_source(source_id):
281
+ return {
282
+ "error": f"No source with id '{source_id}'.",
283
+ "available_sources": [s.id for s in manifest.sources],
284
+ "status": "error",
285
+ }
286
+
287
+ result = run_scan(
288
+ manifest,
289
+ only_source_id=source_id or None,
290
+ update_manifest_path=DEFAULT_MANIFEST_PATH,
291
+ )
292
+
293
+ return {
294
+ "sources": [
295
+ {
296
+ "source_id": s.source_id,
297
+ "type_id": s.type_id,
298
+ "files_scanned": s.files_scanned,
299
+ "files_skipped": s.files_skipped,
300
+ "files_errored": s.files_errored,
301
+ "errors": s.errors[:20], # cap for readability
302
+ "elapsed_sec": s.elapsed_sec,
303
+ }
304
+ for s in result.sources
305
+ ],
306
+ "total_scanned": result.total_scanned,
307
+ "total_skipped": result.total_skipped,
308
+ "total_errored": result.total_errored,
309
+ "output_root": str(manifest.output_root),
310
+ "next_step": (
311
+ "Restart the MCP server to load the new sidecars into the overlay "
312
+ "index, then query via extension_atlas_search(namespace='user.<source_id>')."
313
+ ),
314
+ }
315
+
316
+
317
+ # ─── corpus_status ───────────────────────────────────────────────────────────
318
+
319
+
320
+ @mcp.tool()
321
+ def corpus_status(ctx: Context) -> dict:
322
+ """Report manifest contents + freshness for each source."""
323
+ manifest = load_manifest(DEFAULT_MANIFEST_PATH)
324
+ sources_report = []
325
+ for s in manifest.sources:
326
+ resolved = s.resolved_path
327
+ sources_report.append({
328
+ "source_id": s.id,
329
+ "type": s.type,
330
+ "path": str(resolved),
331
+ "exists": resolved.exists(),
332
+ "recursive": s.recursive,
333
+ "exclude_globs": s.exclude_globs,
334
+ "last_scanned": s.last_scanned,
335
+ "file_count": s.file_count,
336
+ })
337
+ return {
338
+ "manifest_path": str(DEFAULT_MANIFEST_PATH),
339
+ "output_root": str(manifest.output_root),
340
+ "sources": sources_report,
341
+ "scanners_available": list_scanners(),
342
+ }
343
+
344
+
345
+ # ─── corpus_list_scanners ────────────────────────────────────────────────────
346
+
347
+
348
+ @mcp.tool()
349
+ def corpus_detect_plugins(
350
+ ctx: Context,
351
+ formats: list = None,
352
+ persist: bool = True,
353
+ ) -> dict:
354
+ """Phase 2.1 + 2.2 — detect installed VST3 / AU / VST2 / AAX / LV2 plugins
355
+ and extract identity metadata (vendor, version, unique_id, format) from
356
+ each plugin's bundle without needing a DAW host.
357
+
358
+ Parameters
359
+ ----------
360
+ formats : optional list of formats to restrict to, e.g. ["VST3", "AU"].
361
+ Default = all formats found at OS-standard paths.
362
+ persist : write the detected inventory to
363
+ ~/.livepilot/atlas-overlays/user/plugins/_inventory.json (default True)
364
+
365
+ Returns
366
+ -------
367
+ {plugins: [...], totals: {...}, inventory_path}
368
+ """
369
+ fmts = formats or None
370
+ detected = detect_installed_plugins(formats=fmts)
371
+
372
+ plugins_serialized = [p.to_dict() for p in detected]
373
+
374
+ by_format: dict[str, int] = {}
375
+ for p in detected:
376
+ by_format[p.format] = by_format.get(p.format, 0) + 1
377
+
378
+ inventory_path = None
379
+ if persist:
380
+ plugins_dir = DEFAULT_OUTPUT_ROOT / "plugins"
381
+ plugins_dir.mkdir(parents=True, exist_ok=True)
382
+ inventory_path = plugins_dir / "_inventory.json"
383
+ inventory_path.write_text(
384
+ json.dumps({
385
+ "schema_version": 1,
386
+ "plugins": plugins_serialized,
387
+ "totals": by_format,
388
+ }, indent=2, default=str),
389
+ encoding="utf-8",
390
+ )
391
+
392
+ return {
393
+ "plugins": plugins_serialized,
394
+ "totals": {"all": len(detected), "by_format": by_format},
395
+ "inventory_path": str(inventory_path) if inventory_path else None,
396
+ "search_paths": [str(p) for p, _ in default_plugin_dir()],
397
+ }
398
+
399
+
400
+ @mcp.tool()
401
+ def corpus_discover_manuals(
402
+ ctx: Context,
403
+ plugin_id: str = "",
404
+ extract: bool = True,
405
+ persist: bool = True,
406
+ ) -> dict:
407
+ """Phase 2.3 + 2.4 — find local manual files for a detected plugin and
408
+ extract their text.
409
+
410
+ Parameters
411
+ ----------
412
+ plugin_id : the plugin to search for (must already be in _inventory.json
413
+ from corpus_detect_plugins). Required.
414
+ extract : also extract text from the top candidate (default True)
415
+ persist : write extracted text to
416
+ ~/.livepilot/atlas-overlays/user/plugins/<plugin_id>/manual.txt
417
+
418
+ Returns
419
+ -------
420
+ {plugin_id, candidates, extraction: {...}, manual_path}
421
+ """
422
+ if not plugin_id:
423
+ return {
424
+ "error": "plugin_id is required",
425
+ "hint": "Run corpus_detect_plugins first, then pass a plugin_id from the result.",
426
+ "status": "error",
427
+ }
428
+ inventory_path = DEFAULT_OUTPUT_ROOT / "plugins" / "_inventory.json"
429
+ if not inventory_path.exists():
430
+ return {
431
+ "error": "No plugin inventory found.",
432
+ "hint": "Run corpus_detect_plugins first.",
433
+ "status": "error",
434
+ }
435
+ inventory = json.loads(inventory_path.read_text(encoding="utf-8"))
436
+ matches = [p for p in inventory.get("plugins", []) if p["plugin_id"] == plugin_id]
437
+ if not matches:
438
+ return {
439
+ "error": f"plugin_id '{plugin_id}' not found in inventory.",
440
+ "available_count": len(inventory.get("plugins", [])),
441
+ "status": "error",
442
+ }
443
+ plugin_dict = matches[0]
444
+ plugin = DetectedPlugin(**{k: v for k, v in plugin_dict.items() if k in DetectedPlugin.__dataclass_fields__})
445
+
446
+ candidates = discover_manuals_for_plugin(plugin)
447
+ candidates_serialized = [
448
+ {
449
+ "path": c.path,
450
+ "extension": c.extension,
451
+ "size_kb": c.size_kb,
452
+ "name_score": c.name_score,
453
+ "location_score": c.location_score,
454
+ }
455
+ for c in candidates
456
+ ]
457
+
458
+ extraction_result = None
459
+ manual_path = None
460
+ if extract and candidates:
461
+ top = candidates[0]
462
+ extraction = extract_manual_text(plugin, top)
463
+ extraction_result = {
464
+ "source_path": extraction.source_path,
465
+ "source_kind": extraction.source_kind,
466
+ "char_count": extraction.char_count,
467
+ "page_count": extraction.page_count,
468
+ "section_count": len(extraction.sections),
469
+ "section_titles": [s["title"] for s in extraction.sections][:30],
470
+ "truncated": extraction.truncated,
471
+ "error": extraction.error,
472
+ }
473
+ if persist and extraction.text and not extraction.error:
474
+ plugin_dir = DEFAULT_OUTPUT_ROOT / "plugins" / plugin.plugin_id
475
+ plugin_dir.mkdir(parents=True, exist_ok=True)
476
+ manual_path = plugin_dir / "manual.txt"
477
+ manual_path.write_text(extraction.text, encoding="utf-8")
478
+ (plugin_dir / "manual_sections.json").write_text(
479
+ json.dumps(extraction.sections, indent=2),
480
+ encoding="utf-8",
481
+ )
482
+
483
+ return {
484
+ "plugin_id": plugin_id,
485
+ "plugin_identity": {
486
+ "name": plugin.name, "vendor": plugin.vendor,
487
+ "format": plugin.format, "version": plugin.version,
488
+ },
489
+ "candidates": candidates_serialized,
490
+ "candidates_total": len(candidates),
491
+ "extraction": extraction_result,
492
+ "manual_path": str(manual_path) if manual_path else None,
493
+ }
494
+
495
+
496
+ @mcp.tool()
497
+ def corpus_canonicalize_plugins(
498
+ ctx: Context,
499
+ skip_vendors: list = None,
500
+ skip_name_prefixes: list = None,
501
+ ) -> dict:
502
+ """Dedupe the plugin inventory by canonical vendor + name; prefer VST3 as
503
+ primary format; pick the prettiest vendor string across formats. Writes
504
+ `plugins/_canonical.json` next to `_inventory.json`.
505
+
506
+ The canonical inventory is what efficient Phase 3+4 research consumes —
507
+ instead of running research separately on the AU and VST3 versions of the
508
+ same plugin, the canonicalized record represents BOTH formats with
509
+ `formats_available: [AU, VST3]` so a single identity.yaml covers both.
510
+
511
+ Filtering:
512
+ skip_vendors — list of vendor names to drop (default: ["Apple",
513
+ "Splice"]) — exclude system AUs + utility apps
514
+ skip_name_prefixes — list of name-prefix patterns to drop (default:
515
+ ["Splice"]) — exclude installer/helper plugins
516
+
517
+ Returns
518
+ -------
519
+ {canonical_count, formats_distribution, top_vendors, canonical_path}
520
+ """
521
+ inventory_path = DEFAULT_OUTPUT_ROOT / "plugins" / "_inventory.json"
522
+ if not inventory_path.exists():
523
+ return {
524
+ "error": "No inventory found. Run corpus_detect_plugins first.",
525
+ "status": "error",
526
+ }
527
+ skip_vendors_lower = {v.lower() for v in (skip_vendors or ["Apple", "Splice"])}
528
+ skip_prefixes_lower = [p.lower() for p in (skip_name_prefixes or ["Splice"])]
529
+
530
+ inv = json.loads(inventory_path.read_text(encoding="utf-8"))
531
+ raw_plugins = inv.get("plugins", [])
532
+
533
+ # Group raw plugins by (canonical_vendor, normalized_name)
534
+ groups: dict = {}
535
+ for p in raw_plugins:
536
+ v_lc = (p.get("vendor") or "").lower()
537
+ if not v_lc or v_lc in skip_vendors_lower:
538
+ continue
539
+ nm = (p.get("name") or "").lower()
540
+ if any(nm.startswith(pre) for pre in skip_prefixes_lower):
541
+ continue
542
+ # Strip parenthetical role qualifiers so "Drambo" + "Drambo (Fx)" merge
543
+ nm_norm = (
544
+ nm.replace(" (instrument)", "").replace(" (instr)", "")
545
+ .replace(" (fx)", "").replace(" (midi fx)", "")
546
+ .replace(" (8 outs)", "").strip()
547
+ )
548
+ key = (_canon_vendor(p.get("vendor") or ""), nm_norm)
549
+ groups.setdefault(key, []).append(p)
550
+
551
+ canonical: list[dict] = []
552
+ for (canon_v, name), entries in groups.items():
553
+ # Sort: VST3 first, then AU, then others; within format, prettiest vendor
554
+ entries.sort(key=lambda p: (
555
+ 0 if p.get("format") == "VST3" else 1 if p.get("format") == "AU" else 2,
556
+ -_vendor_score(p.get("vendor") or ""),
557
+ ))
558
+ primary = dict(entries[0])
559
+ # Pretty vendor across all entries (not just primary's)
560
+ pretty_vendor = max(
561
+ (e.get("vendor") for e in entries if e.get("vendor")),
562
+ key=_vendor_score,
563
+ )
564
+ formats = sorted({e.get("format") for e in entries if e.get("format")})
565
+ # Stable plugin_id: canon_v + primary's name
566
+ primary["vendor"] = pretty_vendor
567
+ primary["plugin_id"] = re.sub(
568
+ r"[^a-z0-9]+", "-", f"{canon_v}-{primary.get('name', '')}".lower(),
569
+ ).strip("-")
570
+ primary["formats_available"] = formats
571
+ canonical.append(primary)
572
+
573
+ canonical.sort(key=lambda d: (d.get("vendor", ""), d.get("name", "")))
574
+
575
+ canonical_path = DEFAULT_OUTPUT_ROOT / "plugins" / "_canonical.json"
576
+ canonical_path.write_text(
577
+ json.dumps(canonical, indent=2, default=str),
578
+ encoding="utf-8",
579
+ )
580
+
581
+ # Report stats
582
+ formats_dist: dict[str, int] = {}
583
+ vendor_counts: dict[str, int] = {}
584
+ for d in canonical:
585
+ for f in d.get("formats_available") or []:
586
+ formats_dist[f] = formats_dist.get(f, 0) + 1
587
+ v = d.get("vendor") or "unknown"
588
+ vendor_counts[v] = vendor_counts.get(v, 0) + 1
589
+
590
+ top_vendors = sorted(vendor_counts.items(), key=lambda kv: -kv[1])[:15]
591
+ return {
592
+ "canonical_count": len(canonical),
593
+ "raw_count": len(raw_plugins),
594
+ "skipped_vendors": sorted(skip_vendors_lower),
595
+ "skipped_name_prefixes": sorted(skip_prefixes_lower),
596
+ "formats_distribution": formats_dist,
597
+ "top_vendors": [{"vendor": v, "count": n} for v, n in top_vendors],
598
+ "canonical_path": str(canonical_path),
599
+ }
600
+
601
+
602
+ @mcp.tool()
603
+ def corpus_cluster_plugins(
604
+ ctx: Context,
605
+ min_cluster_size: int = 2,
606
+ ) -> dict:
607
+ """Group canonical plugins by vendor; return a cluster manifest the agent
608
+ uses to dispatch Phase 3+4 research efficiently.
609
+
610
+ For each cluster (vendor with >= min_cluster_size plugins) the agent runs
611
+ ONE shared WebSearch pass + writes N identity yamls — vs N independent
612
+ research passes for singletons. Cluster research lowers per-plugin token
613
+ cost by 3-5x for vendors documented as a coherent product line.
614
+
615
+ Returns
616
+ -------
617
+ {
618
+ clusters: [{vendor, plugin_count, plugin_ids: [...]}],
619
+ singletons: [{plugin_id, vendor, name}],
620
+ total_plugins, cluster_count, singleton_count,
621
+ identity_yaml_status: {covered: [ids], missing: [ids]},
622
+ }
623
+ """
624
+ canonical_path = DEFAULT_OUTPUT_ROOT / "plugins" / "_canonical.json"
625
+ if not canonical_path.exists():
626
+ return {
627
+ "error": "No canonical inventory. Run corpus_canonicalize_plugins first.",
628
+ "status": "error",
629
+ }
630
+ canonical = json.loads(canonical_path.read_text(encoding="utf-8"))
631
+
632
+ by_vendor: dict[str, list[dict]] = {}
633
+ for d in canonical:
634
+ v = d.get("vendor") or "unknown"
635
+ by_vendor.setdefault(v, []).append(d)
636
+
637
+ clusters: list[dict] = []
638
+ singletons: list[dict] = []
639
+ plugins_root = DEFAULT_OUTPUT_ROOT / "plugins"
640
+ covered: list[str] = []
641
+ missing: list[str] = []
642
+
643
+ for vendor, items in sorted(by_vendor.items()):
644
+ if len(items) >= min_cluster_size:
645
+ clusters.append({
646
+ "vendor": vendor,
647
+ "plugin_count": len(items),
648
+ "plugin_ids": [d.get("plugin_id") for d in items],
649
+ "plugin_names": [d.get("name") for d in items],
650
+ "research_strategy": (
651
+ "Run ONE shared WebSearch pass on this vendor's product line, "
652
+ "cache to _shared_research/<vendor-slug>/, then write N lean "
653
+ "identity yamls citing the shared cache."
654
+ ),
655
+ })
656
+ else:
657
+ for d in items:
658
+ singletons.append({
659
+ "plugin_id": d.get("plugin_id"),
660
+ "vendor": vendor,
661
+ "name": d.get("name"),
662
+ })
663
+ for d in items:
664
+ pid = d.get("plugin_id") or ""
665
+ yaml_p = plugins_root / pid / "identity.yaml"
666
+ if yaml_p.exists():
667
+ covered.append(pid)
668
+ else:
669
+ missing.append(pid)
670
+
671
+ # Sort clusters by size descending — largest research-payoff first
672
+ clusters.sort(key=lambda c: -c["plugin_count"])
673
+
674
+ return {
675
+ "clusters": clusters,
676
+ "singletons": singletons,
677
+ "total_plugins": len(canonical),
678
+ "cluster_count": len(clusters),
679
+ "singleton_count": len(singletons),
680
+ "identity_yaml_status": {
681
+ "covered": covered,
682
+ "missing": missing,
683
+ "covered_count": len(covered),
684
+ "missing_count": len(missing),
685
+ },
686
+ "next_step": (
687
+ "For clusters: dispatch one sonnet subagent per cluster with the "
688
+ "plugin_ids list, pointing each at corpus_emit_synthesis_briefs(plugin_ids=[...]). "
689
+ "For singletons: batch ~5-7 per agent (no shared theme). Cap parallel "
690
+ "agent count at 8."
691
+ ),
692
+ }
693
+
694
+
695
+ @mcp.tool()
696
+ def corpus_trim_plugin_identity(
697
+ ctx: Context,
698
+ plugin_id: str = "",
699
+ research_priority: str = "low",
700
+ ) -> dict:
701
+ """Slim a plugin's identity.yaml to the lean overlay-required shape.
702
+
703
+ Use when the user explicitly deprioritizes a plugin ("don't waste tokens
704
+ on this one") OR when post-processing a Phase 4 batch where some plugins
705
+ received deeper research than the user wants persisted. Keeps the file
706
+ queryable via atlas_search but drops the long key_techniques /
707
+ parameter_glossary / comparable_plugins sections.
708
+
709
+ Result preserves: entity_id, entity_type, name, description, tags (with
710
+ `research-priority:<level>` appended), artists, plugin_id, vendor, format,
711
+ formats_available, sonic_fingerprint (capped at 400 chars).
712
+
713
+ Result drops: reach_for, avoid, key_techniques, parameter_glossary,
714
+ comparable_plugins, genre_affinity, producer_anchors, cache_provenance.
715
+
716
+ Parameters
717
+ ----------
718
+ plugin_id : the plugin to trim. Required.
719
+ research_priority : tag value: "low" / "medium" / "skip". Default "low".
720
+ """
721
+ if not plugin_id:
722
+ return {"error": "plugin_id is required", "status": "error"}
723
+ yaml_path = DEFAULT_OUTPUT_ROOT / "plugins" / plugin_id / "identity.yaml"
724
+ if not yaml_path.exists():
725
+ return {
726
+ "error": f"No identity.yaml for plugin_id '{plugin_id}'",
727
+ "status": "error",
728
+ }
729
+ import yaml as _yaml
730
+ full = _yaml.safe_load(yaml_path.read_text(encoding="utf-8"))
731
+ if not isinstance(full, dict):
732
+ return {"error": "identity.yaml is not a dict", "status": "error"}
733
+
734
+ original_size = yaml_path.stat().st_size
735
+
736
+ slim = {
737
+ "entity_id": full.get("entity_id", plugin_id),
738
+ "entity_type": full.get("entity_type", "installed_plugin"),
739
+ "name": full.get("name", ""),
740
+ "description": (full.get("description") or "")[:200],
741
+ # Filter out genre tags but keep core overlay-search signal; append research_priority
742
+ "tags": (
743
+ [t for t in (full.get("tags") or []) if not t.startswith("genre:")][:8]
744
+ + [f"research-priority:{research_priority}"]
745
+ ),
746
+ "artists": full.get("artists") or [],
747
+ "plugin_id": full.get("entity_id", plugin_id),
748
+ "vendor": full.get("vendor", ""),
749
+ "format": full.get("format", ""),
750
+ "formats_available": full.get("formats_available") or [],
751
+ "sonic_fingerprint": (full.get("sonic_fingerprint") or "")[:400],
752
+ "research_priority": research_priority,
753
+ "schema_version": full.get("schema_version", 1),
754
+ }
755
+ yaml_path.write_text(
756
+ _yaml.dump(slim, sort_keys=False, default_flow_style=False, width=200, allow_unicode=True),
757
+ encoding="utf-8",
758
+ )
759
+ return {
760
+ "plugin_id": plugin_id,
761
+ "size_before_bytes": original_size,
762
+ "size_after_bytes": yaml_path.stat().st_size,
763
+ "research_priority": research_priority,
764
+ "yaml_path": str(yaml_path),
765
+ }
766
+
767
+
768
+ @mcp.tool()
769
+ def corpus_research_targets(
770
+ ctx: Context,
771
+ plugin_id: str = "",
772
+ ) -> dict:
773
+ """Phase 3 — emit a structured WebSearch task packet for the agent to fulfill.
774
+
775
+ This tool does NOT call the web. It returns the queries + cache locations
776
+ + instructions; the Claude agent uses WebSearch + WebFetch + sonnet
777
+ subagents to fulfill them.
778
+
779
+ Returns
780
+ -------
781
+ The Phase 3 packet — see PLUGIN_KNOWLEDGE_ENGINE.md §"Research target packet".
782
+ """
783
+ if not plugin_id:
784
+ return {"error": "plugin_id is required", "status": "error"}
785
+ inventory_path = DEFAULT_OUTPUT_ROOT / "plugins" / "_inventory.json"
786
+ if not inventory_path.exists():
787
+ return {
788
+ "error": "No inventory; run corpus_detect_plugins first.",
789
+ "status": "error",
790
+ }
791
+ inventory = json.loads(inventory_path.read_text(encoding="utf-8"))
792
+ matches = [p for p in inventory.get("plugins", []) if p["plugin_id"] == plugin_id]
793
+ if not matches:
794
+ return {"error": f"plugin_id '{plugin_id}' not in inventory.", "status": "error"}
795
+ plugin = DetectedPlugin(**{k: v for k, v in matches[0].items() if k in DetectedPlugin.__dataclass_fields__})
796
+
797
+ # Look for an extracted local manual to inform the packet
798
+ manual_path = DEFAULT_OUTPUT_ROOT / "plugins" / plugin_id / "manual.txt"
799
+ sections_path = DEFAULT_OUTPUT_ROOT / "plugins" / plugin_id / "manual_sections.json"
800
+ local_manual = None
801
+ if manual_path.exists():
802
+ from .plugin_engine.manual import ManualExtraction
803
+ sections = []
804
+ if sections_path.exists():
805
+ try:
806
+ sections = json.loads(sections_path.read_text(encoding="utf-8"))
807
+ except json.JSONDecodeError:
808
+ pass
809
+ local_manual = ManualExtraction(
810
+ plugin_id=plugin_id, source_path=str(manual_path),
811
+ source_kind=manual_path.suffix.lstrip(".") or "txt",
812
+ text=manual_path.read_text(encoding="utf-8", errors="ignore"),
813
+ char_count=manual_path.stat().st_size,
814
+ sections=sections,
815
+ )
816
+
817
+ return build_research_targets(plugin, local_manual)
818
+
819
+
820
+ @mcp.tool()
821
+ def corpus_emit_synthesis_briefs(
822
+ ctx: Context,
823
+ plugin_ids: list = None,
824
+ ) -> dict:
825
+ """Phase 4 — emit sonnet-subagent briefs for plugin identity synthesis.
826
+
827
+ For each requested plugin, builds a self-contained brief that an agent
828
+ dispatches to a sonnet subagent (via the Agent tool) which writes one
829
+ identity.yaml at the brief's output_path.
830
+
831
+ Parameters
832
+ ----------
833
+ plugin_ids : list of plugin_ids to emit briefs for. If empty, emits for
834
+ every plugin in the inventory.
835
+
836
+ Returns
837
+ -------
838
+ {briefs: [{plugin_id, brief, output_path}, ...], total}
839
+ """
840
+ inventory_path = DEFAULT_OUTPUT_ROOT / "plugins" / "_inventory.json"
841
+ if not inventory_path.exists():
842
+ return {"error": "Run corpus_detect_plugins first.", "status": "error"}
843
+ inventory = json.loads(inventory_path.read_text(encoding="utf-8"))
844
+ all_plugins = inventory.get("plugins", [])
845
+
846
+ target_ids = set(plugin_ids) if plugin_ids else None
847
+ briefs = []
848
+ for plugin_dict in all_plugins:
849
+ pid = plugin_dict.get("plugin_id")
850
+ if target_ids is not None and pid not in target_ids:
851
+ continue
852
+ plugin = DetectedPlugin(**{k: v for k, v in plugin_dict.items() if k in DetectedPlugin.__dataclass_fields__})
853
+
854
+ # Read any extracted manual + research cache that exist
855
+ manual_path = DEFAULT_OUTPUT_ROOT / "plugins" / pid / "manual.txt"
856
+ local_manual = None
857
+ if manual_path.exists():
858
+ from .plugin_engine.manual import ManualExtraction
859
+ local_manual = ManualExtraction(
860
+ plugin_id=pid, source_path=str(manual_path),
861
+ source_kind="txt",
862
+ text=manual_path.read_text(encoding="utf-8", errors="ignore"),
863
+ char_count=manual_path.stat().st_size,
864
+ sections=[],
865
+ )
866
+ research_root = DEFAULT_OUTPUT_ROOT / "plugins" / pid / "research"
867
+
868
+ brief = build_synthesis_brief(plugin, local_manual, research_root if research_root.exists() else None)
869
+ briefs.append({
870
+ "plugin_id": pid,
871
+ "brief": brief,
872
+ "output_path": brief["output_path"],
873
+ })
874
+
875
+ return {
876
+ "briefs": briefs,
877
+ "total": len(briefs),
878
+ "instruction": (
879
+ "For each brief: dispatch one sonnet subagent (Agent tool with "
880
+ "subagent_type='general-purpose', model='sonnet') passing the brief "
881
+ "as context. The subagent reads brief['synthesis_inputs'] and writes "
882
+ "the YAML at brief['output_path']. Cap parallel subagents at ~5 to "
883
+ "avoid main-context bloat."
884
+ ),
885
+ }
886
+
887
+
888
+ @mcp.tool()
889
+ def corpus_list_scanners(ctx: Context) -> dict:
890
+ """Enumerate registered scanner types and their supported file extensions.
891
+
892
+ Useful for discovering what content types the corpus builder can handle on
893
+ this install. Custom scanners registered by the user via @register_scanner
894
+ show up here too.
895
+ """
896
+ from .scanner import SCANNERS
897
+ out = {}
898
+ for type_id, cls in SCANNERS.items():
899
+ out[type_id] = {
900
+ "extensions": cls.file_extensions,
901
+ "output_subdir": cls.output_subdir,
902
+ "schema_version": cls.schema_version,
903
+ }
904
+ return {"scanners": out, "count": len(out)}