ummaya 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. package/README.md +2 -1
  2. package/npm-shrinkwrap.json +2 -2
  3. package/package.json +1 -1
  4. package/prompts/manifest.yaml +2 -2
  5. package/prompts/session_guidance_v1.md +3 -1
  6. package/prompts/system_v1.md +8 -7
  7. package/pyproject.toml +2 -7
  8. package/src/ummaya/context/builder.py +17 -11
  9. package/src/ummaya/engine/engine.py +27 -7
  10. package/src/ummaya/engine/query.py +20 -0
  11. package/src/ummaya/evidence/__init__.py +25 -0
  12. package/src/ummaya/evidence/__main__.py +7 -0
  13. package/src/ummaya/evidence/models.py +58 -0
  14. package/src/ummaya/evidence/runner.py +308 -0
  15. package/src/ummaya/evidence/task_registry.py +264 -0
  16. package/src/ummaya/ipc/frame_schema.py +47 -0
  17. package/src/ummaya/ipc/stdio.py +1287 -54
  18. package/src/ummaya/llm/client.py +132 -56
  19. package/src/ummaya/llm/reasoning.py +84 -0
  20. package/src/ummaya/tools/discovery_bridge.py +17 -1
  21. package/src/ummaya/tools/executor.py +32 -12
  22. package/src/ummaya/tools/geocoding/kakao_client.py +1 -2
  23. package/src/ummaya/tools/kma/apihub_catalog.py +984 -1
  24. package/src/ummaya/tools/kma/apihub_structured_adapter.py +86 -6
  25. package/src/ummaya/tools/kma/apihub_url_adapter.py +593 -0
  26. package/src/ummaya/tools/kma/apihub_url_catalog.py +296 -0
  27. package/src/ummaya/tools/location_adapters.py +8 -6
  28. package/src/ummaya/tools/manifest_metadata.py +16 -3
  29. package/src/ummaya/tools/mvp_surface.py +2 -2
  30. package/src/ummaya/tools/nmc/emergency_search.py +8 -6
  31. package/src/ummaya/tools/register_all.py +9 -0
  32. package/src/ummaya/tools/resolve_location.py +4 -4
  33. package/src/ummaya/tools/search.py +664 -18
  34. package/src/ummaya/tools/verified_data_go_kr/_manifest.py +115 -25
  35. package/src/ummaya/tools/verified_data_go_kr/airkorea_air_quality.py +109 -4
  36. package/src/ummaya/tools/verified_data_go_kr/nmc_aed_site.py +108 -2
  37. package/src/ummaya/tools/verified_data_go_kr/pps_bid_public_info.py +174 -9
  38. package/src/ummaya/tools/verified_data_go_kr/tago_bus_arrival.py +66 -3
  39. package/src/ummaya/tools/verified_data_go_kr/tago_bus_location.py +12 -2
  40. package/src/ummaya/tools/verified_data_go_kr/tago_bus_route.py +8 -2
  41. package/src/ummaya/tools/verified_data_go_kr/tago_bus_route_station.py +114 -0
  42. package/src/ummaya/tools/verified_data_go_kr/tago_bus_station.py +14 -3
  43. package/src/ummaya/tools/verify_canonical_map.py +21 -0
  44. package/tui/package.json +1 -2
  45. package/tui/src/QueryEngine.ts +4 -0
  46. package/tui/src/cli/handlers/auth.ts +1 -1
  47. package/tui/src/cli/handlers/mcp.tsx +3 -3
  48. package/tui/src/cli/print.ts +69 -18
  49. package/tui/src/cli/update.ts +13 -13
  50. package/tui/src/commands/copy/index.ts +1 -1
  51. package/tui/src/commands/cost/cost.ts +2 -2
  52. package/tui/src/commands/init-verifiers.ts +5 -5
  53. package/tui/src/commands/init.ts +30 -30
  54. package/tui/src/commands/insights.ts +43 -43
  55. package/tui/src/commands/install-github-app/install-github-app.tsx +2 -2
  56. package/tui/src/commands/install-github-app/setupGitHubActions.ts +3 -3
  57. package/tui/src/commands/install.tsx +5 -5
  58. package/tui/src/commands/mcp/addCommand.ts +5 -5
  59. package/tui/src/commands/mcp/xaaIdpCommand.ts +2 -2
  60. package/tui/src/commands/plugin/ManageMarketplaces.tsx +2 -2
  61. package/tui/src/commands/reasoning/index.ts +13 -0
  62. package/tui/src/commands/reasoning/reasoning.tsx +177 -0
  63. package/tui/src/commands/thinkback/thinkback.tsx +3 -3
  64. package/tui/src/commands.ts +2 -0
  65. package/tui/src/components/Messages.tsx +2 -1
  66. package/tui/src/components/Spinner.tsx +2 -2
  67. package/tui/src/components/design-system/LoadingState.tsx +2 -2
  68. package/tui/src/ipc/codec.ts +26 -0
  69. package/tui/src/ipc/frames.generated.ts +398 -303
  70. package/tui/src/ipc/llmClient.ts +130 -51
  71. package/tui/src/ipc/llmTypes.ts +16 -1
  72. package/tui/src/ipc/schema/frame.schema.json +1 -3475
  73. package/tui/src/main.tsx +3 -0
  74. package/tui/src/query.ts +467 -2
  75. package/tui/src/screens/REPL.tsx +3 -3
  76. package/tui/src/services/api/claude.ts +48 -18
  77. package/tui/src/services/api/client.ts +33 -12
  78. package/tui/src/services/api/ummaya.ts +70 -16
  79. package/tui/src/skills/bundled/stuck.ts +12 -12
  80. package/tui/src/state/AppStateStore.ts +7 -0
  81. package/tui/src/tools/AdapterTool/AdapterTool.ts +590 -7
  82. package/tui/src/tools/LookupPrimitive/LookupPrimitive.ts +43 -17
  83. package/tui/src/tools/LookupPrimitive/prompt.ts +7 -6
  84. package/tui/src/tools/ResolveLocationPrimitive/ResolveLocationPrimitive.ts +40 -19
  85. package/tui/src/tools/SubmitPrimitive/SubmitPrimitive.ts +25 -9
  86. package/tui/src/tools/VerifyPrimitive/VerifyPrimitive.ts +25 -9
  87. package/tui/src/tools/_shared/citizenUserText.ts +49 -0
  88. package/tui/src/tools/_shared/directPublicDataGuard.ts +362 -0
  89. package/tui/src/tools/_shared/kmaAnalysisGuard.ts +197 -0
  90. package/tui/src/tools/_shared/kmaAviationGuard.ts +70 -0
  91. package/tui/src/tools/_shared/locationInputRepair.ts +112 -0
  92. package/tui/src/tools/_shared/nmcAedGuard.ts +234 -0
  93. package/tui/src/tools/_shared/protectedCheckGuard.ts +207 -0
  94. package/tui/src/tools/_shared/rootPrimitiveInput.ts +67 -0
  95. package/tui/src/tools/_shared/textToolCallGuard.ts +91 -0
  96. package/tui/src/tools/_shared/toolChoiceRepair.ts +866 -0
  97. package/tui/src/utils/attachments.ts +1 -1
  98. package/tui/src/utils/kExaoneReasoning.ts +138 -0
  99. package/tui/src/utils/messages.ts +1 -0
  100. package/tui/src/utils/multiToolLayout.ts +13 -0
  101. package/tui/src/utils/processUserInput/processSlashCommand.tsx +2 -2
  102. package/tui/src/utils/processUserInput/processUserInput.ts +26 -0
  103. package/tui/src/utils/settings/applySettingsChange.ts +4 -0
  104. package/tui/src/utils/settings/types.ts +9 -3
  105. package/tui/src/utils/stats.ts +1 -1
  106. package/uv.lock +1 -15
  107. package/assets/copilot-gate-logo.svg +0 -58
  108. package/assets/govon-logo.svg +0 -40
  109. package/src/ummaya/eval/__init__.py +0 -5
  110. package/src/ummaya/eval/retrieval.py +0 -713
  111. package/tui/src/utils/messageStream.ts +0 -186
@@ -1,713 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- """BM25 retrieval quality evaluation harness — T039.
3
-
4
- CLI entry point::
5
-
6
- python -m ummaya.eval.retrieval eval/retrieval_queries.yaml
7
-
8
- Loads the seed adapter registry, runs each query through lookup(mode="search"),
9
- computes recall@1 and recall@5, and writes a JSON report to
10
- .eval-artifacts/retrieval.json.
11
-
12
- Exit codes:
13
- 0 — pass (recall@5 >= 0.80)
14
- 1 — warn (0.60 <= recall@5 < 0.80)
15
- 2 — fail (recall@5 < 0.60)
16
-
17
- Extended gate exit codes (run_extended_gate / --backend flag):
18
- 0 — pass (recall@5 >= 0.80, sc_01_status is not PENDING_#22)
19
- 1 — warn (0.60 <= recall@5 < 0.80)
20
- 2 — PENDING_#22 (registry_size < 8 or no Phase-3 adapters detected)
21
- 2 — fail (recall@5 < 0.60, when sc_01 is not PENDING)
22
-
23
- NOTE: As of Stage 2a, only ``koroad_accident_hazard_search`` is registered.
24
- The other 3 seed adapters (kma_forecast_fetch, hira_hospital_search,
25
- nmc_emergency_search) land in Stage 3. When fewer than 4 adapters are
26
- registered, recall@5 will be artificially high for queries targeting KOROAD
27
- and zero for the others — the JSON report emits a WARN in that case.
28
- """
29
-
30
- from __future__ import annotations
31
-
32
- import asyncio
33
- import contextlib
34
- import json
35
- import logging
36
- import os
37
- import sys
38
- from collections.abc import Iterator
39
- from datetime import UTC, datetime
40
- from pathlib import Path
41
- from typing import Any
42
-
43
- import yaml
44
-
45
- logger = logging.getLogger(__name__)
46
-
47
-
48
- # ---------------------------------------------------------------------------
49
- # Report schema (Pydantic-free to avoid import overhead in a CLI entrypoint)
50
- # ---------------------------------------------------------------------------
51
-
52
- # The minimum number of distinct seed adapters we expect.
53
- _EXPECTED_ADAPTER_COUNT = 4
54
-
55
- # The adapter IDs that should be registered for a complete eval run.
56
- _SEED_ADAPTER_IDS: frozenset[str] = frozenset(
57
- {
58
- "koroad_accident_hazard_search",
59
- "kma_forecast_fetch",
60
- "hira_hospital_search",
61
- "nmc_emergency_search",
62
- }
63
- )
64
-
65
- # Minimum registry size required for a meaningful A/B eval (SC-001, FR-013).
66
- # Until Epic #22 lands (≥ 4 new adapters), the combined registry will be < 8.
67
- _SC01_MIN_REGISTRY_SIZE = 8
68
-
69
- # The four seed-adapter prefixes. When every registered tool_id starts with
70
- # one of these, no Phase-3 adapters are present and SC-01 is PENDING_#22.
71
- _SEED_PREFIXES: tuple[str, ...] = ("koroad_", "kma_", "hira_", "nmc_")
72
-
73
-
74
- @contextlib.contextmanager
75
- def _backend_env_overlay(backend: str) -> Iterator[None]:
76
- """Context manager that overlays UMMAYA_RETRIEVAL_BACKEND for the duration.
77
-
78
- Restores the previous value (or removes the key entirely if it was absent)
79
- when the block exits. This ensures that callers that set the env var via
80
- this function do not pollute the process environment after the harness run.
81
-
82
- Args:
83
- backend: One of ``bm25``, ``dense``, ``hybrid``.
84
-
85
- Yields:
86
- None
87
- """
88
- key = "UMMAYA_RETRIEVAL_BACKEND"
89
- previous = os.environ.get(key)
90
- os.environ[key] = backend
91
- try:
92
- yield
93
- finally:
94
- if previous is None:
95
- os.environ.pop(key, None)
96
- else:
97
- os.environ[key] = previous
98
-
99
-
100
- @contextlib.contextmanager
101
- def _eager_cold_start_overlay() -> Iterator[None]:
102
- """Force ``UMMAYA_RETRIEVAL_COLD_START=eager`` for the duration of a block.
103
-
104
- The eval harness always issues queries immediately after build, so there
105
- is no boot-cost benefit to the production lazy default (FR-011 /
106
- NFR-BootBudget). Eager cold-start lets ``ToolRegistry.register()`` observe
107
- dense-load failures synchronously and fire the single structured WARN via
108
- ``DegradationRecord`` at build time — the contract
109
- ``tests/retrieval/test_fail_open.py`` locks in.
110
-
111
- Production ``ToolRegistry`` instances constructed outside this harness
112
- continue to honour the lazy default.
113
- """
114
- key = "UMMAYA_RETRIEVAL_COLD_START"
115
- previous = os.environ.get(key)
116
- os.environ[key] = "eager"
117
- try:
118
- yield
119
- finally:
120
- if previous is None:
121
- os.environ.pop(key, None)
122
- else:
123
- os.environ[key] = previous
124
-
125
-
126
- def _compute_sc01_status(registry: object) -> tuple[str, str]:
127
- """Determine SC-01 status for the current registry.
128
-
129
- Returns:
130
- (status, reason) where status is ``"PENDING_#22"`` or ``"EVALUATED"``.
131
-
132
- The PENDING_#22 status is emitted when:
133
- 1. ``registry_size < _SC01_MIN_REGISTRY_SIZE`` — not enough adapters for
134
- a meaningful A/B comparison between BM25 and hybrid backends, OR
135
- 2. Every registered tool_id starts with one of the four seed-adapter
136
- prefixes — meaning no Phase-3 adapters from Epic #22 are present.
137
-
138
- When either condition holds, SC-01 MUST NOT be marked green (FR-013).
139
- """
140
- registry_size = len(registry) # type: ignore[arg-type]
141
-
142
- if registry_size < _SC01_MIN_REGISTRY_SIZE:
143
- return (
144
- "PENDING_#22",
145
- f"registry_size < 8 (require >= 8 adapters from #22 for meaningful A/B); "
146
- f"current registry_size={registry_size}",
147
- )
148
-
149
- tool_ids: list[str] = [t.id for t in registry.all_tools()] # type: ignore[attr-defined]
150
- if tool_ids and all(
151
- any(tid.startswith(prefix) for prefix in _SEED_PREFIXES) for tid in tool_ids
152
- ):
153
- return (
154
- "PENDING_#22",
155
- "no Phase-3 adapter ids detected — awaiting #22",
156
- )
157
-
158
- return ("EVALUATED", "")
159
-
160
-
161
- def _load_queries(yaml_path: Path) -> list[dict[str, Any]]:
162
- """Load and validate the queries YAML file.
163
-
164
- Args:
165
- yaml_path: Path to the retrieval_queries.yaml file.
166
-
167
- Returns:
168
- List of query dicts, each with 'id', 'query', 'expected_tool_id'.
169
-
170
- Raises:
171
- SystemExit: If the file is missing or malformed.
172
- """
173
- if not yaml_path.exists():
174
- logger.error("Queries file not found: %s", yaml_path)
175
- sys.exit(2)
176
-
177
- with yaml_path.open(encoding="utf-8") as fh:
178
- data = yaml.safe_load(fh)
179
-
180
- if not isinstance(data, dict) or "queries" not in data:
181
- logger.error("Invalid YAML structure in %s — expected top-level 'queries' key", yaml_path)
182
- sys.exit(2)
183
-
184
- queries: list[dict[str, Any]] = data["queries"]
185
- for entry in queries:
186
- if "query" not in entry or "expected_tool_id" not in entry:
187
- logger.error("Query entry missing required fields: %r", entry)
188
- sys.exit(2)
189
-
190
- return queries
191
-
192
-
193
- def _build_registry() -> tuple[object, object]:
194
- """Build and populate the tool registry with the 4 seed adapters.
195
-
196
- Registers each seed adapter individually so the eval harness is resilient
197
- to partial registration (e.g., if one adapter module has import errors).
198
- This avoids calling ``register_all_tools()`` which may fail if geocoding
199
- or composite modules are not yet implemented.
200
-
201
- The 4 seed adapters are:
202
- - koroad_accident_hazard_search (always available)
203
- - kma_forecast_fetch (Stage 3)
204
- - hira_hospital_search (Stage 3)
205
- - nmc_emergency_search (Stage 2a stub)
206
-
207
- Returns:
208
- (registry, executor) tuple ready for search.
209
- """
210
- from ummaya.tools.executor import ToolExecutor
211
- from ummaya.tools.registry import ToolRegistry
212
-
213
- # Force eager cold-start for the eval harness: every register() call below
214
- # triggers ``Retriever.rebuild(corpus)``, and the fail-open contract in
215
- # ``tests/retrieval/test_fail_open.py`` requires dense-load failures to
216
- # surface synchronously at build time (single WARN via ``DegradationRecord``).
217
- # Production ToolRegistry instances outside this harness retain the
218
- # production lazy default (FR-011 / NFR-BootBudget).
219
- with _eager_cold_start_overlay():
220
- registry = ToolRegistry()
221
- executor = ToolExecutor(registry)
222
-
223
- # Attempt to register each seed adapter; log warnings on failure.
224
- _try_register_adapter(
225
- "ummaya.tools.koroad.accident_hazard_search",
226
- "register",
227
- registry,
228
- executor,
229
- requires_executor=True,
230
- )
231
- _try_register_adapter(
232
- "ummaya.tools.kma.forecast_fetch",
233
- "register",
234
- registry,
235
- executor,
236
- requires_executor=True,
237
- )
238
- _try_register_adapter(
239
- "ummaya.tools.hira.hospital_search",
240
- "register",
241
- registry,
242
- executor,
243
- requires_executor=True,
244
- )
245
- _try_register_adapter(
246
- "ummaya.tools.nmc.emergency_search",
247
- "register",
248
- registry,
249
- executor,
250
- requires_executor=True,
251
- )
252
-
253
- return registry, executor
254
-
255
-
256
- def _try_register_adapter(
257
- module_path: str,
258
- fn_name: str,
259
- registry: object,
260
- executor: object,
261
- requires_executor: bool,
262
- ) -> None:
263
- """Attempt to import and call a register function, logging on failure.
264
-
265
- Args:
266
- module_path: Dotted module path to import.
267
- fn_name: Name of the registration function in the module.
268
- registry: ToolRegistry instance.
269
- executor: ToolExecutor instance.
270
- requires_executor: If True, call register(registry, executor),
271
- else call register(registry).
272
- """
273
- import importlib
274
-
275
- try:
276
- module = importlib.import_module(module_path)
277
- fn = getattr(module, fn_name)
278
- if requires_executor:
279
- fn(registry, executor)
280
- else:
281
- fn(registry)
282
- logger.info("Registered adapter from %s", module_path)
283
- except Exception as exc:
284
- logger.warning("Failed to register adapter from %s: %s", module_path, exc)
285
-
286
-
287
- async def _run_query(
288
- query: str,
289
- registry: object,
290
- top_k: int = 5,
291
- ) -> list[str]:
292
- """Run a single BM25 search query and return ordered tool_id list.
293
-
294
- Args:
295
- query: Natural-language query string.
296
- registry: Populated ToolRegistry.
297
- top_k: Maximum number of results to fetch.
298
-
299
- Returns:
300
- Ordered list of tool_id strings (rank 1 first).
301
- """
302
- from ummaya.tools.lookup import lookup
303
- from ummaya.tools.models import LookupSearchInput
304
-
305
- inp = LookupSearchInput(mode="search", query=query, top_k=top_k)
306
- result = await lookup(inp, registry=registry)
307
-
308
- # lookup returns LookupSearchResult on search mode
309
- if hasattr(result, "candidates"):
310
- return [c.tool_id for c in result.candidates]
311
- return []
312
-
313
-
314
- def _compute_recall(
315
- ranked: list[str],
316
- expected: str,
317
- at_k: int,
318
- ) -> int:
319
- """Return 1 if expected appears in the top-at_k of ranked, else 0."""
320
- return 1 if expected in ranked[:at_k] else 0
321
-
322
-
323
- def _build_warnings(
324
- registry: object,
325
- missing_adapters: list[str],
326
- ) -> list[str]:
327
- """Build the warnings list for the JSON report.
328
-
329
- Args:
330
- registry: The populated ToolRegistry.
331
- missing_adapters: Seed adapter IDs that were not found in the registry.
332
-
333
- Returns:
334
- List of warning strings.
335
- """
336
- warnings: list[str] = []
337
- registry_size = len(registry) # type: ignore[arg-type]
338
-
339
- if registry_size < _EXPECTED_ADAPTER_COUNT:
340
- warnings.append(
341
- f"Registry has {registry_size} adapter(s); expected {_EXPECTED_ADAPTER_COUNT}. "
342
- "recall@5 is artificially inflated for registered adapters and zero for "
343
- f"missing adapters: {missing_adapters}. "
344
- "Stage 3 will register the remaining adapters."
345
- )
346
-
347
- return warnings
348
-
349
-
350
- async def _evaluate(
351
- queries: list[dict[str, Any]],
352
- registry: object,
353
- ) -> dict[str, Any]:
354
- """Run the full eval loop and return the report dict.
355
-
356
- Args:
357
- queries: Loaded query entries from the YAML file.
358
- registry: Populated ToolRegistry.
359
-
360
- Returns:
361
- Report dict matching the documented JSON schema.
362
- """
363
- total = len(queries)
364
- hits_at_1 = 0
365
- hits_at_5 = 0
366
-
367
- # Per-adapter tracking: {tool_id: {"total": int, "hits_at_1": int, "hits_at_5": int}}
368
- per_adapter: dict[str, dict[str, int]] = {}
369
-
370
- for entry in queries:
371
- query_str: str = entry["query"]
372
- expected_tool_id: str = entry["expected_tool_id"]
373
-
374
- ranked = await _run_query(query_str, registry, top_k=5)
375
-
376
- hit1 = _compute_recall(ranked, expected_tool_id, at_k=1)
377
- hit5 = _compute_recall(ranked, expected_tool_id, at_k=5)
378
-
379
- hits_at_1 += hit1
380
- hits_at_5 += hit5
381
-
382
- if expected_tool_id not in per_adapter:
383
- per_adapter[expected_tool_id] = {"total": 0, "hits_at_1": 0, "hits_at_5": 0}
384
- per_adapter[expected_tool_id]["total"] += 1
385
- per_adapter[expected_tool_id]["hits_at_1"] += hit1
386
- per_adapter[expected_tool_id]["hits_at_5"] += hit5
387
-
388
- query_id = entry.get("id", "?")
389
- logger.debug(
390
- "Query %s (%r): expected=%s ranked=%s hit@1=%d hit@5=%d",
391
- query_id,
392
- query_str[:40],
393
- expected_tool_id,
394
- ranked[:5],
395
- hit1,
396
- hit5,
397
- )
398
-
399
- recall_at_1 = hits_at_1 / total if total > 0 else 0.0
400
- recall_at_5 = hits_at_5 / total if total > 0 else 0.0
401
-
402
- # Check which seed adapters are missing from the registry
403
- registered_ids: set[str] = {t.id for t in registry.all_tools()} # type: ignore[attr-defined]
404
- missing_adapters = sorted(_SEED_ADAPTER_IDS - registered_ids)
405
-
406
- # Compute per-adapter recall metrics
407
- per_adapter_report: dict[str, dict[str, object]] = {}
408
- for tool_id, counts in per_adapter.items():
409
- t = counts["total"]
410
- per_adapter_report[tool_id] = {
411
- "total_queries": t,
412
- "hits_at_1": counts["hits_at_1"],
413
- "hits_at_5": counts["hits_at_5"],
414
- "recall_at_1": counts["hits_at_1"] / t if t > 0 else 0.0,
415
- "recall_at_5": counts["hits_at_5"] / t if t > 0 else 0.0,
416
- }
417
-
418
- return {
419
- "total_queries": total,
420
- "recall_at_1": round(recall_at_1, 4),
421
- "recall_at_5": round(recall_at_5, 4),
422
- "per_adapter": per_adapter_report,
423
- "registry_size": len(registry), # type: ignore[arg-type]
424
- "warnings": _build_warnings(registry, missing_adapters),
425
- "timestamp": datetime.now(UTC).isoformat(),
426
- }
427
-
428
-
429
- def _write_report(report: dict[str, Any], output_path: Path) -> None:
430
- """Write the JSON report to output_path, creating parent dirs as needed."""
431
- output_path.parent.mkdir(parents=True, exist_ok=True)
432
- with output_path.open("w", encoding="utf-8") as fh:
433
- json.dump(report, fh, indent=2, ensure_ascii=False)
434
- logger.info("Report written to %s", output_path)
435
-
436
-
437
- def _exit_code(recall_at_5: float) -> int:
438
- """Compute exit code from recall@5 value.
439
-
440
- Returns:
441
- 0 — pass (>= 0.80)
442
- 1 — warn ([0.60, 0.80))
443
- 2 — fail (< 0.60)
444
- """
445
- if recall_at_5 >= 0.80:
446
- return 0
447
- if recall_at_5 >= 0.60:
448
- return 1
449
- return 2
450
-
451
-
452
- # Sentinel that distinguishes "caller omitted report_path" from
453
- # "caller explicitly passed report_path=None (no file write)".
454
- _REPORT_PATH_DEFAULT = object()
455
-
456
-
457
- def run_extended_gate(
458
- *,
459
- backend: str | None = None,
460
- queries_path: Path | None = None,
461
- report_path: object = _REPORT_PATH_DEFAULT,
462
- registry: object | None = None,
463
- ) -> dict[str, Any]:
464
- """Run the extended retrieval gate with backend selection and SC-01 status.
465
-
466
- This function extends the baseline ``_evaluate()`` harness with:
467
- - Pluggable backend selection via ``UMMAYA_RETRIEVAL_BACKEND`` env overlay.
468
- - ``sc_01_status`` / ``sc_01_reason`` fields added to the report dict.
469
- - ``sc_02_status`` placeholder (evaluated when adversarial file exists).
470
-
471
- The existing baseline schema fields (``total_queries``, ``recall_at_1``,
472
- ``recall_at_5``, ``per_adapter``, ``registry_size``, ``warnings``,
473
- ``timestamp``) are preserved byte-identical so ``test_retrieval_gate.py``
474
- contract continues to pass.
475
-
476
- SC-01 PENDING_#22 conditions (FR-013, T032):
477
- - ``registry_size < 8`` — not enough adapters from Epic #22.
478
- - All tool_ids start with a seed prefix — no Phase-3 adapters detected.
479
-
480
- Args:
481
- backend: Retrieval backend to activate (``bm25``, ``dense``, ``hybrid``).
482
- When ``None`` (default), the ambient ``UMMAYA_RETRIEVAL_BACKEND``
483
- env var — or ``bm25`` if unset — is honoured. Only meaningful when
484
- ``registry`` is also ``None``; an injected registry already has a
485
- retriever bound at construction time and the env overlay is a no-op.
486
- queries_path: Path to the queries YAML file. Defaults to the committed
487
- ``eval/retrieval_queries.yaml`` in the repo root.
488
- report_path: Path to write the JSON report, or ``None`` to skip writing.
489
- When omitted entirely, defaults to
490
- ``.eval-artifacts/retrieval_extended.json``.
491
- registry: Pre-built registry to use (for testing). When ``None``,
492
- builds the 4-seed registry via ``_build_registry()`` under the
493
- backend env overlay so the retriever is initialised correctly.
494
-
495
- Returns:
496
- Report dict containing all baseline fields PLUS new SC-status fields.
497
- The caller is responsible for interpreting ``sc_01_status`` and
498
- deciding whether to ``sys.exit(2)`` at the CLI layer.
499
- """
500
- if queries_path is None:
501
- queries_path = (
502
- Path(__file__).parent.parent.parent.parent / "eval" / "retrieval_queries.yaml"
503
- )
504
-
505
- # Resolve the effective report path:
506
- # - sentinel (omitted by caller) → use default file path
507
- # - None (explicitly passed) → no file write
508
- # - Path instance → write to that path
509
- # A runtime isinstance check replaces the prior ``# type: ignore`` so
510
- # bad caller types (e.g. str) fail loudly here rather than deep in
511
- # the JSON writer.
512
- if report_path is _REPORT_PATH_DEFAULT:
513
- effective_report_path: Path | None = Path(".eval-artifacts/retrieval_extended.json")
514
- elif report_path is None:
515
- effective_report_path = None
516
- elif isinstance(report_path, Path):
517
- effective_report_path = report_path
518
- else:
519
- raise TypeError(
520
- f"report_path must be pathlib.Path, None, or omitted (got {type(report_path).__name__})"
521
- )
522
-
523
- # The env overlay only influences ``ToolRegistry.__init__`` (which reads
524
- # ``UMMAYA_RETRIEVAL_BACKEND`` to bind a retriever). Scope it narrowly so
525
- # it is not leaked across the scoring phase, and skip it entirely when the
526
- # caller has injected a pre-built registry whose retriever is already set,
527
- # or when ``backend`` is ``None`` (honour ambient env var, defaulting to
528
- # ``bm25`` inside ``build_retriever_from_env``).
529
- if registry is None and backend is not None:
530
- with _backend_env_overlay(backend):
531
- built_registry, _ = _build_registry()
532
- elif registry is None:
533
- built_registry, _ = _build_registry()
534
- else:
535
- built_registry = registry
536
-
537
- queries = _load_queries(queries_path)
538
- report: dict[str, Any] = asyncio.run(_evaluate(queries, built_registry))
539
-
540
- # Compute SC-01 status outside the env overlay — depends on registry
541
- # composition, not on the backend in use.
542
- sc01_status, sc01_reason = _compute_sc01_status(built_registry)
543
- report["sc_01_status"] = sc01_status
544
- report["sc_01_reason"] = sc01_reason
545
-
546
- # SC-02 placeholder — evaluated against adversarial file in a follow-on task.
547
- report["sc_02_status"] = "PENDING_ADVERSARIAL_EVAL"
548
-
549
- if effective_report_path is not None:
550
- _write_report(report, effective_report_path)
551
-
552
- return report
553
-
554
-
555
- def main(argv: list[str] | None = None) -> None:
556
- """CLI entry point for retrieval evaluation.
557
-
558
- Usage (baseline, backward-compatible)::
559
-
560
- python -m ummaya.eval.retrieval eval/retrieval_queries.yaml
561
-
562
- Usage (extended gate with backend selection)::
563
-
564
- python -m ummaya.eval.retrieval \\
565
- --backend hybrid \\
566
- --queries eval/retrieval_queries.yaml \\
567
- --report .eval-artifacts/retrieval_extended.json
568
-
569
- When ``--backend`` is supplied, the extended gate runs via
570
- ``run_extended_gate()`` and the exit code follows the extended scheme:
571
- 0 — pass (recall@5 >= 0.80, sc_01 is not PENDING_#22)
572
- 1 — warn (0.60 <= recall@5 < 0.80)
573
- 2 — PENDING_#22 (registry too small / no Phase-3 adapters)
574
- 2 — fail (recall@5 < 0.60)
575
-
576
- Without ``--backend``, the legacy positional-arg path runs.
577
-
578
- Args:
579
- argv: Argument list (default: sys.argv[1:]).
580
- """
581
- import argparse
582
-
583
- logging.basicConfig(
584
- level=logging.INFO,
585
- format="%(asctime)s %(levelname)s %(name)s: %(message)s",
586
- stream=sys.stderr,
587
- )
588
-
589
- raw_args = argv if argv is not None else sys.argv[1:]
590
-
591
- # Detect extended mode: any arg that starts with "--" triggers argparse.
592
- # The legacy positional mode (first arg is a yaml path, no flags) still
593
- # works so existing scripts / tests are not broken.
594
- if raw_args and raw_args[0].startswith("--"):
595
- parser = argparse.ArgumentParser(
596
- prog="ummaya.eval.retrieval",
597
- description="Retrieval quality evaluation harness (extended gate).",
598
- )
599
- parser.add_argument(
600
- "--backend",
601
- choices=["bm25", "dense", "hybrid"],
602
- default=None,
603
- help=(
604
- "Retrieval backend to activate. "
605
- "Defaults to honouring UMMAYA_RETRIEVAL_BACKEND "
606
- "(or bm25 when that env var is unset)."
607
- ),
608
- )
609
- parser.add_argument(
610
- "--queries",
611
- type=Path,
612
- default=None,
613
- help="Path to retrieval_queries.yaml (default: eval/retrieval_queries.yaml).",
614
- )
615
- parser.add_argument(
616
- "--report",
617
- type=Path,
618
- default=None,
619
- help=(
620
- "Path to write the JSON report (default: .eval-artifacts/retrieval_extended.json)."
621
- ),
622
- )
623
- parsed = parser.parse_args(raw_args)
624
-
625
- effective_backend = parsed.backend or os.environ.get("UMMAYA_RETRIEVAL_BACKEND", "bm25")
626
- logger.info("Running extended gate with backend=%s", effective_backend)
627
- # Only forward ``report_path`` when the operator actually passed
628
- # ``--report``. If we always forwarded ``parsed.report`` (which is
629
- # ``None`` when omitted), ``run_extended_gate`` would interpret that
630
- # as "skip writing" and silently drop the default artifact at
631
- # ``.eval-artifacts/retrieval_extended.json``.
632
- #
633
- # Likewise for ``--backend``: when the operator omits it, we forward
634
- # ``None`` so ``run_extended_gate`` honours the ambient
635
- # ``UMMAYA_RETRIEVAL_BACKEND`` env var rather than force-overlaying a
636
- # CLI default that would silently override an operator's export.
637
- gate_kwargs: dict[str, Any] = {
638
- "backend": parsed.backend,
639
- "queries_path": parsed.queries,
640
- }
641
- if parsed.report is not None:
642
- gate_kwargs["report_path"] = parsed.report
643
- report = run_extended_gate(**gate_kwargs)
644
-
645
- recall5 = report["recall_at_5"]
646
- recall1 = report["recall_at_1"]
647
- sc01 = report.get("sc_01_status", "EVALUATED")
648
-
649
- if report.get("warnings"):
650
- for w in report["warnings"]:
651
- logger.warning("WARN: %s", w)
652
-
653
- if sc01 == "PENDING_#22":
654
- reason = report.get("sc_01_reason", "")
655
- logger.warning("SC-01 PENDING_#22: %s", reason)
656
- print( # noqa: T201
657
- f"[PENDING_#22] recall@5={recall5:.2%} recall@1={recall1:.2%} "
658
- f"total={report['total_queries']} registry={report['registry_size']} "
659
- f"sc_01={sc01}"
660
- )
661
- sys.exit(2)
662
-
663
- code = _exit_code(float(recall5))
664
- status = {0: "PASS", 1: "WARN", 2: "FAIL"}[code]
665
- print( # noqa: T201
666
- f"[{status}] recall@5={recall5:.2%} recall@1={recall1:.2%} "
667
- f"total={report['total_queries']} registry={report['registry_size']} "
668
- f"sc_01={sc01}"
669
- )
670
- sys.exit(code)
671
-
672
- # Legacy positional mode — byte-identical to the pre-T031 behaviour.
673
- if not raw_args:
674
- logger.error("Usage: python -m ummaya.eval.retrieval <queries.yaml>")
675
- sys.exit(2)
676
-
677
- queries_path = Path(raw_args[0])
678
- output_path = Path(".eval-artifacts/retrieval.json")
679
-
680
- logger.info("Loading queries from %s", queries_path)
681
- queries = _load_queries(queries_path)
682
- logger.info("Loaded %d queries", len(queries))
683
-
684
- logger.info("Building tool registry...")
685
- registry, _ = _build_registry()
686
- logger.info("Registry size: %d adapters", len(registry)) # type: ignore[arg-type]
687
-
688
- logger.info("Running BM25 retrieval evaluation...")
689
- report = asyncio.run(_evaluate(queries, registry))
690
-
691
- _write_report(report, output_path)
692
-
693
- recall5 = report["recall_at_5"]
694
- recall1 = report["recall_at_1"]
695
-
696
- if report["warnings"]:
697
- for w in report["warnings"]:
698
- logger.warning("WARN: %s", w)
699
-
700
- code = _exit_code(float(recall5))
701
- status = {0: "PASS", 1: "WARN", 2: "FAIL"}[code]
702
-
703
- # Single-line stdout summary (only print() allowed per spec)
704
- print( # noqa: T201
705
- f"[{status}] recall@5={recall5:.2%} recall@1={recall1:.2%} "
706
- f"total={report['total_queries']} registry={report['registry_size']}"
707
- )
708
-
709
- sys.exit(code)
710
-
711
-
712
- if __name__ == "__main__":
713
- main()