minima-cli 0.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. minima/__init__.py +5 -0
  2. minima/api/__init__.py +1 -0
  3. minima/api/auth.py +39 -0
  4. minima/api/errors.py +40 -0
  5. minima/api/routers/__init__.py +1 -0
  6. minima/api/routers/calibration.py +50 -0
  7. minima/api/routers/feedback.py +279 -0
  8. minima/api/routers/health.py +50 -0
  9. minima/api/routers/models.py +42 -0
  10. minima/api/routers/recommend.py +66 -0
  11. minima/api/routers/savings.py +55 -0
  12. minima/api/routers/strategies.py +33 -0
  13. minima/catalog/__init__.py +1 -0
  14. minima/catalog/data/capability_priors.json +210 -0
  15. minima/catalog/data/model_aliases.json +12 -0
  16. minima/catalog/merge.py +69 -0
  17. minima/catalog/refresh.py +54 -0
  18. minima/catalog/sources/__init__.py +1 -0
  19. minima/catalog/sources/litellm.py +19 -0
  20. minima/catalog/sources/openrouter.py +25 -0
  21. minima/catalog/store.py +86 -0
  22. minima/config.py +288 -0
  23. minima/deps.py +35 -0
  24. minima/llm/__init__.py +1 -0
  25. minima/llm/anthropic.py +106 -0
  26. minima/llm/base.py +196 -0
  27. minima/llm/gemini.py +124 -0
  28. minima/llm/registry.py +54 -0
  29. minima/logging.py +28 -0
  30. minima/main.py +109 -0
  31. minima/memory/__init__.py +1 -0
  32. minima/memory/adapter.py +572 -0
  33. minima/memory/keys.py +83 -0
  34. minima/memory/records.py +190 -0
  35. minima/memory/threadpool.py +41 -0
  36. minima/metrics/__init__.py +1 -0
  37. minima/metrics/calibration.py +415 -0
  38. minima/metrics/report.py +116 -0
  39. minima/metrics/savings.py +98 -0
  40. minima/recommender/__init__.py +1 -0
  41. minima/recommender/_pg_pool.py +38 -0
  42. minima/recommender/_redis_client.py +32 -0
  43. minima/recommender/aggregate.py +157 -0
  44. minima/recommender/classify.py +165 -0
  45. minima/recommender/decisionlog.py +505 -0
  46. minima/recommender/durablerefs.py +312 -0
  47. minima/recommender/engine.py +997 -0
  48. minima/recommender/escalation.py +83 -0
  49. minima/recommender/propensity.py +189 -0
  50. minima/recommender/recstore.py +368 -0
  51. minima/recommender/score.py +318 -0
  52. minima/recommender/types.py +166 -0
  53. minima/schemas/__init__.py +1 -0
  54. minima/schemas/common.py +73 -0
  55. minima/schemas/feedback.py +34 -0
  56. minima/schemas/models_catalog.py +36 -0
  57. minima/schemas/recommend.py +104 -0
  58. minima/schemas/savings.py +39 -0
  59. minima/schemas/strategies.py +57 -0
  60. minima/schemas/workflow.py +43 -0
  61. minima/seeding/__init__.py +1 -0
  62. minima/seeding/items.py +42 -0
  63. minima/seeding/llmrouterbench.py +232 -0
  64. minima/seeding/routerbench.py +141 -0
  65. minima/seeding/run_seed.py +56 -0
  66. minima/seeding/synthetic.py +70 -0
  67. minima/tenancy/__init__.py +8 -0
  68. minima/tenancy/context.py +37 -0
  69. minima/tenancy/passthrough.py +110 -0
  70. minima/version.py +3 -0
  71. minima_cli-0.4.9.dist-info/METADATA +275 -0
  72. minima_cli-0.4.9.dist-info/RECORD +161 -0
  73. minima_cli-0.4.9.dist-info/WHEEL +4 -0
  74. minima_cli-0.4.9.dist-info/entry_points.txt +5 -0
  75. minima_cli-0.4.9.dist-info/licenses/LICENSE +295 -0
  76. minima_client/__init__.py +19 -0
  77. minima_client/autocapture.py +101 -0
  78. minima_client/client.py +301 -0
  79. minima_client/errors.py +23 -0
  80. minima_harness/LICENSE_PI +32 -0
  81. minima_harness/__init__.py +16 -0
  82. minima_harness/agent/__init__.py +72 -0
  83. minima_harness/agent/agent.py +276 -0
  84. minima_harness/agent/events.py +124 -0
  85. minima_harness/agent/loop.py +311 -0
  86. minima_harness/agent/state.py +79 -0
  87. minima_harness/agent/tools.py +97 -0
  88. minima_harness/ai/__init__.py +66 -0
  89. minima_harness/ai/compat.py +71 -0
  90. minima_harness/ai/errors.py +96 -0
  91. minima_harness/ai/events.py +117 -0
  92. minima_harness/ai/openrouter_catalog.py +153 -0
  93. minima_harness/ai/provider_catalog.py +299 -0
  94. minima_harness/ai/provider_quirks.py +37 -0
  95. minima_harness/ai/providers/__init__.py +75 -0
  96. minima_harness/ai/providers/_common.py +48 -0
  97. minima_harness/ai/providers/anthropic.py +290 -0
  98. minima_harness/ai/providers/base.py +65 -0
  99. minima_harness/ai/providers/faux.py +173 -0
  100. minima_harness/ai/providers/google.py +221 -0
  101. minima_harness/ai/providers/openai_compat.py +278 -0
  102. minima_harness/ai/registry.py +184 -0
  103. minima_harness/ai/stream.py +82 -0
  104. minima_harness/ai/tools.py +51 -0
  105. minima_harness/ai/types.py +204 -0
  106. minima_harness/ai/usage.py +41 -0
  107. minima_harness/minima/__init__.py +40 -0
  108. minima_harness/minima/cache.py +102 -0
  109. minima_harness/minima/config.py +85 -0
  110. minima_harness/minima/goals.py +226 -0
  111. minima_harness/minima/judge.py +144 -0
  112. minima_harness/minima/mapping.py +147 -0
  113. minima_harness/minima/meter.py +143 -0
  114. minima_harness/minima/router.py +220 -0
  115. minima_harness/minima/runtime.py +544 -0
  116. minima_harness/minima/signals.py +195 -0
  117. minima_harness/session/__init__.py +14 -0
  118. minima_harness/session/format.py +35 -0
  119. minima_harness/session/store.py +236 -0
  120. minima_harness/tasks/__init__.py +17 -0
  121. minima_harness/tasks/task_set.py +78 -0
  122. minima_harness/tools/__init__.py +7 -0
  123. minima_harness/tools/_io.py +34 -0
  124. minima_harness/tools/bash.py +70 -0
  125. minima_harness/tools/builtin.py +23 -0
  126. minima_harness/tools/edit.py +50 -0
  127. minima_harness/tools/find.py +38 -0
  128. minima_harness/tools/grep.py +73 -0
  129. minima_harness/tools/ls.py +35 -0
  130. minima_harness/tools/read.py +38 -0
  131. minima_harness/tools/tasks.py +75 -0
  132. minima_harness/tools/write.py +36 -0
  133. minima_harness/tui/__init__.py +3 -0
  134. minima_harness/tui/analytics.py +111 -0
  135. minima_harness/tui/app.py +1927 -0
  136. minima_harness/tui/bridge.py +103 -0
  137. minima_harness/tui/cli.py +227 -0
  138. minima_harness/tui/clipboard.py +60 -0
  139. minima_harness/tui/commands.py +49 -0
  140. minima_harness/tui/compaction.py +17 -0
  141. minima_harness/tui/config_cli.py +141 -0
  142. minima_harness/tui/config_store.py +237 -0
  143. minima_harness/tui/context.py +93 -0
  144. minima_harness/tui/customize.py +95 -0
  145. minima_harness/tui/diff.py +53 -0
  146. minima_harness/tui/editor.py +43 -0
  147. minima_harness/tui/extensions.py +84 -0
  148. minima_harness/tui/extra_models.py +52 -0
  149. minima_harness/tui/history.py +71 -0
  150. minima_harness/tui/mubit.py +295 -0
  151. minima_harness/tui/overlays.py +593 -0
  152. minima_harness/tui/packages.py +59 -0
  153. minima_harness/tui/run_modes.py +66 -0
  154. minima_harness/tui/theme.py +77 -0
  155. minima_harness/tui/welcome.py +83 -0
  156. minima_harness/tui/widgets/__init__.py +3 -0
  157. minima_harness/tui/widgets/banner.py +38 -0
  158. minima_harness/tui/widgets/editor.py +83 -0
  159. minima_harness/tui/widgets/footer.py +73 -0
  160. minima_harness/tui/widgets/messages.py +151 -0
  161. minima_harness/tui/widgets/status.py +57 -0
@@ -0,0 +1,104 @@
1
+ """Schemas for the per-call recommendation endpoint."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pydantic import BaseModel, ConfigDict, Field
6
+
7
+ from minima.schemas.common import Constraints, DecisionBasis, Difficulty, TaskInput, TaskType
8
+
9
+
10
+ class RecommendRequest(BaseModel):
11
+ task: TaskInput
12
+ cost_quality_tradeoff: float = Field(
13
+ 5.0, ge=0, le=10, description="0 = cheapest acceptable, 10 = highest quality"
14
+ )
15
+ constraints: Constraints = Field(default_factory=Constraints)
16
+ user_id: str | None = Field(
17
+ None, description="optional within-org actor label (NOT a tenant/auth boundary)"
18
+ )
19
+ namespace: str | None = Field(
20
+ None,
21
+ description=(
22
+ "optional within-org sub-scope (team/project/env), namespaced under your org. "
23
+ "The tenant boundary is your Minima API key -> your Mubit instance, not this field."
24
+ ),
25
+ )
26
+ max_candidates: int = Field(8, ge=1, le=64)
27
+ allow_llm_escalation: bool = True
28
+ explain: bool = True
29
+ baseline_model_id: str | None = Field(
30
+ None,
31
+ description=(
32
+ "the model you would have used without Minima; powers the vs_declared_default "
33
+ "savings baseline in GET /v1/savings"
34
+ ),
35
+ )
36
+
37
+
38
+ class EvidenceRef(BaseModel):
39
+ """A recalled past outcome that informed a candidate's score."""
40
+
41
+ model_config = ConfigDict(protected_namespaces=())
42
+
43
+ entry_id: str = Field(..., description="QueryEvidence.id (used for outcome attribution)")
44
+ reference_id: str | None = None
45
+ model_id: str
46
+ score: float = Field(..., description="retrieval similarity")
47
+ knowledge_confidence: float = Field(..., ge=0, le=1)
48
+ observed_success: float = Field(..., ge=0, le=1)
49
+ is_stale: bool = False
50
+
51
+
52
+ class RankedModel(BaseModel):
53
+ model_config = ConfigDict(protected_namespaces=())
54
+
55
+ model_id: str
56
+ provider: str
57
+ predicted_success: float = Field(..., ge=0, le=1)
58
+ est_cost_usd: float = Field(..., ge=0)
59
+ est_cost_breakdown: dict[str, float] = Field(default_factory=dict)
60
+ score: float = Field(..., description="final objective score; sorting key")
61
+ rationale: str = ""
62
+ decision_basis: DecisionBasis = DecisionBasis.prior
63
+ evidence: list[EvidenceRef] = Field(default_factory=list)
64
+ supports_prompt_caching: bool = False
65
+ context_window: int = 0
66
+ est_latency_ms: float | None = Field(
67
+ None, description="observed latency percentile from similar past outcomes (ms)"
68
+ )
69
+ latency_basis: str = Field("", description='e.g. "observed_p75"; empty without evidence')
70
+ est_cost_low: float | None = Field(
71
+ None, ge=0, description="low end of the data-grounded predictable cost band ($)"
72
+ )
73
+ est_cost_high: float | None = Field(
74
+ None, ge=0, description="high end of the data-grounded predictable cost band ($)"
75
+ )
76
+ cost_band_basis: str = Field(
77
+ "", description='e.g. "observed_p25_p75" | "rescaled_p25_p75"; empty without a band'
78
+ )
79
+ success_interval_width: float = Field(
80
+ 0.0, ge=0, le=1, description="95% credible-interval width of predicted_success"
81
+ )
82
+
83
+
84
+ class RecommendResponse(BaseModel):
85
+ recommendation_id: str
86
+ recommended_model: RankedModel
87
+ ranked: list[RankedModel] = Field(default_factory=list)
88
+ fallback_model: RankedModel | None = None
89
+ confidence: float = Field(..., ge=0, le=1)
90
+ decision_basis: DecisionBasis
91
+ threshold_used: float
92
+ classified_task_type: TaskType
93
+ classified_difficulty: Difficulty
94
+ catalog_version: str
95
+ catalog_stale: bool = False
96
+ latency_ms: int = 0
97
+ warnings: list[str] = Field(default_factory=list)
98
+ selection_policy: str = Field(
99
+ "argmin", description='"argmin" | "epsilon_softmax" (per-org opt-in exploration)'
100
+ )
101
+ recommended_actions: list[str] = Field(
102
+ default_factory=list,
103
+ description="near-free cost-saving actions to apply (e.g. enable_prompt_cache)",
104
+ )
@@ -0,0 +1,39 @@
1
+ """Schemas for the savings and calibration reporting endpoints.
2
+
3
+ The payload bodies reuse the metrics dataclasses directly (pydantic v2 validates and
4
+ serializes stdlib dataclasses), so the report shape has exactly one definition.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pydantic import BaseModel, Field
10
+
11
+ from minima.metrics.calibration import CalibrationReport, CusumFlag
12
+ from minima.metrics.savings import SavingsSummary
13
+
14
+
15
+ class SavingsGroup(BaseModel):
16
+ key: str
17
+ summary: SavingsSummary
18
+ health: dict[str, float | int] = Field(default_factory=dict)
19
+
20
+
21
+ class SavingsResponse(BaseModel):
22
+ org_id: str
23
+ since: float
24
+ days: float
25
+ namespace: str | None = None
26
+ summary: SavingsSummary
27
+ health: dict[str, float | int] = Field(default_factory=dict)
28
+ group_by: str | None = None
29
+ groups: list[SavingsGroup] = Field(default_factory=list)
30
+
31
+
32
+ class CalibrationResponse(BaseModel):
33
+ org_id: str
34
+ since: float
35
+ days: float
36
+ namespace: str | None = None
37
+ health: dict[str, float | int] = Field(default_factory=dict)
38
+ reports: list[CalibrationReport] = Field(default_factory=list)
39
+ drift_flags: list[CusumFlag] = Field(default_factory=list)
@@ -0,0 +1,57 @@
1
+ """Schemas for the surfaced-strategies endpoint.
2
+
3
+ ``reflect()`` promotes accumulated outcomes/lessons into validated rules; Mubit's
4
+ ``surface_strategies`` clusters those lessons into ``EmergentStrategy`` summaries.
5
+ This endpoint exposes them so callers can see *why* a namespace routes the way it does.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections.abc import Mapping
11
+ from typing import Any
12
+
13
+ from pydantic import BaseModel, Field
14
+
15
+
16
+ def _g(d: Mapping[str, Any], *keys: str, default: Any = None) -> Any:
17
+ """First present key among snake_case/camelCase variants."""
18
+ for k in keys:
19
+ if k in d and d[k] is not None:
20
+ return d[k]
21
+ return default
22
+
23
+
24
+ class Strategy(BaseModel):
25
+ strategy_id: str = ""
26
+ description: str = ""
27
+ supporting_lesson_count: int = 0
28
+ avg_confidence: float = 0.0
29
+ avg_reinforcement: float = 0.0
30
+ dominant_lesson_type: str = ""
31
+ dominant_scope: str = ""
32
+ lesson_ids: list[str] = Field(default_factory=list)
33
+
34
+ @classmethod
35
+ def from_emergent(cls, d: Mapping[str, Any]) -> Strategy:
36
+ """Parse a Mubit ``EmergentStrategy`` dict (snake_case or camelCase)."""
37
+ return cls(
38
+ strategy_id=str(_g(d, "strategy_id", "strategyId", default="")),
39
+ description=str(_g(d, "description", default="")),
40
+ supporting_lesson_count=int(
41
+ _g(d, "supporting_lesson_count", "supportingLessonCount", default=0)
42
+ ),
43
+ avg_confidence=float(_g(d, "avg_confidence", "avgConfidence", default=0.0)),
44
+ avg_reinforcement=float(_g(d, "avg_reinforcement", "avgReinforcement", default=0.0)),
45
+ dominant_lesson_type=str(
46
+ _g(d, "dominant_lesson_type", "dominantLessonType", default="")
47
+ ),
48
+ dominant_scope=str(_g(d, "dominant_scope", "dominantScope", default="")),
49
+ lesson_ids=[str(x) for x in (_g(d, "lesson_ids", "lessonIds", default=[]) or [])],
50
+ )
51
+
52
+
53
+ class StrategiesResponse(BaseModel):
54
+ namespace: str | None = None
55
+ lane: str
56
+ strategies: list[Strategy] = Field(default_factory=list)
57
+ count: int = 0
@@ -0,0 +1,43 @@
1
+ """Schemas for the multi-step workflow recommendation endpoint."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+ from minima.schemas.common import Constraints, TaskInput
8
+ from minima.schemas.recommend import RecommendResponse
9
+
10
+
11
+ class WorkflowStep(BaseModel):
12
+ step_id: str = Field(..., min_length=1)
13
+ task: TaskInput
14
+ constraints: Constraints | None = Field(
15
+ None, description="per-step override; merged over global"
16
+ )
17
+ depends_on: list[str] = Field(default_factory=list)
18
+
19
+
20
+ class WorkflowRequest(BaseModel):
21
+ steps: list[WorkflowStep] = Field(..., min_length=1)
22
+ cost_quality_tradeoff: float = Field(5.0, ge=0, le=10)
23
+ constraints: Constraints = Field(default_factory=Constraints)
24
+ user_id: str | None = Field(
25
+ None, description="optional within-org actor label (NOT a tenant/auth boundary)"
26
+ )
27
+ namespace: str | None = Field(
28
+ None, description="optional within-org sub-scope; tenant boundary is the Minima key"
29
+ )
30
+ allow_llm_escalation: bool = True
31
+
32
+
33
+ class StepRecommendation(BaseModel):
34
+ step_id: str
35
+ recommendation: RecommendResponse
36
+
37
+
38
+ class WorkflowResponse(BaseModel):
39
+ workflow_recommendation_id: str
40
+ steps: list[StepRecommendation]
41
+ total_est_cost_usd: float = Field(..., ge=0)
42
+ total_est_cost_if_all_premium: float = Field(..., ge=0)
43
+ confidence: float = Field(..., ge=0, le=1)
@@ -0,0 +1 @@
1
+ """Cold-start seeding of Mubit memory (offline import)."""
@@ -0,0 +1,42 @@
1
+ """Build Mubit batch_insert items from outcome records.
2
+
3
+ ``metadata_json`` must be a JSON *string* on the batch_insert wire (unlike remember(),
4
+ which JSON-encodes a dict for you). ``embedding: []`` lets the server embed on ingest.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import time
11
+ from collections.abc import Sequence
12
+ from dataclasses import dataclass
13
+
14
+ from minima.memory.records import OutcomeRecord
15
+
16
+
17
+ @dataclass(slots=True)
18
+ class SeedItem:
19
+ item_id: str
20
+ content: str
21
+ record: OutcomeRecord
22
+ env_tags: list[str]
23
+
24
+
25
+ def build_item(seed: SeedItem, source: str = "system") -> dict:
26
+ # Stamp seed time, not dataset time: seeds are then governed by the explicit
27
+ # seed-vs-live weighting (source_dataset), not by pretending to be fresh or
28
+ # being decayed by an arbitrary benchmark-publication age.
29
+ if seed.record.recorded_at is None:
30
+ seed.record.recorded_at = time.time()
31
+ return {
32
+ "item_id": seed.item_id,
33
+ "text": seed.content,
34
+ "metadata_json": json.dumps(seed.record.to_metadata()),
35
+ "source": source,
36
+ "embedding": [],
37
+ "env_tags": list(seed.env_tags),
38
+ }
39
+
40
+
41
+ def chunked(items: Sequence[dict], size: int) -> list[list[dict]]:
42
+ return [list(items[i : i + size]) for i in range(0, len(items), size)]
@@ -0,0 +1,232 @@
1
+ """Fetch + raw-load LLMRouterBench / OpenRouterBench (offline benchmark).
2
+
3
+ LLMRouterBench (HF dataset ``NPULH/LLMRouterBench``; code repo ``ynulihao/LLMRouterBench``,
4
+ Findings@ACL 2026) ships as ONE gzipped tarball ``bench-release.tar.gz`` (~1.28 GB) whose
5
+ members are::
6
+
7
+ bench-release/<dataset>/<model_name>/<dataset>-<model_name>-<YYYYMMDD_HHMMSS>.json
8
+
9
+ Each JSON is one ``(dataset, split, model)`` run: top-level aggregate fields
10
+ (``dataset_name``, ``split``, ``model_name``, ``demo``, ``data_fingerprint`` …) plus a
11
+ ``records`` list. Each record carries::
12
+
13
+ index, origin_query, prompt, prompt_tokens, completion_tokens,
14
+ cost, score, prediction, ground_truth, raw_output
15
+
16
+ - ``score`` is 0.0/1.0 correctness (a few datasets use a graded [0,1] judge score).
17
+ - ``cost`` is the *real* per-call USD cost — BUT open models run locally report ``0.0``
18
+ (handle this when building the cost axis; see Phase 2 of the plan and guard V2 — the
19
+ router must *decide* on independent market prices, not on this column it is *scored* on).
20
+
21
+ We stream records straight out of the tarball (no multi-GB extraction) and drop the bulky
22
+ ``raw_output`` by default. This module is **Phase 1** of
23
+ ``docs/PLAN/LLMRouterBench-H1-setup.md`` — fetch + raw load only. The wide-DataFrame pivot
24
+ the eval harness consumes (``<model>`` score + ``<model>|total_cost`` columns) is **Phase 3**
25
+ and builds on :func:`iter_raw_records`.
26
+
27
+ Requires the ``seed`` extra (``huggingface-hub``/``pandas``); raises a clear error otherwise.
28
+ """
29
+
30
+ from __future__ import annotations
31
+
32
+ import json
33
+ import re
34
+ import tarfile
35
+ from collections import defaultdict
36
+ from collections.abc import Iterator, Sequence
37
+ from typing import Any
38
+
39
+ _REPO_ID = "NPULH/LLMRouterBench"
40
+ _TARBALL = "bench-release.tar.gz"
41
+ _ROOT = "bench-release"
42
+ _TS_RE = re.compile(r"(\d{8}_\d{6})\.json$")
43
+ # Cost-column suffix the eval harness keys on. MUST match
44
+ # ``minima.seeding.routerbench.detect_model_columns`` so the wide frame this module emits is
45
+ # consumed by the same code path as RouterBench.
46
+ _COST_SUFFIX = "|total_cost"
47
+
48
+ # The per-record fields we keep. We drop the two bulky ones the eval never needs:
49
+ # ``raw_output`` and ``prediction`` (the latter is a full role/content conversation in the
50
+ # real data, despite schema.py declaring it ``str`` — confirmed by inspecting the bytes).
51
+ _RECORD_FIELDS = (
52
+ "index", "origin_query", "prompt", "prompt_tokens",
53
+ "completion_tokens", "cost", "score", "ground_truth",
54
+ )
55
+
56
+
57
+ def _require_seed_extra():
58
+ try:
59
+ from huggingface_hub import hf_hub_download # noqa: F401
60
+ except ImportError as exc: # pragma: no cover
61
+ raise RuntimeError(
62
+ "LLMRouterBench needs the 'seed' extra: `uv sync --extra seed`."
63
+ ) from exc
64
+
65
+
66
+ def download_tarball() -> str:
67
+ """Download (and HF-cache) ``bench-release.tar.gz``; return the local path.
68
+
69
+ Idempotent: ``hf_hub_download`` returns the cached file on subsequent calls without
70
+ re-downloading. The ~1.28 GB lives under the HF cache, not the repo.
71
+ """
72
+ _require_seed_extra()
73
+ from huggingface_hub import hf_hub_download
74
+
75
+ return hf_hub_download(repo_id=_REPO_ID, filename=_TARBALL, repo_type="dataset")
76
+
77
+
78
+ def _split_member(name: str) -> tuple[str, str, str, str] | None:
79
+ """Map a member path to ``(dataset, subset, model, timestamp)``.
80
+
81
+ Two shapes occur in the release; the **model is always the directory immediately
82
+ above the file** and the **dataset is the first directory under the root** — the
83
+ optional middle segment is a subset/split (e.g. ``valid``, ``subset_500``)::
84
+
85
+ bench-release/<dataset>/<model>/<…ts>.json (depth 4, subset="")
86
+ bench-release/<dataset>/<subset>/<model>/<…ts>.json (depth 5)
87
+
88
+ (Parsing a fixed ``parts[2]`` as the model is the trap — it mis-reads the subset on
89
+ depth-5 paths; confirmed by inspecting the tarball.)
90
+ """
91
+ parts = name.split("/")
92
+ if len(parts) < 4 or parts[0] != _ROOT or not parts[-1].endswith(".json"):
93
+ return None
94
+ dataset = parts[1]
95
+ model = parts[-2]
96
+ subset = "/".join(parts[2:-2]) # "" for depth-4 paths
97
+ m = _TS_RE.search(parts[-1])
98
+ return dataset, subset, model, (m.group(1) if m else "")
99
+
100
+
101
+ def list_release_contents(tarball_path: str | None = None) -> dict[str, Any]:
102
+ """Inventory the tarball WITHOUT parsing record bodies (one decompression pass).
103
+
104
+ Returns ``{"datasets": [...], "models": [...], "latest": {(dataset, model): name},
105
+ "file_count": int}``. ``latest`` keeps only the newest-timestamp file per
106
+ ``(dataset, model)`` so duplicate re-runs are de-duplicated, mirroring the authors'
107
+ ``BaselineDataLoader`` behaviour.
108
+ """
109
+ path = tarball_path or download_tarball()
110
+ latest: dict[tuple[str, str, str], tuple[str, str]] = {} # (ds, subset, model) -> (ts, name)
111
+ file_count = 0
112
+ with tarfile.open(path, mode="r|gz") as tf:
113
+ for m in tf:
114
+ if not m.isfile():
115
+ continue
116
+ parsed = _split_member(m.name)
117
+ if parsed is None:
118
+ continue
119
+ ds, subset, model, ts = parsed
120
+ file_count += 1
121
+ key = (ds, subset, model)
122
+ if key not in latest or ts > latest[key][0]:
123
+ latest[key] = (ts, m.name)
124
+ datasets = sorted({k[0] for k in latest})
125
+ models = sorted({k[2] for k in latest})
126
+ by_dataset: dict[str, set[str]] = defaultdict(set)
127
+ for ds, _subset, model in latest:
128
+ by_dataset[ds].add(model)
129
+ return {
130
+ "datasets": datasets,
131
+ "models": models,
132
+ "latest": {k: v[1] for k, v in latest.items()},
133
+ "models_by_dataset": {k: sorted(v) for k, v in by_dataset.items()},
134
+ "file_count": file_count,
135
+ }
136
+
137
+
138
+ def iter_raw_records(
139
+ tarball_path: str | None = None,
140
+ *,
141
+ datasets: set[str] | None = None,
142
+ models: set[str] | None = None,
143
+ limit: int | None = None,
144
+ skip_demo: bool = True,
145
+ ) -> Iterator[dict[str, Any]]:
146
+ """Stream flattened per-(prompt, model) records out of the tarball.
147
+
148
+ Each yielded dict has the keys ``dataset_id``, ``split``, ``model_name`` plus
149
+ :data:`_RECORD_FIELDS` (``raw_output`` dropped). Filter by ``datasets`` / ``models``
150
+ (matched on the member path, so unwanted files are skipped before JSON parsing).
151
+
152
+ NOTE: this reads every ``.json`` member; for a pristine release there is exactly one
153
+ file per ``(dataset, model)`` so no de-duplication is needed. If a release ever ships
154
+ duplicate timestamped re-runs, restrict to :func:`list_release_contents`'s ``latest``
155
+ set first.
156
+ """
157
+ _require_seed_extra()
158
+ path = tarball_path or download_tarball()
159
+ emitted = 0
160
+ with tarfile.open(path, mode="r|gz") as tf:
161
+ for m in tf:
162
+ if not m.isfile():
163
+ continue
164
+ parsed = _split_member(m.name)
165
+ if parsed is None:
166
+ continue
167
+ ds, _subset, model, _ts = parsed
168
+ if datasets is not None and ds not in datasets:
169
+ continue
170
+ if models is not None and model not in models:
171
+ continue
172
+ fh = tf.extractfile(m)
173
+ if fh is None:
174
+ continue
175
+ data = json.load(fh)
176
+ if skip_demo and data.get("demo", False):
177
+ continue
178
+ dataset_id = data.get("dataset_name", ds)
179
+ split = data.get("split", "")
180
+ model_name = data.get("model_name", model)
181
+ for rec in data.get("records", []):
182
+ row = {"dataset_id": dataset_id, "split": split, "model_name": model_name}
183
+ for f in _RECORD_FIELDS:
184
+ row[f] = rec.get(f)
185
+ yield row
186
+ emitted += 1
187
+ if limit is not None and emitted >= limit:
188
+ return
189
+
190
+
191
+ def load_llmrouterbench_df(
192
+ candidates: Sequence[str],
193
+ datasets: Sequence[str],
194
+ tarball_path: str | None = None,
195
+ ):
196
+ """Pivot the long per-(prompt, model) records into the WIDE DataFrame the eval consumes.
197
+
198
+ This is Phase 3 of ``docs/PLAN/LLMRouterBench-H1-setup.md``. Output: one row per
199
+ ``(dataset, question index)``, with columns ``prompt``, ``eval_name`` (= dataset id), and
200
+ for each candidate model ``m`` a score column ``m`` and a cost column ``m|total_cost`` —
201
+ i.e. exactly the contract ``routerbench.detect_model_columns`` / ``harness.prepare_rows``
202
+ expect, so the eval reuses all its machinery unchanged.
203
+
204
+ Keyed by ``(dataset_id, index)`` — the question's identity — rather than prompt text, so it
205
+ is robust to repeated/boilerplate prompt strings within a dataset. Questions where any
206
+ candidate lacks a usable ``(score, cost)`` are dropped (the harness would drop them anyway).
207
+ """
208
+ _require_seed_extra()
209
+ import pandas as pd
210
+
211
+ cand = list(candidates)
212
+ want_ds = set(datasets)
213
+ bucket: dict[tuple[str, Any], dict[str, Any]] = {}
214
+ for r in iter_raw_records(tarball_path, datasets=want_ds, models=set(cand)):
215
+ key = (r["dataset_id"], r["index"])
216
+ slot = bucket.get(key)
217
+ if slot is None:
218
+ slot = {"prompt": r["prompt"], "scores": {}, "costs": {}}
219
+ bucket[key] = slot
220
+ slot["scores"][r["model_name"]] = r["score"]
221
+ slot["costs"][r["model_name"]] = r["cost"]
222
+
223
+ rows: list[dict[str, Any]] = []
224
+ for (dataset_id, _idx), slot in bucket.items():
225
+ if any(slot["scores"].get(m) is None or slot["costs"].get(m) is None for m in cand):
226
+ continue
227
+ row: dict[str, Any] = {"prompt": slot["prompt"], "eval_name": dataset_id}
228
+ for m in cand:
229
+ row[m] = slot["scores"][m]
230
+ row[f"{m}{_COST_SUFFIX}"] = slot["costs"][m]
231
+ rows.append(row)
232
+ return pd.DataFrame(rows)
@@ -0,0 +1,141 @@
1
+ """Load RouterBench (offline benchmark) into Minima outcome records.
2
+
3
+ RouterBench (``withmartian/routerbench``) ships as pickled pandas frames
4
+ (``routerbench_0shot.pkl``), not a ``datasets``-loadable layout — so it is fetched with
5
+ ``hf_hub_download`` + ``pandas.read_pickle``. Each row is a prompt; per model there is a
6
+ ``<model>`` correctness column (0/1) and a ``<model>|total_cost`` column (real USD for
7
+ that call). We emit one outcome record per (prompt, model). Requires the ``seed`` extra
8
+ (``datasets``/``huggingface-hub``/``pandas``); raises a clear error otherwise.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from typing import Any
14
+
15
+ from minima.memory.keys import build_content, task_cluster, task_fingerprint
16
+ from minima.memory.records import OutcomeRecord
17
+ from minima.seeding.items import SeedItem
18
+
19
+ _REPO_ID = "withmartian/routerbench"
20
+ _SPLIT_FILES = {"0shot": "routerbench_0shot.pkl", "5shot": "routerbench_5shot.pkl"}
21
+ _COST_SUFFIX = "|total_cost"
22
+
23
+ _EVAL_TO_TASK_TYPE = {
24
+ "mbpp": "code",
25
+ "humaneval": "code",
26
+ "code-llama": "code",
27
+ "gsm8k": "reasoning",
28
+ "grade-school-math": "reasoning",
29
+ "math": "reasoning",
30
+ "mmlu": "qa",
31
+ "arc": "qa",
32
+ "hellaswag": "reasoning",
33
+ "winogrande": "reasoning",
34
+ "rag": "rag",
35
+ "mt-bench": "other",
36
+ "mtbench": "other",
37
+ }
38
+
39
+
40
+ def _reverse_aliases(aliases: dict[str, list[str]]) -> dict[str, str]:
41
+ reverse: dict[str, str] = {}
42
+ for canonical, names in aliases.items():
43
+ for name in names:
44
+ reverse[name] = canonical
45
+ return reverse
46
+
47
+
48
+ def _task_type_for(eval_name: str) -> str:
49
+ name = (eval_name or "").lower()
50
+ for key, value in _EVAL_TO_TASK_TYPE.items():
51
+ if key in name:
52
+ return value
53
+ return "other"
54
+
55
+
56
+ def detect_model_columns(columns: list[str]) -> dict[str, str]:
57
+ """Return {score_column: cost_column} for every model with a cost sibling."""
58
+ cost_cols = {c for c in columns if isinstance(c, str) and c.endswith(_COST_SUFFIX)}
59
+ pairs: dict[str, str] = {}
60
+ for cost_col in cost_cols:
61
+ score_col = cost_col[: -len(_COST_SUFFIX)]
62
+ if score_col in columns:
63
+ pairs[score_col] = cost_col
64
+ return pairs
65
+
66
+
67
+ def load_routerbench_df(split: str = "0shot") -> Any:
68
+ """Download + read the RouterBench pickle into a pandas DataFrame."""
69
+ try:
70
+ import pandas as pd
71
+ from huggingface_hub import hf_hub_download
72
+ except ImportError as exc: # pragma: no cover
73
+ raise RuntimeError(
74
+ "RouterBench needs the 'seed' extra: `uv sync --extra seed`. "
75
+ "For a network-free smoke test use `--dataset synthetic`."
76
+ ) from exc
77
+
78
+ filename = _SPLIT_FILES.get(split, _SPLIT_FILES["0shot"])
79
+ path = hf_hub_download(repo_id=_REPO_ID, filename=filename, repo_type="dataset")
80
+ return pd.read_pickle(path)
81
+
82
+
83
+ def load_records(limit: int, aliases: dict[str, list[str]], split: str = "0shot") -> list[SeedItem]:
84
+ df = load_routerbench_df(split)
85
+ columns = list(df.columns)
86
+ model_columns = detect_model_columns(columns)
87
+ if not model_columns:
88
+ raise RuntimeError(
89
+ f"could not detect RouterBench model columns in {columns[:10]}...; "
90
+ "the dataset schema may have changed."
91
+ )
92
+
93
+ reverse = _reverse_aliases(aliases)
94
+ prompt_col = "prompt" if "prompt" in columns else columns[0]
95
+ out: list[SeedItem] = []
96
+
97
+ for row_index, row in enumerate(df.itertuples(index=False)):
98
+ if len(out) >= limit:
99
+ break
100
+ rowd = dict(zip(columns, row, strict=False))
101
+ prompt = str(rowd.get(prompt_col, "")).strip()
102
+ if not prompt:
103
+ continue
104
+ task_type = _task_type_for(str(rowd.get("eval_name", "")))
105
+ difficulty = "medium"
106
+ fingerprint = task_fingerprint(prompt)
107
+ cluster = task_cluster(task_type, difficulty)
108
+ content = build_content(task_type, difficulty, prompt)
109
+
110
+ for score_col, cost_col in model_columns.items():
111
+ quality = _to_float(rowd.get(score_col))
112
+ if quality is None:
113
+ continue
114
+ model_id = reverse.get(score_col, score_col)
115
+ record = OutcomeRecord(
116
+ model_id=model_id,
117
+ task_type=task_type,
118
+ difficulty=difficulty,
119
+ task_fingerprint=fingerprint,
120
+ task_cluster=cluster,
121
+ cost_usd=_to_float(rowd.get(cost_col)) or 0.0,
122
+ quality_score=max(0.0, min(1.0, quality)),
123
+ outcome="success" if quality >= 0.5 else "failure",
124
+ source_dataset="routerbench",
125
+ )
126
+ out.append(
127
+ SeedItem(
128
+ item_id=f"rb-{row_index}-{score_col}",
129
+ content=content,
130
+ record=record,
131
+ env_tags=["seed:routerbench"],
132
+ )
133
+ )
134
+ return out
135
+
136
+
137
+ def _to_float(value: Any) -> float | None:
138
+ try:
139
+ return float(value)
140
+ except (TypeError, ValueError):
141
+ return None