minima-cli 0.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- minima/__init__.py +5 -0
- minima/api/__init__.py +1 -0
- minima/api/auth.py +39 -0
- minima/api/errors.py +40 -0
- minima/api/routers/__init__.py +1 -0
- minima/api/routers/calibration.py +50 -0
- minima/api/routers/feedback.py +279 -0
- minima/api/routers/health.py +50 -0
- minima/api/routers/models.py +42 -0
- minima/api/routers/recommend.py +66 -0
- minima/api/routers/savings.py +55 -0
- minima/api/routers/strategies.py +33 -0
- minima/catalog/__init__.py +1 -0
- minima/catalog/data/capability_priors.json +210 -0
- minima/catalog/data/model_aliases.json +12 -0
- minima/catalog/merge.py +69 -0
- minima/catalog/refresh.py +54 -0
- minima/catalog/sources/__init__.py +1 -0
- minima/catalog/sources/litellm.py +19 -0
- minima/catalog/sources/openrouter.py +25 -0
- minima/catalog/store.py +86 -0
- minima/config.py +288 -0
- minima/deps.py +35 -0
- minima/llm/__init__.py +1 -0
- minima/llm/anthropic.py +106 -0
- minima/llm/base.py +196 -0
- minima/llm/gemini.py +124 -0
- minima/llm/registry.py +54 -0
- minima/logging.py +28 -0
- minima/main.py +109 -0
- minima/memory/__init__.py +1 -0
- minima/memory/adapter.py +572 -0
- minima/memory/keys.py +83 -0
- minima/memory/records.py +190 -0
- minima/memory/threadpool.py +41 -0
- minima/metrics/__init__.py +1 -0
- minima/metrics/calibration.py +415 -0
- minima/metrics/report.py +116 -0
- minima/metrics/savings.py +98 -0
- minima/recommender/__init__.py +1 -0
- minima/recommender/_pg_pool.py +38 -0
- minima/recommender/_redis_client.py +32 -0
- minima/recommender/aggregate.py +157 -0
- minima/recommender/classify.py +165 -0
- minima/recommender/decisionlog.py +505 -0
- minima/recommender/durablerefs.py +312 -0
- minima/recommender/engine.py +997 -0
- minima/recommender/escalation.py +83 -0
- minima/recommender/propensity.py +189 -0
- minima/recommender/recstore.py +368 -0
- minima/recommender/score.py +318 -0
- minima/recommender/types.py +166 -0
- minima/schemas/__init__.py +1 -0
- minima/schemas/common.py +73 -0
- minima/schemas/feedback.py +34 -0
- minima/schemas/models_catalog.py +36 -0
- minima/schemas/recommend.py +104 -0
- minima/schemas/savings.py +39 -0
- minima/schemas/strategies.py +57 -0
- minima/schemas/workflow.py +43 -0
- minima/seeding/__init__.py +1 -0
- minima/seeding/items.py +42 -0
- minima/seeding/llmrouterbench.py +232 -0
- minima/seeding/routerbench.py +141 -0
- minima/seeding/run_seed.py +56 -0
- minima/seeding/synthetic.py +70 -0
- minima/tenancy/__init__.py +8 -0
- minima/tenancy/context.py +37 -0
- minima/tenancy/passthrough.py +110 -0
- minima/version.py +3 -0
- minima_cli-0.4.9.dist-info/METADATA +275 -0
- minima_cli-0.4.9.dist-info/RECORD +161 -0
- minima_cli-0.4.9.dist-info/WHEEL +4 -0
- minima_cli-0.4.9.dist-info/entry_points.txt +5 -0
- minima_cli-0.4.9.dist-info/licenses/LICENSE +295 -0
- minima_client/__init__.py +19 -0
- minima_client/autocapture.py +101 -0
- minima_client/client.py +301 -0
- minima_client/errors.py +23 -0
- minima_harness/LICENSE_PI +32 -0
- minima_harness/__init__.py +16 -0
- minima_harness/agent/__init__.py +72 -0
- minima_harness/agent/agent.py +276 -0
- minima_harness/agent/events.py +124 -0
- minima_harness/agent/loop.py +311 -0
- minima_harness/agent/state.py +79 -0
- minima_harness/agent/tools.py +97 -0
- minima_harness/ai/__init__.py +66 -0
- minima_harness/ai/compat.py +71 -0
- minima_harness/ai/errors.py +96 -0
- minima_harness/ai/events.py +117 -0
- minima_harness/ai/openrouter_catalog.py +153 -0
- minima_harness/ai/provider_catalog.py +299 -0
- minima_harness/ai/provider_quirks.py +37 -0
- minima_harness/ai/providers/__init__.py +75 -0
- minima_harness/ai/providers/_common.py +48 -0
- minima_harness/ai/providers/anthropic.py +290 -0
- minima_harness/ai/providers/base.py +65 -0
- minima_harness/ai/providers/faux.py +173 -0
- minima_harness/ai/providers/google.py +221 -0
- minima_harness/ai/providers/openai_compat.py +278 -0
- minima_harness/ai/registry.py +184 -0
- minima_harness/ai/stream.py +82 -0
- minima_harness/ai/tools.py +51 -0
- minima_harness/ai/types.py +204 -0
- minima_harness/ai/usage.py +41 -0
- minima_harness/minima/__init__.py +40 -0
- minima_harness/minima/cache.py +102 -0
- minima_harness/minima/config.py +85 -0
- minima_harness/minima/goals.py +226 -0
- minima_harness/minima/judge.py +144 -0
- minima_harness/minima/mapping.py +147 -0
- minima_harness/minima/meter.py +143 -0
- minima_harness/minima/router.py +220 -0
- minima_harness/minima/runtime.py +544 -0
- minima_harness/minima/signals.py +195 -0
- minima_harness/session/__init__.py +14 -0
- minima_harness/session/format.py +35 -0
- minima_harness/session/store.py +236 -0
- minima_harness/tasks/__init__.py +17 -0
- minima_harness/tasks/task_set.py +78 -0
- minima_harness/tools/__init__.py +7 -0
- minima_harness/tools/_io.py +34 -0
- minima_harness/tools/bash.py +70 -0
- minima_harness/tools/builtin.py +23 -0
- minima_harness/tools/edit.py +50 -0
- minima_harness/tools/find.py +38 -0
- minima_harness/tools/grep.py +73 -0
- minima_harness/tools/ls.py +35 -0
- minima_harness/tools/read.py +38 -0
- minima_harness/tools/tasks.py +75 -0
- minima_harness/tools/write.py +36 -0
- minima_harness/tui/__init__.py +3 -0
- minima_harness/tui/analytics.py +111 -0
- minima_harness/tui/app.py +1927 -0
- minima_harness/tui/bridge.py +103 -0
- minima_harness/tui/cli.py +227 -0
- minima_harness/tui/clipboard.py +60 -0
- minima_harness/tui/commands.py +49 -0
- minima_harness/tui/compaction.py +17 -0
- minima_harness/tui/config_cli.py +141 -0
- minima_harness/tui/config_store.py +237 -0
- minima_harness/tui/context.py +93 -0
- minima_harness/tui/customize.py +95 -0
- minima_harness/tui/diff.py +53 -0
- minima_harness/tui/editor.py +43 -0
- minima_harness/tui/extensions.py +84 -0
- minima_harness/tui/extra_models.py +52 -0
- minima_harness/tui/history.py +71 -0
- minima_harness/tui/mubit.py +295 -0
- minima_harness/tui/overlays.py +593 -0
- minima_harness/tui/packages.py +59 -0
- minima_harness/tui/run_modes.py +66 -0
- minima_harness/tui/theme.py +77 -0
- minima_harness/tui/welcome.py +83 -0
- minima_harness/tui/widgets/__init__.py +3 -0
- minima_harness/tui/widgets/banner.py +38 -0
- minima_harness/tui/widgets/editor.py +83 -0
- minima_harness/tui/widgets/footer.py +73 -0
- minima_harness/tui/widgets/messages.py +151 -0
- minima_harness/tui/widgets/status.py +57 -0
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Schemas for the per-call recommendation endpoint."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
6
|
+
|
|
7
|
+
from minima.schemas.common import Constraints, DecisionBasis, Difficulty, TaskInput, TaskType
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class RecommendRequest(BaseModel):
|
|
11
|
+
task: TaskInput
|
|
12
|
+
cost_quality_tradeoff: float = Field(
|
|
13
|
+
5.0, ge=0, le=10, description="0 = cheapest acceptable, 10 = highest quality"
|
|
14
|
+
)
|
|
15
|
+
constraints: Constraints = Field(default_factory=Constraints)
|
|
16
|
+
user_id: str | None = Field(
|
|
17
|
+
None, description="optional within-org actor label (NOT a tenant/auth boundary)"
|
|
18
|
+
)
|
|
19
|
+
namespace: str | None = Field(
|
|
20
|
+
None,
|
|
21
|
+
description=(
|
|
22
|
+
"optional within-org sub-scope (team/project/env), namespaced under your org. "
|
|
23
|
+
"The tenant boundary is your Minima API key -> your Mubit instance, not this field."
|
|
24
|
+
),
|
|
25
|
+
)
|
|
26
|
+
max_candidates: int = Field(8, ge=1, le=64)
|
|
27
|
+
allow_llm_escalation: bool = True
|
|
28
|
+
explain: bool = True
|
|
29
|
+
baseline_model_id: str | None = Field(
|
|
30
|
+
None,
|
|
31
|
+
description=(
|
|
32
|
+
"the model you would have used without Minima; powers the vs_declared_default "
|
|
33
|
+
"savings baseline in GET /v1/savings"
|
|
34
|
+
),
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class EvidenceRef(BaseModel):
|
|
39
|
+
"""A recalled past outcome that informed a candidate's score."""
|
|
40
|
+
|
|
41
|
+
model_config = ConfigDict(protected_namespaces=())
|
|
42
|
+
|
|
43
|
+
entry_id: str = Field(..., description="QueryEvidence.id (used for outcome attribution)")
|
|
44
|
+
reference_id: str | None = None
|
|
45
|
+
model_id: str
|
|
46
|
+
score: float = Field(..., description="retrieval similarity")
|
|
47
|
+
knowledge_confidence: float = Field(..., ge=0, le=1)
|
|
48
|
+
observed_success: float = Field(..., ge=0, le=1)
|
|
49
|
+
is_stale: bool = False
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class RankedModel(BaseModel):
|
|
53
|
+
model_config = ConfigDict(protected_namespaces=())
|
|
54
|
+
|
|
55
|
+
model_id: str
|
|
56
|
+
provider: str
|
|
57
|
+
predicted_success: float = Field(..., ge=0, le=1)
|
|
58
|
+
est_cost_usd: float = Field(..., ge=0)
|
|
59
|
+
est_cost_breakdown: dict[str, float] = Field(default_factory=dict)
|
|
60
|
+
score: float = Field(..., description="final objective score; sorting key")
|
|
61
|
+
rationale: str = ""
|
|
62
|
+
decision_basis: DecisionBasis = DecisionBasis.prior
|
|
63
|
+
evidence: list[EvidenceRef] = Field(default_factory=list)
|
|
64
|
+
supports_prompt_caching: bool = False
|
|
65
|
+
context_window: int = 0
|
|
66
|
+
est_latency_ms: float | None = Field(
|
|
67
|
+
None, description="observed latency percentile from similar past outcomes (ms)"
|
|
68
|
+
)
|
|
69
|
+
latency_basis: str = Field("", description='e.g. "observed_p75"; empty without evidence')
|
|
70
|
+
est_cost_low: float | None = Field(
|
|
71
|
+
None, ge=0, description="low end of the data-grounded predictable cost band ($)"
|
|
72
|
+
)
|
|
73
|
+
est_cost_high: float | None = Field(
|
|
74
|
+
None, ge=0, description="high end of the data-grounded predictable cost band ($)"
|
|
75
|
+
)
|
|
76
|
+
cost_band_basis: str = Field(
|
|
77
|
+
"", description='e.g. "observed_p25_p75" | "rescaled_p25_p75"; empty without a band'
|
|
78
|
+
)
|
|
79
|
+
success_interval_width: float = Field(
|
|
80
|
+
0.0, ge=0, le=1, description="95% credible-interval width of predicted_success"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class RecommendResponse(BaseModel):
|
|
85
|
+
recommendation_id: str
|
|
86
|
+
recommended_model: RankedModel
|
|
87
|
+
ranked: list[RankedModel] = Field(default_factory=list)
|
|
88
|
+
fallback_model: RankedModel | None = None
|
|
89
|
+
confidence: float = Field(..., ge=0, le=1)
|
|
90
|
+
decision_basis: DecisionBasis
|
|
91
|
+
threshold_used: float
|
|
92
|
+
classified_task_type: TaskType
|
|
93
|
+
classified_difficulty: Difficulty
|
|
94
|
+
catalog_version: str
|
|
95
|
+
catalog_stale: bool = False
|
|
96
|
+
latency_ms: int = 0
|
|
97
|
+
warnings: list[str] = Field(default_factory=list)
|
|
98
|
+
selection_policy: str = Field(
|
|
99
|
+
"argmin", description='"argmin" | "epsilon_softmax" (per-org opt-in exploration)'
|
|
100
|
+
)
|
|
101
|
+
recommended_actions: list[str] = Field(
|
|
102
|
+
default_factory=list,
|
|
103
|
+
description="near-free cost-saving actions to apply (e.g. enable_prompt_cache)",
|
|
104
|
+
)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Schemas for the savings and calibration reporting endpoints.
|
|
2
|
+
|
|
3
|
+
The payload bodies reuse the metrics dataclasses directly (pydantic v2 validates and
|
|
4
|
+
serializes stdlib dataclasses), so the report shape has exactly one definition.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, Field
|
|
10
|
+
|
|
11
|
+
from minima.metrics.calibration import CalibrationReport, CusumFlag
|
|
12
|
+
from minima.metrics.savings import SavingsSummary
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SavingsGroup(BaseModel):
|
|
16
|
+
key: str
|
|
17
|
+
summary: SavingsSummary
|
|
18
|
+
health: dict[str, float | int] = Field(default_factory=dict)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SavingsResponse(BaseModel):
|
|
22
|
+
org_id: str
|
|
23
|
+
since: float
|
|
24
|
+
days: float
|
|
25
|
+
namespace: str | None = None
|
|
26
|
+
summary: SavingsSummary
|
|
27
|
+
health: dict[str, float | int] = Field(default_factory=dict)
|
|
28
|
+
group_by: str | None = None
|
|
29
|
+
groups: list[SavingsGroup] = Field(default_factory=list)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class CalibrationResponse(BaseModel):
|
|
33
|
+
org_id: str
|
|
34
|
+
since: float
|
|
35
|
+
days: float
|
|
36
|
+
namespace: str | None = None
|
|
37
|
+
health: dict[str, float | int] = Field(default_factory=dict)
|
|
38
|
+
reports: list[CalibrationReport] = Field(default_factory=list)
|
|
39
|
+
drift_flags: list[CusumFlag] = Field(default_factory=list)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Schemas for the surfaced-strategies endpoint.
|
|
2
|
+
|
|
3
|
+
``reflect()`` promotes accumulated outcomes/lessons into validated rules; Mubit's
|
|
4
|
+
``surface_strategies`` clusters those lessons into ``EmergentStrategy`` summaries.
|
|
5
|
+
This endpoint exposes them so callers can see *why* a namespace routes the way it does.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections.abc import Mapping
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, Field
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _g(d: Mapping[str, Any], *keys: str, default: Any = None) -> Any:
|
|
17
|
+
"""First present key among snake_case/camelCase variants."""
|
|
18
|
+
for k in keys:
|
|
19
|
+
if k in d and d[k] is not None:
|
|
20
|
+
return d[k]
|
|
21
|
+
return default
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Strategy(BaseModel):
|
|
25
|
+
strategy_id: str = ""
|
|
26
|
+
description: str = ""
|
|
27
|
+
supporting_lesson_count: int = 0
|
|
28
|
+
avg_confidence: float = 0.0
|
|
29
|
+
avg_reinforcement: float = 0.0
|
|
30
|
+
dominant_lesson_type: str = ""
|
|
31
|
+
dominant_scope: str = ""
|
|
32
|
+
lesson_ids: list[str] = Field(default_factory=list)
|
|
33
|
+
|
|
34
|
+
@classmethod
|
|
35
|
+
def from_emergent(cls, d: Mapping[str, Any]) -> Strategy:
|
|
36
|
+
"""Parse a Mubit ``EmergentStrategy`` dict (snake_case or camelCase)."""
|
|
37
|
+
return cls(
|
|
38
|
+
strategy_id=str(_g(d, "strategy_id", "strategyId", default="")),
|
|
39
|
+
description=str(_g(d, "description", default="")),
|
|
40
|
+
supporting_lesson_count=int(
|
|
41
|
+
_g(d, "supporting_lesson_count", "supportingLessonCount", default=0)
|
|
42
|
+
),
|
|
43
|
+
avg_confidence=float(_g(d, "avg_confidence", "avgConfidence", default=0.0)),
|
|
44
|
+
avg_reinforcement=float(_g(d, "avg_reinforcement", "avgReinforcement", default=0.0)),
|
|
45
|
+
dominant_lesson_type=str(
|
|
46
|
+
_g(d, "dominant_lesson_type", "dominantLessonType", default="")
|
|
47
|
+
),
|
|
48
|
+
dominant_scope=str(_g(d, "dominant_scope", "dominantScope", default="")),
|
|
49
|
+
lesson_ids=[str(x) for x in (_g(d, "lesson_ids", "lessonIds", default=[]) or [])],
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class StrategiesResponse(BaseModel):
|
|
54
|
+
namespace: str | None = None
|
|
55
|
+
lane: str
|
|
56
|
+
strategies: list[Strategy] = Field(default_factory=list)
|
|
57
|
+
count: int = 0
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Schemas for the multi-step workflow recommendation endpoint."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
from minima.schemas.common import Constraints, TaskInput
|
|
8
|
+
from minima.schemas.recommend import RecommendResponse
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class WorkflowStep(BaseModel):
|
|
12
|
+
step_id: str = Field(..., min_length=1)
|
|
13
|
+
task: TaskInput
|
|
14
|
+
constraints: Constraints | None = Field(
|
|
15
|
+
None, description="per-step override; merged over global"
|
|
16
|
+
)
|
|
17
|
+
depends_on: list[str] = Field(default_factory=list)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class WorkflowRequest(BaseModel):
|
|
21
|
+
steps: list[WorkflowStep] = Field(..., min_length=1)
|
|
22
|
+
cost_quality_tradeoff: float = Field(5.0, ge=0, le=10)
|
|
23
|
+
constraints: Constraints = Field(default_factory=Constraints)
|
|
24
|
+
user_id: str | None = Field(
|
|
25
|
+
None, description="optional within-org actor label (NOT a tenant/auth boundary)"
|
|
26
|
+
)
|
|
27
|
+
namespace: str | None = Field(
|
|
28
|
+
None, description="optional within-org sub-scope; tenant boundary is the Minima key"
|
|
29
|
+
)
|
|
30
|
+
allow_llm_escalation: bool = True
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class StepRecommendation(BaseModel):
|
|
34
|
+
step_id: str
|
|
35
|
+
recommendation: RecommendResponse
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class WorkflowResponse(BaseModel):
|
|
39
|
+
workflow_recommendation_id: str
|
|
40
|
+
steps: list[StepRecommendation]
|
|
41
|
+
total_est_cost_usd: float = Field(..., ge=0)
|
|
42
|
+
total_est_cost_if_all_premium: float = Field(..., ge=0)
|
|
43
|
+
confidence: float = Field(..., ge=0, le=1)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Cold-start seeding of Mubit memory (offline import)."""
|
minima/seeding/items.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Build Mubit batch_insert items from outcome records.
|
|
2
|
+
|
|
3
|
+
``metadata_json`` must be a JSON *string* on the batch_insert wire (unlike remember(),
|
|
4
|
+
which JSON-encodes a dict for you). ``embedding: []`` lets the server embed on ingest.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import time
|
|
11
|
+
from collections.abc import Sequence
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
|
|
14
|
+
from minima.memory.records import OutcomeRecord
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(slots=True)
|
|
18
|
+
class SeedItem:
|
|
19
|
+
item_id: str
|
|
20
|
+
content: str
|
|
21
|
+
record: OutcomeRecord
|
|
22
|
+
env_tags: list[str]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def build_item(seed: SeedItem, source: str = "system") -> dict:
|
|
26
|
+
# Stamp seed time, not dataset time: seeds are then governed by the explicit
|
|
27
|
+
# seed-vs-live weighting (source_dataset), not by pretending to be fresh or
|
|
28
|
+
# being decayed by an arbitrary benchmark-publication age.
|
|
29
|
+
if seed.record.recorded_at is None:
|
|
30
|
+
seed.record.recorded_at = time.time()
|
|
31
|
+
return {
|
|
32
|
+
"item_id": seed.item_id,
|
|
33
|
+
"text": seed.content,
|
|
34
|
+
"metadata_json": json.dumps(seed.record.to_metadata()),
|
|
35
|
+
"source": source,
|
|
36
|
+
"embedding": [],
|
|
37
|
+
"env_tags": list(seed.env_tags),
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def chunked(items: Sequence[dict], size: int) -> list[list[dict]]:
|
|
42
|
+
return [list(items[i : i + size]) for i in range(0, len(items), size)]
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""Fetch + raw-load LLMRouterBench / OpenRouterBench (offline benchmark).
|
|
2
|
+
|
|
3
|
+
LLMRouterBench (HF dataset ``NPULH/LLMRouterBench``; code repo ``ynulihao/LLMRouterBench``,
|
|
4
|
+
Findings@ACL 2026) ships as ONE gzipped tarball ``bench-release.tar.gz`` (~1.28 GB) whose
|
|
5
|
+
members are::
|
|
6
|
+
|
|
7
|
+
bench-release/<dataset>/<model_name>/<dataset>-<model_name>-<YYYYMMDD_HHMMSS>.json
|
|
8
|
+
|
|
9
|
+
Each JSON is one ``(dataset, split, model)`` run: top-level aggregate fields
|
|
10
|
+
(``dataset_name``, ``split``, ``model_name``, ``demo``, ``data_fingerprint`` …) plus a
|
|
11
|
+
``records`` list. Each record carries::
|
|
12
|
+
|
|
13
|
+
index, origin_query, prompt, prompt_tokens, completion_tokens,
|
|
14
|
+
cost, score, prediction, ground_truth, raw_output
|
|
15
|
+
|
|
16
|
+
- ``score`` is 0.0/1.0 correctness (a few datasets use a graded [0,1] judge score).
|
|
17
|
+
- ``cost`` is the *real* per-call USD cost — BUT open models run locally report ``0.0``
|
|
18
|
+
(handle this when building the cost axis; see Phase 2 of the plan and guard V2 — the
|
|
19
|
+
router must *decide* on independent market prices, not on this column it is *scored* on).
|
|
20
|
+
|
|
21
|
+
We stream records straight out of the tarball (no multi-GB extraction) and drop the bulky
|
|
22
|
+
``raw_output`` by default. This module is **Phase 1** of
|
|
23
|
+
``docs/PLAN/LLMRouterBench-H1-setup.md`` — fetch + raw load only. The wide-DataFrame pivot
|
|
24
|
+
the eval harness consumes (``<model>`` score + ``<model>|total_cost`` columns) is **Phase 3**
|
|
25
|
+
and builds on :func:`iter_raw_records`.
|
|
26
|
+
|
|
27
|
+
Requires the ``seed`` extra (``huggingface-hub``/``pandas``); raises a clear error otherwise.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from __future__ import annotations
|
|
31
|
+
|
|
32
|
+
import json
|
|
33
|
+
import re
|
|
34
|
+
import tarfile
|
|
35
|
+
from collections import defaultdict
|
|
36
|
+
from collections.abc import Iterator, Sequence
|
|
37
|
+
from typing import Any
|
|
38
|
+
|
|
39
|
+
_REPO_ID = "NPULH/LLMRouterBench"
|
|
40
|
+
_TARBALL = "bench-release.tar.gz"
|
|
41
|
+
_ROOT = "bench-release"
|
|
42
|
+
_TS_RE = re.compile(r"(\d{8}_\d{6})\.json$")
|
|
43
|
+
# Cost-column suffix the eval harness keys on. MUST match
|
|
44
|
+
# ``minima.seeding.routerbench.detect_model_columns`` so the wide frame this module emits is
|
|
45
|
+
# consumed by the same code path as RouterBench.
|
|
46
|
+
_COST_SUFFIX = "|total_cost"
|
|
47
|
+
|
|
48
|
+
# The per-record fields we keep. We drop the two bulky ones the eval never needs:
|
|
49
|
+
# ``raw_output`` and ``prediction`` (the latter is a full role/content conversation in the
|
|
50
|
+
# real data, despite schema.py declaring it ``str`` — confirmed by inspecting the bytes).
|
|
51
|
+
_RECORD_FIELDS = (
|
|
52
|
+
"index", "origin_query", "prompt", "prompt_tokens",
|
|
53
|
+
"completion_tokens", "cost", "score", "ground_truth",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _require_seed_extra():
|
|
58
|
+
try:
|
|
59
|
+
from huggingface_hub import hf_hub_download # noqa: F401
|
|
60
|
+
except ImportError as exc: # pragma: no cover
|
|
61
|
+
raise RuntimeError(
|
|
62
|
+
"LLMRouterBench needs the 'seed' extra: `uv sync --extra seed`."
|
|
63
|
+
) from exc
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def download_tarball() -> str:
|
|
67
|
+
"""Download (and HF-cache) ``bench-release.tar.gz``; return the local path.
|
|
68
|
+
|
|
69
|
+
Idempotent: ``hf_hub_download`` returns the cached file on subsequent calls without
|
|
70
|
+
re-downloading. The ~1.28 GB lives under the HF cache, not the repo.
|
|
71
|
+
"""
|
|
72
|
+
_require_seed_extra()
|
|
73
|
+
from huggingface_hub import hf_hub_download
|
|
74
|
+
|
|
75
|
+
return hf_hub_download(repo_id=_REPO_ID, filename=_TARBALL, repo_type="dataset")
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _split_member(name: str) -> tuple[str, str, str, str] | None:
|
|
79
|
+
"""Map a member path to ``(dataset, subset, model, timestamp)``.
|
|
80
|
+
|
|
81
|
+
Two shapes occur in the release; the **model is always the directory immediately
|
|
82
|
+
above the file** and the **dataset is the first directory under the root** — the
|
|
83
|
+
optional middle segment is a subset/split (e.g. ``valid``, ``subset_500``)::
|
|
84
|
+
|
|
85
|
+
bench-release/<dataset>/<model>/<…ts>.json (depth 4, subset="")
|
|
86
|
+
bench-release/<dataset>/<subset>/<model>/<…ts>.json (depth 5)
|
|
87
|
+
|
|
88
|
+
(Parsing a fixed ``parts[2]`` as the model is the trap — it mis-reads the subset on
|
|
89
|
+
depth-5 paths; confirmed by inspecting the tarball.)
|
|
90
|
+
"""
|
|
91
|
+
parts = name.split("/")
|
|
92
|
+
if len(parts) < 4 or parts[0] != _ROOT or not parts[-1].endswith(".json"):
|
|
93
|
+
return None
|
|
94
|
+
dataset = parts[1]
|
|
95
|
+
model = parts[-2]
|
|
96
|
+
subset = "/".join(parts[2:-2]) # "" for depth-4 paths
|
|
97
|
+
m = _TS_RE.search(parts[-1])
|
|
98
|
+
return dataset, subset, model, (m.group(1) if m else "")
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def list_release_contents(tarball_path: str | None = None) -> dict[str, Any]:
|
|
102
|
+
"""Inventory the tarball WITHOUT parsing record bodies (one decompression pass).
|
|
103
|
+
|
|
104
|
+
Returns ``{"datasets": [...], "models": [...], "latest": {(dataset, model): name},
|
|
105
|
+
"file_count": int}``. ``latest`` keeps only the newest-timestamp file per
|
|
106
|
+
``(dataset, model)`` so duplicate re-runs are de-duplicated, mirroring the authors'
|
|
107
|
+
``BaselineDataLoader`` behaviour.
|
|
108
|
+
"""
|
|
109
|
+
path = tarball_path or download_tarball()
|
|
110
|
+
latest: dict[tuple[str, str, str], tuple[str, str]] = {} # (ds, subset, model) -> (ts, name)
|
|
111
|
+
file_count = 0
|
|
112
|
+
with tarfile.open(path, mode="r|gz") as tf:
|
|
113
|
+
for m in tf:
|
|
114
|
+
if not m.isfile():
|
|
115
|
+
continue
|
|
116
|
+
parsed = _split_member(m.name)
|
|
117
|
+
if parsed is None:
|
|
118
|
+
continue
|
|
119
|
+
ds, subset, model, ts = parsed
|
|
120
|
+
file_count += 1
|
|
121
|
+
key = (ds, subset, model)
|
|
122
|
+
if key not in latest or ts > latest[key][0]:
|
|
123
|
+
latest[key] = (ts, m.name)
|
|
124
|
+
datasets = sorted({k[0] for k in latest})
|
|
125
|
+
models = sorted({k[2] for k in latest})
|
|
126
|
+
by_dataset: dict[str, set[str]] = defaultdict(set)
|
|
127
|
+
for ds, _subset, model in latest:
|
|
128
|
+
by_dataset[ds].add(model)
|
|
129
|
+
return {
|
|
130
|
+
"datasets": datasets,
|
|
131
|
+
"models": models,
|
|
132
|
+
"latest": {k: v[1] for k, v in latest.items()},
|
|
133
|
+
"models_by_dataset": {k: sorted(v) for k, v in by_dataset.items()},
|
|
134
|
+
"file_count": file_count,
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def iter_raw_records(
|
|
139
|
+
tarball_path: str | None = None,
|
|
140
|
+
*,
|
|
141
|
+
datasets: set[str] | None = None,
|
|
142
|
+
models: set[str] | None = None,
|
|
143
|
+
limit: int | None = None,
|
|
144
|
+
skip_demo: bool = True,
|
|
145
|
+
) -> Iterator[dict[str, Any]]:
|
|
146
|
+
"""Stream flattened per-(prompt, model) records out of the tarball.
|
|
147
|
+
|
|
148
|
+
Each yielded dict has the keys ``dataset_id``, ``split``, ``model_name`` plus
|
|
149
|
+
:data:`_RECORD_FIELDS` (``raw_output`` dropped). Filter by ``datasets`` / ``models``
|
|
150
|
+
(matched on the member path, so unwanted files are skipped before JSON parsing).
|
|
151
|
+
|
|
152
|
+
NOTE: this reads every ``.json`` member; for a pristine release there is exactly one
|
|
153
|
+
file per ``(dataset, model)`` so no de-duplication is needed. If a release ever ships
|
|
154
|
+
duplicate timestamped re-runs, restrict to :func:`list_release_contents`'s ``latest``
|
|
155
|
+
set first.
|
|
156
|
+
"""
|
|
157
|
+
_require_seed_extra()
|
|
158
|
+
path = tarball_path or download_tarball()
|
|
159
|
+
emitted = 0
|
|
160
|
+
with tarfile.open(path, mode="r|gz") as tf:
|
|
161
|
+
for m in tf:
|
|
162
|
+
if not m.isfile():
|
|
163
|
+
continue
|
|
164
|
+
parsed = _split_member(m.name)
|
|
165
|
+
if parsed is None:
|
|
166
|
+
continue
|
|
167
|
+
ds, _subset, model, _ts = parsed
|
|
168
|
+
if datasets is not None and ds not in datasets:
|
|
169
|
+
continue
|
|
170
|
+
if models is not None and model not in models:
|
|
171
|
+
continue
|
|
172
|
+
fh = tf.extractfile(m)
|
|
173
|
+
if fh is None:
|
|
174
|
+
continue
|
|
175
|
+
data = json.load(fh)
|
|
176
|
+
if skip_demo and data.get("demo", False):
|
|
177
|
+
continue
|
|
178
|
+
dataset_id = data.get("dataset_name", ds)
|
|
179
|
+
split = data.get("split", "")
|
|
180
|
+
model_name = data.get("model_name", model)
|
|
181
|
+
for rec in data.get("records", []):
|
|
182
|
+
row = {"dataset_id": dataset_id, "split": split, "model_name": model_name}
|
|
183
|
+
for f in _RECORD_FIELDS:
|
|
184
|
+
row[f] = rec.get(f)
|
|
185
|
+
yield row
|
|
186
|
+
emitted += 1
|
|
187
|
+
if limit is not None and emitted >= limit:
|
|
188
|
+
return
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def load_llmrouterbench_df(
|
|
192
|
+
candidates: Sequence[str],
|
|
193
|
+
datasets: Sequence[str],
|
|
194
|
+
tarball_path: str | None = None,
|
|
195
|
+
):
|
|
196
|
+
"""Pivot the long per-(prompt, model) records into the WIDE DataFrame the eval consumes.
|
|
197
|
+
|
|
198
|
+
This is Phase 3 of ``docs/PLAN/LLMRouterBench-H1-setup.md``. Output: one row per
|
|
199
|
+
``(dataset, question index)``, with columns ``prompt``, ``eval_name`` (= dataset id), and
|
|
200
|
+
for each candidate model ``m`` a score column ``m`` and a cost column ``m|total_cost`` —
|
|
201
|
+
i.e. exactly the contract ``routerbench.detect_model_columns`` / ``harness.prepare_rows``
|
|
202
|
+
expect, so the eval reuses all its machinery unchanged.
|
|
203
|
+
|
|
204
|
+
Keyed by ``(dataset_id, index)`` — the question's identity — rather than prompt text, so it
|
|
205
|
+
is robust to repeated/boilerplate prompt strings within a dataset. Questions where any
|
|
206
|
+
candidate lacks a usable ``(score, cost)`` are dropped (the harness would drop them anyway).
|
|
207
|
+
"""
|
|
208
|
+
_require_seed_extra()
|
|
209
|
+
import pandas as pd
|
|
210
|
+
|
|
211
|
+
cand = list(candidates)
|
|
212
|
+
want_ds = set(datasets)
|
|
213
|
+
bucket: dict[tuple[str, Any], dict[str, Any]] = {}
|
|
214
|
+
for r in iter_raw_records(tarball_path, datasets=want_ds, models=set(cand)):
|
|
215
|
+
key = (r["dataset_id"], r["index"])
|
|
216
|
+
slot = bucket.get(key)
|
|
217
|
+
if slot is None:
|
|
218
|
+
slot = {"prompt": r["prompt"], "scores": {}, "costs": {}}
|
|
219
|
+
bucket[key] = slot
|
|
220
|
+
slot["scores"][r["model_name"]] = r["score"]
|
|
221
|
+
slot["costs"][r["model_name"]] = r["cost"]
|
|
222
|
+
|
|
223
|
+
rows: list[dict[str, Any]] = []
|
|
224
|
+
for (dataset_id, _idx), slot in bucket.items():
|
|
225
|
+
if any(slot["scores"].get(m) is None or slot["costs"].get(m) is None for m in cand):
|
|
226
|
+
continue
|
|
227
|
+
row: dict[str, Any] = {"prompt": slot["prompt"], "eval_name": dataset_id}
|
|
228
|
+
for m in cand:
|
|
229
|
+
row[m] = slot["scores"][m]
|
|
230
|
+
row[f"{m}{_COST_SUFFIX}"] = slot["costs"][m]
|
|
231
|
+
rows.append(row)
|
|
232
|
+
return pd.DataFrame(rows)
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""Load RouterBench (offline benchmark) into Minima outcome records.
|
|
2
|
+
|
|
3
|
+
RouterBench (``withmartian/routerbench``) ships as pickled pandas frames
|
|
4
|
+
(``routerbench_0shot.pkl``), not a ``datasets``-loadable layout — so it is fetched with
|
|
5
|
+
``hf_hub_download`` + ``pandas.read_pickle``. Each row is a prompt; per model there is a
|
|
6
|
+
``<model>`` correctness column (0/1) and a ``<model>|total_cost`` column (real USD for
|
|
7
|
+
that call). We emit one outcome record per (prompt, model). Requires the ``seed`` extra
|
|
8
|
+
(``datasets``/``huggingface-hub``/``pandas``); raises a clear error otherwise.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from minima.memory.keys import build_content, task_cluster, task_fingerprint
|
|
16
|
+
from minima.memory.records import OutcomeRecord
|
|
17
|
+
from minima.seeding.items import SeedItem
|
|
18
|
+
|
|
19
|
+
_REPO_ID = "withmartian/routerbench"
|
|
20
|
+
_SPLIT_FILES = {"0shot": "routerbench_0shot.pkl", "5shot": "routerbench_5shot.pkl"}
|
|
21
|
+
_COST_SUFFIX = "|total_cost"
|
|
22
|
+
|
|
23
|
+
_EVAL_TO_TASK_TYPE = {
|
|
24
|
+
"mbpp": "code",
|
|
25
|
+
"humaneval": "code",
|
|
26
|
+
"code-llama": "code",
|
|
27
|
+
"gsm8k": "reasoning",
|
|
28
|
+
"grade-school-math": "reasoning",
|
|
29
|
+
"math": "reasoning",
|
|
30
|
+
"mmlu": "qa",
|
|
31
|
+
"arc": "qa",
|
|
32
|
+
"hellaswag": "reasoning",
|
|
33
|
+
"winogrande": "reasoning",
|
|
34
|
+
"rag": "rag",
|
|
35
|
+
"mt-bench": "other",
|
|
36
|
+
"mtbench": "other",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _reverse_aliases(aliases: dict[str, list[str]]) -> dict[str, str]:
|
|
41
|
+
reverse: dict[str, str] = {}
|
|
42
|
+
for canonical, names in aliases.items():
|
|
43
|
+
for name in names:
|
|
44
|
+
reverse[name] = canonical
|
|
45
|
+
return reverse
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _task_type_for(eval_name: str) -> str:
|
|
49
|
+
name = (eval_name or "").lower()
|
|
50
|
+
for key, value in _EVAL_TO_TASK_TYPE.items():
|
|
51
|
+
if key in name:
|
|
52
|
+
return value
|
|
53
|
+
return "other"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def detect_model_columns(columns: list[str]) -> dict[str, str]:
|
|
57
|
+
"""Return {score_column: cost_column} for every model with a cost sibling."""
|
|
58
|
+
cost_cols = {c for c in columns if isinstance(c, str) and c.endswith(_COST_SUFFIX)}
|
|
59
|
+
pairs: dict[str, str] = {}
|
|
60
|
+
for cost_col in cost_cols:
|
|
61
|
+
score_col = cost_col[: -len(_COST_SUFFIX)]
|
|
62
|
+
if score_col in columns:
|
|
63
|
+
pairs[score_col] = cost_col
|
|
64
|
+
return pairs
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def load_routerbench_df(split: str = "0shot") -> Any:
|
|
68
|
+
"""Download + read the RouterBench pickle into a pandas DataFrame."""
|
|
69
|
+
try:
|
|
70
|
+
import pandas as pd
|
|
71
|
+
from huggingface_hub import hf_hub_download
|
|
72
|
+
except ImportError as exc: # pragma: no cover
|
|
73
|
+
raise RuntimeError(
|
|
74
|
+
"RouterBench needs the 'seed' extra: `uv sync --extra seed`. "
|
|
75
|
+
"For a network-free smoke test use `--dataset synthetic`."
|
|
76
|
+
) from exc
|
|
77
|
+
|
|
78
|
+
filename = _SPLIT_FILES.get(split, _SPLIT_FILES["0shot"])
|
|
79
|
+
path = hf_hub_download(repo_id=_REPO_ID, filename=filename, repo_type="dataset")
|
|
80
|
+
return pd.read_pickle(path)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def load_records(limit: int, aliases: dict[str, list[str]], split: str = "0shot") -> list[SeedItem]:
|
|
84
|
+
df = load_routerbench_df(split)
|
|
85
|
+
columns = list(df.columns)
|
|
86
|
+
model_columns = detect_model_columns(columns)
|
|
87
|
+
if not model_columns:
|
|
88
|
+
raise RuntimeError(
|
|
89
|
+
f"could not detect RouterBench model columns in {columns[:10]}...; "
|
|
90
|
+
"the dataset schema may have changed."
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
reverse = _reverse_aliases(aliases)
|
|
94
|
+
prompt_col = "prompt" if "prompt" in columns else columns[0]
|
|
95
|
+
out: list[SeedItem] = []
|
|
96
|
+
|
|
97
|
+
for row_index, row in enumerate(df.itertuples(index=False)):
|
|
98
|
+
if len(out) >= limit:
|
|
99
|
+
break
|
|
100
|
+
rowd = dict(zip(columns, row, strict=False))
|
|
101
|
+
prompt = str(rowd.get(prompt_col, "")).strip()
|
|
102
|
+
if not prompt:
|
|
103
|
+
continue
|
|
104
|
+
task_type = _task_type_for(str(rowd.get("eval_name", "")))
|
|
105
|
+
difficulty = "medium"
|
|
106
|
+
fingerprint = task_fingerprint(prompt)
|
|
107
|
+
cluster = task_cluster(task_type, difficulty)
|
|
108
|
+
content = build_content(task_type, difficulty, prompt)
|
|
109
|
+
|
|
110
|
+
for score_col, cost_col in model_columns.items():
|
|
111
|
+
quality = _to_float(rowd.get(score_col))
|
|
112
|
+
if quality is None:
|
|
113
|
+
continue
|
|
114
|
+
model_id = reverse.get(score_col, score_col)
|
|
115
|
+
record = OutcomeRecord(
|
|
116
|
+
model_id=model_id,
|
|
117
|
+
task_type=task_type,
|
|
118
|
+
difficulty=difficulty,
|
|
119
|
+
task_fingerprint=fingerprint,
|
|
120
|
+
task_cluster=cluster,
|
|
121
|
+
cost_usd=_to_float(rowd.get(cost_col)) or 0.0,
|
|
122
|
+
quality_score=max(0.0, min(1.0, quality)),
|
|
123
|
+
outcome="success" if quality >= 0.5 else "failure",
|
|
124
|
+
source_dataset="routerbench",
|
|
125
|
+
)
|
|
126
|
+
out.append(
|
|
127
|
+
SeedItem(
|
|
128
|
+
item_id=f"rb-{row_index}-{score_col}",
|
|
129
|
+
content=content,
|
|
130
|
+
record=record,
|
|
131
|
+
env_tags=["seed:routerbench"],
|
|
132
|
+
)
|
|
133
|
+
)
|
|
134
|
+
return out
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _to_float(value: Any) -> float | None:
|
|
138
|
+
try:
|
|
139
|
+
return float(value)
|
|
140
|
+
except (TypeError, ValueError):
|
|
141
|
+
return None
|