minima-cli 0.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- minima/__init__.py +5 -0
- minima/api/__init__.py +1 -0
- minima/api/auth.py +39 -0
- minima/api/errors.py +40 -0
- minima/api/routers/__init__.py +1 -0
- minima/api/routers/calibration.py +50 -0
- minima/api/routers/feedback.py +279 -0
- minima/api/routers/health.py +50 -0
- minima/api/routers/models.py +42 -0
- minima/api/routers/recommend.py +66 -0
- minima/api/routers/savings.py +55 -0
- minima/api/routers/strategies.py +33 -0
- minima/catalog/__init__.py +1 -0
- minima/catalog/data/capability_priors.json +210 -0
- minima/catalog/data/model_aliases.json +12 -0
- minima/catalog/merge.py +69 -0
- minima/catalog/refresh.py +54 -0
- minima/catalog/sources/__init__.py +1 -0
- minima/catalog/sources/litellm.py +19 -0
- minima/catalog/sources/openrouter.py +25 -0
- minima/catalog/store.py +86 -0
- minima/config.py +288 -0
- minima/deps.py +35 -0
- minima/llm/__init__.py +1 -0
- minima/llm/anthropic.py +106 -0
- minima/llm/base.py +196 -0
- minima/llm/gemini.py +124 -0
- minima/llm/registry.py +54 -0
- minima/logging.py +28 -0
- minima/main.py +109 -0
- minima/memory/__init__.py +1 -0
- minima/memory/adapter.py +572 -0
- minima/memory/keys.py +83 -0
- minima/memory/records.py +190 -0
- minima/memory/threadpool.py +41 -0
- minima/metrics/__init__.py +1 -0
- minima/metrics/calibration.py +415 -0
- minima/metrics/report.py +116 -0
- minima/metrics/savings.py +98 -0
- minima/recommender/__init__.py +1 -0
- minima/recommender/_pg_pool.py +38 -0
- minima/recommender/_redis_client.py +32 -0
- minima/recommender/aggregate.py +157 -0
- minima/recommender/classify.py +165 -0
- minima/recommender/decisionlog.py +505 -0
- minima/recommender/durablerefs.py +312 -0
- minima/recommender/engine.py +997 -0
- minima/recommender/escalation.py +83 -0
- minima/recommender/propensity.py +189 -0
- minima/recommender/recstore.py +368 -0
- minima/recommender/score.py +318 -0
- minima/recommender/types.py +166 -0
- minima/schemas/__init__.py +1 -0
- minima/schemas/common.py +73 -0
- minima/schemas/feedback.py +34 -0
- minima/schemas/models_catalog.py +36 -0
- minima/schemas/recommend.py +104 -0
- minima/schemas/savings.py +39 -0
- minima/schemas/strategies.py +57 -0
- minima/schemas/workflow.py +43 -0
- minima/seeding/__init__.py +1 -0
- minima/seeding/items.py +42 -0
- minima/seeding/llmrouterbench.py +232 -0
- minima/seeding/routerbench.py +141 -0
- minima/seeding/run_seed.py +56 -0
- minima/seeding/synthetic.py +70 -0
- minima/tenancy/__init__.py +8 -0
- minima/tenancy/context.py +37 -0
- minima/tenancy/passthrough.py +110 -0
- minima/version.py +3 -0
- minima_cli-0.4.9.dist-info/METADATA +275 -0
- minima_cli-0.4.9.dist-info/RECORD +161 -0
- minima_cli-0.4.9.dist-info/WHEEL +4 -0
- minima_cli-0.4.9.dist-info/entry_points.txt +5 -0
- minima_cli-0.4.9.dist-info/licenses/LICENSE +295 -0
- minima_client/__init__.py +19 -0
- minima_client/autocapture.py +101 -0
- minima_client/client.py +301 -0
- minima_client/errors.py +23 -0
- minima_harness/LICENSE_PI +32 -0
- minima_harness/__init__.py +16 -0
- minima_harness/agent/__init__.py +72 -0
- minima_harness/agent/agent.py +276 -0
- minima_harness/agent/events.py +124 -0
- minima_harness/agent/loop.py +311 -0
- minima_harness/agent/state.py +79 -0
- minima_harness/agent/tools.py +97 -0
- minima_harness/ai/__init__.py +66 -0
- minima_harness/ai/compat.py +71 -0
- minima_harness/ai/errors.py +96 -0
- minima_harness/ai/events.py +117 -0
- minima_harness/ai/openrouter_catalog.py +153 -0
- minima_harness/ai/provider_catalog.py +299 -0
- minima_harness/ai/provider_quirks.py +37 -0
- minima_harness/ai/providers/__init__.py +75 -0
- minima_harness/ai/providers/_common.py +48 -0
- minima_harness/ai/providers/anthropic.py +290 -0
- minima_harness/ai/providers/base.py +65 -0
- minima_harness/ai/providers/faux.py +173 -0
- minima_harness/ai/providers/google.py +221 -0
- minima_harness/ai/providers/openai_compat.py +278 -0
- minima_harness/ai/registry.py +184 -0
- minima_harness/ai/stream.py +82 -0
- minima_harness/ai/tools.py +51 -0
- minima_harness/ai/types.py +204 -0
- minima_harness/ai/usage.py +41 -0
- minima_harness/minima/__init__.py +40 -0
- minima_harness/minima/cache.py +102 -0
- minima_harness/minima/config.py +85 -0
- minima_harness/minima/goals.py +226 -0
- minima_harness/minima/judge.py +144 -0
- minima_harness/minima/mapping.py +147 -0
- minima_harness/minima/meter.py +143 -0
- minima_harness/minima/router.py +220 -0
- minima_harness/minima/runtime.py +544 -0
- minima_harness/minima/signals.py +195 -0
- minima_harness/session/__init__.py +14 -0
- minima_harness/session/format.py +35 -0
- minima_harness/session/store.py +236 -0
- minima_harness/tasks/__init__.py +17 -0
- minima_harness/tasks/task_set.py +78 -0
- minima_harness/tools/__init__.py +7 -0
- minima_harness/tools/_io.py +34 -0
- minima_harness/tools/bash.py +70 -0
- minima_harness/tools/builtin.py +23 -0
- minima_harness/tools/edit.py +50 -0
- minima_harness/tools/find.py +38 -0
- minima_harness/tools/grep.py +73 -0
- minima_harness/tools/ls.py +35 -0
- minima_harness/tools/read.py +38 -0
- minima_harness/tools/tasks.py +75 -0
- minima_harness/tools/write.py +36 -0
- minima_harness/tui/__init__.py +3 -0
- minima_harness/tui/analytics.py +111 -0
- minima_harness/tui/app.py +1927 -0
- minima_harness/tui/bridge.py +103 -0
- minima_harness/tui/cli.py +227 -0
- minima_harness/tui/clipboard.py +60 -0
- minima_harness/tui/commands.py +49 -0
- minima_harness/tui/compaction.py +17 -0
- minima_harness/tui/config_cli.py +141 -0
- minima_harness/tui/config_store.py +237 -0
- minima_harness/tui/context.py +93 -0
- minima_harness/tui/customize.py +95 -0
- minima_harness/tui/diff.py +53 -0
- minima_harness/tui/editor.py +43 -0
- minima_harness/tui/extensions.py +84 -0
- minima_harness/tui/extra_models.py +52 -0
- minima_harness/tui/history.py +71 -0
- minima_harness/tui/mubit.py +295 -0
- minima_harness/tui/overlays.py +593 -0
- minima_harness/tui/packages.py +59 -0
- minima_harness/tui/run_modes.py +66 -0
- minima_harness/tui/theme.py +77 -0
- minima_harness/tui/welcome.py +83 -0
- minima_harness/tui/widgets/__init__.py +3 -0
- minima_harness/tui/widgets/banner.py +38 -0
- minima_harness/tui/widgets/editor.py +83 -0
- minima_harness/tui/widgets/footer.py +73 -0
- minima_harness/tui/widgets/messages.py +151 -0
- minima_harness/tui/widgets/status.py +57 -0
minima/metrics/report.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Ops-side calibration/savings report over the shared decision-log DB.
|
|
2
|
+
|
|
3
|
+
Usage: ``minima-calibration-report [--days 30] [--org <org_id>]``. Reads the SQLite
|
|
4
|
+
decision log at MINIMA_SQLITE_PATH directly (all orgs unless --org), so it works
|
|
5
|
+
without the API and without a tenant key. With the in-memory store there is nothing
|
|
6
|
+
to read across processes — run the service with MINIMA_RECOMMENDATION_STORE=sqlite.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import json
|
|
13
|
+
import time
|
|
14
|
+
from dataclasses import asdict
|
|
15
|
+
|
|
16
|
+
from minima.config import get_settings
|
|
17
|
+
from minima.metrics.calibration import calibration_by_task_type, cusum_flags, routing_health
|
|
18
|
+
from minima.metrics.savings import summarize
|
|
19
|
+
from minima.recommender.decisionlog import DecisionRecord, SqliteDecisionLog
|
|
20
|
+
|
|
21
|
+
_SECONDS_PER_DAY = 86_400.0
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _print_org(org_id: str, rows: list[DecisionRecord], settings) -> None: # noqa: ANN001
|
|
25
|
+
health = routing_health(rows)
|
|
26
|
+
reports = calibration_by_task_type(
|
|
27
|
+
rows,
|
|
28
|
+
n_bins=settings.minima_calibration_bins,
|
|
29
|
+
shrinkage_k=settings.minima_calibration_shrinkage_k,
|
|
30
|
+
)
|
|
31
|
+
flags = cusum_flags(rows, k=settings.minima_cusum_k, h=settings.minima_cusum_h)
|
|
32
|
+
savings = summarize(rows)
|
|
33
|
+
|
|
34
|
+
print(f"\n=== org: {org_id} ===")
|
|
35
|
+
print(f"health: {json.dumps(health)}")
|
|
36
|
+
est, real = savings.estimated, savings.realized
|
|
37
|
+
print(
|
|
38
|
+
f"savings: est vs premium ${est.savings_vs_premium_usd:.4f} over {est.n} recs"
|
|
39
|
+
f" | est vs declared ${est.savings_vs_declared_usd:.4f} over {est.n_declared}"
|
|
40
|
+
f" | realized(vs est premium) ${real.savings_vs_premium_est_usd:.4f}"
|
|
41
|
+
f" over {real.n_reconciled} reconciled"
|
|
42
|
+
)
|
|
43
|
+
print("calibration (ECE by task_type, shrunk toward global):")
|
|
44
|
+
for r in reports:
|
|
45
|
+
print(
|
|
46
|
+
f" {r.slice_key:<16} n={r.n:<6} ece={r.ece:.4f}"
|
|
47
|
+
f" shrunk={r.ece_shrunk:.4f} quality={r.ece_quality:.4f}"
|
|
48
|
+
)
|
|
49
|
+
if flags:
|
|
50
|
+
print("drift flags (CUSUM):")
|
|
51
|
+
for f in flags[:20]:
|
|
52
|
+
print(
|
|
53
|
+
f" {f.cluster} / {f.model_id}: {f.direction}"
|
|
54
|
+
f" (S={f.statistic}, n={f.n})"
|
|
55
|
+
)
|
|
56
|
+
else:
|
|
57
|
+
print("drift flags: none")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def main() -> None:
|
|
61
|
+
parser = argparse.ArgumentParser(description=__doc__)
|
|
62
|
+
parser.add_argument("--days", type=float, default=30.0)
|
|
63
|
+
parser.add_argument("--org", default=None, help="restrict to one org id")
|
|
64
|
+
parser.add_argument("--json", action="store_true", help="emit raw JSON instead of text")
|
|
65
|
+
args = parser.parse_args()
|
|
66
|
+
|
|
67
|
+
settings = get_settings()
|
|
68
|
+
backend = SqliteDecisionLog(
|
|
69
|
+
settings.minima_sqlite_path, settings.minima_decision_log_retention_days
|
|
70
|
+
)
|
|
71
|
+
since = time.time() - args.days * _SECONDS_PER_DAY
|
|
72
|
+
rows = backend.rows(since=since, org_id=args.org)
|
|
73
|
+
if not rows:
|
|
74
|
+
print(
|
|
75
|
+
f"no decisions in the last {args.days:g} days"
|
|
76
|
+
f" (db: {settings.minima_sqlite_path}"
|
|
77
|
+
f"{', org: ' + args.org if args.org else ''})"
|
|
78
|
+
)
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
by_org: dict[str, list[DecisionRecord]] = {}
|
|
82
|
+
for r in rows:
|
|
83
|
+
by_org.setdefault(r.org_id, []).append(r)
|
|
84
|
+
|
|
85
|
+
if args.json:
|
|
86
|
+
payload = {
|
|
87
|
+
org: {
|
|
88
|
+
"health": routing_health(org_rows),
|
|
89
|
+
"savings": asdict(summarize(org_rows)),
|
|
90
|
+
"calibration": [
|
|
91
|
+
asdict(rep)
|
|
92
|
+
for rep in calibration_by_task_type(
|
|
93
|
+
org_rows,
|
|
94
|
+
n_bins=settings.minima_calibration_bins,
|
|
95
|
+
shrinkage_k=settings.minima_calibration_shrinkage_k,
|
|
96
|
+
)
|
|
97
|
+
],
|
|
98
|
+
"drift_flags": [
|
|
99
|
+
asdict(f)
|
|
100
|
+
for f in cusum_flags(
|
|
101
|
+
org_rows, k=settings.minima_cusum_k, h=settings.minima_cusum_h
|
|
102
|
+
)
|
|
103
|
+
],
|
|
104
|
+
}
|
|
105
|
+
for org, org_rows in sorted(by_org.items())
|
|
106
|
+
}
|
|
107
|
+
print(json.dumps(payload, indent=2))
|
|
108
|
+
return
|
|
109
|
+
|
|
110
|
+
print(f"decision-log report — last {args.days:g} days, {len(rows)} decisions")
|
|
111
|
+
for org, org_rows in sorted(by_org.items()):
|
|
112
|
+
_print_org(org, org_rows, settings)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
if __name__ == "__main__":
|
|
116
|
+
main()
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""Savings accounting over the decision log: estimated and realized, two baselines.
|
|
2
|
+
|
|
3
|
+
Both counterfactual baselines are always reported side by side and explicitly labeled:
|
|
4
|
+
``vs_premium`` (the most expensive scored candidate — generous, overstates savings for
|
|
5
|
+
callers who would never have used the premium model) and ``vs_declared`` (the caller's
|
|
6
|
+
stated default via RecommendRequest.baseline_model_id — honest, but only present when
|
|
7
|
+
callers declare one). Realized figures use the actual cost reported at feedback and are
|
|
8
|
+
restricted to the reconciled subset; estimated and realized are never mixed in one number.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
|
|
15
|
+
from minima.recommender.decisionlog import DecisionRecord
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass(slots=True)
|
|
19
|
+
class SavingsEstimated:
|
|
20
|
+
n: int = 0
|
|
21
|
+
cost_recommended_usd: float = 0.0
|
|
22
|
+
cost_premium_usd: float = 0.0
|
|
23
|
+
savings_vs_premium_usd: float = 0.0
|
|
24
|
+
n_declared: int = 0
|
|
25
|
+
cost_declared_usd: float = 0.0
|
|
26
|
+
savings_vs_declared_usd: float = 0.0
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(slots=True)
|
|
30
|
+
class SavingsRealized:
|
|
31
|
+
n_reconciled: int = 0
|
|
32
|
+
realized_cost_usd: float = 0.0
|
|
33
|
+
est_cost_recommended_usd: float = 0.0
|
|
34
|
+
est_cost_premium_usd: float = 0.0
|
|
35
|
+
savings_vs_premium_est_usd: float = 0.0
|
|
36
|
+
n_declared: int = 0
|
|
37
|
+
est_cost_declared_usd: float = 0.0
|
|
38
|
+
savings_vs_declared_est_usd: float = 0.0
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass(slots=True)
|
|
42
|
+
class SavingsSummary:
|
|
43
|
+
estimated: SavingsEstimated = field(default_factory=SavingsEstimated)
|
|
44
|
+
realized: SavingsRealized = field(default_factory=SavingsRealized)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def summarize(rows: list[DecisionRecord]) -> SavingsSummary:
|
|
48
|
+
est = SavingsEstimated()
|
|
49
|
+
real = SavingsRealized()
|
|
50
|
+
for r in rows:
|
|
51
|
+
est.n += 1
|
|
52
|
+
est.cost_recommended_usd += r.est_cost_recommended
|
|
53
|
+
est.cost_premium_usd += r.est_cost_premium
|
|
54
|
+
est.savings_vs_premium_usd += r.est_cost_premium - r.est_cost_recommended
|
|
55
|
+
if r.est_cost_baseline_declared is not None:
|
|
56
|
+
est.n_declared += 1
|
|
57
|
+
est.cost_declared_usd += r.est_cost_baseline_declared
|
|
58
|
+
est.savings_vs_declared_usd += (
|
|
59
|
+
r.est_cost_baseline_declared - r.est_cost_recommended
|
|
60
|
+
)
|
|
61
|
+
if r.reconciled and r.realized_cost_usd is not None and r.realized_cost_usd > 0:
|
|
62
|
+
real.n_reconciled += 1
|
|
63
|
+
real.realized_cost_usd += r.realized_cost_usd
|
|
64
|
+
real.est_cost_recommended_usd += r.est_cost_recommended
|
|
65
|
+
real.est_cost_premium_usd += r.est_cost_premium
|
|
66
|
+
# Realized chosen cost against the ESTIMATED premium baseline — the only
|
|
67
|
+
# counterfactual available (the premium model was never run). Labeled "est".
|
|
68
|
+
real.savings_vs_premium_est_usd += r.est_cost_premium - r.realized_cost_usd
|
|
69
|
+
if r.est_cost_baseline_declared is not None:
|
|
70
|
+
real.n_declared += 1
|
|
71
|
+
real.est_cost_declared_usd += r.est_cost_baseline_declared
|
|
72
|
+
real.savings_vs_declared_est_usd += (
|
|
73
|
+
r.est_cost_baseline_declared - r.realized_cost_usd
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
for obj in (est, real):
|
|
77
|
+
for name in obj.__dataclass_fields__:
|
|
78
|
+
value = getattr(obj, name)
|
|
79
|
+
if isinstance(value, float):
|
|
80
|
+
setattr(obj, name, round(value, 8))
|
|
81
|
+
return SavingsSummary(estimated=est, realized=real)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def group_rows(
|
|
85
|
+
rows: list[DecisionRecord], group_by: str | None
|
|
86
|
+
) -> dict[str, list[DecisionRecord]]:
|
|
87
|
+
if group_by == "cluster":
|
|
88
|
+
key = lambda r: r.cluster # noqa: E731
|
|
89
|
+
elif group_by == "task_type":
|
|
90
|
+
key = lambda r: r.task_type # noqa: E731
|
|
91
|
+
elif group_by == "lane":
|
|
92
|
+
key = lambda r: r.lane # noqa: E731
|
|
93
|
+
else:
|
|
94
|
+
return {}
|
|
95
|
+
grouped: dict[str, list[DecisionRecord]] = {}
|
|
96
|
+
for r in rows:
|
|
97
|
+
grouped.setdefault(key(r), []).append(r)
|
|
98
|
+
return grouped
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""The recommendation engine: classify -> recall -> aggregate -> score -> optimize."""
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Shared psycopg2 ThreadedConnectionPool registry keyed by database URL.
|
|
2
|
+
|
|
3
|
+
All four durable store classes (RecStore, DecisionLog, Propensity, DurableRefs) call
|
|
4
|
+
get_pool() with the same URL, so the process holds exactly one connection pool per
|
|
5
|
+
database regardless of how many store objects are instantiated.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import contextlib
|
|
11
|
+
from collections.abc import Generator
|
|
12
|
+
from threading import Lock
|
|
13
|
+
|
|
14
|
+
import psycopg2
|
|
15
|
+
from psycopg2.pool import ThreadedConnectionPool
|
|
16
|
+
|
|
17
|
+
_pools: dict[str, ThreadedConnectionPool] = {}
|
|
18
|
+
_lock = Lock()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_pool(url: str, minconn: int = 1, maxconn: int = 5) -> ThreadedConnectionPool:
|
|
22
|
+
with _lock:
|
|
23
|
+
if url not in _pools:
|
|
24
|
+
_pools[url] = ThreadedConnectionPool(minconn, maxconn, url)
|
|
25
|
+
return _pools[url]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@contextlib.contextmanager
|
|
29
|
+
def cursor(url: str) -> Generator[psycopg2.extensions.cursor, None, None]:
|
|
30
|
+
"""Yield a cursor inside a committed transaction; return the connection to the pool."""
|
|
31
|
+
pool = get_pool(url)
|
|
32
|
+
conn = pool.getconn()
|
|
33
|
+
try:
|
|
34
|
+
with conn: # commits on clean exit, rolls back on exception
|
|
35
|
+
with conn.cursor() as cur:
|
|
36
|
+
yield cur
|
|
37
|
+
finally:
|
|
38
|
+
pool.putconn(conn)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Shared Redis client registry keyed by URL.
|
|
2
|
+
|
|
3
|
+
RecStore and DurableRefs both call get_client() with the same URL, so the process
|
|
4
|
+
holds exactly one connection pool (redis.ConnectionPool is built into redis.Redis)
|
|
5
|
+
per Redis instance.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from threading import Lock
|
|
11
|
+
|
|
12
|
+
import redis as _redis
|
|
13
|
+
|
|
14
|
+
_clients: dict[str, _redis.Redis] = {}
|
|
15
|
+
_lock = Lock()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_client(url: str) -> _redis.Redis:
|
|
19
|
+
with _lock:
|
|
20
|
+
if url not in _clients:
|
|
21
|
+
_clients[url] = _redis.from_url(url, decode_responses=True)
|
|
22
|
+
return _clients[url]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def decode(value: bytes | str) -> str:
|
|
26
|
+
"""Narrow a redis response to ``str``.
|
|
27
|
+
|
|
28
|
+
``get_client`` always sets ``decode_responses=True``, so values are ``str`` at
|
|
29
|
+
runtime; the redis-py stubs still type them ``bytes | str``, which this helper
|
|
30
|
+
reconciles for callers (and decodes correctly if the flag ever flips).
|
|
31
|
+
"""
|
|
32
|
+
return value.decode() if isinstance(value, bytes) else value
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""Turn recalled outcomes into a weighted per-model summary."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from collections.abc import Iterable
|
|
7
|
+
|
|
8
|
+
from minima.memory.records import OutcomeRecord, RecalledEvidence, clamp01
|
|
9
|
+
from minima.recommender.types import ModelAggregate
|
|
10
|
+
|
|
11
|
+
# Floor on the confidence multiplier so freshly-seeded (un-reinforced) but
|
|
12
|
+
# topically-relevant evidence still counts — just less than reinforced evidence.
|
|
13
|
+
KC_FLOOR = 0.3
|
|
14
|
+
STALE_DECAY = 0.5
|
|
15
|
+
|
|
16
|
+
_SECONDS_PER_DAY = 86_400.0
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def age_decay(
|
|
20
|
+
recorded_at: float | None,
|
|
21
|
+
*,
|
|
22
|
+
half_life_days: float,
|
|
23
|
+
floor: float,
|
|
24
|
+
now: float | None = None,
|
|
25
|
+
) -> float | None:
|
|
26
|
+
"""Exponential observation-age decay: halves every half-life, floored.
|
|
27
|
+
|
|
28
|
+
None when the record has no timestamp (legacy schema v1) — caller falls back to
|
|
29
|
+
the binary staleness penalty. Future-dated timestamps clamp to no decay.
|
|
30
|
+
"""
|
|
31
|
+
if recorded_at is None or recorded_at <= 0.0 or half_life_days <= 0.0:
|
|
32
|
+
return None
|
|
33
|
+
ref_now = now if now is not None else time.time()
|
|
34
|
+
age_days = max(0.0, ref_now - recorded_at) / _SECONDS_PER_DAY
|
|
35
|
+
return max(floor, 0.5 ** (age_days / half_life_days))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def neighbor_weight(
|
|
39
|
+
ev: RecalledEvidence,
|
|
40
|
+
*,
|
|
41
|
+
half_life_days: float = 0.0,
|
|
42
|
+
decay_floor: float = 0.1,
|
|
43
|
+
now: float | None = None,
|
|
44
|
+
) -> float:
|
|
45
|
+
similarity = max(0.0, ev.score)
|
|
46
|
+
confidence_mult = KC_FLOOR + (1.0 - KC_FLOOR) * clamp01(ev.knowledge_confidence)
|
|
47
|
+
# Observation-age decay when the record carries a timestamp; supersession (is_stale)
|
|
48
|
+
# still caps the multiplier at STALE_DECAY. Records without a timestamp keep the
|
|
49
|
+
# legacy binary behavior. knowledge_confidence is left untouched on purpose: its
|
|
50
|
+
# server-side recency component tracks *reinforcement* recency, this tracks
|
|
51
|
+
# *observation* age.
|
|
52
|
+
decay = age_decay(
|
|
53
|
+
ev.record.recorded_at if ev.record else None,
|
|
54
|
+
half_life_days=half_life_days,
|
|
55
|
+
floor=decay_floor,
|
|
56
|
+
now=now,
|
|
57
|
+
)
|
|
58
|
+
if decay is None:
|
|
59
|
+
decay = STALE_DECAY if ev.is_stale else 1.0
|
|
60
|
+
elif ev.is_stale:
|
|
61
|
+
decay = min(decay, STALE_DECAY)
|
|
62
|
+
return similarity * confidence_mult * decay
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def seed_factor(n_live: int, *, seed_weight: float, crowdout_n: int) -> float:
|
|
66
|
+
"""Weight multiplier for seeded evidence, crowded out linearly by live outcomes."""
|
|
67
|
+
if crowdout_n <= 0:
|
|
68
|
+
return seed_weight
|
|
69
|
+
return seed_weight * max(0.0, 1.0 - n_live / float(crowdout_n))
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def aggregate_by_model(
|
|
73
|
+
evidence: Iterable[RecalledEvidence],
|
|
74
|
+
candidate_ids: set[str] | None = None,
|
|
75
|
+
*,
|
|
76
|
+
half_life_days: float = 0.0,
|
|
77
|
+
decay_floor: float = 0.1,
|
|
78
|
+
seed_weight: float = 1.0,
|
|
79
|
+
seed_crowdout_n: int = 0,
|
|
80
|
+
now: float | None = None,
|
|
81
|
+
) -> dict[str, ModelAggregate]:
|
|
82
|
+
"""Group neighbors by model and accumulate weighted success statistics.
|
|
83
|
+
|
|
84
|
+
Two passes: the first counts live (non-seed) outcomes per model so seeded evidence
|
|
85
|
+
can be crowded out as real feedback accumulates; the second accumulates weights.
|
|
86
|
+
Defaults preserve legacy behavior (no age decay, seeds at full weight).
|
|
87
|
+
"""
|
|
88
|
+
items: list[tuple[RecalledEvidence, OutcomeRecord]] = []
|
|
89
|
+
n_live: dict[str, int] = {}
|
|
90
|
+
for ev in evidence:
|
|
91
|
+
rec = ev.record
|
|
92
|
+
if rec is None:
|
|
93
|
+
continue
|
|
94
|
+
if candidate_ids is not None and rec.model_id not in candidate_ids:
|
|
95
|
+
continue
|
|
96
|
+
items.append((ev, rec))
|
|
97
|
+
if rec.source_dataset is None:
|
|
98
|
+
n_live[rec.model_id] = n_live.get(rec.model_id, 0) + 1
|
|
99
|
+
|
|
100
|
+
aggs: dict[str, ModelAggregate] = {}
|
|
101
|
+
kc_totals: dict[str, float] = {}
|
|
102
|
+
|
|
103
|
+
for ev, rec in items:
|
|
104
|
+
model_id = rec.model_id
|
|
105
|
+
|
|
106
|
+
weight = neighbor_weight(
|
|
107
|
+
ev, half_life_days=half_life_days, decay_floor=decay_floor, now=now
|
|
108
|
+
)
|
|
109
|
+
if rec.source_dataset is not None and seed_weight != 1.0:
|
|
110
|
+
weight *= seed_factor(
|
|
111
|
+
n_live.get(model_id, 0), seed_weight=seed_weight, crowdout_n=seed_crowdout_n
|
|
112
|
+
)
|
|
113
|
+
agg = aggs.get(model_id)
|
|
114
|
+
if agg is None:
|
|
115
|
+
agg = ModelAggregate(model_id=model_id)
|
|
116
|
+
aggs[model_id] = agg
|
|
117
|
+
kc_totals[model_id] = 0.0
|
|
118
|
+
|
|
119
|
+
y = clamp01(rec.quality_score)
|
|
120
|
+
agg.weight_sum += weight
|
|
121
|
+
agg.weighted_success += weight * y
|
|
122
|
+
agg.n += 1
|
|
123
|
+
agg.evidence.append(ev)
|
|
124
|
+
kc_totals[model_id] += clamp01(ev.knowledge_confidence)
|
|
125
|
+
# Observed cost is derived on demand from agg.evidence (robust median, similarity
|
|
126
|
+
# weighted) — see ModelAggregate.observed_cost — not accumulated here.
|
|
127
|
+
|
|
128
|
+
for model_id, agg in aggs.items():
|
|
129
|
+
agg.avg_knowledge_confidence = kc_totals[model_id] / agg.n if agg.n else 0.0
|
|
130
|
+
|
|
131
|
+
return aggs
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def is_conflicted(agg: ModelAggregate, min_n: int = 4, lo: float = 0.35, hi: float = 0.70) -> bool:
|
|
135
|
+
"""A model whose neighbors show mixed success — broadened to catch degrading models."""
|
|
136
|
+
return agg.n >= min_n and lo <= agg.weighted_success_rate <= hi
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def apply_ipw(
|
|
140
|
+
aggs: dict[str, ModelAggregate],
|
|
141
|
+
propensities: dict[str, float],
|
|
142
|
+
clip_low: float,
|
|
143
|
+
clip_high: float,
|
|
144
|
+
) -> None:
|
|
145
|
+
"""Re-weight each model's evidence mass by clipped inverse propensity, in place.
|
|
146
|
+
|
|
147
|
+
Scaling weight_sum and weighted_success by the same factor preserves the
|
|
148
|
+
empirical success rate while up-weighting evidence from rarely-recommended
|
|
149
|
+
models (low propensity) so it isn't drowned out by selection bias.
|
|
150
|
+
"""
|
|
151
|
+
for model_id, agg in aggs.items():
|
|
152
|
+
pi = propensities.get(model_id)
|
|
153
|
+
if not pi or pi <= 0:
|
|
154
|
+
continue
|
|
155
|
+
factor = min(clip_high, max(clip_low, 1.0 / pi))
|
|
156
|
+
agg.weight_sum *= factor
|
|
157
|
+
agg.weighted_success *= factor
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""Heuristic task classification: type and difficulty from the prompt text.
|
|
2
|
+
|
|
3
|
+
Cheap and deterministic. Caller-supplied ``task_type``/``difficulty`` always win.
|
|
4
|
+
A cheap-LLM classifier is layered in a later phase when confidence is low.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
|
|
11
|
+
from minima.schemas.common import Difficulty, TaskInput, TaskType
|
|
12
|
+
|
|
13
|
+
# Ordered most-specific-first; the first matching pattern wins.
|
|
14
|
+
_TYPE_PATTERNS: list[tuple[TaskType, re.Pattern[str]]] = [
|
|
15
|
+
(
|
|
16
|
+
TaskType.code,
|
|
17
|
+
re.compile(
|
|
18
|
+
r"```|\bdef \b|\bclass \b|\bfunction\b|\bimport \b|\bSELECT \b|regex|stack ?trace|compile|refactor|implement|unit test|debug",
|
|
19
|
+
re.I,
|
|
20
|
+
),
|
|
21
|
+
),
|
|
22
|
+
(
|
|
23
|
+
TaskType.translation,
|
|
24
|
+
re.compile(
|
|
25
|
+
r"\btranslate\b|\bin (french|spanish|german|chinese|japanese|hindi)\b|\bto (french|spanish|german)\b",
|
|
26
|
+
re.I,
|
|
27
|
+
),
|
|
28
|
+
),
|
|
29
|
+
(
|
|
30
|
+
TaskType.summarization,
|
|
31
|
+
re.compile(r"\bsummari[sz]e\b|\btl;?dr\b|\bsummary\b|\bcondense\b|\bin brief\b", re.I),
|
|
32
|
+
),
|
|
33
|
+
(
|
|
34
|
+
TaskType.extraction,
|
|
35
|
+
re.compile(
|
|
36
|
+
r"\bextract\b|\bparse\b|\bpull out\b|\blist all\b|\bfind all\b|\bjson schema\b|\bfields?\b.*\bfrom\b",
|
|
37
|
+
re.I,
|
|
38
|
+
),
|
|
39
|
+
),
|
|
40
|
+
(
|
|
41
|
+
TaskType.classification,
|
|
42
|
+
re.compile(
|
|
43
|
+
r"\bclassif|\bcategori[sz]e\b|\blabel\b|\bsentiment\b|\bwhich (category|class|label)\b|\btrue or false\b",
|
|
44
|
+
re.I,
|
|
45
|
+
),
|
|
46
|
+
),
|
|
47
|
+
(
|
|
48
|
+
TaskType.tool_use,
|
|
49
|
+
re.compile(
|
|
50
|
+
r"\bcall the\b|\buse the tool\b|\bfunction call\b|\btool[_ ]?call\b|\bapi call\b", re.I
|
|
51
|
+
),
|
|
52
|
+
),
|
|
53
|
+
(
|
|
54
|
+
TaskType.rag,
|
|
55
|
+
re.compile(
|
|
56
|
+
r"\bbased on the (following|context|document)\b|\baccording to the\b|\bcontext:\b|\bgiven the passage\b",
|
|
57
|
+
re.I,
|
|
58
|
+
),
|
|
59
|
+
),
|
|
60
|
+
(
|
|
61
|
+
TaskType.creative,
|
|
62
|
+
re.compile(
|
|
63
|
+
r"\bwrite a (story|poem|song|essay)\b|\bcreative\b|\bimagine\b|\bbrainstorm\b", re.I
|
|
64
|
+
),
|
|
65
|
+
),
|
|
66
|
+
(
|
|
67
|
+
TaskType.reasoning,
|
|
68
|
+
re.compile(
|
|
69
|
+
r"\bprove\b|\bcalculate\b|\bsolve\b|\bequation\b|\bstep[- ]by[- ]step\b|\breason(ing)?\b|\bderive\b|\bwhy (does|is|are)\b",
|
|
70
|
+
re.I,
|
|
71
|
+
),
|
|
72
|
+
),
|
|
73
|
+
(
|
|
74
|
+
TaskType.qa,
|
|
75
|
+
re.compile(r"^\s*(what|who|when|where|why|how|which|is|are|does|can)\b|\?\s*$", re.I),
|
|
76
|
+
),
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
_COMPLEXITY_MARKERS = re.compile(
|
|
80
|
+
r"\b(and then|after that|must|ensure|constraint|optimi[sz]e|edge case|step \d|\d\.\s)\b", re.I
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Task types that tend to need more capability for the same text length.
|
|
84
|
+
_HARD_TYPES = {TaskType.code, TaskType.reasoning}
|
|
85
|
+
_EASY_TYPES = {
|
|
86
|
+
TaskType.classification,
|
|
87
|
+
TaskType.extraction,
|
|
88
|
+
TaskType.summarization,
|
|
89
|
+
TaskType.translation,
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
_ORDER = [
|
|
93
|
+
Difficulty.trivial,
|
|
94
|
+
Difficulty.easy,
|
|
95
|
+
Difficulty.medium,
|
|
96
|
+
Difficulty.hard,
|
|
97
|
+
Difficulty.expert,
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def infer_task_type(text: str) -> TaskType:
|
|
102
|
+
for task_type, pattern in _TYPE_PATTERNS:
|
|
103
|
+
if pattern.search(text):
|
|
104
|
+
return task_type
|
|
105
|
+
return TaskType.other
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def infer_difficulty(text: str, task_type: TaskType) -> Difficulty:
|
|
109
|
+
words = len(text.split())
|
|
110
|
+
if words < 40:
|
|
111
|
+
base = 1 # easy
|
|
112
|
+
elif words < 150:
|
|
113
|
+
base = 2 # medium
|
|
114
|
+
elif words < 400:
|
|
115
|
+
base = 3 # hard
|
|
116
|
+
else:
|
|
117
|
+
base = 4 # expert
|
|
118
|
+
|
|
119
|
+
if len(_COMPLEXITY_MARKERS.findall(text)) >= 2:
|
|
120
|
+
base += 1 # multiple multi-step / constraint markers
|
|
121
|
+
|
|
122
|
+
if task_type in _HARD_TYPES:
|
|
123
|
+
base += 1
|
|
124
|
+
elif task_type in _EASY_TYPES:
|
|
125
|
+
base -= 1
|
|
126
|
+
|
|
127
|
+
index = max(0, min(len(_ORDER) - 1, base))
|
|
128
|
+
return _ORDER[index]
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def classify(task: TaskInput) -> tuple[TaskType, Difficulty]:
|
|
132
|
+
task_type = task.task_type or infer_task_type(task.task)
|
|
133
|
+
difficulty = task.difficulty or infer_difficulty(task.task, task_type)
|
|
134
|
+
return task_type, difficulty
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def classify_from_neighbors(
|
|
138
|
+
votes: list[tuple[str, float]], *, min_neighbors: int = 2, min_share: float = 0.5
|
|
139
|
+
) -> TaskType | None:
|
|
140
|
+
"""Disambiguate an `other` classification from ANN-recalled semantic neighbors.
|
|
141
|
+
|
|
142
|
+
``votes`` is ``(neighbor_task_type, similarity)`` over recalled outcomes. Returns the
|
|
143
|
+
similarity-weighted plurality type when it is non-`other`, has >= ``min_neighbors``
|
|
144
|
+
supporters, and holds >= ``min_share`` of the weighted vote; else None. This is the free,
|
|
145
|
+
semantic alternative to a paid LLM-classify call for prompts the regex can't place.
|
|
146
|
+
"""
|
|
147
|
+
weighted: dict[str, float] = {}
|
|
148
|
+
counts: dict[str, int] = {}
|
|
149
|
+
total = 0.0
|
|
150
|
+
for tt, weight in votes:
|
|
151
|
+
if not tt or tt == TaskType.other.value:
|
|
152
|
+
continue
|
|
153
|
+
w = max(0.0, weight)
|
|
154
|
+
weighted[tt] = weighted.get(tt, 0.0) + w
|
|
155
|
+
counts[tt] = counts.get(tt, 0) + 1
|
|
156
|
+
total += w
|
|
157
|
+
if total <= 0.0:
|
|
158
|
+
return None
|
|
159
|
+
best = max(weighted, key=weighted.__getitem__)
|
|
160
|
+
if counts[best] < min_neighbors or (weighted[best] / total) < min_share:
|
|
161
|
+
return None
|
|
162
|
+
try:
|
|
163
|
+
return TaskType(best)
|
|
164
|
+
except ValueError:
|
|
165
|
+
return None
|