modelab 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- modelab/__init__.py +126 -0
- modelab/_assignment.py +163 -0
- modelab/_engine.py +40 -0
- modelab/_errors.py +26 -0
- modelab/_server_storage.py +102 -0
- modelab/_state.py +42 -0
- modelab/_types.py +70 -0
- modelab/py.typed +0 -0
- modelab-0.1.0.dist-info/METADATA +163 -0
- modelab-0.1.0.dist-info/RECORD +11 -0
- modelab-0.1.0.dist-info/WHEEL +4 -0
modelab/__init__.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""modelab — provider-agnostic A/B testing for LLM systems."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import urllib.request
|
|
8
|
+
from typing import Any, Sequence
|
|
9
|
+
|
|
10
|
+
from modelab._assignment import Assignment
|
|
11
|
+
from modelab._engine import assign_variant
|
|
12
|
+
from modelab._errors import FlagNotFoundError, NotInitializedError
|
|
13
|
+
from modelab._server_storage import ServerStorage
|
|
14
|
+
from modelab._state import _global_state
|
|
15
|
+
from modelab._types import (
|
|
16
|
+
AssignmentRecord,
|
|
17
|
+
EvalContext,
|
|
18
|
+
Flag,
|
|
19
|
+
Variant,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"init",
|
|
24
|
+
"assign",
|
|
25
|
+
"evaluate",
|
|
26
|
+
"Flag",
|
|
27
|
+
"Variant",
|
|
28
|
+
"EvalContext",
|
|
29
|
+
"Assignment",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def init(
|
|
34
|
+
server: str,
|
|
35
|
+
flags: Sequence[Flag] = (),
|
|
36
|
+
api_key: str = "",
|
|
37
|
+
) -> None:
|
|
38
|
+
"""Initialize modelab with a server URL and flag definitions.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
server: The modelab server URL (e.g. "http://localhost:8100").
|
|
42
|
+
flags: The experiment flags to register.
|
|
43
|
+
api_key: Optional API key for server authentication.
|
|
44
|
+
"""
|
|
45
|
+
storage = ServerStorage(server, api_key=api_key)
|
|
46
|
+
_global_state.configure(storage, list(flags), server_url=server)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def assign(flag_name: str, ctx: EvalContext) -> Assignment | None:
|
|
50
|
+
"""Assign a variant for the given flag and context.
|
|
51
|
+
|
|
52
|
+
Returns None if the user is outside the rollout percentage.
|
|
53
|
+
Raises NotInitializedError if init() hasn't been called.
|
|
54
|
+
Raises FlagNotFoundError if the flag name isn't registered.
|
|
55
|
+
"""
|
|
56
|
+
if not _global_state.initialized:
|
|
57
|
+
raise NotInitializedError()
|
|
58
|
+
|
|
59
|
+
flag = _global_state.flags.get(flag_name)
|
|
60
|
+
if flag is None:
|
|
61
|
+
raise FlagNotFoundError(flag_name)
|
|
62
|
+
|
|
63
|
+
variant = assign_variant(flag, ctx)
|
|
64
|
+
if variant is None:
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
storage = _global_state.storage
|
|
68
|
+
assert storage is not None
|
|
69
|
+
|
|
70
|
+
record = AssignmentRecord(
|
|
71
|
+
flag_name=flag_name,
|
|
72
|
+
variant_name=variant.name,
|
|
73
|
+
user_id=ctx.user_id,
|
|
74
|
+
session_id=ctx.session_id,
|
|
75
|
+
config_json=dict(variant.config),
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
storage.save_assignment(record)
|
|
80
|
+
except Exception:
|
|
81
|
+
logging.getLogger("modelab").warning(
|
|
82
|
+
"Failed to save assignment for %s", flag_name, exc_info=True
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
return Assignment(
|
|
86
|
+
flag_name=flag_name,
|
|
87
|
+
variant_name=variant.name,
|
|
88
|
+
config=dict(variant.config),
|
|
89
|
+
context=ctx,
|
|
90
|
+
storage=storage,
|
|
91
|
+
assignment_id=record.assignment_id,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def evaluate(flag_name: str) -> dict[str, Any]:
|
|
96
|
+
"""Fetch per-variant metrics for a flag from the server.
|
|
97
|
+
|
|
98
|
+
Flushes any buffered data, then queries the server's
|
|
99
|
+
GET /api/v1/flags/{flag_name} endpoint.
|
|
100
|
+
|
|
101
|
+
Raises NotInitializedError if init() hasn't been called.
|
|
102
|
+
Raises FlagNotFoundError if the flag name isn't registered.
|
|
103
|
+
"""
|
|
104
|
+
if not _global_state.initialized:
|
|
105
|
+
raise NotInitializedError()
|
|
106
|
+
|
|
107
|
+
if flag_name not in _global_state.flags:
|
|
108
|
+
raise FlagNotFoundError(flag_name)
|
|
109
|
+
|
|
110
|
+
storage = _global_state.storage
|
|
111
|
+
assert storage is not None
|
|
112
|
+
storage.flush()
|
|
113
|
+
|
|
114
|
+
url = f"{_global_state.server_url.rstrip('/')}/api/v1/flags/{flag_name}"
|
|
115
|
+
req = urllib.request.Request(url, method="GET")
|
|
116
|
+
with urllib.request.urlopen(req, timeout=10) as resp:
|
|
117
|
+
return json.loads(resp.read())
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def reset() -> None:
|
|
121
|
+
"""Reset global state to uninitialized.
|
|
122
|
+
|
|
123
|
+
Clears all flags, storage, and server URL. Primarily used for testing
|
|
124
|
+
to ensure clean state between test runs.
|
|
125
|
+
"""
|
|
126
|
+
_global_state.reset()
|
modelab/_assignment.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""Assignment class — the main object returned by modelab.assign()."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
7
|
+
|
|
8
|
+
from modelab._types import (
|
|
9
|
+
EvalContext,
|
|
10
|
+
EventRecord,
|
|
11
|
+
ExecutionRecord,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from modelab._server_storage import ServerStorage
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger("modelab")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _extract_usage_from_response(response: Any) -> tuple[int | None, int | None]:
|
|
21
|
+
"""Duck-type token usage from a provider response object.
|
|
22
|
+
|
|
23
|
+
Tries OpenAI-style attrs first (usage.prompt_tokens / completion_tokens),
|
|
24
|
+
then Anthropic-style (usage.input_tokens / output_tokens).
|
|
25
|
+
Returns (input_tokens, output_tokens) or (None, None) if not found.
|
|
26
|
+
"""
|
|
27
|
+
usage = getattr(response, "usage", None)
|
|
28
|
+
if usage is None:
|
|
29
|
+
return None, None
|
|
30
|
+
|
|
31
|
+
# OpenAI: usage.prompt_tokens / usage.completion_tokens
|
|
32
|
+
prompt = getattr(usage, "prompt_tokens", None)
|
|
33
|
+
completion = getattr(usage, "completion_tokens", None)
|
|
34
|
+
if prompt is not None or completion is not None:
|
|
35
|
+
return prompt, completion
|
|
36
|
+
|
|
37
|
+
# Anthropic: usage.input_tokens / usage.output_tokens
|
|
38
|
+
inp = getattr(usage, "input_tokens", None)
|
|
39
|
+
out = getattr(usage, "output_tokens", None)
|
|
40
|
+
if inp is not None or out is not None:
|
|
41
|
+
return inp, out
|
|
42
|
+
|
|
43
|
+
return None, None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class Assignment:
|
|
47
|
+
"""Represents a variant assignment for a specific evaluation context."""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
flag_name: str,
|
|
52
|
+
variant_name: str,
|
|
53
|
+
config: dict[str, Any],
|
|
54
|
+
context: EvalContext,
|
|
55
|
+
storage: ServerStorage,
|
|
56
|
+
assignment_id: str,
|
|
57
|
+
) -> None:
|
|
58
|
+
self._flag_name = flag_name
|
|
59
|
+
self._variant_name = variant_name
|
|
60
|
+
self._config = config
|
|
61
|
+
self._context = context
|
|
62
|
+
self._storage = storage
|
|
63
|
+
self._assignment_id = assignment_id
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def flag_name(self) -> str:
|
|
67
|
+
return self._flag_name
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def variant_name(self) -> str:
|
|
71
|
+
return self._variant_name
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def config(self) -> dict[str, Any]:
|
|
75
|
+
return self._config
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def context(self) -> EvalContext:
|
|
79
|
+
return self._context
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def assignment_id(self) -> str:
|
|
83
|
+
return self._assignment_id
|
|
84
|
+
|
|
85
|
+
def record(
|
|
86
|
+
self,
|
|
87
|
+
response: Any = None,
|
|
88
|
+
*,
|
|
89
|
+
latency_ms: float | None = None,
|
|
90
|
+
input_tokens: int | None = None,
|
|
91
|
+
output_tokens: int | None = None,
|
|
92
|
+
cost: float | None = None,
|
|
93
|
+
error: str | None = None,
|
|
94
|
+
**metadata: Any,
|
|
95
|
+
) -> None:
|
|
96
|
+
"""Record execution metrics, optionally extracting tokens from a provider response.
|
|
97
|
+
|
|
98
|
+
If ``response`` is passed, token counts are duck-typed from
|
|
99
|
+
``response.usage`` (OpenAI and Anthropic formats).
|
|
100
|
+
Explicit keyword arguments always override extracted values.
|
|
101
|
+
"""
|
|
102
|
+
if response is not None:
|
|
103
|
+
extracted_in, extracted_out = _extract_usage_from_response(response)
|
|
104
|
+
if input_tokens is None:
|
|
105
|
+
input_tokens = extracted_in
|
|
106
|
+
if output_tokens is None:
|
|
107
|
+
output_tokens = extracted_out
|
|
108
|
+
|
|
109
|
+
self._save_execution(
|
|
110
|
+
ExecutionRecord(
|
|
111
|
+
assignment_id=self._assignment_id,
|
|
112
|
+
latency_ms=latency_ms,
|
|
113
|
+
input_tokens=input_tokens,
|
|
114
|
+
output_tokens=output_tokens,
|
|
115
|
+
cost=cost,
|
|
116
|
+
error=error,
|
|
117
|
+
metadata_json=metadata,
|
|
118
|
+
)
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
def mark_success(self, payload: dict[str, Any] | None = None) -> None:
|
|
122
|
+
"""Mark this assignment as a success event.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
payload: Optional additional data to attach to the event.
|
|
126
|
+
"""
|
|
127
|
+
self._save_event("success", "", payload or {})
|
|
128
|
+
|
|
129
|
+
def mark_failure(self, payload: dict[str, Any] | None = None) -> None:
|
|
130
|
+
"""Mark this assignment as a failure event.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
payload: Optional additional data to attach to the event.
|
|
134
|
+
"""
|
|
135
|
+
self._save_event("failure", "", payload or {})
|
|
136
|
+
|
|
137
|
+
def mark_custom_event(self, name: str, payload: dict[str, Any] | None = None) -> None:
|
|
138
|
+
"""Mark a custom event for this assignment.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
name: The name of the custom event (e.g., "copied", "dismissed").
|
|
142
|
+
payload: Optional additional data to attach to the event.
|
|
143
|
+
"""
|
|
144
|
+
self._save_event("custom", name, payload or {})
|
|
145
|
+
|
|
146
|
+
def _save_execution(self, record: ExecutionRecord) -> None:
|
|
147
|
+
try:
|
|
148
|
+
self._storage.save_execution(record)
|
|
149
|
+
except Exception:
|
|
150
|
+
logger.warning("Failed to save execution for %s", self._assignment_id, exc_info=True)
|
|
151
|
+
|
|
152
|
+
def _save_event(self, event_type: str, event_name: str, payload: dict[str, Any]) -> None:
|
|
153
|
+
try:
|
|
154
|
+
self._storage.save_event(
|
|
155
|
+
EventRecord(
|
|
156
|
+
assignment_id=self._assignment_id,
|
|
157
|
+
event_type=event_type,
|
|
158
|
+
event_name=event_name,
|
|
159
|
+
payload_json=payload,
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
except Exception:
|
|
163
|
+
logger.warning("Failed to save event for %s", self._assignment_id, exc_info=True)
|
modelab/_engine.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Deterministic assignment engine: hash → bucket → rollout gate → variant."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
|
|
7
|
+
from modelab._types import EvalContext, Flag, Variant
|
|
8
|
+
|
|
9
|
+
BUCKET_COUNT = 10_000
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _bucket(flag_name: str, user_id: str) -> int:
|
|
13
|
+
"""Deterministic bucket in [0, BUCKET_COUNT) from flag + user."""
|
|
14
|
+
key = f"{flag_name}:{user_id}"
|
|
15
|
+
digest = hashlib.md5(key.encode()).hexdigest()
|
|
16
|
+
return int(digest, 16) % BUCKET_COUNT
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def assign_variant(flag: Flag, ctx: EvalContext) -> Variant | None:
|
|
20
|
+
"""Return the assigned Variant, or None if outside rollout."""
|
|
21
|
+
bucket = _bucket(flag.name, ctx.user_id)
|
|
22
|
+
|
|
23
|
+
# Rollout gate: rollout_pct of 100 means buckets 0–9999 pass
|
|
24
|
+
rollout_threshold = int(flag.rollout_pct / 100.0 * BUCKET_COUNT)
|
|
25
|
+
if bucket >= rollout_threshold:
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
# Weighted variant selection within the rollout population
|
|
29
|
+
total_weight = sum(v.weight for v in flag.variants)
|
|
30
|
+
if total_weight == 0:
|
|
31
|
+
return flag.variants[0] if flag.variants else None
|
|
32
|
+
point = bucket % total_weight
|
|
33
|
+
cumulative = 0
|
|
34
|
+
for variant in flag.variants:
|
|
35
|
+
cumulative += variant.weight
|
|
36
|
+
if point < cumulative:
|
|
37
|
+
return variant
|
|
38
|
+
|
|
39
|
+
# Fallback (shouldn't happen if weights > 0)
|
|
40
|
+
return flag.variants[-1] if flag.variants else None
|
modelab/_errors.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Exception hierarchy for modelab."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ModelabError(Exception):
|
|
7
|
+
"""Base exception for all modelab errors."""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class NotInitializedError(ModelabError):
|
|
11
|
+
"""Raised when modelab.assign() is called before modelab.init()."""
|
|
12
|
+
|
|
13
|
+
def __init__(self) -> None:
|
|
14
|
+
super().__init__("modelab.init() must be called before assign()")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class FlagNotFoundError(ModelabError):
|
|
18
|
+
"""Raised when a flag name is not in the registry."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, name: str) -> None:
|
|
21
|
+
super().__init__(f"Flag not found: {name!r}")
|
|
22
|
+
self.name = name
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class InvalidFlagError(ModelabError):
|
|
26
|
+
"""Raised when a Flag definition is invalid."""
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""ServerStorage — HTTP storage backend that buffers and flushes to modelab-server."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import atexit
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import threading
|
|
9
|
+
import urllib.request
|
|
10
|
+
from dataclasses import asdict
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from modelab._types import AssignmentRecord, EventRecord, ExecutionRecord
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger("modelab")
|
|
17
|
+
|
|
18
|
+
_FLUSH_SIZE = 50
|
|
19
|
+
_FLUSH_INTERVAL = 5.0 # seconds
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _default_serializer(obj: Any) -> str:
|
|
23
|
+
if isinstance(obj, datetime):
|
|
24
|
+
return obj.isoformat()
|
|
25
|
+
raise TypeError(f"Not serializable: {type(obj)}")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ServerStorage:
|
|
29
|
+
"""HTTP storage that buffers records and flushes to the modelab server."""
|
|
30
|
+
|
|
31
|
+
def __init__(self, base_url: str, api_key: str = "") -> None:
|
|
32
|
+
self._base_url = base_url.rstrip("/")
|
|
33
|
+
self._api_key = api_key
|
|
34
|
+
self._lock = threading.Lock()
|
|
35
|
+
self._assignments: list[dict[str, Any]] = []
|
|
36
|
+
self._executions: list[dict[str, Any]] = []
|
|
37
|
+
self._events: list[dict[str, Any]] = []
|
|
38
|
+
|
|
39
|
+
# Background flush timer
|
|
40
|
+
self._timer: threading.Timer | None = None
|
|
41
|
+
self._start_timer()
|
|
42
|
+
atexit.register(self.flush)
|
|
43
|
+
|
|
44
|
+
def _start_timer(self) -> None:
|
|
45
|
+
self._timer = threading.Timer(_FLUSH_INTERVAL, self._timer_flush)
|
|
46
|
+
self._timer.daemon = True
|
|
47
|
+
self._timer.start()
|
|
48
|
+
|
|
49
|
+
def _timer_flush(self) -> None:
|
|
50
|
+
self.flush()
|
|
51
|
+
self._start_timer()
|
|
52
|
+
|
|
53
|
+
def save_assignment(self, record: AssignmentRecord) -> None:
|
|
54
|
+
with self._lock:
|
|
55
|
+
self._assignments.append(asdict(record))
|
|
56
|
+
if len(self._assignments) >= _FLUSH_SIZE:
|
|
57
|
+
self._flush_locked("assignments", self._assignments)
|
|
58
|
+
self._assignments = []
|
|
59
|
+
|
|
60
|
+
def save_execution(self, record: ExecutionRecord) -> None:
|
|
61
|
+
with self._lock:
|
|
62
|
+
self._executions.append(asdict(record))
|
|
63
|
+
if len(self._executions) >= _FLUSH_SIZE:
|
|
64
|
+
self._flush_locked("executions", self._executions)
|
|
65
|
+
self._executions = []
|
|
66
|
+
|
|
67
|
+
def save_event(self, record: EventRecord) -> None:
|
|
68
|
+
with self._lock:
|
|
69
|
+
self._events.append(asdict(record))
|
|
70
|
+
if len(self._events) >= _FLUSH_SIZE:
|
|
71
|
+
self._flush_locked("events", self._events)
|
|
72
|
+
self._events = []
|
|
73
|
+
|
|
74
|
+
def flush(self) -> None:
|
|
75
|
+
with self._lock:
|
|
76
|
+
if self._assignments:
|
|
77
|
+
self._flush_locked("assignments", self._assignments)
|
|
78
|
+
self._assignments = []
|
|
79
|
+
if self._executions:
|
|
80
|
+
self._flush_locked("executions", self._executions)
|
|
81
|
+
self._executions = []
|
|
82
|
+
if self._events:
|
|
83
|
+
self._flush_locked("events", self._events)
|
|
84
|
+
self._events = []
|
|
85
|
+
|
|
86
|
+
def _flush_locked(self, endpoint: str, records: list[dict[str, Any]]) -> None:
|
|
87
|
+
url = f"{self._base_url}/api/v1/ingest/{endpoint}"
|
|
88
|
+
data = json.dumps(records, default=_default_serializer).encode()
|
|
89
|
+
req = urllib.request.Request(
|
|
90
|
+
url,
|
|
91
|
+
data=data,
|
|
92
|
+
headers={
|
|
93
|
+
"Content-Type": "application/json",
|
|
94
|
+
**({"X-API-Key": self._api_key} if self._api_key else {}),
|
|
95
|
+
},
|
|
96
|
+
method="POST",
|
|
97
|
+
)
|
|
98
|
+
try:
|
|
99
|
+
with urllib.request.urlopen(req, timeout=10) as resp:
|
|
100
|
+
resp.read()
|
|
101
|
+
except Exception:
|
|
102
|
+
logger.warning("Failed to flush %d %s to %s", len(records), endpoint, url, exc_info=True)
|
modelab/_state.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Module-level singleton state for modelab."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
from modelab._errors import InvalidFlagError
|
|
8
|
+
from modelab._types import Flag
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from modelab._server_storage import ServerStorage
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class _State:
|
|
15
|
+
def __init__(self) -> None:
|
|
16
|
+
self.storage: ServerStorage | None = None
|
|
17
|
+
self.flags: dict[str, Flag] = {}
|
|
18
|
+
self.server_url: str = ""
|
|
19
|
+
|
|
20
|
+
def configure(self, storage: ServerStorage, flags: list[Flag], server_url: str = "") -> None:
|
|
21
|
+
for flag in flags:
|
|
22
|
+
if not flag.variants:
|
|
23
|
+
raise InvalidFlagError(f"Flag {flag.name!r} has no variants")
|
|
24
|
+
if not (0 <= flag.rollout_pct <= 100):
|
|
25
|
+
raise InvalidFlagError(
|
|
26
|
+
f"Flag {flag.name!r} rollout_pct must be 0-100, got {flag.rollout_pct}"
|
|
27
|
+
)
|
|
28
|
+
self.storage = storage
|
|
29
|
+
self.flags = {f.name: f for f in flags}
|
|
30
|
+
self.server_url = server_url
|
|
31
|
+
|
|
32
|
+
def reset(self) -> None:
|
|
33
|
+
self.storage = None
|
|
34
|
+
self.flags = {}
|
|
35
|
+
self.server_url = ""
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def initialized(self) -> bool:
|
|
39
|
+
return self.storage is not None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
_global_state = _State()
|
modelab/_types.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Core types for modelab: Flag, Variant, EvalContext, records, Storage protocol."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import uuid
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class Variant:
|
|
13
|
+
"""A single variant within a flag."""
|
|
14
|
+
|
|
15
|
+
name: str
|
|
16
|
+
weight: int = 50
|
|
17
|
+
config: dict[str, Any] = field(default_factory=dict)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(frozen=True)
|
|
21
|
+
class Flag:
|
|
22
|
+
"""An experiment flag with one or more variants."""
|
|
23
|
+
|
|
24
|
+
name: str
|
|
25
|
+
variants: list[Variant] = field(default_factory=list)
|
|
26
|
+
rollout_pct: float = 100.0
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(frozen=True)
|
|
30
|
+
class EvalContext:
|
|
31
|
+
"""Context for assignment — identifies who is being assigned."""
|
|
32
|
+
|
|
33
|
+
user_id: str
|
|
34
|
+
session_id: str = ""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# ── Records (persisted to storage) ──────────────────────────────────
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class AssignmentRecord:
|
|
42
|
+
assignment_id: str = field(default_factory=lambda: str(uuid.uuid4()))
|
|
43
|
+
flag_name: str = ""
|
|
44
|
+
variant_name: str = ""
|
|
45
|
+
user_id: str = ""
|
|
46
|
+
session_id: str = ""
|
|
47
|
+
config_json: dict[str, Any] = field(default_factory=dict)
|
|
48
|
+
assigned_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class ExecutionRecord:
|
|
53
|
+
assignment_id: str = ""
|
|
54
|
+
latency_ms: float | None = None
|
|
55
|
+
input_tokens: int | None = None
|
|
56
|
+
output_tokens: int | None = None
|
|
57
|
+
cost: float | None = None
|
|
58
|
+
error: str | None = None
|
|
59
|
+
metadata_json: dict[str, Any] = field(default_factory=dict)
|
|
60
|
+
recorded_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class EventRecord:
|
|
65
|
+
event_id: str = field(default_factory=lambda: str(uuid.uuid4()))
|
|
66
|
+
assignment_id: str = ""
|
|
67
|
+
event_type: str = "" # success / failure / custom
|
|
68
|
+
event_name: str = "" # for custom events
|
|
69
|
+
payload_json: dict[str, Any] = field(default_factory=dict)
|
|
70
|
+
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
modelab/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: modelab
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Provider-agnostic A/B testing for LLM systems
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Keywords: ab-testing,experiments,feature-flags,llm
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Provides-Extra: dev
|
|
9
|
+
Requires-Dist: httpx>=0.27; extra == 'dev'
|
|
10
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
11
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
12
|
+
Provides-Extra: server
|
|
13
|
+
Requires-Dist: fastapi>=0.110; extra == 'server'
|
|
14
|
+
Requires-Dist: psycopg[binary,pool]>=3.1; extra == 'server'
|
|
15
|
+
Requires-Dist: uvicorn[standard]>=0.29; extra == 'server'
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
# modelab
|
|
19
|
+
|
|
20
|
+
Provider-agnostic A/B testing for LLM systems in production.
|
|
21
|
+
|
|
22
|
+
**Two components:**
|
|
23
|
+
1. **Python SDK** — zero-dependency library for assignment, tracking, and evaluation
|
|
24
|
+
2. **Server + Dashboard** — self-hosted FastAPI + React app for visualization (Docker Compose)
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
### SDK (local development)
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install modelab
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
import modelab
|
|
36
|
+
from modelab import Flag, Variant, EvalContext
|
|
37
|
+
|
|
38
|
+
# Initialize — point to the modelab server
|
|
39
|
+
modelab.init(
|
|
40
|
+
server="http://localhost:8100",
|
|
41
|
+
flags=[
|
|
42
|
+
Flag(
|
|
43
|
+
name="summarizer_v2",
|
|
44
|
+
variants=[
|
|
45
|
+
Variant("control", weight=50, config={"model": "gpt-3.5-turbo", "prompt": "Summarize: {input}"}),
|
|
46
|
+
Variant("treatment", weight=50, config={"model": "gpt-4", "prompt": "Concisely summarize: {input}"}),
|
|
47
|
+
],
|
|
48
|
+
rollout_pct=100,
|
|
49
|
+
),
|
|
50
|
+
],
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Assign a variant
|
|
54
|
+
ctx = EvalContext(user_id="user_123", session_id="abc")
|
|
55
|
+
assignment = modelab.assign("summarizer_v2", ctx)
|
|
56
|
+
|
|
57
|
+
if assignment is None:
|
|
58
|
+
# Outside rollout — use default behavior
|
|
59
|
+
response = call_llm(model="gpt-3.5-turbo", prompt=text)
|
|
60
|
+
else:
|
|
61
|
+
# In experiment — use assigned variant config
|
|
62
|
+
response = call_llm(
|
|
63
|
+
model=assignment.config["model"],
|
|
64
|
+
prompt=assignment.config["prompt"].format(input=text),
|
|
65
|
+
)
|
|
66
|
+
assignment.record(response, cost=0.013)
|
|
67
|
+
assignment.mark_success()
|
|
68
|
+
|
|
69
|
+
# Evaluate results
|
|
70
|
+
results = modelab.evaluate("summarizer_v2")
|
|
71
|
+
print(results)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Self-Hosted Server + Dashboard
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
docker compose up
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
This starts:
|
|
81
|
+
- **PostgreSQL** on port 5432
|
|
82
|
+
- **modelab server + dashboard** on port 8100
|
|
83
|
+
|
|
84
|
+
## Concepts
|
|
85
|
+
|
|
86
|
+
### Flags
|
|
87
|
+
An experiment with one or more variants and a rollout percentage (0-100%).
|
|
88
|
+
|
|
89
|
+
### Variants
|
|
90
|
+
Each variant has a name, weight (for traffic splitting), and a config dict you use to parameterize your LLM calls.
|
|
91
|
+
|
|
92
|
+
### Assignment
|
|
93
|
+
Deterministic — the same `(flag_name, user_id)` always maps to the same variant. Uses MD5 hashing into 10,000 buckets for 0.01% rollout granularity.
|
|
94
|
+
|
|
95
|
+
### Recording
|
|
96
|
+
|
|
97
|
+
Use `assignment.record(response)` to capture execution metrics. Token counts are automatically extracted from the response object via duck-typing (supports OpenAI and Anthropic response formats). Cost, latency, error, and arbitrary metadata can be passed as keyword arguments:
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
assignment.record(response, cost=0.013, latency_ms=250.0, model="gpt-4o")
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
You can also record without a response object:
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
assignment.record(input_tokens=50, output_tokens=100, cost=0.01)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Events
|
|
110
|
+
Mark assignments as success/failure or record custom events (e.g., "copied", "thumbs_up").
|
|
111
|
+
|
|
112
|
+
### Evaluation
|
|
113
|
+
`modelab.evaluate(flag_name)` returns per-variant metrics: success rate, avg latency, avg cost, token usage, and custom event counts.
|
|
114
|
+
|
|
115
|
+
## Server API
|
|
116
|
+
|
|
117
|
+
### Ingestion (from SDK)
|
|
118
|
+
```
|
|
119
|
+
POST /api/v1/ingest/assignments (batch)
|
|
120
|
+
POST /api/v1/ingest/executions (batch)
|
|
121
|
+
POST /api/v1/ingest/events (batch)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Dashboard API
|
|
125
|
+
```
|
|
126
|
+
GET /api/v1/flags — list flags with summary stats
|
|
127
|
+
GET /api/v1/flags/{name} — detailed per-variant evaluation
|
|
128
|
+
GET /api/v1/flags/{name}/timeline — time-series metrics
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Development
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
# Install in dev mode
|
|
135
|
+
pip install -e ".[dev]"
|
|
136
|
+
|
|
137
|
+
# Run tests
|
|
138
|
+
pytest
|
|
139
|
+
|
|
140
|
+
# Run dashboard dev server
|
|
141
|
+
cd dashboard && npm install && npm run dev
|
|
142
|
+
|
|
143
|
+
# Run API server (requires Postgres)
|
|
144
|
+
uvicorn server.app:app --reload --port 8100
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## Architecture
|
|
148
|
+
|
|
149
|
+
```
|
|
150
|
+
Developer's App
|
|
151
|
+
│
|
|
152
|
+
├── modelab SDK (pip install modelab)
|
|
153
|
+
│ └── ServerStorage ──HTTP POST──▶ modelab-server
|
|
154
|
+
│
|
|
155
|
+
modelab-server (docker compose up)
|
|
156
|
+
├── FastAPI backend
|
|
157
|
+
├── React dashboard (served as static files)
|
|
158
|
+
└── PostgreSQL
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## License
|
|
162
|
+
|
|
163
|
+
MIT
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
modelab/__init__.py,sha256=ToaPXEjWZYi8poObLXTHXOkTz8DugUHLyKcZsqQ_jLg,3463
|
|
2
|
+
modelab/_assignment.py,sha256=XXH9ZNMftobh6m2rj62T-pqpjwxw8wKG41JLmXz4Mvo,5211
|
|
3
|
+
modelab/_engine.py,sha256=rPhlSXIRQLvhHWxnLs6YayJ1HJX5iKwZbVFrkmIB49A,1333
|
|
4
|
+
modelab/_errors.py,sha256=y6aqP28CihLrlM2MTxm0v3ISdK-qbkW4aOetOZRh_oM,693
|
|
5
|
+
modelab/_server_storage.py,sha256=pmlIRwIY9mmNKEUemVeF2f41YnJ5RYcr_2YZlmCSksA,3537
|
|
6
|
+
modelab/_state.py,sha256=pERDfuCx5Ylh7YzqTdpbgZ90aTHz9bO5HyqCcdbk0kY,1225
|
|
7
|
+
modelab/_types.py,sha256=3zc_be5PTmlANlLrY1ZhYoX9vaTyWO4PdBd5NK6zh_s,2033
|
|
8
|
+
modelab/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
modelab-0.1.0.dist-info/METADATA,sha256=lPiLzc6Hs0Ni5Qk4V1TvHxSHgVLloG0bteexe3ea5-g,4386
|
|
10
|
+
modelab-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
11
|
+
modelab-0.1.0.dist-info/RECORD,,
|