fluxloop 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fluxloop might be problematic. Click here for more details.
- fluxloop/__init__.py +58 -0
- fluxloop/buffer.py +186 -0
- fluxloop/client.py +175 -0
- fluxloop/config.py +191 -0
- fluxloop/context.py +219 -0
- fluxloop/decorators.py +465 -0
- fluxloop/models.py +92 -0
- fluxloop/recording.py +205 -0
- fluxloop/schemas/__init__.py +48 -0
- fluxloop/schemas/config.py +312 -0
- fluxloop/schemas/trace.py +197 -0
- fluxloop/serialization.py +116 -0
- fluxloop/storage.py +53 -0
- fluxloop-0.1.0.dist-info/METADATA +76 -0
- fluxloop-0.1.0.dist-info/RECORD +17 -0
- fluxloop-0.1.0.dist-info/WHEEL +5 -0
- fluxloop-0.1.0.dist-info/top_level.txt +1 -0
fluxloop/models.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Internal SDK models for observations and traces.
|
|
3
|
+
These are lightweight versions optimized for SDK use.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from typing import Any, Dict, List, Optional
|
|
9
|
+
from uuid import UUID, uuid4
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, Field
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ObservationType(str, Enum):
|
|
15
|
+
"""Type of observation."""
|
|
16
|
+
|
|
17
|
+
SPAN = "span"
|
|
18
|
+
EVENT = "event"
|
|
19
|
+
GENERATION = "generation"
|
|
20
|
+
TOOL = "tool"
|
|
21
|
+
AGENT = "agent"
|
|
22
|
+
CHAIN = "chain"
|
|
23
|
+
EVALUATION = "evaluation"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ObservationLevel(str, Enum):
|
|
27
|
+
"""Log level for observations."""
|
|
28
|
+
|
|
29
|
+
DEBUG = "debug"
|
|
30
|
+
INFO = "info"
|
|
31
|
+
WARNING = "warning"
|
|
32
|
+
ERROR = "error"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ObservationData(BaseModel):
|
|
36
|
+
"""Lightweight observation data for SDK use."""
|
|
37
|
+
|
|
38
|
+
id: UUID = Field(default_factory=uuid4)
|
|
39
|
+
type: ObservationType
|
|
40
|
+
name: str
|
|
41
|
+
|
|
42
|
+
# Timing
|
|
43
|
+
start_time: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
|
44
|
+
end_time: Optional[datetime] = None
|
|
45
|
+
|
|
46
|
+
# Content
|
|
47
|
+
input: Optional[Any] = None
|
|
48
|
+
output: Optional[Any] = None
|
|
49
|
+
error: Optional[str] = None
|
|
50
|
+
|
|
51
|
+
# Metadata
|
|
52
|
+
level: ObservationLevel = ObservationLevel.INFO
|
|
53
|
+
status_message: Optional[str] = None
|
|
54
|
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
55
|
+
|
|
56
|
+
# LLM specific
|
|
57
|
+
model: Optional[str] = None
|
|
58
|
+
llm_parameters: Optional[Dict[str, Any]] = None
|
|
59
|
+
prompt_tokens: Optional[int] = None
|
|
60
|
+
completion_tokens: Optional[int] = None
|
|
61
|
+
total_tokens: Optional[int] = None
|
|
62
|
+
|
|
63
|
+
# Parent reference (set by context)
|
|
64
|
+
parent_observation_id: Optional[UUID] = None
|
|
65
|
+
trace_id: Optional[UUID] = None
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class TraceData(BaseModel):
|
|
69
|
+
"""Lightweight trace data for SDK use."""
|
|
70
|
+
|
|
71
|
+
id: UUID = Field(default_factory=uuid4)
|
|
72
|
+
name: str
|
|
73
|
+
|
|
74
|
+
# Timing
|
|
75
|
+
start_time: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
|
76
|
+
end_time: Optional[datetime] = None
|
|
77
|
+
|
|
78
|
+
# Context
|
|
79
|
+
session_id: Optional[UUID] = None
|
|
80
|
+
user_id: Optional[str] = None
|
|
81
|
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
82
|
+
tags: List[str] = Field(default_factory=list)
|
|
83
|
+
|
|
84
|
+
# Experiment specific
|
|
85
|
+
experiment_id: Optional[str] = None
|
|
86
|
+
iteration: Optional[int] = None
|
|
87
|
+
persona: Optional[str] = None
|
|
88
|
+
variation_seed: Optional[str] = None
|
|
89
|
+
|
|
90
|
+
# Input/Output
|
|
91
|
+
input: Optional[Any] = None
|
|
92
|
+
output: Optional[Any] = None
|
fluxloop/recording.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""Argument recording utilities for the FluxLoop SDK (MVP implementation)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from collections.abc import Mapping, Sequence
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Dict, Optional
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
_SENSITIVE_KEY_PATTERNS = [
|
|
13
|
+
"token",
|
|
14
|
+
"password",
|
|
15
|
+
"secret",
|
|
16
|
+
"key",
|
|
17
|
+
"auth",
|
|
18
|
+
"credential",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
from pydantic import BaseModel
|
|
23
|
+
except ImportError: # pragma: no cover - fallback if optional dependency missing
|
|
24
|
+
class BaseModel: # type: ignore
|
|
25
|
+
def __init__(self, **kwargs):
|
|
26
|
+
for key, value in kwargs.items():
|
|
27
|
+
setattr(self, key, value)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ArgsRecorder:
|
|
31
|
+
"""Simple argument recorder that writes call metadata to a JSONL file."""
|
|
32
|
+
|
|
33
|
+
def __init__(self, output_file: Path) -> None:
|
|
34
|
+
self.output_file = output_file
|
|
35
|
+
self.output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
36
|
+
self._iteration_counters: Dict[str, int] = {}
|
|
37
|
+
|
|
38
|
+
def record(self, target: str, *, iteration: Optional[int], **kwargs: Any) -> None:
|
|
39
|
+
"""Record call arguments for the given target."""
|
|
40
|
+
|
|
41
|
+
serializable_kwargs: Dict[str, Any] = {}
|
|
42
|
+
|
|
43
|
+
for key, value in kwargs.items():
|
|
44
|
+
if callable(value):
|
|
45
|
+
serializable_kwargs[key] = self._serialize_callable(value)
|
|
46
|
+
continue
|
|
47
|
+
|
|
48
|
+
if self._is_sensitive_key(key):
|
|
49
|
+
serializable_kwargs[key] = "***"
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
safe_value = self._coerce_to_json_safe(value)
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
json.dumps(safe_value)
|
|
56
|
+
except (TypeError, ValueError):
|
|
57
|
+
serializable_kwargs[key] = self._serialize_non_json_value(key, value)
|
|
58
|
+
else:
|
|
59
|
+
serializable_kwargs[key] = safe_value
|
|
60
|
+
|
|
61
|
+
resolved_iteration = self._resolve_iteration(target, iteration)
|
|
62
|
+
|
|
63
|
+
record = {
|
|
64
|
+
"_version": "1",
|
|
65
|
+
"iteration": resolved_iteration,
|
|
66
|
+
"target": target,
|
|
67
|
+
"kwargs": serializable_kwargs,
|
|
68
|
+
"timestamp": datetime.now().isoformat(),
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
with self.output_file.open("a", encoding="utf-8") as fp:
|
|
72
|
+
fp.write(json.dumps(record, default=str) + "\n")
|
|
73
|
+
|
|
74
|
+
def _resolve_iteration(self, target: str, iteration: Optional[int]) -> int:
|
|
75
|
+
if iteration is not None:
|
|
76
|
+
self._iteration_counters[target] = iteration
|
|
77
|
+
return iteration
|
|
78
|
+
|
|
79
|
+
next_value = self._iteration_counters.get(target, -1) + 1
|
|
80
|
+
self._iteration_counters[target] = next_value
|
|
81
|
+
return next_value
|
|
82
|
+
|
|
83
|
+
def _serialize_callable(self, value: Any) -> str:
|
|
84
|
+
marker = getattr(value, "__fluxloop_builtin__", None)
|
|
85
|
+
if marker:
|
|
86
|
+
return f"<builtin:{marker}>"
|
|
87
|
+
|
|
88
|
+
if hasattr(value, "messages"):
|
|
89
|
+
return "<builtin:collector.send>"
|
|
90
|
+
|
|
91
|
+
if hasattr(value, "errors"):
|
|
92
|
+
return "<builtin:collector.error>"
|
|
93
|
+
|
|
94
|
+
name = getattr(value, "__name__", "unknown")
|
|
95
|
+
return f"<callable:{name}>"
|
|
96
|
+
|
|
97
|
+
def _serialize_non_json_value(self, key: str, value: Any) -> Any:
|
|
98
|
+
if self._is_sensitive_key(key):
|
|
99
|
+
return "***"
|
|
100
|
+
|
|
101
|
+
coerced = self._coerce_to_json_safe(value)
|
|
102
|
+
try:
|
|
103
|
+
json.dumps(coerced)
|
|
104
|
+
return coerced
|
|
105
|
+
except (TypeError, ValueError):
|
|
106
|
+
representation = repr(value)
|
|
107
|
+
if len(representation) > 100:
|
|
108
|
+
representation = representation[:100]
|
|
109
|
+
return f"<repr:{representation}>"
|
|
110
|
+
|
|
111
|
+
def _coerce_to_json_safe(self, value: Any, *, depth: int = 0) -> Any:
|
|
112
|
+
if depth > 3:
|
|
113
|
+
return f"<repr:{type(value).__name__}>"
|
|
114
|
+
|
|
115
|
+
if value is None or isinstance(value, (str, int, float, bool)):
|
|
116
|
+
return value
|
|
117
|
+
|
|
118
|
+
if isinstance(value, datetime):
|
|
119
|
+
return value.isoformat()
|
|
120
|
+
|
|
121
|
+
if isinstance(value, Mapping):
|
|
122
|
+
result: Dict[str, Any] = {}
|
|
123
|
+
for key, item in value.items():
|
|
124
|
+
mask_key = str(key)
|
|
125
|
+
if self._is_sensitive_key(mask_key):
|
|
126
|
+
result[mask_key] = "***"
|
|
127
|
+
else:
|
|
128
|
+
result[mask_key] = self._coerce_to_json_safe(item, depth=depth + 1)
|
|
129
|
+
return result
|
|
130
|
+
|
|
131
|
+
if isinstance(value, Sequence) and not isinstance(value, (str, bytes, bytearray)):
|
|
132
|
+
return [
|
|
133
|
+
"***"
|
|
134
|
+
if self._is_collection_with_sensitive_keys(item)
|
|
135
|
+
else self._coerce_to_json_safe(item, depth=depth + 1)
|
|
136
|
+
for item in value
|
|
137
|
+
]
|
|
138
|
+
|
|
139
|
+
if hasattr(value, "__dict__"):
|
|
140
|
+
return {
|
|
141
|
+
str(attr): self._coerce_to_json_safe(attr_value, depth=depth + 1)
|
|
142
|
+
for attr, attr_value in vars(value).items()
|
|
143
|
+
if not attr.startswith("__")
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
return value
|
|
147
|
+
|
|
148
|
+
def _is_collection_with_sensitive_keys(self, value: Any) -> bool:
|
|
149
|
+
if isinstance(value, Mapping):
|
|
150
|
+
return any(self._is_sensitive_key(str(key)) for key in value.keys())
|
|
151
|
+
|
|
152
|
+
if isinstance(value, Sequence) and not isinstance(value, (str, bytes, bytearray)):
|
|
153
|
+
return any(self._is_collection_with_sensitive_keys(item) for item in value)
|
|
154
|
+
|
|
155
|
+
return False
|
|
156
|
+
|
|
157
|
+
def _is_sensitive_key(self, key: str) -> bool:
|
|
158
|
+
key_lower = key.lower()
|
|
159
|
+
return any(pattern in key_lower for pattern in _SENSITIVE_KEY_PATTERNS)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
_global_recorder: Optional[ArgsRecorder] = None
|
|
163
|
+
|
|
164
|
+
class RecordingConfig(BaseModel):
|
|
165
|
+
iteration_auto_increment: bool = True
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
_recording_config = RecordingConfig()
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def enable_recording(output_file: str) -> None:
|
|
172
|
+
"""Enable argument recording by configuring the global recorder."""
|
|
173
|
+
|
|
174
|
+
global _global_recorder
|
|
175
|
+
resolved_path = Path(output_file).expanduser().resolve()
|
|
176
|
+
_global_recorder = ArgsRecorder(resolved_path)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def record_call_args(target: str, *, iteration: Optional[int] = None, **kwargs: Any) -> None:
|
|
180
|
+
"""Record call arguments if recording is enabled."""
|
|
181
|
+
|
|
182
|
+
if _global_recorder is None:
|
|
183
|
+
return
|
|
184
|
+
|
|
185
|
+
recorded_iteration = iteration
|
|
186
|
+
if iteration is None and not _recording_config.iteration_auto_increment:
|
|
187
|
+
recorded_iteration = 0
|
|
188
|
+
|
|
189
|
+
_global_recorder.record(target, iteration=recorded_iteration, **kwargs)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def disable_recording() -> None:
|
|
193
|
+
"""Disable argument recording."""
|
|
194
|
+
|
|
195
|
+
global _global_recorder
|
|
196
|
+
_global_recorder = None
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def set_recording_options(*, iteration_auto_increment: Optional[bool] = None) -> None:
|
|
200
|
+
"""Update global recording behaviour."""
|
|
201
|
+
|
|
202
|
+
if iteration_auto_increment is not None:
|
|
203
|
+
_recording_config.iteration_auto_increment = iteration_auto_increment
|
|
204
|
+
|
|
205
|
+
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""
|
|
2
|
+
FluxLoop Shared Schemas
|
|
3
|
+
|
|
4
|
+
Common data models shared across all FluxLoop components.
|
|
5
|
+
Based on Langfuse data model for compatibility.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .trace import (
|
|
9
|
+
Trace,
|
|
10
|
+
Observation,
|
|
11
|
+
ObservationType,
|
|
12
|
+
Score,
|
|
13
|
+
ScoreDataType,
|
|
14
|
+
TraceStatus,
|
|
15
|
+
ObservationLevel,
|
|
16
|
+
)
|
|
17
|
+
from .config import (
|
|
18
|
+
ExperimentConfig,
|
|
19
|
+
InputGenerationConfig,
|
|
20
|
+
InputGenerationMode,
|
|
21
|
+
LLMGeneratorConfig,
|
|
22
|
+
PersonaConfig,
|
|
23
|
+
ReplayArgsConfig,
|
|
24
|
+
VariationStrategy,
|
|
25
|
+
EvaluatorConfig,
|
|
26
|
+
RunnerConfig,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
# Trace models
|
|
31
|
+
"Trace",
|
|
32
|
+
"Observation",
|
|
33
|
+
"ObservationType",
|
|
34
|
+
"Score",
|
|
35
|
+
"ScoreDataType",
|
|
36
|
+
"TraceStatus",
|
|
37
|
+
"ObservationLevel",
|
|
38
|
+
# Config models
|
|
39
|
+
"ExperimentConfig",
|
|
40
|
+
"InputGenerationConfig",
|
|
41
|
+
"InputGenerationMode",
|
|
42
|
+
"LLMGeneratorConfig",
|
|
43
|
+
"PersonaConfig",
|
|
44
|
+
"ReplayArgsConfig",
|
|
45
|
+
"VariationStrategy",
|
|
46
|
+
"EvaluatorConfig",
|
|
47
|
+
"RunnerConfig",
|
|
48
|
+
]
|
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration schemas for experiments and simulations.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, Field, PrivateAttr, field_validator, model_validator
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class VariationStrategy(str, Enum):
|
|
13
|
+
"""Strategy for generating prompt variations."""
|
|
14
|
+
|
|
15
|
+
REPHRASE = "rephrase" # Rephrase the same intent
|
|
16
|
+
TYPO = "typo" # Add typos/errors
|
|
17
|
+
VERBOSE = "verbose" # Make more verbose
|
|
18
|
+
CONCISE = "concise" # Make more concise
|
|
19
|
+
PERSONA_BASED = "persona_based" # Based on persona characteristics
|
|
20
|
+
ADVERSARIAL = "adversarial" # Edge cases and attacks
|
|
21
|
+
MULTILINGUAL = "multilingual" # Different languages
|
|
22
|
+
CUSTOM = "custom" # Custom variation prompt
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class InputGenerationMode(str, Enum):
|
|
26
|
+
"""Supported input generation approaches."""
|
|
27
|
+
|
|
28
|
+
DETERMINISTIC = "deterministic"
|
|
29
|
+
LLM = "llm"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class LLMGeneratorConfig(BaseModel):
|
|
33
|
+
"""Configuration for LLM-backed input generation."""
|
|
34
|
+
|
|
35
|
+
enabled: bool = False
|
|
36
|
+
provider: str = "openai"
|
|
37
|
+
model: str = "gpt-4o-mini"
|
|
38
|
+
api_key: Optional[str] = None
|
|
39
|
+
system_prompt: Optional[str] = None
|
|
40
|
+
user_prompt_template: Optional[str] = None
|
|
41
|
+
strategy_prompts: Dict[str, str] = Field(default_factory=dict)
|
|
42
|
+
max_outputs: int = Field(default=3, ge=1, le=20)
|
|
43
|
+
temperature: float = Field(default=0.7, ge=0.0, le=2.0)
|
|
44
|
+
top_p: float = Field(default=1.0, ge=0.0, le=1.0)
|
|
45
|
+
frequency_penalty: float = Field(default=0.0, ge=-2.0, le=2.0)
|
|
46
|
+
presence_penalty: float = Field(default=0.0, ge=-2.0, le=2.0)
|
|
47
|
+
max_tokens: int = Field(default=1024, ge=16, le=4096)
|
|
48
|
+
request_timeout: int = Field(default=60, ge=1, le=600)
|
|
49
|
+
batch_size: int = Field(default=1, ge=1, le=10)
|
|
50
|
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
51
|
+
|
|
52
|
+
# GPT-5 specific controls
|
|
53
|
+
reasoning_effort: Optional[str] = Field(
|
|
54
|
+
default=None,
|
|
55
|
+
description="Reasoning effort for GPT-5 models: minimal, low, medium, high",
|
|
56
|
+
)
|
|
57
|
+
text_verbosity: Optional[str] = Field(
|
|
58
|
+
default=None,
|
|
59
|
+
description="Output verbosity for GPT-5 models: low, medium, high",
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class InputGenerationConfig(BaseModel):
|
|
64
|
+
"""Wrapper around input generation options."""
|
|
65
|
+
|
|
66
|
+
mode: InputGenerationMode = InputGenerationMode.LLM
|
|
67
|
+
llm: LLMGeneratorConfig = Field(default_factory=LLMGeneratorConfig)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class PersonaConfig(BaseModel):
|
|
71
|
+
"""User persona for simulation."""
|
|
72
|
+
|
|
73
|
+
name: str
|
|
74
|
+
description: str
|
|
75
|
+
characteristics: List[str] = Field(default_factory=list)
|
|
76
|
+
language: str = "en"
|
|
77
|
+
expertise_level: str = "intermediate" # novice, intermediate, expert
|
|
78
|
+
goals: List[str] = Field(default_factory=list)
|
|
79
|
+
constraints: List[str] = Field(default_factory=list)
|
|
80
|
+
custom_attributes: Dict[str, Any] = Field(default_factory=dict)
|
|
81
|
+
|
|
82
|
+
def to_prompt(self) -> str:
|
|
83
|
+
"""Convert persona to a prompt description."""
|
|
84
|
+
prompt_parts = [
|
|
85
|
+
f"User Persona: {self.name}",
|
|
86
|
+
f"Description: {self.description}",
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
if self.characteristics:
|
|
90
|
+
prompt_parts.append(f"Characteristics: {', '.join(self.characteristics)}")
|
|
91
|
+
|
|
92
|
+
if self.goals:
|
|
93
|
+
prompt_parts.append(f"Goals: {', '.join(self.goals)}")
|
|
94
|
+
|
|
95
|
+
if self.constraints:
|
|
96
|
+
prompt_parts.append(f"Constraints: {', '.join(self.constraints)}")
|
|
97
|
+
|
|
98
|
+
prompt_parts.append(f"Language: {self.language}")
|
|
99
|
+
prompt_parts.append(f"Expertise Level: {self.expertise_level}")
|
|
100
|
+
|
|
101
|
+
return "\n".join(prompt_parts)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class EvaluatorConfig(BaseModel):
|
|
105
|
+
"""Configuration for evaluation methods."""
|
|
106
|
+
|
|
107
|
+
name: str
|
|
108
|
+
type: str # "llm_judge", "rule_based", "metric", "custom"
|
|
109
|
+
enabled: bool = True
|
|
110
|
+
|
|
111
|
+
# For LLM judge
|
|
112
|
+
model: Optional[str] = None
|
|
113
|
+
prompt_template: Optional[str] = None
|
|
114
|
+
|
|
115
|
+
# For rule-based
|
|
116
|
+
rules: List[Dict[str, Any]] = Field(default_factory=list)
|
|
117
|
+
|
|
118
|
+
# For metrics
|
|
119
|
+
metric_name: Optional[str] = None
|
|
120
|
+
threshold: Optional[float] = None
|
|
121
|
+
|
|
122
|
+
# Custom evaluator
|
|
123
|
+
module_path: Optional[str] = None
|
|
124
|
+
class_name: Optional[str] = None
|
|
125
|
+
|
|
126
|
+
# Common
|
|
127
|
+
weight: float = 1.0
|
|
128
|
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class RunnerConfig(BaseModel):
|
|
132
|
+
"""Configuration for agent execution."""
|
|
133
|
+
|
|
134
|
+
# Entry point
|
|
135
|
+
module_path: str # e.g., "my_agent.main"
|
|
136
|
+
function_name: str = "run" # Function to call
|
|
137
|
+
target: Optional[str] = Field(
|
|
138
|
+
default=None,
|
|
139
|
+
description=(
|
|
140
|
+
"Optional combined target specification. Use 'module:function' or "
|
|
141
|
+
"'module:Class.method'. When provided, takes precedence over "
|
|
142
|
+
"module_path + function_name."
|
|
143
|
+
),
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Execution environment
|
|
147
|
+
working_directory: Optional[str] = None
|
|
148
|
+
python_path: Optional[str] = None
|
|
149
|
+
environment_vars: Dict[str, str] = Field(default_factory=dict)
|
|
150
|
+
|
|
151
|
+
# Dependencies
|
|
152
|
+
requirements_file: Optional[str] = None
|
|
153
|
+
setup_commands: List[str] = Field(default_factory=list)
|
|
154
|
+
|
|
155
|
+
# Execution settings
|
|
156
|
+
timeout_seconds: int = 300
|
|
157
|
+
max_retries: int = 3
|
|
158
|
+
retry_delay: int = 5
|
|
159
|
+
|
|
160
|
+
# Docker settings (optional)
|
|
161
|
+
use_docker: bool = False
|
|
162
|
+
docker_image: Optional[str] = None
|
|
163
|
+
docker_build_context: Optional[str] = None
|
|
164
|
+
docker_volumes: List[str] = Field(default_factory=list)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class ExperimentConfig(BaseModel):
|
|
168
|
+
"""Complete experiment configuration."""
|
|
169
|
+
|
|
170
|
+
_source_dir: Optional[Path] = PrivateAttr(default=None)
|
|
171
|
+
_resolved_input_count: Optional[int] = PrivateAttr(default=None)
|
|
172
|
+
_resolved_persona_count: Optional[int] = PrivateAttr(default=None)
|
|
173
|
+
|
|
174
|
+
# Basic info
|
|
175
|
+
name: str
|
|
176
|
+
description: Optional[str] = None
|
|
177
|
+
version: str = "1.0.0"
|
|
178
|
+
|
|
179
|
+
# Simulation settings
|
|
180
|
+
iterations: int = Field(default=10, ge=1, le=1000)
|
|
181
|
+
parallel_runs: int = Field(default=1, ge=1, le=10)
|
|
182
|
+
seed: Optional[int] = None
|
|
183
|
+
run_delay_seconds: float = Field(default=0.0, ge=0.0)
|
|
184
|
+
|
|
185
|
+
# Personas
|
|
186
|
+
personas: List[PersonaConfig] = Field(default_factory=list)
|
|
187
|
+
|
|
188
|
+
# Variation settings
|
|
189
|
+
variation_strategies: List[VariationStrategy] = Field(default_factory=list)
|
|
190
|
+
variation_count: int = Field(default=1, ge=1, le=10)
|
|
191
|
+
variation_temperature: float = Field(default=0.7, ge=0, le=2)
|
|
192
|
+
variation_model: str = "gpt-3.5-turbo"
|
|
193
|
+
custom_variation_prompt: Optional[str] = None
|
|
194
|
+
|
|
195
|
+
# Base prompts/inputs
|
|
196
|
+
base_inputs: List[Dict[str, Any]] = Field(default_factory=list)
|
|
197
|
+
inputs_file: Optional[str] = None
|
|
198
|
+
input_template: Optional[str] = None
|
|
199
|
+
input_generation: InputGenerationConfig = Field(default_factory=InputGenerationConfig)
|
|
200
|
+
|
|
201
|
+
# Runner configuration
|
|
202
|
+
runner: RunnerConfig
|
|
203
|
+
|
|
204
|
+
# Argument replay (optional)
|
|
205
|
+
replay_args: Optional["ReplayArgsConfig"] = None
|
|
206
|
+
|
|
207
|
+
# Evaluators
|
|
208
|
+
evaluators: List[EvaluatorConfig] = Field(default_factory=list)
|
|
209
|
+
|
|
210
|
+
# Output settings
|
|
211
|
+
output_directory: str = "./experiments"
|
|
212
|
+
save_traces: bool = True
|
|
213
|
+
save_aggregated_metrics: bool = True
|
|
214
|
+
|
|
215
|
+
# Collector settings
|
|
216
|
+
collector_url: Optional[str] = None
|
|
217
|
+
collector_api_key: Optional[str] = None
|
|
218
|
+
|
|
219
|
+
# Metadata
|
|
220
|
+
tags: List[str] = Field(default_factory=list)
|
|
221
|
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
222
|
+
|
|
223
|
+
@field_validator("iterations")
|
|
224
|
+
def validate_iterations(cls, v):
|
|
225
|
+
"""Ensure reasonable iteration count."""
|
|
226
|
+
if v > 1000:
|
|
227
|
+
raise ValueError("iterations must be <= 1000 for safety")
|
|
228
|
+
return v
|
|
229
|
+
|
|
230
|
+
@model_validator(mode="after")
|
|
231
|
+
def validate_input_sources(cls, values: "ExperimentConfig") -> "ExperimentConfig":
|
|
232
|
+
"""Ensure at least one input source is configured."""
|
|
233
|
+
if not values.base_inputs and not values.inputs_file:
|
|
234
|
+
# Allow both sources to be disabled if base_inputs is provided
|
|
235
|
+
if not values.base_inputs:
|
|
236
|
+
raise ValueError("Either base_inputs or inputs_file must be provided")
|
|
237
|
+
return values
|
|
238
|
+
|
|
239
|
+
def has_external_inputs(self) -> bool:
|
|
240
|
+
"""Return True when inputs should be loaded from an external file."""
|
|
241
|
+
return bool(self.inputs_file)
|
|
242
|
+
|
|
243
|
+
def set_source_dir(self, source_dir: Path) -> None:
|
|
244
|
+
"""Remember the directory where this config file was loaded from."""
|
|
245
|
+
self._source_dir = source_dir
|
|
246
|
+
|
|
247
|
+
def get_source_dir(self) -> Optional[Path]:
|
|
248
|
+
"""Return the directory where the config file was loaded from."""
|
|
249
|
+
return self._source_dir
|
|
250
|
+
|
|
251
|
+
def set_resolved_input_count(self, count: int) -> None:
|
|
252
|
+
"""Record the effective input count after resolution."""
|
|
253
|
+
if count < 0:
|
|
254
|
+
raise ValueError("resolved input count must be non-negative")
|
|
255
|
+
self._resolved_input_count = count
|
|
256
|
+
|
|
257
|
+
def get_resolved_input_count(self) -> Optional[int]:
|
|
258
|
+
"""Return the resolved input count if it has been set."""
|
|
259
|
+
return self._resolved_input_count
|
|
260
|
+
|
|
261
|
+
def set_resolved_persona_count(self, count: int) -> None:
|
|
262
|
+
"""Record the effective persona multiplier after resolution."""
|
|
263
|
+
if count < 1:
|
|
264
|
+
raise ValueError("resolved persona count must be >= 1")
|
|
265
|
+
self._resolved_persona_count = count
|
|
266
|
+
|
|
267
|
+
def get_resolved_persona_count(self) -> Optional[int]:
|
|
268
|
+
"""Return the resolved persona multiplier if available."""
|
|
269
|
+
return self._resolved_persona_count
|
|
270
|
+
|
|
271
|
+
def _default_input_count(self) -> int:
|
|
272
|
+
"""Fallback calculation when no resolved count is available."""
|
|
273
|
+
base_count = len(self.base_inputs)
|
|
274
|
+
if self.has_external_inputs():
|
|
275
|
+
return base_count if base_count else 1
|
|
276
|
+
variation_multiplier = max(1, self.variation_count)
|
|
277
|
+
return base_count * variation_multiplier if base_count else variation_multiplier
|
|
278
|
+
|
|
279
|
+
def get_input_count(self) -> int:
|
|
280
|
+
"""Return the effective number of inputs that will be executed."""
|
|
281
|
+
return self._resolved_input_count if self._resolved_input_count is not None else self._default_input_count()
|
|
282
|
+
|
|
283
|
+
def estimate_total_runs(self) -> int:
|
|
284
|
+
"""Calculate total number of runs."""
|
|
285
|
+
if self._resolved_persona_count is not None:
|
|
286
|
+
persona_count = self._resolved_persona_count
|
|
287
|
+
else:
|
|
288
|
+
persona_count = len(self.personas) if self.personas else 1
|
|
289
|
+
input_count = self.get_input_count()
|
|
290
|
+
return self.iterations * persona_count * input_count
|
|
291
|
+
|
|
292
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
293
|
+
"""Convert to dictionary for serialization."""
|
|
294
|
+
return self.model_dump(exclude_none=True)
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
class ReplayArgsConfig(BaseModel):
|
|
298
|
+
"""Configuration for argument replay (MVP scope)."""
|
|
299
|
+
|
|
300
|
+
enabled: bool = False
|
|
301
|
+
recording_file: Optional[str] = None
|
|
302
|
+
callable_providers: Dict[str, str] = Field(
|
|
303
|
+
default_factory=lambda: {
|
|
304
|
+
"send_message_callback": "builtin:collector.send",
|
|
305
|
+
"send_error_callback": "builtin:collector.error",
|
|
306
|
+
},
|
|
307
|
+
description="Mapping of callable parameter names to builtin providers",
|
|
308
|
+
)
|
|
309
|
+
override_param_path: Optional[str] = Field(
|
|
310
|
+
default="data.content",
|
|
311
|
+
description="Single dot-notation path whose value should be overridden with runtime input",
|
|
312
|
+
)
|