aiqa-client 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiqa/__init__.py +44 -7
- aiqa/aiqa_exporter.py +286 -53
- aiqa/client.py +170 -0
- aiqa/experiment_runner.py +336 -0
- aiqa/object_serialiser.py +361 -0
- aiqa/test_experiment_runner.py +176 -0
- aiqa/test_tracing.py +230 -0
- aiqa/tracing.py +1102 -153
- {aiqa_client-0.1.0.dist-info → aiqa_client-0.1.2.dist-info}/METADATA +95 -4
- aiqa_client-0.1.2.dist-info/RECORD +14 -0
- aiqa_client-0.1.0.dist-info/RECORD +0 -9
- {aiqa_client-0.1.0.dist-info → aiqa_client-0.1.2.dist-info}/WHEEL +0 -0
- {aiqa_client-0.1.0.dist-info → aiqa_client-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {aiqa_client-0.1.0.dist-info → aiqa_client-0.1.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Object serialization utilities for converting Python objects to JSON-safe formats.
|
|
3
|
+
Handles objects, dataclasses, circular references, and size limits.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
import dataclasses
|
|
9
|
+
from datetime import datetime, date, time
|
|
10
|
+
from typing import Any, Callable, Set
|
|
11
|
+
|
|
12
|
+
def toNumber(value: str|int|None) -> int:
|
|
13
|
+
"""Convert string to number. handling units like g, m, k, (also mb kb gb though these should be avoided)"""
|
|
14
|
+
if value is None:
|
|
15
|
+
return 0
|
|
16
|
+
if isinstance(value, int):
|
|
17
|
+
return value
|
|
18
|
+
if value.endswith("b"): # drop the b
|
|
19
|
+
value = value[:-1]
|
|
20
|
+
if value.endswith("g"):
|
|
21
|
+
return int(value[:-1]) * 1024 * 1024 * 1024
|
|
22
|
+
elif value.endswith("m"):
|
|
23
|
+
return int(value[:-1]) * 1024 * 1024
|
|
24
|
+
elif value.endswith("k"):
|
|
25
|
+
return int(value[:-1]) * 1024
|
|
26
|
+
return int(value)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# Configurable limit for object string representation (in characters)
|
|
30
|
+
AIQA_MAX_OBJECT_STR_CHARS = toNumber(os.getenv("AIQA_MAX_OBJECT_STR_CHARS", "1m"))
|
|
31
|
+
|
|
32
|
+
# Data filters configuration
|
|
33
|
+
def _get_enabled_filters() -> Set[str]:
|
|
34
|
+
"""Get set of enabled filter names from AIQA_DATA_FILTERS env var."""
|
|
35
|
+
filters_env = os.getenv("AIQA_DATA_FILTERS", "RemovePasswords, RemoveJWT, RemoveAuthHeaders, RemoveAPIKeys")
|
|
36
|
+
if not filters_env or filters_env.lower() == "false":
|
|
37
|
+
return set()
|
|
38
|
+
return {f.strip() for f in filters_env.split(",") if f.strip()}
|
|
39
|
+
|
|
40
|
+
_ENABLED_FILTERS = _get_enabled_filters()
|
|
41
|
+
|
|
42
|
+
def _is_jwt_token(value: Any) -> bool:
|
|
43
|
+
"""Check if a value looks like a JWT token (starts with 'eyJ' and has 3 parts separated by dots)."""
|
|
44
|
+
if not isinstance(value, str):
|
|
45
|
+
return False
|
|
46
|
+
# JWT tokens have format: header.payload.signature (3 parts separated by dots)
|
|
47
|
+
# They typically start with 'eyJ' (base64 encoded '{"')
|
|
48
|
+
parts = value.split('.')
|
|
49
|
+
return len(parts) == 3 and value.startswith('eyJ') and all(len(p) > 0 for p in parts)
|
|
50
|
+
|
|
51
|
+
def _is_api_key(value: Any) -> bool:
|
|
52
|
+
"""Check if a value looks like an API key based on common patterns."""
|
|
53
|
+
if not isinstance(value, str):
|
|
54
|
+
return False
|
|
55
|
+
value = value.strip()
|
|
56
|
+
# Common API key prefixes:
|
|
57
|
+
api_key_prefixes = [
|
|
58
|
+
'sk-', # OpenAI secret key
|
|
59
|
+
'pk-', # possibly public key
|
|
60
|
+
'AKIA', # AWS access key
|
|
61
|
+
'ghp_', # GitHub personal access token
|
|
62
|
+
'gho_', # GitHub OAuth token
|
|
63
|
+
'ghu_', # GitHub unidentified token
|
|
64
|
+
'ghs_', # GitHub SAML token
|
|
65
|
+
'ghr_' # GitHub refresh token
|
|
66
|
+
]
|
|
67
|
+
return any(value.startswith(prefix) for prefix in api_key_prefixes)
|
|
68
|
+
|
|
69
|
+
def _apply_data_filters(key: str, value: Any) -> Any:
|
|
70
|
+
"""Apply data filters to a key-value pair based on enabled filters."""
|
|
71
|
+
if not value: # Don't filter falsy values
|
|
72
|
+
return value
|
|
73
|
+
|
|
74
|
+
key_lower = str(key).lower()
|
|
75
|
+
|
|
76
|
+
# RemovePasswords filter: if key contains "password", replace value with "****"
|
|
77
|
+
if "RemovePasswords" in _ENABLED_FILTERS and "password" in key_lower:
|
|
78
|
+
return "****"
|
|
79
|
+
|
|
80
|
+
# RemoveJWT filter: if value looks like a JWT token, replace with "****"
|
|
81
|
+
if "RemoveJWT" in _ENABLED_FILTERS and _is_jwt_token(value):
|
|
82
|
+
return "****"
|
|
83
|
+
|
|
84
|
+
# RemoveAuthHeaders filter: if key is "authorization" (case-insensitive), replace value with "****"
|
|
85
|
+
if "RemoveAuthHeaders" in _ENABLED_FILTERS and key_lower == "authorization":
|
|
86
|
+
return "****"
|
|
87
|
+
|
|
88
|
+
# RemoveAPIKeys filter: if key contains API key patterns or value looks like an API key
|
|
89
|
+
if "RemoveAPIKeys" in _ENABLED_FILTERS:
|
|
90
|
+
# Check key patterns
|
|
91
|
+
api_key_key_patterns = ['api_key', 'apikey', 'api-key', 'apikey']
|
|
92
|
+
if any(pattern in key_lower for pattern in api_key_key_patterns):
|
|
93
|
+
return "****"
|
|
94
|
+
# Check value patterns
|
|
95
|
+
if _is_api_key(value):
|
|
96
|
+
return "****"
|
|
97
|
+
|
|
98
|
+
return value
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def serialize_for_span(value: Any) -> Any:
|
|
102
|
+
"""
|
|
103
|
+
Serialize a value for span attributes.
|
|
104
|
+
OpenTelemetry only accepts primitives (bool, str, bytes, int, float) or sequences of those.
|
|
105
|
+
Complex types (dicts, lists, objects) are converted to JSON strings.
|
|
106
|
+
|
|
107
|
+
Handles objects by attempting to convert them to dicts, with safeguards against:
|
|
108
|
+
- Circular references
|
|
109
|
+
- Unconvertible parts
|
|
110
|
+
- Large objects (size limits)
|
|
111
|
+
"""
|
|
112
|
+
# Keep primitives as is (including None)
|
|
113
|
+
if value is None or isinstance(value, (str, int, float, bool, bytes)):
|
|
114
|
+
return value
|
|
115
|
+
|
|
116
|
+
# For sequences, check if all elements are primitives
|
|
117
|
+
if isinstance(value, (list, tuple)):
|
|
118
|
+
# If all elements are primitives, return as list
|
|
119
|
+
if all(isinstance(item, (str, int, float, bool, bytes, type(None))) for item in value):
|
|
120
|
+
return list(value)
|
|
121
|
+
# Otherwise serialize to JSON string
|
|
122
|
+
try:
|
|
123
|
+
return safe_json_dumps(value)
|
|
124
|
+
except Exception:
|
|
125
|
+
return str(value)
|
|
126
|
+
|
|
127
|
+
# For dicts and other complex types, serialize to JSON string
|
|
128
|
+
try:
|
|
129
|
+
return safe_json_dumps(value)
|
|
130
|
+
except Exception:
|
|
131
|
+
# If JSON serialization fails, convert to string
|
|
132
|
+
return safe_str_repr(value)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def safe_str_repr(value: Any) -> str:
|
|
136
|
+
"""
|
|
137
|
+
Safely convert a value to string representation.
|
|
138
|
+
Handles objects with __repr__ that might raise exceptions.
|
|
139
|
+
Uses AIQA_MAX_OBJECT_STR_CHARS environment variable (default: 100000) to limit length.
|
|
140
|
+
"""
|
|
141
|
+
try:
|
|
142
|
+
# Try __repr__ first (usually more informative)
|
|
143
|
+
repr_str = repr(value)
|
|
144
|
+
# Limit length to avoid huge strings
|
|
145
|
+
if len(repr_str) > AIQA_MAX_OBJECT_STR_CHARS:
|
|
146
|
+
return repr_str[:AIQA_MAX_OBJECT_STR_CHARS] + "... (truncated)"
|
|
147
|
+
return repr_str
|
|
148
|
+
except Exception:
|
|
149
|
+
# Fallback to type name
|
|
150
|
+
try:
|
|
151
|
+
return f"<{type(value).__name__} object>"
|
|
152
|
+
except Exception:
|
|
153
|
+
return "<unknown object>"
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def object_to_dict(obj: Any, visited: Set[int], max_depth: int = 10, current_depth: int = 0) -> Any:
|
|
157
|
+
"""
|
|
158
|
+
Convert an object to a dictionary representation.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
obj: The object to convert
|
|
162
|
+
visited: Set of object IDs to detect circular references
|
|
163
|
+
max_depth: Maximum recursion depth
|
|
164
|
+
current_depth: Current recursion depth
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Dictionary representation of the object, or a string if conversion fails
|
|
168
|
+
"""
|
|
169
|
+
if current_depth > max_depth:
|
|
170
|
+
return "<max depth exceeded>"
|
|
171
|
+
|
|
172
|
+
obj_id = id(obj)
|
|
173
|
+
if obj_id in visited:
|
|
174
|
+
return "<circular reference>"
|
|
175
|
+
|
|
176
|
+
# Handle None
|
|
177
|
+
if obj is None:
|
|
178
|
+
return None
|
|
179
|
+
|
|
180
|
+
# Handle primitives
|
|
181
|
+
if isinstance(obj, (str, int, float, bool, bytes)):
|
|
182
|
+
return obj
|
|
183
|
+
|
|
184
|
+
# Handle datetime objects
|
|
185
|
+
if isinstance(obj, datetime):
|
|
186
|
+
return obj.isoformat()
|
|
187
|
+
if isinstance(obj, date):
|
|
188
|
+
return obj.isoformat()
|
|
189
|
+
if isinstance(obj, time):
|
|
190
|
+
return obj.isoformat()
|
|
191
|
+
|
|
192
|
+
# Handle dict
|
|
193
|
+
if isinstance(obj, dict):
|
|
194
|
+
visited.add(obj_id)
|
|
195
|
+
try:
|
|
196
|
+
result = {}
|
|
197
|
+
for k, v in obj.items():
|
|
198
|
+
key_str = str(k) if not isinstance(k, (str, int, float, bool)) else k
|
|
199
|
+
filtered_value = _apply_data_filters(key_str, v)
|
|
200
|
+
result[key_str] = object_to_dict(filtered_value, visited, max_depth, current_depth + 1)
|
|
201
|
+
visited.remove(obj_id)
|
|
202
|
+
return result
|
|
203
|
+
except Exception:
|
|
204
|
+
visited.discard(obj_id)
|
|
205
|
+
return safe_str_repr(obj)
|
|
206
|
+
|
|
207
|
+
# Handle list/tuple
|
|
208
|
+
if isinstance(obj, (list, tuple)):
|
|
209
|
+
visited.add(obj_id)
|
|
210
|
+
try:
|
|
211
|
+
result = [object_to_dict(item, visited, max_depth, current_depth + 1) for item in obj]
|
|
212
|
+
visited.remove(obj_id)
|
|
213
|
+
return result
|
|
214
|
+
except Exception:
|
|
215
|
+
visited.discard(obj_id)
|
|
216
|
+
return safe_str_repr(obj)
|
|
217
|
+
|
|
218
|
+
# Handle dataclasses
|
|
219
|
+
if dataclasses.is_dataclass(obj):
|
|
220
|
+
visited.add(obj_id)
|
|
221
|
+
try:
|
|
222
|
+
result = {}
|
|
223
|
+
for field in dataclasses.fields(obj):
|
|
224
|
+
value = getattr(obj, field.name, None)
|
|
225
|
+
filtered_value = _apply_data_filters(field.name, value)
|
|
226
|
+
result[field.name] = object_to_dict(filtered_value, visited, max_depth, current_depth + 1)
|
|
227
|
+
visited.remove(obj_id)
|
|
228
|
+
return result
|
|
229
|
+
except Exception:
|
|
230
|
+
visited.discard(obj_id)
|
|
231
|
+
return safe_str_repr(obj)
|
|
232
|
+
|
|
233
|
+
# Handle objects with __dict__
|
|
234
|
+
if hasattr(obj, "__dict__"):
|
|
235
|
+
visited.add(obj_id)
|
|
236
|
+
try:
|
|
237
|
+
result = {}
|
|
238
|
+
for key, value in obj.__dict__.items():
|
|
239
|
+
# Skip private attributes that start with __
|
|
240
|
+
if not (isinstance(key, str) and key.startswith("__")):
|
|
241
|
+
filtered_value = _apply_data_filters(key, value)
|
|
242
|
+
result[key] = object_to_dict(filtered_value, visited, max_depth, current_depth + 1)
|
|
243
|
+
visited.remove(obj_id)
|
|
244
|
+
return result
|
|
245
|
+
except Exception:
|
|
246
|
+
visited.discard(obj_id)
|
|
247
|
+
return safe_str_repr(obj)
|
|
248
|
+
|
|
249
|
+
# Handle objects with __slots__
|
|
250
|
+
if hasattr(obj, "__slots__"):
|
|
251
|
+
visited.add(obj_id)
|
|
252
|
+
try:
|
|
253
|
+
result = {}
|
|
254
|
+
for slot in obj.__slots__:
|
|
255
|
+
if hasattr(obj, slot):
|
|
256
|
+
value = getattr(obj, slot, None)
|
|
257
|
+
filtered_value = _apply_data_filters(slot, value)
|
|
258
|
+
result[slot] = object_to_dict(filtered_value, visited, max_depth, current_depth + 1)
|
|
259
|
+
visited.remove(obj_id)
|
|
260
|
+
return result
|
|
261
|
+
except Exception:
|
|
262
|
+
visited.discard(obj_id)
|
|
263
|
+
return safe_str_repr(obj)
|
|
264
|
+
|
|
265
|
+
# Fallback: try to get a few common attributes
|
|
266
|
+
try:
|
|
267
|
+
result = {}
|
|
268
|
+
for attr in ["name", "id", "value", "type", "status"]:
|
|
269
|
+
if hasattr(obj, attr):
|
|
270
|
+
value = getattr(obj, attr, None)
|
|
271
|
+
filtered_value = _apply_data_filters(attr, value)
|
|
272
|
+
result[attr] = object_to_dict(filtered_value, visited, max_depth, current_depth + 1)
|
|
273
|
+
if result:
|
|
274
|
+
return result
|
|
275
|
+
except Exception:
|
|
276
|
+
pass
|
|
277
|
+
|
|
278
|
+
# Final fallback: string representation
|
|
279
|
+
return safe_str_repr(obj)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def safe_json_dumps(value: Any) -> str:
|
|
283
|
+
"""
|
|
284
|
+
Safely serialize a value to JSON string with safeguards against:
|
|
285
|
+
- Circular references
|
|
286
|
+
- Large objects (size limits)
|
|
287
|
+
- Unconvertible parts
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
value: The value to serialize
|
|
291
|
+
|
|
292
|
+
Uses AIQA_MAX_OBJECT_STR_CHARS environment variable (default: 1000000) to limit length.
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
JSON string representation
|
|
296
|
+
"""
|
|
297
|
+
max_size_chars = AIQA_MAX_OBJECT_STR_CHARS
|
|
298
|
+
visited: Set[int] = set()
|
|
299
|
+
|
|
300
|
+
# Convert the entire structure to ensure circular references are detected
|
|
301
|
+
# across the whole object graph
|
|
302
|
+
try:
|
|
303
|
+
converted = object_to_dict(value, visited)
|
|
304
|
+
except Exception:
|
|
305
|
+
# If conversion fails, try with a fresh visited set and json default handler
|
|
306
|
+
try:
|
|
307
|
+
json_str = json.dumps(value, default=json_default_handler_factory(set()))
|
|
308
|
+
if len(json_str) > max_size_chars:
|
|
309
|
+
return f"<object {type(value)} too large: {len(json_str)} chars (limit: {max_size_chars} chars) begins: {json_str[:100]}... conversion error: {e}>"
|
|
310
|
+
return json_str
|
|
311
|
+
except Exception:
|
|
312
|
+
return safe_str_repr(value)
|
|
313
|
+
|
|
314
|
+
# Try JSON serialization of the converted structure
|
|
315
|
+
try:
|
|
316
|
+
json_str = json.dumps(converted, default=json_default_handler_factory(set()))
|
|
317
|
+
# Check size
|
|
318
|
+
if len(json_str) > max_size_chars:
|
|
319
|
+
return f"<object {type(value)} too large: {len(json_str)} chars (limit: {max_size_chars} chars) begins: {json_str[:100]}...>"
|
|
320
|
+
return json_str
|
|
321
|
+
except Exception:
|
|
322
|
+
# Final fallback
|
|
323
|
+
return safe_str_repr(value)
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def json_default_handler_factory(visited: Set[int]) -> Callable[[Any], Any]:
|
|
327
|
+
"""
|
|
328
|
+
Create a JSON default handler with a shared visited set for circular reference detection.
|
|
329
|
+
"""
|
|
330
|
+
def handler(obj: Any) -> Any:
|
|
331
|
+
# Handle datetime objects
|
|
332
|
+
if isinstance(obj, datetime):
|
|
333
|
+
return obj.isoformat()
|
|
334
|
+
if isinstance(obj, date):
|
|
335
|
+
return obj.isoformat()
|
|
336
|
+
if isinstance(obj, time):
|
|
337
|
+
return obj.isoformat()
|
|
338
|
+
|
|
339
|
+
# Handle bytes
|
|
340
|
+
if isinstance(obj, bytes):
|
|
341
|
+
try:
|
|
342
|
+
return obj.decode('utf-8')
|
|
343
|
+
except UnicodeDecodeError:
|
|
344
|
+
return f"<bytes: {len(obj)} bytes>"
|
|
345
|
+
|
|
346
|
+
# Try object conversion with the shared visited set
|
|
347
|
+
try:
|
|
348
|
+
return object_to_dict(obj, visited)
|
|
349
|
+
except Exception:
|
|
350
|
+
return safe_str_repr(obj)
|
|
351
|
+
|
|
352
|
+
return handler
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def json_default_handler(obj: Any) -> Any:
|
|
356
|
+
"""
|
|
357
|
+
Default handler for JSON serialization of non-serializable objects.
|
|
358
|
+
This is a fallback that creates its own visited set.
|
|
359
|
+
"""
|
|
360
|
+
return json_default_handler_factory(set())(obj)
|
|
361
|
+
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Example usage of the ExperimentRunner class.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import os
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
from aiqa import ExperimentRunner
|
|
9
|
+
|
|
10
|
+
# Load environment variables
|
|
11
|
+
load_dotenv()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# A dummy test engine that returns a dummy response
|
|
15
|
+
async def my_engine(input_data):
|
|
16
|
+
"""
|
|
17
|
+
Example engine function that simulates an API call.
|
|
18
|
+
Note: For run(), the engine only takes input_data.
|
|
19
|
+
For run_example(), you can use an engine that takes (input_data, parameters).
|
|
20
|
+
"""
|
|
21
|
+
# Imitate an OpenAI API response
|
|
22
|
+
# Sleep for random about 0.5 - 1 seconds
|
|
23
|
+
import random
|
|
24
|
+
|
|
25
|
+
sleep_time = random.random() * 0.5 + 0.5
|
|
26
|
+
await asyncio.sleep(sleep_time)
|
|
27
|
+
return {
|
|
28
|
+
"choices": [
|
|
29
|
+
{
|
|
30
|
+
"message": {
|
|
31
|
+
"content": f"hello {input_data}",
|
|
32
|
+
},
|
|
33
|
+
},
|
|
34
|
+
],
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
async def scorer(output, example):
|
|
39
|
+
"""
|
|
40
|
+
Example scorer function that scores the output.
|
|
41
|
+
In a real scenario, you would use the metrics from the dataset.
|
|
42
|
+
Note: For run(), the scorer only takes (output, example).
|
|
43
|
+
For run_example(), you can use a scorer that takes (output, example, parameters).
|
|
44
|
+
"""
|
|
45
|
+
# This is a simple example - in practice, you'd use the metrics from the dataset
|
|
46
|
+
# and call the scoring functions accordingly
|
|
47
|
+
scores = {}
|
|
48
|
+
# Add your scoring logic here
|
|
49
|
+
return scores
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
async def example_basic_usage():
|
|
53
|
+
"""
|
|
54
|
+
Basic example of using ExperimentRunner.
|
|
55
|
+
"""
|
|
56
|
+
if not os.getenv("AIQA_API_KEY"):
|
|
57
|
+
print("Warning: AIQA_API_KEY environment variable is not set. Example may fail.")
|
|
58
|
+
|
|
59
|
+
dataset_id = "your-dataset-id-here"
|
|
60
|
+
organisation_id = "your-organisation-id-here"
|
|
61
|
+
|
|
62
|
+
experiment_runner = ExperimentRunner(
|
|
63
|
+
dataset_id=dataset_id,
|
|
64
|
+
organisation_id=organisation_id,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Get metrics from the dataset
|
|
68
|
+
dataset = experiment_runner.get_dataset()
|
|
69
|
+
metrics = dataset.get("metrics", [])
|
|
70
|
+
print(f"Found {len(metrics)} metrics in dataset: {[m['name'] for m in metrics]}")
|
|
71
|
+
|
|
72
|
+
# Create scorer that scores all metrics from the dataset
|
|
73
|
+
# (In practice, you'd implement this based on your metrics)
|
|
74
|
+
async def dataset_scorer(output, example):
|
|
75
|
+
# Use the metrics from the dataset to score
|
|
76
|
+
# This is a placeholder - implement based on your actual metrics
|
|
77
|
+
return await scorer(output, example)
|
|
78
|
+
|
|
79
|
+
# Get example inputs
|
|
80
|
+
example_inputs = experiment_runner.get_example_inputs()
|
|
81
|
+
print(f"Processing {len(example_inputs)} examples")
|
|
82
|
+
|
|
83
|
+
# Run experiments on each example
|
|
84
|
+
for example in example_inputs:
|
|
85
|
+
result = await experiment_runner.run_example(example, my_engine, dataset_scorer)
|
|
86
|
+
if result and len(result) > 0:
|
|
87
|
+
print(f"Scored example {example['id']}: {result}")
|
|
88
|
+
else:
|
|
89
|
+
print(f"No results for example {example['id']}")
|
|
90
|
+
|
|
91
|
+
# Get summary results
|
|
92
|
+
summary_results = experiment_runner.get_summary_results()
|
|
93
|
+
print(f"Summary results: {summary_results}")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
async def example_with_experiment_setup():
|
|
97
|
+
"""
|
|
98
|
+
Example of creating an experiment with custom setup.
|
|
99
|
+
"""
|
|
100
|
+
dataset_id = "your-dataset-id-here"
|
|
101
|
+
organisation_id = "your-organisation-id-here"
|
|
102
|
+
|
|
103
|
+
experiment_runner = ExperimentRunner(
|
|
104
|
+
dataset_id=dataset_id,
|
|
105
|
+
organisation_id=organisation_id,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Create experiment with custom parameters
|
|
109
|
+
experiment = experiment_runner.create_experiment(
|
|
110
|
+
{
|
|
111
|
+
"name": "My Custom Experiment",
|
|
112
|
+
"parameters": {
|
|
113
|
+
"model": "gpt-4",
|
|
114
|
+
"temperature": 0.7,
|
|
115
|
+
},
|
|
116
|
+
"comparison_parameters": [
|
|
117
|
+
{"temperature": 0.5},
|
|
118
|
+
{"temperature": 0.9},
|
|
119
|
+
],
|
|
120
|
+
}
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
print(f"Created experiment: {experiment['id']}")
|
|
124
|
+
|
|
125
|
+
# Now run the experiment
|
|
126
|
+
await experiment_runner.run(my_engine, scorer)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
async def example_stepwise():
|
|
130
|
+
"""
|
|
131
|
+
Example of running experiments step by step (more control).
|
|
132
|
+
"""
|
|
133
|
+
dataset_id = "your-dataset-id-here"
|
|
134
|
+
organisation_id = "your-organisation-id-here"
|
|
135
|
+
|
|
136
|
+
experiment_runner = ExperimentRunner(
|
|
137
|
+
dataset_id=dataset_id,
|
|
138
|
+
organisation_id=organisation_id,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Get the dataset
|
|
142
|
+
dataset = experiment_runner.get_dataset()
|
|
143
|
+
metrics = dataset.get("metrics", [])
|
|
144
|
+
print(f"Found {len(metrics)} metrics in dataset")
|
|
145
|
+
|
|
146
|
+
# Create scorer for run_example (takes parameters)
|
|
147
|
+
async def my_scorer(output, example, parameters):
|
|
148
|
+
# Implement your scoring logic here
|
|
149
|
+
# Note: run_example() passes parameters, so this scorer can use them
|
|
150
|
+
return {"score": 0.8} # Placeholder
|
|
151
|
+
|
|
152
|
+
# Get examples
|
|
153
|
+
examples = experiment_runner.get_example_inputs(limit=100)
|
|
154
|
+
print(f"Processing {len(examples)} examples")
|
|
155
|
+
|
|
156
|
+
# Process each example individually
|
|
157
|
+
for example in examples:
|
|
158
|
+
try:
|
|
159
|
+
result = await experiment_runner.run_example(example, my_engine, my_scorer)
|
|
160
|
+
print(f"Example {example['id']} completed: {result}")
|
|
161
|
+
except Exception as e:
|
|
162
|
+
print(f"Example {example['id']} failed: {e}")
|
|
163
|
+
|
|
164
|
+
# Get final summary
|
|
165
|
+
summary = experiment_runner.get_summary_results()
|
|
166
|
+
print(f"Final summary: {summary}")
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
if __name__ == "__main__":
|
|
170
|
+
# Uncomment the example you want to run:
|
|
171
|
+
# asyncio.run(example_basic_usage())
|
|
172
|
+
# asyncio.run(example_with_experiment_setup())
|
|
173
|
+
# asyncio.run(example_stepwise())
|
|
174
|
+
print("Please uncomment one of the examples above to run it.")
|
|
175
|
+
print("Make sure to set your dataset_id and organisation_id in the example functions.")
|
|
176
|
+
|