aiqa-client 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiqa/__init__.py +7 -44
- aiqa/aiqa_exporter.py +21 -79
- aiqa/tracing.py +170 -1107
- {aiqa_client-0.1.2.dist-info → aiqa_client-0.1.4.dist-info}/METADATA +4 -94
- aiqa_client-0.1.4.dist-info/RECORD +9 -0
- aiqa/client.py +0 -170
- aiqa/experiment_runner.py +0 -336
- aiqa/object_serialiser.py +0 -361
- aiqa/test_experiment_runner.py +0 -176
- aiqa/test_tracing.py +0 -230
- aiqa_client-0.1.2.dist-info/RECORD +0 -14
- {aiqa_client-0.1.2.dist-info → aiqa_client-0.1.4.dist-info}/WHEEL +0 -0
- {aiqa_client-0.1.2.dist-info → aiqa_client-0.1.4.dist-info}/licenses/LICENSE +0 -0
- {aiqa_client-0.1.2.dist-info → aiqa_client-0.1.4.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: aiqa-client
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: OpenTelemetry-based Python client for tracing functions and sending traces to the AIQA server
|
|
5
5
|
Author-email: AIQA <info@aiqa.dev>
|
|
6
6
|
License: MIT
|
|
@@ -72,15 +72,7 @@ export AIQA_API_KEY="your-api-key"
|
|
|
72
72
|
### Basic Usage
|
|
73
73
|
|
|
74
74
|
```python
|
|
75
|
-
from
|
|
76
|
-
from aiqa import get_aiqa_client, WithTracing
|
|
77
|
-
|
|
78
|
-
# Load environment variables from .env file (if using one)
|
|
79
|
-
load_dotenv()
|
|
80
|
-
|
|
81
|
-
# Initialize client (must be called before using WithTracing)
|
|
82
|
-
# This loads environment variables and initializes the tracing system
|
|
83
|
-
get_aiqa_client()
|
|
75
|
+
from aiqa import WithTracing
|
|
84
76
|
|
|
85
77
|
@WithTracing
|
|
86
78
|
def my_function(x, y):
|
|
@@ -116,12 +108,12 @@ def my_function(data):
|
|
|
116
108
|
Spans are automatically flushed every 5 seconds. To flush immediately:
|
|
117
109
|
|
|
118
110
|
```python
|
|
119
|
-
from aiqa import
|
|
111
|
+
from aiqa import flush_spans
|
|
120
112
|
import asyncio
|
|
121
113
|
|
|
122
114
|
async def main():
|
|
123
115
|
# Your code here
|
|
124
|
-
await
|
|
116
|
+
await flush_spans()
|
|
125
117
|
|
|
126
118
|
asyncio.run(main())
|
|
127
119
|
```
|
|
@@ -152,87 +144,6 @@ def my_function():
|
|
|
152
144
|
# ... rest of function
|
|
153
145
|
```
|
|
154
146
|
|
|
155
|
-
### Grouping Traces by Conversation
|
|
156
|
-
|
|
157
|
-
To group multiple traces together that are part of the same conversation or session:
|
|
158
|
-
|
|
159
|
-
```python
|
|
160
|
-
from aiqa import WithTracing, set_conversation_id
|
|
161
|
-
|
|
162
|
-
@WithTracing
|
|
163
|
-
def handle_user_request(user_id: str, session_id: str):
|
|
164
|
-
# Set conversation ID to group all traces for this user session
|
|
165
|
-
set_conversation_id(f"user_{user_id}_session_{session_id}")
|
|
166
|
-
# All spans created in this function and its children will have this gen_ai.conversation.id
|
|
167
|
-
# ... rest of function
|
|
168
|
-
```
|
|
169
|
-
|
|
170
|
-
The `gen_ai.conversation.id` attribute allows you to filter and group traces in the AIQA server by conversation, making it easier to analyze multi-step interactions or user sessions. See the [OpenTelemetry GenAI Events specification](https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-events/) for more details.
|
|
171
|
-
|
|
172
|
-
### Trace ID Propagation Across Services/Agents
|
|
173
|
-
|
|
174
|
-
To link traces across different services or agents, you can extract and propagate trace IDs:
|
|
175
|
-
|
|
176
|
-
#### Getting Current Trace ID
|
|
177
|
-
|
|
178
|
-
```python
|
|
179
|
-
from aiqa import get_trace_id, get_span_id
|
|
180
|
-
|
|
181
|
-
# Get the current trace ID and span ID
|
|
182
|
-
trace_id = get_trace_id() # Returns hex string (32 chars) or None
|
|
183
|
-
span_id = get_span_id() # Returns hex string (16 chars) or None
|
|
184
|
-
|
|
185
|
-
# Pass these to another service (e.g., in HTTP headers, message queue, etc.)
|
|
186
|
-
```
|
|
187
|
-
|
|
188
|
-
#### Continuing a Trace in Another Service
|
|
189
|
-
|
|
190
|
-
```python
|
|
191
|
-
from aiqa import create_span_from_trace_id
|
|
192
|
-
|
|
193
|
-
# Continue a trace from another service/agent
|
|
194
|
-
# trace_id and parent_span_id come from the other service
|
|
195
|
-
with create_span_from_trace_id(
|
|
196
|
-
trace_id="abc123...",
|
|
197
|
-
parent_span_id="def456...",
|
|
198
|
-
span_name="service_b_operation"
|
|
199
|
-
):
|
|
200
|
-
# Your code here - this span will be linked to the original trace
|
|
201
|
-
pass
|
|
202
|
-
```
|
|
203
|
-
|
|
204
|
-
#### Using OpenTelemetry Context Propagation (Recommended)
|
|
205
|
-
|
|
206
|
-
For HTTP requests, use the built-in context propagation:
|
|
207
|
-
|
|
208
|
-
```python
|
|
209
|
-
from aiqa import inject_trace_context, extract_trace_context
|
|
210
|
-
import requests
|
|
211
|
-
from opentelemetry.trace import use_span
|
|
212
|
-
|
|
213
|
-
# In the sending service:
|
|
214
|
-
headers = {}
|
|
215
|
-
inject_trace_context(headers) # Adds trace context to headers
|
|
216
|
-
response = requests.get("http://other-service/api", headers=headers)
|
|
217
|
-
|
|
218
|
-
# In the receiving service:
|
|
219
|
-
# Extract context from incoming request headers
|
|
220
|
-
ctx = extract_trace_context(request.headers)
|
|
221
|
-
|
|
222
|
-
# Use the context to create a span
|
|
223
|
-
from opentelemetry.trace import use_span
|
|
224
|
-
with use_span(ctx):
|
|
225
|
-
# Your code here
|
|
226
|
-
pass
|
|
227
|
-
|
|
228
|
-
# Or create a span with the context
|
|
229
|
-
from opentelemetry import trace
|
|
230
|
-
tracer = trace.get_tracer("aiqa-tracer")
|
|
231
|
-
with tracer.start_as_current_span("operation", context=ctx):
|
|
232
|
-
# Your code here
|
|
233
|
-
pass
|
|
234
|
-
```
|
|
235
|
-
|
|
236
147
|
## Features
|
|
237
148
|
|
|
238
149
|
- Automatic tracing of function calls (sync and async)
|
|
@@ -240,7 +151,6 @@ with tracer.start_as_current_span("operation", context=ctx):
|
|
|
240
151
|
- Automatic error tracking and exception recording
|
|
241
152
|
- Thread-safe span buffering and auto-flushing
|
|
242
153
|
- OpenTelemetry context propagation for nested spans
|
|
243
|
-
- Trace ID propagation utilities for distributed tracing
|
|
244
154
|
|
|
245
155
|
## Example
|
|
246
156
|
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
aiqa/__init__.py,sha256=MwDG0A3kszFsknTCxQ3l8lX1QB2soTMSMZ2YhI7FcYU,470
|
|
2
|
+
aiqa/aiqa_exporter.py,sha256=vXyX6Q_iOjrDz3tCPOMXuBTQg7ocACdOOqzpkUqhy9g,19131
|
|
3
|
+
aiqa/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
aiqa/tracing.py,sha256=Aq4VbX6czSO7LtIcA6tTXghtw9-upZ802g3DNylzoq8,12680
|
|
5
|
+
aiqa_client-0.1.4.dist-info/licenses/LICENSE,sha256=kIzkzLuzG0HHaWYm4F4W5FeJ1Yxut3Ec6bhLWyw798A,1062
|
|
6
|
+
aiqa_client-0.1.4.dist-info/METADATA,sha256=WU-tzSni5NhyJfUHdq4f0lX6JHOPW0VRGrtkW6XL7mo,3772
|
|
7
|
+
aiqa_client-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
+
aiqa_client-0.1.4.dist-info/top_level.txt,sha256=nwcsuVVSuWu27iLxZd4n1evVzv1W6FVTrSnCXCc-NQs,5
|
|
9
|
+
aiqa_client-0.1.4.dist-info/RECORD,,
|
aiqa/client.py
DELETED
|
@@ -1,170 +0,0 @@
|
|
|
1
|
-
# aiqa/client.py
|
|
2
|
-
import os
|
|
3
|
-
import logging
|
|
4
|
-
from functools import lru_cache
|
|
5
|
-
from opentelemetry import trace
|
|
6
|
-
from opentelemetry.sdk.trace import TracerProvider
|
|
7
|
-
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
8
|
-
|
|
9
|
-
logger = logging.getLogger("AIQA")
|
|
10
|
-
|
|
11
|
-
# Compatibility import for TraceIdRatioBased sampler
|
|
12
|
-
# In older OpenTelemetry versions it was TraceIdRatioBasedSampler
|
|
13
|
-
# In newer versions (>=1.24.0) it's TraceIdRatioBased
|
|
14
|
-
TraceIdRatioBased = None
|
|
15
|
-
try:
|
|
16
|
-
from opentelemetry.sdk.trace.sampling import TraceIdRatioBased
|
|
17
|
-
except ImportError:
|
|
18
|
-
try:
|
|
19
|
-
from opentelemetry.sdk.trace.sampling import TraceIdRatioBasedSampler as TraceIdRatioBased
|
|
20
|
-
except ImportError:
|
|
21
|
-
logger.warning(
|
|
22
|
-
"Could not import TraceIdRatioBased or TraceIdRatioBasedSampler from "
|
|
23
|
-
"opentelemetry.sdk.trace.sampling. AIQA tracing may not work correctly. "
|
|
24
|
-
"Please ensure opentelemetry-sdk>=1.24.0 is installed. "
|
|
25
|
-
"Try: pip install --upgrade opentelemetry-sdk"
|
|
26
|
-
)
|
|
27
|
-
# Set to None so we can check later
|
|
28
|
-
TraceIdRatioBased = None
|
|
29
|
-
|
|
30
|
-
from .aiqa_exporter import AIQASpanExporter
|
|
31
|
-
|
|
32
|
-
AIQA_TRACER_NAME = "aiqa-tracer"
|
|
33
|
-
|
|
34
|
-
client = {
|
|
35
|
-
"provider": None,
|
|
36
|
-
"exporter": None,
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
# Component tag to add to all spans (can be set via AIQA_COMPONENT_TAG env var or programmatically)
|
|
40
|
-
_component_tag: str = ""
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def get_component_tag() -> str:
|
|
44
|
-
"""Get the current component tag."""
|
|
45
|
-
return _component_tag
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def set_component_tag(tag: str | None) -> None:
|
|
49
|
-
"""Set the component tag programmatically (overrides environment variable)."""
|
|
50
|
-
global _component_tag
|
|
51
|
-
_component_tag = tag or ""
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
@lru_cache(maxsize=1)
|
|
55
|
-
def get_aiqa_client():
|
|
56
|
-
"""
|
|
57
|
-
Initialize and return the AIQA client.
|
|
58
|
-
|
|
59
|
-
This function must be called before using any AIQA tracing functionality to ensure
|
|
60
|
-
that environment variables (such as AIQA_SERVER_URL, AIQA_API_KEY, AIQA_COMPONENT_TAG)
|
|
61
|
-
are properly loaded and the tracing system is initialized.
|
|
62
|
-
|
|
63
|
-
The function is idempotent - calling it multiple times is safe and will only
|
|
64
|
-
initialize once.
|
|
65
|
-
|
|
66
|
-
Example:
|
|
67
|
-
from aiqa import get_aiqa_client, WithTracing
|
|
68
|
-
|
|
69
|
-
# Initialize client (loads env vars)
|
|
70
|
-
get_aiqa_client()
|
|
71
|
-
|
|
72
|
-
@WithTracing
|
|
73
|
-
def my_function():
|
|
74
|
-
pass
|
|
75
|
-
"""
|
|
76
|
-
global client
|
|
77
|
-
try:
|
|
78
|
-
_init_tracing()
|
|
79
|
-
except Exception as e:
|
|
80
|
-
logger.error(f"Failed to initialize AIQA tracing: {e}")
|
|
81
|
-
logger.warning("AIQA tracing is disabled. Your application will continue to run without tracing.")
|
|
82
|
-
# optionally return a richer client object; for now you just need init
|
|
83
|
-
return client
|
|
84
|
-
|
|
85
|
-
def _init_tracing():
|
|
86
|
-
"""Initialize tracing system and load configuration from environment variables."""
|
|
87
|
-
try:
|
|
88
|
-
# Initialize component tag from environment variable
|
|
89
|
-
set_component_tag(os.getenv("AIQA_COMPONENT_TAG", None))
|
|
90
|
-
|
|
91
|
-
provider = trace.get_tracer_provider()
|
|
92
|
-
|
|
93
|
-
# Get sampling rate from environment (default: 1.0 = sample all)
|
|
94
|
-
sampling_rate = 1.0
|
|
95
|
-
if env_rate := os.getenv("AIQA_SAMPLING_RATE"):
|
|
96
|
-
try:
|
|
97
|
-
rate = float(env_rate)
|
|
98
|
-
sampling_rate = max(0.0, min(1.0, rate)) # Clamp to [0, 1]
|
|
99
|
-
except ValueError:
|
|
100
|
-
logger.warning(f"Invalid AIQA_SAMPLING_RATE value '{env_rate}', using default 1.0")
|
|
101
|
-
|
|
102
|
-
# If it's still the default proxy, install a real SDK provider
|
|
103
|
-
if not isinstance(provider, TracerProvider):
|
|
104
|
-
if TraceIdRatioBased is None:
|
|
105
|
-
raise ImportError(
|
|
106
|
-
"TraceIdRatioBased sampler is not available. "
|
|
107
|
-
"Please install opentelemetry-sdk>=1.24.0"
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
# Create sampler based on trace-id for deterministic sampling
|
|
111
|
-
sampler = TraceIdRatioBased(sampling_rate)
|
|
112
|
-
provider = TracerProvider(sampler=sampler)
|
|
113
|
-
trace.set_tracer_provider(provider)
|
|
114
|
-
|
|
115
|
-
# Idempotently add your processor
|
|
116
|
-
_attach_aiqa_processor(provider)
|
|
117
|
-
global client
|
|
118
|
-
client["provider"] = provider
|
|
119
|
-
|
|
120
|
-
# Log successful initialization
|
|
121
|
-
server_url = os.getenv("AIQA_SERVER_URL", "not configured")
|
|
122
|
-
logger.info(f"AIQA initialized and tracing (sampling rate: {sampling_rate:.2f}, server: {server_url})")
|
|
123
|
-
|
|
124
|
-
except Exception as e:
|
|
125
|
-
logger.error(f"Error initializing AIQA tracing: {e}")
|
|
126
|
-
raise
|
|
127
|
-
|
|
128
|
-
def _attach_aiqa_processor(provider: TracerProvider):
|
|
129
|
-
"""Attach AIQA span processor to the provider. Idempotent - safe to call multiple times."""
|
|
130
|
-
try:
|
|
131
|
-
# Avoid double-adding if get_aiqa_client() is called multiple times
|
|
132
|
-
for p in provider._active_span_processor._span_processors:
|
|
133
|
-
if isinstance(getattr(p, "exporter", None), AIQASpanExporter):
|
|
134
|
-
logger.debug("AIQA span processor already attached, skipping")
|
|
135
|
-
return
|
|
136
|
-
|
|
137
|
-
exporter = AIQASpanExporter(
|
|
138
|
-
server_url=os.getenv("AIQA_SERVER_URL"),
|
|
139
|
-
api_key=os.getenv("AIQA_API_KEY"),
|
|
140
|
-
)
|
|
141
|
-
provider.add_span_processor(BatchSpanProcessor(exporter))
|
|
142
|
-
global client
|
|
143
|
-
client["exporter"] = exporter
|
|
144
|
-
logger.debug("AIQA span processor attached successfully")
|
|
145
|
-
except Exception as e:
|
|
146
|
-
logger.error(f"Error attaching AIQA span processor: {e}")
|
|
147
|
-
# Re-raise to let _init_tracing handle it - it will log and continue
|
|
148
|
-
raise
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
def get_aiqa_tracer():
|
|
152
|
-
"""
|
|
153
|
-
Get the AIQA tracer with version from __init__.py __version__.
|
|
154
|
-
This should be used instead of trace.get_tracer() to ensure version is set.
|
|
155
|
-
"""
|
|
156
|
-
try:
|
|
157
|
-
# Import here to avoid circular import
|
|
158
|
-
from . import __version__
|
|
159
|
-
|
|
160
|
-
# Compatibility: version parameter may not be supported in older OpenTelemetry versions
|
|
161
|
-
try:
|
|
162
|
-
# Try with version parameter (newer OpenTelemetry versions)
|
|
163
|
-
return trace.get_tracer(AIQA_TRACER_NAME, version=__version__)
|
|
164
|
-
except TypeError:
|
|
165
|
-
# Fall back to without version parameter (older versions)
|
|
166
|
-
return trace.get_tracer(AIQA_TRACER_NAME)
|
|
167
|
-
except Exception as e:
|
|
168
|
-
logger.error(f"Error getting AIQA tracer: {e}")
|
|
169
|
-
# Return a basic tracer as fallback to prevent crashes
|
|
170
|
-
return trace.get_tracer(AIQA_TRACER_NAME)
|
aiqa/experiment_runner.py
DELETED
|
@@ -1,336 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
ExperimentRunner - runs experiments on datasets and scores results
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
import os
|
|
6
|
-
import time
|
|
7
|
-
from typing import Any, Dict, List, Optional, Callable, Awaitable, Union
|
|
8
|
-
import requests
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class ExperimentRunner:
|
|
12
|
-
"""
|
|
13
|
-
The ExperimentRunner is the main class for running experiments on datasets.
|
|
14
|
-
It can create an experiment, run it, and score the results.
|
|
15
|
-
Handles setting up environment variables and passing parameters to the engine function.
|
|
16
|
-
"""
|
|
17
|
-
|
|
18
|
-
def __init__(
|
|
19
|
-
self,
|
|
20
|
-
dataset_id: str,
|
|
21
|
-
experiment_id: Optional[str] = None,
|
|
22
|
-
server_url: Optional[str] = None,
|
|
23
|
-
api_key: Optional[str] = None,
|
|
24
|
-
organisation_id: Optional[str] = None,
|
|
25
|
-
):
|
|
26
|
-
"""
|
|
27
|
-
Initialize the ExperimentRunner.
|
|
28
|
-
|
|
29
|
-
Args:
|
|
30
|
-
dataset_id: ID of the dataset to run experiments on
|
|
31
|
-
experiment_id: Usually unset, and a fresh experiment is created with a random ID
|
|
32
|
-
server_url: URL of the AIQA server (defaults to AIQA_SERVER_URL env var)
|
|
33
|
-
api_key: API key for authentication (defaults to AIQA_API_KEY env var)
|
|
34
|
-
organisation_id: Organisation ID for the experiment
|
|
35
|
-
"""
|
|
36
|
-
self.dataset_id = dataset_id
|
|
37
|
-
self.experiment_id = experiment_id
|
|
38
|
-
self.server_url = (server_url or os.getenv("AIQA_SERVER_URL", "")).rstrip("/")
|
|
39
|
-
self.api_key = api_key or os.getenv("AIQA_API_KEY", "")
|
|
40
|
-
self.organisation = organisation_id
|
|
41
|
-
self.experiment: Optional[Dict[str, Any]] = None
|
|
42
|
-
self.scores: List[Dict[str, Any]] = []
|
|
43
|
-
|
|
44
|
-
def _get_headers(self) -> Dict[str, str]:
|
|
45
|
-
"""Build HTTP headers for API requests."""
|
|
46
|
-
headers = {"Content-Type": "application/json"}
|
|
47
|
-
if self.api_key:
|
|
48
|
-
headers["Authorization"] = f"ApiKey {self.api_key}"
|
|
49
|
-
return headers
|
|
50
|
-
|
|
51
|
-
def get_dataset(self) -> Dict[str, Any]:
|
|
52
|
-
"""
|
|
53
|
-
Fetch the dataset to get its metrics.
|
|
54
|
-
|
|
55
|
-
Returns:
|
|
56
|
-
The dataset object with metrics and other information
|
|
57
|
-
"""
|
|
58
|
-
response = requests.get(
|
|
59
|
-
f"{self.server_url}/dataset/{self.dataset_id}",
|
|
60
|
-
headers=self._get_headers(),
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
if not response.ok:
|
|
64
|
-
error_text = response.text if hasattr(response, "text") else "Unknown error"
|
|
65
|
-
raise Exception(
|
|
66
|
-
f"Failed to fetch dataset: {response.status_code} {response.reason} - {error_text}"
|
|
67
|
-
)
|
|
68
|
-
|
|
69
|
-
return response.json()
|
|
70
|
-
|
|
71
|
-
def get_example_inputs(self, limit: int = 10000) -> List[Dict[str, Any]]:
|
|
72
|
-
"""
|
|
73
|
-
Fetch example inputs from the dataset.
|
|
74
|
-
|
|
75
|
-
Args:
|
|
76
|
-
limit: Maximum number of examples to fetch (default: 10000)
|
|
77
|
-
|
|
78
|
-
Returns:
|
|
79
|
-
List of example objects
|
|
80
|
-
"""
|
|
81
|
-
params = {
|
|
82
|
-
"dataset_id": self.dataset_id,
|
|
83
|
-
"limit": str(limit),
|
|
84
|
-
}
|
|
85
|
-
if self.organisation:
|
|
86
|
-
params["organisation"] = self.organisation
|
|
87
|
-
|
|
88
|
-
response = requests.get(
|
|
89
|
-
f"{self.server_url}/example",
|
|
90
|
-
params=params,
|
|
91
|
-
headers=self._get_headers(),
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
if not response.ok:
|
|
95
|
-
error_text = response.text if hasattr(response, "text") else "Unknown error"
|
|
96
|
-
raise Exception(
|
|
97
|
-
f"Failed to fetch example inputs: {response.status_code} {response.reason} - {error_text}"
|
|
98
|
-
)
|
|
99
|
-
|
|
100
|
-
data = response.json()
|
|
101
|
-
return data.get("hits", [])
|
|
102
|
-
|
|
103
|
-
def create_experiment(
|
|
104
|
-
self, experiment_setup: Optional[Dict[str, Any]] = None
|
|
105
|
-
) -> Dict[str, Any]:
|
|
106
|
-
"""
|
|
107
|
-
Create an experiment if one does not exist.
|
|
108
|
-
|
|
109
|
-
Args:
|
|
110
|
-
experiment_setup: Optional setup for the experiment object. You may wish to set:
|
|
111
|
-
- name (recommended for labelling the experiment)
|
|
112
|
-
- parameters
|
|
113
|
-
- comparison_parameters
|
|
114
|
-
|
|
115
|
-
Returns:
|
|
116
|
-
The created experiment object
|
|
117
|
-
"""
|
|
118
|
-
if not self.organisation or not self.dataset_id:
|
|
119
|
-
raise Exception("Organisation and dataset ID are required to create an experiment")
|
|
120
|
-
|
|
121
|
-
if not experiment_setup:
|
|
122
|
-
experiment_setup = {}
|
|
123
|
-
|
|
124
|
-
# Fill in if not set
|
|
125
|
-
experiment_setup = {
|
|
126
|
-
**experiment_setup,
|
|
127
|
-
"organisation": self.organisation,
|
|
128
|
-
"dataset": self.dataset_id,
|
|
129
|
-
"results": [],
|
|
130
|
-
"summary_results": {},
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
print("Creating experiment")
|
|
134
|
-
response = requests.post(
|
|
135
|
-
f"{self.server_url}/experiment",
|
|
136
|
-
json=experiment_setup,
|
|
137
|
-
headers=self._get_headers(),
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
if not response.ok:
|
|
141
|
-
error_text = response.text if hasattr(response, "text") else "Unknown error"
|
|
142
|
-
raise Exception(
|
|
143
|
-
f"Failed to create experiment: {response.status_code} {response.reason} - {error_text}"
|
|
144
|
-
)
|
|
145
|
-
|
|
146
|
-
experiment = response.json()
|
|
147
|
-
self.experiment_id = experiment["id"]
|
|
148
|
-
self.experiment = experiment
|
|
149
|
-
return experiment
|
|
150
|
-
|
|
151
|
-
def score_and_store(
|
|
152
|
-
self,
|
|
153
|
-
example: Dict[str, Any],
|
|
154
|
-
result: Any,
|
|
155
|
-
scores: Optional[Dict[str, Any]] = None,
|
|
156
|
-
) -> Dict[str, Any]:
|
|
157
|
-
"""
|
|
158
|
-
Ask the server to score an example result. Stores the score for later summary calculation.
|
|
159
|
-
|
|
160
|
-
Args:
|
|
161
|
-
example: The example object
|
|
162
|
-
result: The output from running the engine on the example
|
|
163
|
-
scores: Optional pre-computed scores
|
|
164
|
-
|
|
165
|
-
Returns:
|
|
166
|
-
The score result from the server
|
|
167
|
-
"""
|
|
168
|
-
# Do we have an experiment ID? If not, we need to create the experiment first
|
|
169
|
-
if not self.experiment_id:
|
|
170
|
-
self.create_experiment()
|
|
171
|
-
|
|
172
|
-
if scores is None:
|
|
173
|
-
scores = {}
|
|
174
|
-
|
|
175
|
-
print(f"Scoring and storing example: {example['id']}")
|
|
176
|
-
print(f"Scores: {scores}")
|
|
177
|
-
|
|
178
|
-
response = requests.post(
|
|
179
|
-
f"{self.server_url}/experiment/{self.experiment_id}/example/{example['id']}/scoreAndStore",
|
|
180
|
-
json={
|
|
181
|
-
"output": result,
|
|
182
|
-
"traceId": example.get("traceId"),
|
|
183
|
-
"scores": scores,
|
|
184
|
-
},
|
|
185
|
-
headers=self._get_headers(),
|
|
186
|
-
)
|
|
187
|
-
|
|
188
|
-
if not response.ok:
|
|
189
|
-
error_text = response.text if hasattr(response, "text") else "Unknown error"
|
|
190
|
-
raise Exception(
|
|
191
|
-
f"Failed to score and store: {response.status_code} {response.reason} - {error_text}"
|
|
192
|
-
)
|
|
193
|
-
|
|
194
|
-
json_result = response.json()
|
|
195
|
-
print(f"scoreAndStore response: {json_result}")
|
|
196
|
-
return json_result
|
|
197
|
-
|
|
198
|
-
async def run(
|
|
199
|
-
self,
|
|
200
|
-
engine: Callable[[Any], Union[Any, Awaitable[Any]]],
|
|
201
|
-
scorer: Optional[
|
|
202
|
-
Callable[[Any, Dict[str, Any]], Awaitable[Dict[str, Any]]]
|
|
203
|
-
] = None,
|
|
204
|
-
) -> None:
|
|
205
|
-
"""
|
|
206
|
-
Run an engine function on all examples and score the results.
|
|
207
|
-
|
|
208
|
-
Args:
|
|
209
|
-
engine: Function that takes input, returns output (can be async)
|
|
210
|
-
scorer: Optional function that scores the output given the example
|
|
211
|
-
"""
|
|
212
|
-
examples = self.get_example_inputs()
|
|
213
|
-
|
|
214
|
-
# Wrap engine to match run_example signature (input, parameters)
|
|
215
|
-
def wrapped_engine(input_data, parameters):
|
|
216
|
-
return engine(input_data)
|
|
217
|
-
|
|
218
|
-
# Wrap scorer to match run_example signature (output, example, parameters)
|
|
219
|
-
async def wrapped_scorer(output, example, parameters):
|
|
220
|
-
if scorer:
|
|
221
|
-
return await scorer(output, example)
|
|
222
|
-
return {}
|
|
223
|
-
|
|
224
|
-
for example in examples:
|
|
225
|
-
scores = await self.run_example(example, wrapped_engine, wrapped_scorer)
|
|
226
|
-
if scores:
|
|
227
|
-
self.scores.append(
|
|
228
|
-
{
|
|
229
|
-
"example": example,
|
|
230
|
-
"result": scores,
|
|
231
|
-
"scores": scores,
|
|
232
|
-
}
|
|
233
|
-
)
|
|
234
|
-
|
|
235
|
-
async def run_example(
|
|
236
|
-
self,
|
|
237
|
-
example: Dict[str, Any],
|
|
238
|
-
call_my_code: Callable[[Any, Dict[str, Any]], Union[Any, Awaitable[Any]]],
|
|
239
|
-
score_this_output: Optional[
|
|
240
|
-
Callable[[Any, Dict[str, Any], Dict[str, Any]], Awaitable[Dict[str, Any]]]
|
|
241
|
-
] = None,
|
|
242
|
-
) -> List[Dict[str, Any]]:
|
|
243
|
-
"""
|
|
244
|
-
Run the engine on an example with the given parameters (looping over comparison parameters),
|
|
245
|
-
and score the result. Also calls scoreAndStore to store the result in the server.
|
|
246
|
-
|
|
247
|
-
Args:
|
|
248
|
-
example: The example to run
|
|
249
|
-
call_my_code: Function that takes input and parameters, returns output (can be async)
|
|
250
|
-
score_this_output: Optional function that scores the output given the example and parameters
|
|
251
|
-
|
|
252
|
-
Returns:
|
|
253
|
-
One set of scores for each comparison parameter set. If no comparison parameters,
|
|
254
|
-
returns an array of one.
|
|
255
|
-
"""
|
|
256
|
-
# Ensure experiment exists
|
|
257
|
-
if not self.experiment:
|
|
258
|
-
self.create_experiment()
|
|
259
|
-
if not self.experiment:
|
|
260
|
-
raise Exception("Failed to create experiment")
|
|
261
|
-
|
|
262
|
-
# Make the parameters
|
|
263
|
-
parameters_fixed = self.experiment.get("parameters") or {}
|
|
264
|
-
# If comparison_parameters is empty/undefined, default to [{}] so we run at least once
|
|
265
|
-
parameters_loop = self.experiment.get("comparison_parameters") or [{}]
|
|
266
|
-
|
|
267
|
-
# Handle both spans array and input field
|
|
268
|
-
input_data = example.get("input")
|
|
269
|
-
if not input_data and example.get("spans") and len(example["spans"]) > 0:
|
|
270
|
-
input_data = example["spans"][0].get("attributes", {}).get("input")
|
|
271
|
-
|
|
272
|
-
if not input_data:
|
|
273
|
-
print(
|
|
274
|
-
f"Warning: Example has no input field or spans with input attribute: {example}"
|
|
275
|
-
)
|
|
276
|
-
# Run engine anyway -- this could make sense if it's all about the parameters
|
|
277
|
-
|
|
278
|
-
all_scores: List[Dict[str, Any]] = []
|
|
279
|
-
# This loop should not be parallelized - it should run sequentially, one after the other
|
|
280
|
-
# to avoid creating interference between the runs.
|
|
281
|
-
for parameters in parameters_loop:
|
|
282
|
-
parameters_here = {**parameters_fixed, **parameters}
|
|
283
|
-
print(f"Running with parameters: {parameters_here}")
|
|
284
|
-
|
|
285
|
-
# Set env vars from parameters_here
|
|
286
|
-
for key, value in parameters_here.items():
|
|
287
|
-
if value:
|
|
288
|
-
os.environ[key] = str(value)
|
|
289
|
-
|
|
290
|
-
start = time.time() * 1000 # milliseconds
|
|
291
|
-
output = call_my_code(input_data, parameters_here)
|
|
292
|
-
# Handle async functions
|
|
293
|
-
if hasattr(output, "__await__"):
|
|
294
|
-
import asyncio
|
|
295
|
-
|
|
296
|
-
output = await output
|
|
297
|
-
end = time.time() * 1000 # milliseconds
|
|
298
|
-
duration = int(end - start)
|
|
299
|
-
|
|
300
|
-
print(f"Output: {output}")
|
|
301
|
-
|
|
302
|
-
scores: Dict[str, Any] = {}
|
|
303
|
-
if score_this_output:
|
|
304
|
-
scores = await score_this_output(output, example, parameters_here)
|
|
305
|
-
|
|
306
|
-
scores["duration"] = duration
|
|
307
|
-
|
|
308
|
-
# TODO: this call as async and wait for all to complete before returning
|
|
309
|
-
print(f"Call scoreAndStore ... for example: {example['id']} with scores: {scores}")
|
|
310
|
-
result = self.score_and_store(example, output, scores)
|
|
311
|
-
print(f"scoreAndStore returned: {result}")
|
|
312
|
-
all_scores.append(result)
|
|
313
|
-
|
|
314
|
-
return all_scores
|
|
315
|
-
|
|
316
|
-
def get_summary_results(self) -> Dict[str, Any]:
|
|
317
|
-
"""
|
|
318
|
-
Get summary results from the experiment.
|
|
319
|
-
|
|
320
|
-
Returns:
|
|
321
|
-
Dictionary of metric names to summary statistics
|
|
322
|
-
"""
|
|
323
|
-
response = requests.get(
|
|
324
|
-
f"{self.server_url}/experiment/{self.experiment_id}",
|
|
325
|
-
headers=self._get_headers(),
|
|
326
|
-
)
|
|
327
|
-
|
|
328
|
-
if not response.ok:
|
|
329
|
-
error_text = response.text if hasattr(response, "text") else "Unknown error"
|
|
330
|
-
raise Exception(
|
|
331
|
-
f"Failed to fetch summary results: {response.status_code} {response.reason} - {error_text}"
|
|
332
|
-
)
|
|
333
|
-
|
|
334
|
-
experiment2 = response.json()
|
|
335
|
-
return experiment2.get("summary_results", {})
|
|
336
|
-
|