agenta 0.72.4__py3-none-any.whl → 0.75.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agenta/__init__.py +9 -3
- agenta/sdk/__init__.py +2 -4
- agenta/sdk/agenta_init.py +22 -75
- agenta/sdk/context/serving.py +2 -0
- agenta/sdk/contexts/routing.py +2 -0
- agenta/sdk/contexts/running.py +3 -2
- agenta/sdk/decorators/running.py +8 -4
- agenta/sdk/decorators/serving.py +82 -41
- agenta/sdk/engines/tracing/inline.py +8 -1
- agenta/sdk/evaluations/preview/evaluate.py +36 -8
- agenta/sdk/evaluations/runs.py +2 -1
- agenta/sdk/litellm/mockllm.py +2 -2
- agenta/sdk/managers/config.py +3 -1
- agenta/sdk/managers/secrets.py +25 -8
- agenta/sdk/managers/testsets.py +143 -227
- agenta/sdk/middleware/vault.py +33 -18
- agenta/sdk/middlewares/running/vault.py +33 -17
- agenta/sdk/router.py +30 -5
- agenta/sdk/tracing/inline.py +8 -1
- agenta/sdk/types.py +13 -19
- agenta/sdk/utils/client.py +10 -9
- agenta/sdk/utils/lazy.py +253 -0
- agenta/sdk/workflows/builtin.py +2 -0
- agenta/sdk/workflows/configurations.py +1 -0
- agenta/sdk/workflows/handlers.py +236 -81
- agenta/sdk/workflows/interfaces.py +47 -0
- agenta/sdk/workflows/runners/base.py +6 -2
- agenta/sdk/workflows/runners/daytona.py +250 -131
- agenta/sdk/workflows/runners/local.py +22 -56
- agenta/sdk/workflows/runners/registry.py +1 -1
- agenta/sdk/workflows/sandbox.py +17 -5
- agenta/sdk/workflows/templates.py +81 -0
- agenta/sdk/workflows/utils.py +6 -0
- {agenta-0.72.4.dist-info → agenta-0.75.0.dist-info}/METADATA +4 -8
- {agenta-0.72.4.dist-info → agenta-0.75.0.dist-info}/RECORD +36 -36
- agenta/config.py +0 -25
- agenta/config.toml +0 -4
- {agenta-0.72.4.dist-info → agenta-0.75.0.dist-info}/WHEEL +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
|
-
from typing import Any, Dict, Union
|
|
2
|
+
from typing import Any, Dict, Union, Optional
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class CodeRunner(ABC):
|
|
@@ -13,16 +13,20 @@ class CodeRunner(ABC):
|
|
|
13
13
|
inputs: Dict[str, Any],
|
|
14
14
|
output: Union[dict, str],
|
|
15
15
|
correct_answer: Any,
|
|
16
|
+
runtime: Optional[str] = None,
|
|
17
|
+
templates: Optional[Dict[str, str]] = None,
|
|
16
18
|
) -> Union[float, None]:
|
|
17
19
|
"""
|
|
18
20
|
Execute code and return a float score between 0 and 1.
|
|
19
21
|
|
|
20
22
|
Args:
|
|
21
|
-
code:
|
|
23
|
+
code: Code to execute
|
|
22
24
|
app_params: Application parameters
|
|
23
25
|
inputs: Input data for the code
|
|
24
26
|
output: Output from the application variant
|
|
25
27
|
correct_answer: Expected/correct answer for comparison
|
|
28
|
+
runtime: Runtime environment (python, javascript, typescript), None = python
|
|
29
|
+
templates: Wrapper templates keyed by runtime.
|
|
26
30
|
|
|
27
31
|
Returns:
|
|
28
32
|
Float score between 0 and 1, or None if execution fails
|
|
@@ -1,42 +1,55 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import json
|
|
3
|
-
from
|
|
4
|
-
|
|
5
|
-
from daytona import Daytona, DaytonaConfig, Sandbox
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from typing import Any, Dict, Generator, Union, Optional, TYPE_CHECKING
|
|
6
5
|
|
|
6
|
+
import agenta as ag
|
|
7
7
|
from agenta.sdk.workflows.runners.base import CodeRunner
|
|
8
|
+
from agenta.sdk.contexts.running import RunningContext
|
|
9
|
+
from agenta.sdk.utils.lazy import _load_daytona
|
|
8
10
|
|
|
9
11
|
from agenta.sdk.utils.logging import get_module_logger
|
|
10
12
|
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from daytona import Sandbox
|
|
15
|
+
|
|
11
16
|
log = get_module_logger(__name__)
|
|
12
17
|
|
|
13
|
-
# Template for wrapping user code with evaluation context
|
|
14
|
-
EVALUATION_CODE_TEMPLATE = """
|
|
15
|
-
import json
|
|
16
18
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
19
|
+
def _extract_error_message(error_text: str) -> str:
|
|
20
|
+
"""Extract a clean error message from a Python traceback.
|
|
21
|
+
|
|
22
|
+
Given a full traceback string, extracts just the final error line
|
|
23
|
+
(e.g., "NameError: name 'foo' is not defined") instead of the full
|
|
24
|
+
noisy traceback with base64-encoded code.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
error_text: Full error/traceback string
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Clean error message, or original text if extraction fails
|
|
31
|
+
"""
|
|
32
|
+
if not error_text:
|
|
33
|
+
return "Unknown error"
|
|
23
34
|
|
|
24
|
-
|
|
25
|
-
{user_code}
|
|
35
|
+
lines = error_text.strip().split("\n")
|
|
26
36
|
|
|
27
|
-
#
|
|
28
|
-
|
|
37
|
+
# Look for common Python error patterns from the end
|
|
38
|
+
for line in reversed(lines):
|
|
39
|
+
line = line.strip()
|
|
40
|
+
# Match patterns like "NameError: ...", "ValueError: ...", etc.
|
|
41
|
+
if ": " in line and not line.startswith("File "):
|
|
42
|
+
# Check if it looks like an error line (ErrorType: message)
|
|
43
|
+
parts = line.split(": ", 1)
|
|
44
|
+
if parts[0].replace(".", "").replace("_", "").isalnum():
|
|
45
|
+
return line
|
|
29
46
|
|
|
30
|
-
#
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
except (ValueError, TypeError):
|
|
35
|
-
result = None
|
|
47
|
+
# Fallback: return last non-empty line
|
|
48
|
+
for line in reversed(lines):
|
|
49
|
+
if line.strip():
|
|
50
|
+
return line.strip()
|
|
36
51
|
|
|
37
|
-
|
|
38
|
-
print(json.dumps({{"result": result}}))
|
|
39
|
-
"""
|
|
52
|
+
return error_text[:200] if len(error_text) > 200 else error_text
|
|
40
53
|
|
|
41
54
|
|
|
42
55
|
class DaytonaRunner(CodeRunner):
|
|
@@ -57,7 +70,7 @@ class DaytonaRunner(CodeRunner):
|
|
|
57
70
|
return
|
|
58
71
|
|
|
59
72
|
self._initialized = True
|
|
60
|
-
self.daytona
|
|
73
|
+
self.daytona = None
|
|
61
74
|
self._validate_config()
|
|
62
75
|
|
|
63
76
|
def _validate_config(self) -> None:
|
|
@@ -77,6 +90,8 @@ class DaytonaRunner(CodeRunner):
|
|
|
77
90
|
return
|
|
78
91
|
|
|
79
92
|
try:
|
|
93
|
+
Daytona, DaytonaConfig, _, _ = _load_daytona()
|
|
94
|
+
|
|
80
95
|
# Get configuration with fallbacks
|
|
81
96
|
api_url = os.getenv("DAYTONA_API_URL") or "https://app.daytona.io/api"
|
|
82
97
|
api_key = os.getenv("DAYTONA_API_KEY")
|
|
@@ -92,26 +107,114 @@ class DaytonaRunner(CodeRunner):
|
|
|
92
107
|
except Exception as e:
|
|
93
108
|
raise RuntimeError(f"Failed to initialize Daytona client: {e}")
|
|
94
109
|
|
|
95
|
-
def
|
|
96
|
-
"""
|
|
110
|
+
def _get_provider_env_vars(self) -> Dict[str, str]:
|
|
111
|
+
"""
|
|
112
|
+
Fetch user secrets and extract standard provider keys as environment variables.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
Dictionary of environment variables for standard providers
|
|
116
|
+
"""
|
|
117
|
+
env_vars = {}
|
|
118
|
+
|
|
119
|
+
# Get secrets from context (set by vault middleware)
|
|
120
|
+
ctx = RunningContext.get()
|
|
121
|
+
secrets = getattr(ctx, "vault_secrets", [])
|
|
122
|
+
|
|
123
|
+
# Standard provider keys mapping
|
|
124
|
+
provider_env_mapping = {
|
|
125
|
+
"openai": "OPENAI_API_KEY",
|
|
126
|
+
"cohere": "COHERE_API_KEY",
|
|
127
|
+
"anyscale": "ANYSCALE_API_KEY",
|
|
128
|
+
"deepinfra": "DEEPINFRA_API_KEY",
|
|
129
|
+
"alephalpha": "ALEPHALPHA_API_KEY",
|
|
130
|
+
"groq": "GROQ_API_KEY",
|
|
131
|
+
"mistralai": "MISTRALAI_API_KEY",
|
|
132
|
+
"anthropic": "ANTHROPIC_API_KEY",
|
|
133
|
+
"perplexityai": "PERPLEXITYAI_API_KEY",
|
|
134
|
+
"togetherai": "TOGETHERAI_API_KEY",
|
|
135
|
+
"openrouter": "OPENROUTER_API_KEY",
|
|
136
|
+
"gemini": "GEMINI_API_KEY",
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
# Extract provider keys from secrets
|
|
140
|
+
for secret in secrets:
|
|
141
|
+
if secret.get("kind") == "provider_key":
|
|
142
|
+
secret_data = secret.get("data", {})
|
|
143
|
+
provider_kind = secret_data.get("kind")
|
|
144
|
+
|
|
145
|
+
if provider_kind in provider_env_mapping:
|
|
146
|
+
provider_settings = secret_data.get("provider", {})
|
|
147
|
+
api_key = provider_settings.get("key")
|
|
148
|
+
|
|
149
|
+
if api_key:
|
|
150
|
+
env_var_name = provider_env_mapping[provider_kind]
|
|
151
|
+
env_vars[env_var_name] = api_key
|
|
152
|
+
|
|
153
|
+
return env_vars
|
|
154
|
+
|
|
155
|
+
def _create_sandbox(self, runtime: Optional[str] = None) -> Any:
|
|
156
|
+
"""Create a new sandbox for this run from snapshot.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
runtime: Runtime environment (python, javascript, typescript), None = python
|
|
160
|
+
"""
|
|
97
161
|
try:
|
|
98
162
|
if self.daytona is None:
|
|
99
163
|
raise RuntimeError("Daytona client not initialized")
|
|
100
164
|
|
|
101
|
-
|
|
165
|
+
# Normalize runtime: None means python
|
|
166
|
+
runtime = runtime or "python"
|
|
167
|
+
|
|
168
|
+
# Select general snapshot
|
|
169
|
+
snapshot_id = os.getenv("DAYTONA_SNAPSHOT")
|
|
102
170
|
|
|
103
171
|
if not snapshot_id:
|
|
104
172
|
raise RuntimeError(
|
|
105
|
-
"
|
|
106
|
-
"Set
|
|
173
|
+
f"No Daytona snapshot configured for runtime '{runtime}'. "
|
|
174
|
+
f"Set DAYTONA_SNAPSHOT environment variable."
|
|
107
175
|
)
|
|
108
176
|
|
|
109
|
-
|
|
177
|
+
_, _, _, CreateSandboxFromSnapshotParams = _load_daytona()
|
|
178
|
+
|
|
179
|
+
agenta_host = (
|
|
180
|
+
ag.DEFAULT_AGENTA_SINGLETON_INSTANCE.host
|
|
181
|
+
#
|
|
182
|
+
or ""
|
|
183
|
+
)
|
|
184
|
+
agenta_api_url = (
|
|
185
|
+
ag.DEFAULT_AGENTA_SINGLETON_INSTANCE.api_url
|
|
186
|
+
#
|
|
187
|
+
or ""
|
|
188
|
+
)
|
|
189
|
+
agenta_credentials = (
|
|
190
|
+
RunningContext.get().credentials
|
|
191
|
+
#
|
|
192
|
+
or ""
|
|
193
|
+
)
|
|
194
|
+
agenta_api_key = (
|
|
195
|
+
agenta_credentials[7:]
|
|
196
|
+
if agenta_credentials.startswith("ApiKey ")
|
|
197
|
+
else ""
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Get provider API keys from user secrets
|
|
201
|
+
provider_env_vars = self._get_provider_env_vars()
|
|
202
|
+
|
|
203
|
+
# Combine base env vars with provider keys
|
|
204
|
+
env_vars = {
|
|
205
|
+
"AGENTA_HOST": agenta_host,
|
|
206
|
+
"AGENTA_API_URL": agenta_api_url,
|
|
207
|
+
"AGENTA_API_KEY": agenta_api_key,
|
|
208
|
+
"AGENTA_CREDENTIALS": agenta_credentials,
|
|
209
|
+
**provider_env_vars, # Add provider API keys
|
|
210
|
+
}
|
|
110
211
|
|
|
111
212
|
sandbox = self.daytona.create(
|
|
112
213
|
CreateSandboxFromSnapshotParams(
|
|
113
214
|
snapshot=snapshot_id,
|
|
114
215
|
ephemeral=True,
|
|
216
|
+
env_vars=env_vars,
|
|
217
|
+
language=runtime,
|
|
115
218
|
)
|
|
116
219
|
)
|
|
117
220
|
|
|
@@ -120,6 +223,29 @@ class DaytonaRunner(CodeRunner):
|
|
|
120
223
|
except Exception as e:
|
|
121
224
|
raise RuntimeError(f"Failed to create sandbox from snapshot: {e}")
|
|
122
225
|
|
|
226
|
+
@contextmanager
|
|
227
|
+
def _sandbox_context(
|
|
228
|
+
self, runtime: Optional[str] = None
|
|
229
|
+
) -> Generator["Sandbox", None, None]:
|
|
230
|
+
"""Context manager for sandbox lifecycle.
|
|
231
|
+
|
|
232
|
+
Ensures sandbox is deleted even if an error occurs during execution.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
runtime: Runtime environment (python, javascript, typescript), None = python
|
|
236
|
+
|
|
237
|
+
Yields:
|
|
238
|
+
Sandbox instance
|
|
239
|
+
"""
|
|
240
|
+
sandbox = self._create_sandbox(runtime=runtime)
|
|
241
|
+
try:
|
|
242
|
+
yield sandbox
|
|
243
|
+
finally:
|
|
244
|
+
try:
|
|
245
|
+
sandbox.delete()
|
|
246
|
+
except Exception as e:
|
|
247
|
+
log.error("Failed to delete sandbox: %s", e)
|
|
248
|
+
|
|
123
249
|
def run(
|
|
124
250
|
self,
|
|
125
251
|
code: str,
|
|
@@ -127,130 +253,123 @@ class DaytonaRunner(CodeRunner):
|
|
|
127
253
|
inputs: Dict[str, Any],
|
|
128
254
|
output: Union[dict, str],
|
|
129
255
|
correct_answer: Any,
|
|
256
|
+
runtime: Optional[str] = None,
|
|
257
|
+
templates: Optional[Dict[str, str]] = None,
|
|
130
258
|
) -> Union[float, None]:
|
|
131
259
|
"""
|
|
132
|
-
Execute provided
|
|
260
|
+
Execute provided code in Daytona sandbox.
|
|
133
261
|
|
|
134
262
|
The code must define an `evaluate()` function that takes
|
|
135
263
|
(app_params, inputs, output, correct_answer) and returns a float (0-1).
|
|
136
264
|
|
|
137
265
|
Args:
|
|
138
|
-
code: The
|
|
266
|
+
code: The code to be executed
|
|
139
267
|
app_params: The parameters of the app variant
|
|
140
268
|
inputs: Inputs to be used during code execution
|
|
141
269
|
output: The output of the app variant after being called
|
|
142
270
|
correct_answer: The correct answer (or target) for comparison
|
|
271
|
+
runtime: Runtime environment (python, javascript, typescript), None = python
|
|
272
|
+
templates: Wrapper templates keyed by runtime.
|
|
143
273
|
|
|
144
274
|
Returns:
|
|
145
275
|
Float score between 0 and 1, or None if execution fails
|
|
146
276
|
"""
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
try:
|
|
151
|
-
# Prepare all parameters as a single dict
|
|
152
|
-
params = {
|
|
153
|
-
"app_params": app_params,
|
|
154
|
-
"inputs": inputs,
|
|
155
|
-
"output": output,
|
|
156
|
-
"correct_answer": correct_answer,
|
|
157
|
-
}
|
|
158
|
-
params_json = json.dumps(params)
|
|
159
|
-
|
|
160
|
-
# Wrap the user code with the necessary context and evaluation
|
|
161
|
-
wrapped_code = EVALUATION_CODE_TEMPLATE.format(
|
|
162
|
-
params_json=params_json,
|
|
163
|
-
user_code=code,
|
|
164
|
-
)
|
|
165
|
-
|
|
166
|
-
# Log the input parameters for debugging
|
|
167
|
-
# log.debug("Input parameters to evaluation:")
|
|
168
|
-
# print("\n" + "=" * 80)
|
|
169
|
-
# print("INPUT PARAMETERS:")
|
|
170
|
-
# print("=" * 80)
|
|
171
|
-
# print(f"app_params: {app_params}")
|
|
172
|
-
# print(f"inputs: {inputs}")
|
|
173
|
-
# print(f"output: {output}")
|
|
174
|
-
# print(f"correct_answer: {correct_answer}")
|
|
175
|
-
# print("=" * 80 + "\n")
|
|
176
|
-
|
|
177
|
-
# Log the generated code for debugging
|
|
178
|
-
# log.debug("Generated code to send to Daytona:")
|
|
179
|
-
# print("=" * 80)
|
|
180
|
-
# print("GENERATED CODE TO SEND TO DAYTONA:")
|
|
181
|
-
# print("=" * 80)
|
|
182
|
-
# code_lines = wrapped_code.split("\n")
|
|
183
|
-
# for i, line in enumerate(code_lines, 1):
|
|
184
|
-
# log.debug(f" {i:3d}: {line}")
|
|
185
|
-
# print(f" {i:3d}: {line}")
|
|
186
|
-
# print("=" * 80)
|
|
187
|
-
# print(f"Total lines: {len(code_lines)}")
|
|
188
|
-
# print("=" * 80 + "\n")
|
|
189
|
-
|
|
190
|
-
# Callback functions to capture output and errors
|
|
191
|
-
stdout_lines = []
|
|
192
|
-
stderr_lines = []
|
|
193
|
-
|
|
194
|
-
def on_stdout(line: str) -> None:
|
|
195
|
-
"""Capture stdout output."""
|
|
196
|
-
# log.debug(f"[STDOUT] {line}")
|
|
197
|
-
# print(f"[STDOUT] {line}")
|
|
198
|
-
stdout_lines.append(line)
|
|
199
|
-
|
|
200
|
-
def on_stderr(line: str) -> None:
|
|
201
|
-
"""Capture stderr output."""
|
|
202
|
-
# log.warning(f"[STDERR] {line}")
|
|
203
|
-
# print(f"[STDERR] {line}")
|
|
204
|
-
stderr_lines.append(line)
|
|
205
|
-
|
|
206
|
-
def on_error(error: Exception) -> None:
|
|
207
|
-
"""Capture errors."""
|
|
208
|
-
log.error(f"[ERROR] {type(error).__name__}: {error}")
|
|
209
|
-
# print(f"[ERROR] {type(error).__name__}: {error}")
|
|
210
|
-
|
|
211
|
-
# Execute the code in the Daytona sandbox
|
|
212
|
-
# log.debug("Executing code in Daytona sandbox")
|
|
213
|
-
response = sandbox.code_interpreter.run_code(
|
|
214
|
-
wrapped_code,
|
|
215
|
-
on_stdout=on_stdout,
|
|
216
|
-
on_stderr=on_stderr,
|
|
217
|
-
on_error=on_error,
|
|
218
|
-
)
|
|
219
|
-
|
|
220
|
-
# log.debug(f"Raw response: {response}")
|
|
221
|
-
# print(f"Raw response: {response}")
|
|
277
|
+
# Normalize runtime: None means python
|
|
278
|
+
runtime = runtime or "python"
|
|
222
279
|
|
|
223
|
-
|
|
224
|
-
# Response has stdout, stderr, and error fields
|
|
225
|
-
response_stdout = response.stdout if hasattr(response, "stdout") else ""
|
|
226
|
-
response_error = response.error if hasattr(response, "error") else None
|
|
280
|
+
self._initialize_client()
|
|
227
281
|
|
|
228
|
-
|
|
282
|
+
with self._sandbox_context(runtime=runtime) as sandbox:
|
|
283
|
+
try:
|
|
284
|
+
# Prepare all parameters as a single dict
|
|
285
|
+
params = {
|
|
286
|
+
"app_params": app_params,
|
|
287
|
+
"inputs": inputs,
|
|
288
|
+
"output": output,
|
|
289
|
+
"correct_answer": correct_answer,
|
|
290
|
+
}
|
|
291
|
+
params_json = json.dumps(params)
|
|
292
|
+
|
|
293
|
+
if not templates:
|
|
294
|
+
raise RuntimeError(
|
|
295
|
+
"Missing evaluator templates for Daytona execution"
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
template = templates.get(runtime)
|
|
299
|
+
if template is None:
|
|
300
|
+
raise RuntimeError(
|
|
301
|
+
f"Missing evaluator template for runtime '{runtime}'"
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
# Wrap the user code with the necessary context and evaluation
|
|
305
|
+
wrapped_code = template.format(
|
|
306
|
+
params_json=params_json,
|
|
307
|
+
user_code=code,
|
|
308
|
+
)
|
|
229
309
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
310
|
+
# Execute the code in the Daytona sandbox
|
|
311
|
+
response = sandbox.process.code_run(wrapped_code)
|
|
312
|
+
response_stdout = response.result if hasattr(response, "result") else ""
|
|
313
|
+
response_exit_code = getattr(response, "exit_code", 0)
|
|
314
|
+
response_error = getattr(response, "error", None) or getattr(
|
|
315
|
+
response, "stderr", None
|
|
316
|
+
)
|
|
233
317
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
318
|
+
if response_exit_code and response_exit_code != 0:
|
|
319
|
+
raw_error = response_error or response_stdout or "Unknown error"
|
|
320
|
+
# Log full error for debugging
|
|
321
|
+
# log.warning(
|
|
322
|
+
# "Sandbox execution error (exit_code=%s): %s",
|
|
323
|
+
# response_exit_code,
|
|
324
|
+
# raw_error,
|
|
325
|
+
# )
|
|
326
|
+
# Extract clean error message for user display
|
|
327
|
+
clean_error = _extract_error_message(raw_error)
|
|
328
|
+
raise RuntimeError(clean_error)
|
|
329
|
+
|
|
330
|
+
# Parse the result from stdout
|
|
331
|
+
output_lines = response_stdout.strip().split("\n")
|
|
332
|
+
for line in reversed(output_lines):
|
|
333
|
+
if not line.strip():
|
|
334
|
+
continue
|
|
335
|
+
try:
|
|
336
|
+
result_obj = json.loads(line)
|
|
337
|
+
if isinstance(result_obj, dict) and "result" in result_obj:
|
|
338
|
+
result = result_obj["result"]
|
|
339
|
+
if isinstance(result, (float, int, type(None))):
|
|
340
|
+
return float(result) if result is not None else None
|
|
341
|
+
except json.JSONDecodeError:
|
|
342
|
+
continue
|
|
343
|
+
|
|
344
|
+
# Fallback: attempt to extract a JSON object containing "result"
|
|
345
|
+
for line in reversed(output_lines):
|
|
346
|
+
if "result" not in line:
|
|
347
|
+
continue
|
|
348
|
+
start = line.find("{")
|
|
349
|
+
end = line.rfind("}")
|
|
350
|
+
if start == -1 or end == -1 or end <= start:
|
|
351
|
+
continue
|
|
352
|
+
try:
|
|
353
|
+
result_obj = json.loads(line[start : end + 1])
|
|
354
|
+
except json.JSONDecodeError:
|
|
355
|
+
continue
|
|
241
356
|
if isinstance(result_obj, dict) and "result" in result_obj:
|
|
242
357
|
result = result_obj["result"]
|
|
243
358
|
if isinstance(result, (float, int, type(None))):
|
|
244
359
|
return float(result) if result is not None else None
|
|
245
|
-
except json.JSONDecodeError:
|
|
246
|
-
continue
|
|
247
360
|
|
|
248
|
-
|
|
361
|
+
# log.warning(
|
|
362
|
+
# "Evaluation output did not include JSON result: %s", response_stdout
|
|
363
|
+
# )
|
|
364
|
+
raise ValueError(
|
|
365
|
+
"Could not parse evaluation result from Daytona output"
|
|
366
|
+
)
|
|
249
367
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
368
|
+
except Exception as e:
|
|
369
|
+
# log.warning(
|
|
370
|
+
# f"Error during Daytona code execution:\n {e}", exc_info=True
|
|
371
|
+
# )
|
|
372
|
+
raise RuntimeError(e)
|
|
254
373
|
|
|
255
374
|
def cleanup(self) -> None:
|
|
256
375
|
"""Clean up Daytona client resources."""
|
|
@@ -1,20 +1,10 @@
|
|
|
1
|
-
from typing import Any, Dict, Union,
|
|
2
|
-
|
|
3
|
-
from RestrictedPython import safe_builtins, compile_restricted, utility_builtins
|
|
4
|
-
from RestrictedPython.Eval import (
|
|
5
|
-
default_guarded_getiter,
|
|
6
|
-
default_guarded_getitem,
|
|
7
|
-
)
|
|
8
|
-
from RestrictedPython.Guards import (
|
|
9
|
-
guarded_iter_unpack_sequence,
|
|
10
|
-
full_write_guard,
|
|
11
|
-
)
|
|
1
|
+
from typing import Any, Dict, Union, Optional
|
|
12
2
|
|
|
13
3
|
from agenta.sdk.workflows.runners.base import CodeRunner
|
|
14
4
|
|
|
15
5
|
|
|
16
6
|
class LocalRunner(CodeRunner):
|
|
17
|
-
"""Local code runner using
|
|
7
|
+
"""Local code runner using direct Python execution."""
|
|
18
8
|
|
|
19
9
|
def run(
|
|
20
10
|
self,
|
|
@@ -23,9 +13,11 @@ class LocalRunner(CodeRunner):
|
|
|
23
13
|
inputs: Dict[str, Any],
|
|
24
14
|
output: Union[dict, str],
|
|
25
15
|
correct_answer: Any,
|
|
16
|
+
runtime: Optional[str] = None,
|
|
17
|
+
templates: Optional[Dict[str, str]] = None,
|
|
26
18
|
) -> Union[float, None]:
|
|
27
19
|
"""
|
|
28
|
-
Execute provided Python code
|
|
20
|
+
Execute provided Python code directly.
|
|
29
21
|
|
|
30
22
|
Args:
|
|
31
23
|
code: The Python code to be executed
|
|
@@ -33,55 +25,29 @@ class LocalRunner(CodeRunner):
|
|
|
33
25
|
inputs: Inputs to be used during code execution
|
|
34
26
|
output: The output of the app variant after being called
|
|
35
27
|
correct_answer: The correct answer (or target) for comparison
|
|
36
|
-
|
|
28
|
+
runtime: Runtime environment (only "python" is supported for local runner)
|
|
29
|
+
templates: Wrapper templates keyed by runtime (unused for local runner).
|
|
37
30
|
|
|
38
31
|
Returns:
|
|
39
32
|
Float score between 0 and 1, or None if execution fails
|
|
40
33
|
"""
|
|
41
|
-
#
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
#
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
"typing",
|
|
55
|
-
]
|
|
56
|
-
|
|
57
|
-
# Create a dictionary to simulate allowed imports
|
|
58
|
-
allowed_modules = {}
|
|
59
|
-
for package_name in allowed_imports:
|
|
60
|
-
allowed_modules[package_name] = __import__(package_name)
|
|
61
|
-
|
|
62
|
-
# Add the allowed modules to the local built-ins
|
|
63
|
-
local_builtins.update(allowed_modules)
|
|
64
|
-
local_builtins.update(utility_builtins)
|
|
65
|
-
|
|
66
|
-
# Define the environment for the code execution
|
|
67
|
-
environment = {
|
|
68
|
-
"_getiter_": default_guarded_getiter,
|
|
69
|
-
"_getitem_": default_guarded_getitem,
|
|
70
|
-
"_iter_unpack_sequence_": guarded_iter_unpack_sequence,
|
|
71
|
-
"_write_": full_write_guard,
|
|
72
|
-
"__builtins__": local_builtins,
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
# Compile the code in a restricted environment
|
|
76
|
-
byte_code = compile_restricted(code, filename="<inline>", mode="exec")
|
|
77
|
-
|
|
78
|
-
# Call the evaluation function, extract the result if it exists
|
|
79
|
-
# and is a float between 0 and 1
|
|
34
|
+
# Normalize runtime: None means python
|
|
35
|
+
runtime = runtime or "python"
|
|
36
|
+
|
|
37
|
+
# Local runner only supports Python
|
|
38
|
+
if runtime != "python":
|
|
39
|
+
raise ValueError(
|
|
40
|
+
f"LocalRunner only supports 'python' runtime, got: {runtime}"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Define the environment for code execution
|
|
44
|
+
environment: dict[str, Any] = dict()
|
|
45
|
+
|
|
46
|
+
# Execute the code directly
|
|
80
47
|
try:
|
|
81
|
-
|
|
82
|
-
exec(byte_code, environment)
|
|
48
|
+
exec(code, environment)
|
|
83
49
|
|
|
84
|
-
# Call the evaluation function
|
|
50
|
+
# Call the evaluation function
|
|
85
51
|
result = environment["evaluate"](app_params, inputs, output, correct_answer)
|
|
86
52
|
|
|
87
53
|
# Attempt to convert result to float
|
|
@@ -19,7 +19,7 @@ def get_runner() -> CodeRunner:
|
|
|
19
19
|
Registry to get the appropriate code runner based on environment configuration.
|
|
20
20
|
|
|
21
21
|
Uses AGENTA_SERVICES_SANDBOX_RUNNER environment variable:
|
|
22
|
-
- "local" (default): Uses
|
|
22
|
+
- "local" (default): Uses current container for local execution
|
|
23
23
|
- "daytona": Uses Daytona remote sandbox
|
|
24
24
|
|
|
25
25
|
Returns:
|
agenta/sdk/workflows/sandbox.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Union, Text, Dict, Any
|
|
1
|
+
from typing import Union, Text, Dict, Any, Optional
|
|
2
2
|
|
|
3
3
|
from agenta.sdk.workflows.runners import get_runner
|
|
4
4
|
|
|
@@ -29,11 +29,13 @@ def execute_code_safely(
|
|
|
29
29
|
output: Union[dict, str],
|
|
30
30
|
correct_answer: Any, # for backward compatibility reasons
|
|
31
31
|
code: Text,
|
|
32
|
+
runtime: Optional[str] = None,
|
|
33
|
+
templates: Optional[Dict[str, str]] = None,
|
|
32
34
|
) -> Union[float, None]:
|
|
33
35
|
"""
|
|
34
|
-
Execute the provided
|
|
36
|
+
Execute the provided code safely.
|
|
35
37
|
|
|
36
|
-
Uses the configured runner (local
|
|
38
|
+
Uses the configured runner (local or remote Daytona)
|
|
37
39
|
based on the AGENTA_SERVICES_SANDBOX_RUNNER environment variable.
|
|
38
40
|
|
|
39
41
|
Args:
|
|
@@ -41,7 +43,9 @@ def execute_code_safely(
|
|
|
41
43
|
- inputs (Dict[str, Any]): Inputs to be used during code execution.
|
|
42
44
|
- output (Union[dict, str]): The output of the app variant after being called.
|
|
43
45
|
- correct_answer (Any): The correct answer (or target) of the app variant.
|
|
44
|
-
- code (Text): The
|
|
46
|
+
- code (Text): The code to be executed.
|
|
47
|
+
- runtime (Optional[str]): Runtime environment (python, javascript, typescript). None = python.
|
|
48
|
+
- templates (Optional[Dict[str, str]]): Wrapper templates keyed by runtime.
|
|
45
49
|
|
|
46
50
|
Returns:
|
|
47
51
|
- (float): Result of the execution if successful. Should be between 0 and 1.
|
|
@@ -52,4 +56,12 @@ def execute_code_safely(
|
|
|
52
56
|
if _runner is None:
|
|
53
57
|
_runner = get_runner()
|
|
54
58
|
|
|
55
|
-
return _runner.run(
|
|
59
|
+
return _runner.run(
|
|
60
|
+
code,
|
|
61
|
+
app_params,
|
|
62
|
+
inputs,
|
|
63
|
+
output,
|
|
64
|
+
correct_answer,
|
|
65
|
+
runtime,
|
|
66
|
+
templates,
|
|
67
|
+
)
|