synth-ai 0.1.0.dev4__py3-none-any.whl → 0.1.0.dev6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- public_tests/synth_sdk.py +389 -0
- public_tests/test_agent.py +538 -0
- public_tests/test_recursive_structured_outputs.py +180 -0
- public_tests/test_structured_outputs.py +100 -0
- synth_ai/zyk/lms/__init__.py +0 -0
- synth_ai/zyk/lms/caching/__init__.py +0 -0
- synth_ai/zyk/lms/caching/constants.py +1 -0
- synth_ai/zyk/lms/caching/dbs.py +0 -0
- synth_ai/zyk/lms/caching/ephemeral.py +50 -0
- synth_ai/zyk/lms/caching/handler.py +92 -0
- synth_ai/zyk/lms/caching/initialize.py +13 -0
- synth_ai/zyk/lms/caching/persistent.py +55 -0
- synth_ai/zyk/lms/config.py +8 -0
- synth_ai/zyk/lms/core/__init__.py +0 -0
- synth_ai/zyk/lms/core/all.py +35 -0
- synth_ai/zyk/lms/core/exceptions.py +9 -0
- synth_ai/zyk/lms/core/main.py +245 -0
- synth_ai/zyk/lms/core/vendor_clients.py +60 -0
- synth_ai/zyk/lms/cost/__init__.py +0 -0
- synth_ai/zyk/lms/cost/monitor.py +1 -0
- synth_ai/zyk/lms/cost/statefulness.py +1 -0
- synth_ai/zyk/lms/structured_outputs/__init__.py +0 -0
- synth_ai/zyk/lms/structured_outputs/handler.py +388 -0
- synth_ai/zyk/lms/structured_outputs/inject.py +185 -0
- synth_ai/zyk/lms/structured_outputs/rehabilitate.py +186 -0
- synth_ai/zyk/lms/vendors/__init__.py +0 -0
- synth_ai/zyk/lms/vendors/base.py +15 -0
- synth_ai/zyk/lms/vendors/constants.py +5 -0
- synth_ai/zyk/lms/vendors/core/__init__.py +0 -0
- synth_ai/zyk/lms/vendors/core/anthropic_api.py +191 -0
- synth_ai/zyk/lms/vendors/core/gemini_api.py +146 -0
- synth_ai/zyk/lms/vendors/core/openai_api.py +145 -0
- synth_ai/zyk/lms/vendors/local/__init__.py +0 -0
- synth_ai/zyk/lms/vendors/local/ollama.py +0 -0
- synth_ai/zyk/lms/vendors/openai_standard.py +141 -0
- synth_ai/zyk/lms/vendors/retries.py +3 -0
- synth_ai/zyk/lms/vendors/supported/__init__.py +0 -0
- synth_ai/zyk/lms/vendors/supported/deepseek.py +18 -0
- synth_ai/zyk/lms/vendors/supported/together.py +11 -0
- {synth_ai-0.1.0.dev4.dist-info → synth_ai-0.1.0.dev6.dist-info}/METADATA +1 -1
- synth_ai-0.1.0.dev6.dist-info/RECORD +46 -0
- synth_ai-0.1.0.dev6.dist-info/top_level.txt +2 -0
- synth_ai-0.1.0.dev4.dist-info/RECORD +0 -7
- synth_ai-0.1.0.dev4.dist-info/top_level.txt +0 -1
- {synth_ai-0.1.0.dev4.dist-info → synth_ai-0.1.0.dev6.dist-info}/LICENSE +0 -0
- {synth_ai-0.1.0.dev4.dist-info → synth_ai-0.1.0.dev6.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,389 @@
|
|
|
1
|
+
# from dev.testing.hendryks import HendryksMathBenchmark, TrivialHendryksMathAgent
|
|
2
|
+
import asyncio
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import sys
|
|
8
|
+
import time
|
|
9
|
+
import uuid
|
|
10
|
+
from typing import Dict, List
|
|
11
|
+
|
|
12
|
+
import pytest
|
|
13
|
+
from datasets import load_dataset
|
|
14
|
+
from dotenv import load_dotenv
|
|
15
|
+
from pydantic import BaseModel
|
|
16
|
+
from synth_sdk.tracing.abstractions import (
|
|
17
|
+
Dataset,
|
|
18
|
+
RewardSignal,
|
|
19
|
+
TrainingQuestion,
|
|
20
|
+
)
|
|
21
|
+
from synth_sdk.tracing.client_manager import ClientManager
|
|
22
|
+
from synth_sdk.tracing.decorators import get_tracing_config, trace_system_async
|
|
23
|
+
from synth_sdk.tracing.upload import upload
|
|
24
|
+
from synth_sdk.tracing.utils import get_system_id
|
|
25
|
+
|
|
26
|
+
from synth_ai.zyk import LM
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class HendryksMathBenchmark:
|
|
30
|
+
def __init__(self):
|
|
31
|
+
self.name = "hendryks_math"
|
|
32
|
+
self.temp_dir = "temp"
|
|
33
|
+
os.makedirs(self.temp_dir, exist_ok=True)
|
|
34
|
+
os.makedirs("datasets/competition_math", exist_ok=True)
|
|
35
|
+
|
|
36
|
+
def load_data(self):
|
|
37
|
+
cache_path = "datasets/competition_math/dataset.json"
|
|
38
|
+
|
|
39
|
+
# Try to load from cache first
|
|
40
|
+
if os.path.exists(cache_path):
|
|
41
|
+
with open(cache_path, "r") as f:
|
|
42
|
+
dataset = json.load(f)
|
|
43
|
+
problems = []
|
|
44
|
+
for item in dataset["train"]: # Using train split for consistency
|
|
45
|
+
problem = {
|
|
46
|
+
"question": item["problem"],
|
|
47
|
+
"answer": item["solution"],
|
|
48
|
+
"subject": item.get("type", "unknown"),
|
|
49
|
+
"level": "competition", # All problems are competition level
|
|
50
|
+
}
|
|
51
|
+
problems.append(problem)
|
|
52
|
+
return problems
|
|
53
|
+
|
|
54
|
+
# If not cached, load from HF and cache
|
|
55
|
+
dataset = load_dataset("competition_math", "main")
|
|
56
|
+
with open(cache_path, "w") as f:
|
|
57
|
+
json.dump(
|
|
58
|
+
{"train": list(dataset["train"]), "test": list(dataset["test"])}, f
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# Convert to our format
|
|
62
|
+
problems = []
|
|
63
|
+
for item in dataset["train"]:
|
|
64
|
+
problem = {
|
|
65
|
+
"question": item["problem"],
|
|
66
|
+
"answer": item["solution"],
|
|
67
|
+
"subject": item.get("type", "unknown"),
|
|
68
|
+
"level": "competition",
|
|
69
|
+
}
|
|
70
|
+
problems.append(problem)
|
|
71
|
+
|
|
72
|
+
return problems
|
|
73
|
+
|
|
74
|
+
def get_problems(self):
|
|
75
|
+
temp_path = os.path.join(self.temp_dir, "hendryks_math.json")
|
|
76
|
+
|
|
77
|
+
# Load from temp file if it exists
|
|
78
|
+
if os.path.exists(temp_path):
|
|
79
|
+
with open(temp_path, "r") as f:
|
|
80
|
+
return json.load(f)
|
|
81
|
+
|
|
82
|
+
# Otherwise load from dataset and save
|
|
83
|
+
problems = self.load_data()
|
|
84
|
+
with open(temp_path, "w") as f:
|
|
85
|
+
json.dump(problems, f)
|
|
86
|
+
return problems
|
|
87
|
+
|
|
88
|
+
def score_answer(self, question: str, proposed_answer: str) -> bool:
|
|
89
|
+
"""Score a proposed answer against the correct answer for a given question."""
|
|
90
|
+
# Find the problem that matches the question
|
|
91
|
+
problems = self.get_problems()
|
|
92
|
+
matching_problem = next(
|
|
93
|
+
(p for p in problems if p["question"] == question), None
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
if not matching_problem:
|
|
97
|
+
raise ValueError("Question not found in benchmark")
|
|
98
|
+
|
|
99
|
+
# Extract answer from proposed solution's \boxed{} format
|
|
100
|
+
proposed_match = re.search(r"\\boxed{((?:[^{}]|{[^{}]*})*)}", proposed_answer)
|
|
101
|
+
if not proposed_match:
|
|
102
|
+
return False
|
|
103
|
+
|
|
104
|
+
# Extract answer from correct solution's \boxed{} format
|
|
105
|
+
correct_match = re.search(
|
|
106
|
+
r"\\boxed{((?:[^{}]|{[^{}]*})*)}", matching_problem["answer"]
|
|
107
|
+
)
|
|
108
|
+
if not correct_match:
|
|
109
|
+
return False
|
|
110
|
+
|
|
111
|
+
extracted_proposed = proposed_match.group(1).strip()
|
|
112
|
+
extracted_correct = correct_match.group(1).strip()
|
|
113
|
+
|
|
114
|
+
# print(f"Proposed answer: {extracted_proposed}")
|
|
115
|
+
# print(f"Correct answer: {extracted_correct}")
|
|
116
|
+
|
|
117
|
+
return extracted_proposed == extracted_correct
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class TrivialHendryksMathAgent:
|
|
121
|
+
def __init__(self):
|
|
122
|
+
self.lm = LM( # gemini-1.5-flash
|
|
123
|
+
model_name="gpt-4o-mini",
|
|
124
|
+
formatting_model_name="gpt-4o-mini",
|
|
125
|
+
temperature=0.1,
|
|
126
|
+
synth_logging=True,
|
|
127
|
+
)
|
|
128
|
+
self.system_name = "HendryksMathAgent"
|
|
129
|
+
self.system_id = get_system_id(self.system_name)
|
|
130
|
+
self.system_instance_id = str(uuid.uuid4())
|
|
131
|
+
|
|
132
|
+
@trace_system_async(
|
|
133
|
+
origin="agent",
|
|
134
|
+
event_type="plan",
|
|
135
|
+
manage_event="create_and_end",
|
|
136
|
+
increment_partition=True,
|
|
137
|
+
verbose=True,
|
|
138
|
+
)
|
|
139
|
+
async def plan(self, math_question: str) -> str:
|
|
140
|
+
logger.debug("Starting plan method with trace decorator")
|
|
141
|
+
try:
|
|
142
|
+
|
|
143
|
+
class Plan(BaseModel):
|
|
144
|
+
content: str
|
|
145
|
+
|
|
146
|
+
response = await self.lm.respond_async(
|
|
147
|
+
system_message="""You are an AI assisting a colleague in completing a mathematics problem.
|
|
148
|
+
You will be given a mathematics problem statement. Your task is to create a detailed plan to solve the problem,
|
|
149
|
+
breaking it down into clear, logical steps.""",
|
|
150
|
+
user_message=f"""Please provide a detailed, step-by-step plan to solve this math problem:
|
|
151
|
+
{math_question}
|
|
152
|
+
|
|
153
|
+
Your plan should include:
|
|
154
|
+
1. A clear statement of the given information and problem to be solved
|
|
155
|
+
2. Identification of relevant mathematical concepts and techniques
|
|
156
|
+
3. Definition of variables and known relationships
|
|
157
|
+
4. A step-by-step approach to solving the problem
|
|
158
|
+
5. Explanation of the reasoning behind each step""",
|
|
159
|
+
response_model=Plan,
|
|
160
|
+
)
|
|
161
|
+
logger.debug("Successfully got response from LM in plan method")
|
|
162
|
+
return response.content
|
|
163
|
+
except Exception as e:
|
|
164
|
+
logger.error(f"Error in plan method: {str(e)}", exc_info=True)
|
|
165
|
+
raise
|
|
166
|
+
|
|
167
|
+
@trace_system_async(
|
|
168
|
+
origin="agent",
|
|
169
|
+
event_type="execute",
|
|
170
|
+
manage_event="create_and_end",
|
|
171
|
+
increment_partition=True,
|
|
172
|
+
verbose=True,
|
|
173
|
+
)
|
|
174
|
+
async def execute(self, plan: str) -> str:
|
|
175
|
+
logger.debug("Starting execute method with trace decorator")
|
|
176
|
+
try:
|
|
177
|
+
|
|
178
|
+
class Solution(BaseModel):
|
|
179
|
+
content: str
|
|
180
|
+
|
|
181
|
+
response = await self.lm.respond_async(
|
|
182
|
+
system_message="""You are an AI mathematical problem-solving assistant.
|
|
183
|
+
You will be given a solution plan. Your task is to implement this plan,
|
|
184
|
+
showing all work and verifying correctness at each step.""",
|
|
185
|
+
user_message=f"""
|
|
186
|
+
Plan:
|
|
187
|
+
{plan}
|
|
188
|
+
|
|
189
|
+
Please solve this problem by carefully following the provided plan. Show all your work and calculations.
|
|
190
|
+
Leave your final answer at the very end in the format \\boxed{{answer}}.""",
|
|
191
|
+
response_model=Solution,
|
|
192
|
+
)
|
|
193
|
+
logger.debug("Successfully got response from LM in execute method")
|
|
194
|
+
return response.content
|
|
195
|
+
except Exception as e:
|
|
196
|
+
logger.error(f"Error in execute method: {str(e)}", exc_info=True)
|
|
197
|
+
raise
|
|
198
|
+
|
|
199
|
+
async def run(self, math_question: str) -> str:
|
|
200
|
+
logger.debug("Starting run method")
|
|
201
|
+
plan = await self.plan(math_question)
|
|
202
|
+
logger.debug("Completed plan method")
|
|
203
|
+
solution = await self.execute(plan)
|
|
204
|
+
logger.debug("Completed execute method")
|
|
205
|
+
return solution
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# Configure logging
|
|
209
|
+
logging.basicConfig(
|
|
210
|
+
level=logging.ERROR, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
211
|
+
)
|
|
212
|
+
logger = logging.getLogger(__name__)
|
|
213
|
+
|
|
214
|
+
# Add logging for trace decorator
|
|
215
|
+
trace_logger = logging.getLogger("synth_sdk.tracing.decorators")
|
|
216
|
+
trace_logger.setLevel(logging.ERROR)
|
|
217
|
+
|
|
218
|
+
# Add logging for client manager
|
|
219
|
+
client_logger = logging.getLogger("synth_sdk.tracing.client_manager")
|
|
220
|
+
client_logger.setLevel(logging.ERROR)
|
|
221
|
+
|
|
222
|
+
load_dotenv()
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
async def setup_synth_config():
|
|
226
|
+
"""Setup synth configuration for deferred logging."""
|
|
227
|
+
logger.info("Setting up synth configuration for deferred logging")
|
|
228
|
+
os.environ["SYNTH_LOGGING_MODE"] = "deferred"
|
|
229
|
+
os.environ["SYNTH_ENDPOINT_OVERRIDE"] = "https://agent-learning.onrender.com"
|
|
230
|
+
config = get_tracing_config()
|
|
231
|
+
ClientManager.initialize(config)
|
|
232
|
+
logger.info("Synth config:")
|
|
233
|
+
logger.info(f" Mode: {config.mode}")
|
|
234
|
+
logger.info(f" API Key present: {bool(config.api_key)}")
|
|
235
|
+
logger.info(f" Base URL: {config.base_url}")
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
@pytest.mark.asyncio
|
|
239
|
+
async def test_deferred_logging():
|
|
240
|
+
"""Test deferred logging with both pytest and regular assertions."""
|
|
241
|
+
logger.info("=== STARTING DEFERRED LOGGING TEST ===")
|
|
242
|
+
start_time = time.time()
|
|
243
|
+
logger.info(f"Test start time: {start_time}")
|
|
244
|
+
|
|
245
|
+
# Determine if running under pytest
|
|
246
|
+
is_pytest = "pytest" in sys.modules
|
|
247
|
+
|
|
248
|
+
try:
|
|
249
|
+
await setup_synth_config()
|
|
250
|
+
|
|
251
|
+
# Initialize and run agent
|
|
252
|
+
benchmark = HendryksMathBenchmark()
|
|
253
|
+
agent = TrivialHendryksMathAgent()
|
|
254
|
+
logger.info(f"Agent system ID: {agent.system_id}")
|
|
255
|
+
logger.info(f"Agent system instance ID: {agent.system_instance_id}")
|
|
256
|
+
|
|
257
|
+
problems = benchmark.get_problems()
|
|
258
|
+
test_problem = problems[0]["question"]
|
|
259
|
+
logger.info(f"Using test problem: {test_problem}")
|
|
260
|
+
|
|
261
|
+
# Run the agent
|
|
262
|
+
logger.info("Running agent...")
|
|
263
|
+
solution = await agent.run(test_problem)
|
|
264
|
+
logger.info(f"Agent solution: {solution}")
|
|
265
|
+
|
|
266
|
+
# Create dataset and upload results
|
|
267
|
+
logger.info("Creating dataset and uploading results...")
|
|
268
|
+
dataset = Dataset(
|
|
269
|
+
questions=[
|
|
270
|
+
TrainingQuestion(
|
|
271
|
+
id="q0",
|
|
272
|
+
intent="Test math problem",
|
|
273
|
+
criteria="Testing deferred tracing and upload functionality",
|
|
274
|
+
)
|
|
275
|
+
],
|
|
276
|
+
reward_signals=[
|
|
277
|
+
RewardSignal(
|
|
278
|
+
question_id="q0",
|
|
279
|
+
system_instance_id=agent.system_instance_id,
|
|
280
|
+
reward=1.0,
|
|
281
|
+
annotation="Test reward",
|
|
282
|
+
)
|
|
283
|
+
],
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
# Upload the dataset and traces
|
|
287
|
+
logger.info("Starting upload process...")
|
|
288
|
+
upload_id, questions_json, reward_signals_json, traces_json = upload(
|
|
289
|
+
dataset=dataset
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
logger.info(f"Upload completed with ID: {upload_id}")
|
|
293
|
+
logger.debug(f"Number of traces: {len(traces_json)}")
|
|
294
|
+
print(traces_json)
|
|
295
|
+
|
|
296
|
+
# Verify upload results
|
|
297
|
+
if is_pytest:
|
|
298
|
+
assert upload_id
|
|
299
|
+
assert questions_json
|
|
300
|
+
assert reward_signals_json
|
|
301
|
+
assert traces_json
|
|
302
|
+
else:
|
|
303
|
+
assert upload_id, "Upload ID should not be empty"
|
|
304
|
+
assert questions_json, "Questions JSON should not be empty"
|
|
305
|
+
assert reward_signals_json, "Reward signals JSON should not be empty"
|
|
306
|
+
assert traces_json, "Traces JSON should not be empty"
|
|
307
|
+
|
|
308
|
+
# Verify trace content
|
|
309
|
+
for i, trace in enumerate(traces_json):
|
|
310
|
+
logger.debug(f"Verifying trace {i}:")
|
|
311
|
+
verify_trace_content(trace, is_pytest)
|
|
312
|
+
|
|
313
|
+
logger.info("All traces verified successfully!")
|
|
314
|
+
return True
|
|
315
|
+
|
|
316
|
+
except AssertionError as e:
|
|
317
|
+
logger.error(f"Test failed: {str(e)}")
|
|
318
|
+
if is_pytest:
|
|
319
|
+
raise
|
|
320
|
+
return False
|
|
321
|
+
except Exception as e:
|
|
322
|
+
logger.error(f"Unexpected error: {str(e)}", exc_info=True)
|
|
323
|
+
if is_pytest:
|
|
324
|
+
raise
|
|
325
|
+
return False
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def verify_trace_content(trace: dict, is_pytest: bool = False) -> None:
|
|
329
|
+
"""Verify the content of a trace."""
|
|
330
|
+
if is_pytest:
|
|
331
|
+
assert trace["system_instance_id"]
|
|
332
|
+
else:
|
|
333
|
+
assert trace["system_instance_id"], "Trace missing system_instance_id"
|
|
334
|
+
|
|
335
|
+
# Verify events were captured
|
|
336
|
+
has_events = False
|
|
337
|
+
for partition in trace["partition"]:
|
|
338
|
+
if len(partition["events"]) > 0:
|
|
339
|
+
has_events = True
|
|
340
|
+
for event in partition["events"]:
|
|
341
|
+
logger.debug(f"Checking event: {json.dumps(event, indent=2)}")
|
|
342
|
+
if "agent_compute_step" in event:
|
|
343
|
+
step = event["agent_compute_step"]
|
|
344
|
+
logger.debug(f"Checking compute step: {json.dumps(step, indent=2)}")
|
|
345
|
+
if is_pytest:
|
|
346
|
+
assert step.get("model_name") is not None
|
|
347
|
+
assert step.get("model_name") != ""
|
|
348
|
+
else:
|
|
349
|
+
assert (
|
|
350
|
+
step.get("model_name") is not None
|
|
351
|
+
), "Model name is missing"
|
|
352
|
+
assert step.get("model_name") != "", "Model name is empty"
|
|
353
|
+
|
|
354
|
+
if step.get("compute_input"):
|
|
355
|
+
for input_item in step["compute_input"]:
|
|
356
|
+
if is_pytest:
|
|
357
|
+
assert "messages" in input_item, input_item.keys()
|
|
358
|
+
else:
|
|
359
|
+
assert "messages" in input_item, (
|
|
360
|
+
f"Input must have 'messages' key, but found keys: {list(input_item.keys())}"
|
|
361
|
+
f"\nFull input: {json.dumps(input_item, indent=2)}"
|
|
362
|
+
)
|
|
363
|
+
messages = input_item["messages"]
|
|
364
|
+
if is_pytest:
|
|
365
|
+
assert isinstance(messages, list)
|
|
366
|
+
assert len(messages) == 2
|
|
367
|
+
else:
|
|
368
|
+
assert isinstance(
|
|
369
|
+
messages, list
|
|
370
|
+
), "Messages must be a list"
|
|
371
|
+
assert len(messages) == 2, (
|
|
372
|
+
f"Expected exactly 2 messages (system and user), but found {len(messages)}"
|
|
373
|
+
f"\nMessages: {json.dumps(messages, indent=2)}"
|
|
374
|
+
)
|
|
375
|
+
break
|
|
376
|
+
|
|
377
|
+
if is_pytest:
|
|
378
|
+
assert has_events
|
|
379
|
+
else:
|
|
380
|
+
assert (
|
|
381
|
+
has_events
|
|
382
|
+
), f"At least one partition should contain events - {trace['partition']}"
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
if __name__ == "__main__":
|
|
386
|
+
# Remove the pytest check so the test always runs
|
|
387
|
+
success = asyncio.run(test_deferred_logging())
|
|
388
|
+
print("✅ All tests passed!" if success else "❌ Tests failed!")
|
|
389
|
+
exit(0 if success else 1)
|