hud-python 0.5.1__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. hud/__init__.py +1 -1
  2. hud/agents/__init__.py +65 -6
  3. hud/agents/base.py +33 -15
  4. hud/agents/claude.py +60 -31
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +15 -26
  7. hud/agents/gemini_cua.py +6 -17
  8. hud/agents/misc/response_agent.py +7 -0
  9. hud/agents/openai.py +16 -29
  10. hud/agents/openai_chat.py +3 -19
  11. hud/agents/operator.py +5 -17
  12. hud/agents/resolver.py +70 -0
  13. hud/agents/tests/test_claude.py +2 -4
  14. hud/agents/tests/test_openai.py +2 -1
  15. hud/agents/tests/test_resolver.py +192 -0
  16. hud/agents/types.py +148 -0
  17. hud/cli/__init__.py +34 -3
  18. hud/cli/build.py +37 -5
  19. hud/cli/dev.py +11 -2
  20. hud/cli/eval.py +51 -39
  21. hud/cli/flows/init.py +1 -1
  22. hud/cli/pull.py +1 -1
  23. hud/cli/push.py +9 -2
  24. hud/cli/tests/test_build.py +2 -2
  25. hud/cli/tests/test_push.py +1 -1
  26. hud/cli/utils/metadata.py +1 -1
  27. hud/cli/utils/tests/test_metadata.py +1 -1
  28. hud/clients/mcp_use.py +6 -1
  29. hud/datasets/loader.py +17 -18
  30. hud/datasets/runner.py +16 -10
  31. hud/datasets/tests/test_loader.py +15 -15
  32. hud/environment/__init__.py +5 -3
  33. hud/environment/connection.py +58 -6
  34. hud/environment/connectors/mcp_config.py +29 -1
  35. hud/environment/environment.py +218 -77
  36. hud/environment/router.py +175 -24
  37. hud/environment/scenarios.py +313 -186
  38. hud/environment/tests/test_connectors.py +10 -23
  39. hud/environment/tests/test_environment.py +432 -0
  40. hud/environment/tests/test_local_connectors.py +81 -40
  41. hud/environment/tests/test_scenarios.py +820 -14
  42. hud/eval/context.py +63 -10
  43. hud/eval/instrument.py +4 -2
  44. hud/eval/manager.py +79 -12
  45. hud/eval/task.py +36 -4
  46. hud/eval/tests/test_eval.py +1 -1
  47. hud/eval/tests/test_task.py +147 -1
  48. hud/eval/types.py +2 -0
  49. hud/eval/utils.py +14 -3
  50. hud/patches/mcp_patches.py +178 -21
  51. hud/telemetry/instrument.py +8 -1
  52. hud/telemetry/tests/test_eval_telemetry.py +8 -8
  53. hud/tools/__init__.py +2 -0
  54. hud/tools/agent.py +223 -0
  55. hud/tools/computer/__init__.py +34 -5
  56. hud/tools/shell.py +3 -3
  57. hud/tools/tests/test_agent_tool.py +355 -0
  58. hud/types.py +62 -34
  59. hud/utils/hud_console.py +30 -17
  60. hud/utils/strict_schema.py +1 -1
  61. hud/utils/tests/test_version.py +1 -1
  62. hud/version.py +1 -1
  63. {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/METADATA +2 -2
  64. {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/RECORD +67 -61
  65. {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/WHEEL +0 -0
  66. {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  67. {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
@@ -2,11 +2,55 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ from datetime import datetime
6
+ from enum import Enum
7
+ from typing import Any
8
+
5
9
  import pytest
10
+ from pydantic import BaseModel
6
11
 
7
12
  from hud.environment import Environment
8
13
 
9
14
 
15
+ # Module-level models for Pydantic/Enum/datetime deserialization tests
16
+ # (prefixed with underscore to avoid pytest collection warnings)
17
+ class _UserConfig(BaseModel):
18
+ """Pydantic model for testing."""
19
+
20
+ name: str
21
+ age: int
22
+ active: bool = True
23
+
24
+
25
+ class _Status(Enum):
26
+ """Enum for testing."""
27
+
28
+ PENDING = "pending"
29
+ ACTIVE = "active"
30
+ COMPLETED = "completed"
31
+
32
+
33
+ class _Address(BaseModel):
34
+ """Nested Pydantic model for testing."""
35
+
36
+ street: str
37
+ city: str
38
+
39
+
40
+ class _Person(BaseModel):
41
+ """Pydantic model with nested model for testing."""
42
+
43
+ name: str
44
+ address: _Address
45
+
46
+
47
+ class _Item(BaseModel):
48
+ """Pydantic model for list tests."""
49
+
50
+ id: int
51
+ name: str
52
+
53
+
10
54
  class TestScenarioDecorator:
11
55
  """Tests for @env.scenario decorator."""
12
56
 
@@ -110,8 +154,9 @@ class TestScenarioExecution:
110
154
  assert prompt is not None
111
155
  await prompt.render({})
112
156
 
113
- # Check session was stored
114
- assert "test" in env._scenario_latest
157
+ # Check session was stored in _active_session
158
+ assert env._active_session is not None
159
+ assert env._active_session.local_name == "test"
115
160
 
116
161
  @pytest.mark.asyncio
117
162
  async def test_scenario_full_flow(self) -> None:
@@ -170,7 +215,7 @@ class TestScenarioSubmit:
170
215
 
171
216
  @pytest.mark.asyncio
172
217
  async def test_submit_stores_answer(self) -> None:
173
- """submit() stores answer for scenario."""
218
+ """submit() stores answer in active session."""
174
219
  env = Environment("test-env")
175
220
 
176
221
  @env.scenario("test")
@@ -178,15 +223,15 @@ class TestScenarioSubmit:
178
223
  yield "What is 2+2?"
179
224
  yield 1.0
180
225
 
181
- # Run setup
182
- prompt = env._prompt_manager._prompts.get("test-env:test")
183
- assert prompt is not None
184
- await prompt.render({})
226
+ # Run setup via proper API (creates _active_session)
227
+ await env.run_scenario_setup("test", {})
185
228
 
186
229
  # Submit answer
187
230
  await env.submit("test", "4")
188
231
 
189
- assert env._scenario_answers.get("test") == "4"
232
+ # Answer is stored in active session (not _scenario_answers for client-side)
233
+ assert env._active_session is not None
234
+ assert env._active_session.answer == "4"
190
235
 
191
236
  @pytest.mark.asyncio
192
237
  async def test_scenario_receives_answer(self) -> None:
@@ -201,13 +246,14 @@ class TestScenarioSubmit:
201
246
  received_answer = answer
202
247
  yield 1.0 if answer == "4" else 0.0
203
248
 
204
- # Run setup
249
+ # Run setup (creates _active_session)
205
250
  prompt = env._prompt_manager._prompts.get("test-env:qa")
206
251
  assert prompt is not None
207
252
  await prompt.render({})
208
253
 
209
- # Submit answer
210
- env._scenario_answers["qa"] = "4"
254
+ # Submit answer via _active_session
255
+ assert env._active_session is not None
256
+ env._active_session.answer = "4"
211
257
 
212
258
  # Run evaluate
213
259
  resource = env._resource_manager._resources.get("test-env:qa")
@@ -226,13 +272,14 @@ class TestScenarioSubmit:
226
272
  answer = yield "What is the capital of France?"
227
273
  yield 1.0 if "paris" in answer.lower() else 0.0
228
274
 
229
- # Run setup
275
+ # Run setup (creates _active_session)
230
276
  prompt = env._prompt_manager._prompts.get("test-env:grading")
231
277
  assert prompt is not None
232
278
  await prompt.render({})
233
279
 
234
- # Submit correct answer
235
- env._scenario_answers["grading"] = "Paris"
280
+ # Submit correct answer via _active_session
281
+ assert env._active_session is not None
282
+ env._active_session.answer = "Paris"
236
283
 
237
284
  # Run evaluate
238
285
  resource = env._resource_manager._resources.get("test-env:grading")
@@ -244,6 +291,46 @@ class TestScenarioSubmit:
244
291
  data = json.loads(result)
245
292
  assert data["reward"] == 1.0
246
293
 
294
+ @pytest.mark.asyncio
295
+ async def test_hud_submit_normalizes_prefixed_scenario_name(self) -> None:
296
+ """_hud_submit with prefixed name stores answer in _active_session.
297
+
298
+ Regression test: answers submitted with "env:scenario" prefix must
299
+ match the active session's local_name for storage.
300
+ """
301
+ env = Environment("my-env")
302
+
303
+ @env.scenario("greet")
304
+ async def greet_scenario():
305
+ answer = yield "Say hello"
306
+ yield 1.0 if answer == "hello" else 0.0
307
+
308
+ # Run setup via prompt (creates _active_session)
309
+ prompt = env._prompt_manager._prompts.get("my-env:greet")
310
+ assert prompt is not None
311
+ await prompt.render({})
312
+
313
+ # Verify session exists before _hud_submit
314
+ assert env._active_session is not None
315
+ assert env._active_session.local_name == "greet"
316
+
317
+ # Simulate _hud_submit with PREFIXED scenario name (as happens in remote calls)
318
+ # This should normalize to "greet" and match the active session
319
+ await env.call_tool("_hud_submit", scenario="my-env:greet", answer="hello")
320
+
321
+ # Verify answer was stored in _active_session
322
+ assert env._active_session.answer == "hello"
323
+
324
+ # Verify evaluation works
325
+ resource = env._resource_manager._resources.get("my-env:greet")
326
+ assert resource is not None
327
+ result = await resource.read()
328
+
329
+ import json
330
+
331
+ data = json.loads(result)
332
+ assert data["reward"] == 1.0
333
+
247
334
 
248
335
  class TestScenarioMeta:
249
336
  """Tests for scenario _meta containing code."""
@@ -278,3 +365,722 @@ class TestScenarioMeta:
278
365
  assert resource.meta is not None
279
366
  assert "code" in resource.meta
280
367
  assert "async def example_scenario" in resource.meta["code"]
368
+
369
+
370
+ class TestScenarioJsonSerialization:
371
+ """Tests for JSON serialization of complex argument types.
372
+
373
+ MCP prompts only support string arguments (dict[str, str]).
374
+ Complex types like lists, dicts, and numbers are JSON-serialized
375
+ when sent and deserialized based on type annotations when received.
376
+ """
377
+
378
+ @pytest.mark.asyncio
379
+ async def test_list_argument_deserialization(self) -> None:
380
+ """List arguments are JSON-deserialized from strings."""
381
+ env = Environment("test-env")
382
+ received_items: list[str] = []
383
+
384
+ @env.scenario("process_items")
385
+ async def process_items_scenario(items: list[str]):
386
+ received_items.extend(items)
387
+ yield f"Processing {len(items)} items"
388
+ yield 1.0
389
+
390
+ prompt = env._prompt_manager._prompts.get("test-env:process_items")
391
+ assert prompt is not None
392
+
393
+ # Simulate MCP sending JSON-encoded list as string
394
+ await prompt.render({"items": '["apple", "banana", "cherry"]'})
395
+
396
+ assert received_items == ["apple", "banana", "cherry"]
397
+
398
+ @pytest.mark.asyncio
399
+ async def test_dict_argument_deserialization(self) -> None:
400
+ """Dict arguments are JSON-deserialized from strings."""
401
+ env = Environment("test-env")
402
+ received_config: dict[str, Any] = {}
403
+
404
+ @env.scenario("configure")
405
+ async def configure_scenario(config: dict[str, Any]):
406
+ received_config.update(config)
407
+ yield "Configuring..."
408
+ yield 1.0
409
+
410
+ prompt = env._prompt_manager._prompts.get("test-env:configure")
411
+ assert prompt is not None
412
+
413
+ # Simulate MCP sending JSON-encoded dict as string
414
+ await prompt.render({"config": '{"timeout": 30, "retries": 3}'})
415
+
416
+ assert received_config == {"timeout": 30, "retries": 3}
417
+
418
+ @pytest.mark.asyncio
419
+ async def test_int_argument_deserialization(self) -> None:
420
+ """Integer arguments are JSON-deserialized from strings."""
421
+ env = Environment("test-env")
422
+ received_count = 0
423
+
424
+ @env.scenario("count")
425
+ async def count_scenario(count: int):
426
+ nonlocal received_count
427
+ received_count = count
428
+ yield f"Counting to {count}"
429
+ yield 1.0
430
+
431
+ prompt = env._prompt_manager._prompts.get("test-env:count")
432
+ assert prompt is not None
433
+
434
+ # Simulate MCP sending JSON-encoded int as string
435
+ await prompt.render({"count": "42"})
436
+
437
+ assert received_count == 42
438
+ assert isinstance(received_count, int)
439
+
440
+ @pytest.mark.asyncio
441
+ async def test_float_argument_deserialization(self) -> None:
442
+ """Float arguments are JSON-deserialized from strings."""
443
+ env = Environment("test-env")
444
+ received_value = 0.0
445
+
446
+ @env.scenario("precision")
447
+ async def precision_scenario(value: float):
448
+ nonlocal received_value
449
+ received_value = value
450
+ yield f"Value is {value}"
451
+ yield 1.0
452
+
453
+ prompt = env._prompt_manager._prompts.get("test-env:precision")
454
+ assert prompt is not None
455
+
456
+ # Simulate MCP sending JSON-encoded float as string
457
+ await prompt.render({"value": "3.14159"})
458
+
459
+ assert received_value == 3.14159
460
+ assert isinstance(received_value, float)
461
+
462
+ @pytest.mark.asyncio
463
+ async def test_bool_argument_deserialization(self) -> None:
464
+ """Boolean arguments are JSON-deserialized from strings."""
465
+ env = Environment("test-env")
466
+ received_flag = False
467
+
468
+ @env.scenario("toggle")
469
+ async def toggle_scenario(enabled: bool):
470
+ nonlocal received_flag
471
+ received_flag = enabled
472
+ yield f"Enabled: {enabled}"
473
+ yield 1.0
474
+
475
+ prompt = env._prompt_manager._prompts.get("test-env:toggle")
476
+ assert prompt is not None
477
+
478
+ # Simulate MCP sending JSON-encoded bool as string
479
+ await prompt.render({"enabled": "true"})
480
+
481
+ assert received_flag is True
482
+ assert isinstance(received_flag, bool)
483
+
484
+ @pytest.mark.asyncio
485
+ async def test_string_argument_unchanged(self) -> None:
486
+ """String arguments are passed through unchanged."""
487
+ env = Environment("test-env")
488
+ received_name = ""
489
+
490
+ @env.scenario("greet")
491
+ async def greet_scenario(name: str):
492
+ nonlocal received_name
493
+ received_name = name
494
+ yield f"Hello, {name}!"
495
+ yield 1.0
496
+
497
+ prompt = env._prompt_manager._prompts.get("test-env:greet")
498
+ assert prompt is not None
499
+
500
+ # String should pass through as-is (not double-encoded)
501
+ await prompt.render({"name": "Alice"})
502
+
503
+ assert received_name == "Alice"
504
+
505
+ @pytest.mark.asyncio
506
+ async def test_mixed_argument_types(self) -> None:
507
+ """Mixed argument types are handled correctly."""
508
+ env = Environment("test-env")
509
+ received_args: dict[str, Any] = {}
510
+
511
+ @env.scenario("mixed")
512
+ async def mixed_scenario(
513
+ name: str,
514
+ count: int,
515
+ items: list[str],
516
+ options: dict[str, bool],
517
+ ):
518
+ received_args["name"] = name
519
+ received_args["count"] = count
520
+ received_args["items"] = items
521
+ received_args["options"] = options
522
+ yield "Processing..."
523
+ yield 1.0
524
+
525
+ prompt = env._prompt_manager._prompts.get("test-env:mixed")
526
+ assert prompt is not None
527
+
528
+ await prompt.render(
529
+ {
530
+ "name": "test",
531
+ "count": "5",
532
+ "items": '["a", "b", "c"]',
533
+ "options": '{"verbose": true, "dry_run": false}',
534
+ }
535
+ )
536
+
537
+ assert received_args["name"] == "test"
538
+ assert received_args["count"] == 5
539
+ assert received_args["items"] == ["a", "b", "c"]
540
+ assert received_args["options"] == {"verbose": True, "dry_run": False}
541
+
542
+ @pytest.mark.asyncio
543
+ async def test_invalid_json_falls_back_to_string(self) -> None:
544
+ """Invalid JSON for non-string type falls back to string value."""
545
+ env = Environment("test-env")
546
+ received_items: list[str] = []
547
+
548
+ @env.scenario("fallback")
549
+ async def fallback_scenario(items: list[str]):
550
+ # This will receive the raw string if JSON parsing fails
551
+ received_items.append(str(items))
552
+ yield "Processing..."
553
+ yield 1.0
554
+
555
+ prompt = env._prompt_manager._prompts.get("test-env:fallback")
556
+ assert prompt is not None
557
+
558
+ # Invalid JSON - should fall back to string
559
+ await prompt.render({"items": "not valid json ["})
560
+
561
+ # Falls back to raw string
562
+ assert received_items == ["not valid json ["]
563
+
564
+ @pytest.mark.asyncio
565
+ async def test_nested_complex_types(self) -> None:
566
+ """Nested complex types are deserialized correctly."""
567
+ env = Environment("test-env")
568
+ received_data: dict[str, Any] = {}
569
+
570
+ @env.scenario("nested")
571
+ async def nested_scenario(data: dict[str, Any]):
572
+ received_data.update(data)
573
+ yield "Processing nested data..."
574
+ yield 1.0
575
+
576
+ prompt = env._prompt_manager._prompts.get("test-env:nested")
577
+ assert prompt is not None
578
+
579
+ nested_json = (
580
+ '{"users": [{"name": "Alice", "age": 30}, {"name": "Bob", "age": 25}], '
581
+ '"metadata": {"version": 1}}'
582
+ )
583
+ await prompt.render({"data": nested_json})
584
+
585
+ assert received_data == {
586
+ "users": [
587
+ {"name": "Alice", "age": 30},
588
+ {"name": "Bob", "age": 25},
589
+ ],
590
+ "metadata": {"version": 1},
591
+ }
592
+
593
+ @pytest.mark.asyncio
594
+ async def test_optional_list_with_value(self) -> None:
595
+ """Optional[list[str]] receives list when provided."""
596
+ env = Environment("test-env")
597
+ received_items: list[str] | None = None
598
+
599
+ @env.scenario("optional_list")
600
+ async def optional_list_scenario(items: list[str] | None = None):
601
+ nonlocal received_items
602
+ received_items = items
603
+ yield f"Got {items}"
604
+ yield 1.0
605
+
606
+ prompt = env._prompt_manager._prompts.get("test-env:optional_list")
607
+ assert prompt is not None
608
+
609
+ await prompt.render({"items": '["x", "y", "z"]'})
610
+
611
+ assert received_items == ["x", "y", "z"]
612
+
613
+ @pytest.mark.asyncio
614
+ async def test_optional_list_with_null(self) -> None:
615
+ """Optional[list[str]] receives None when 'null' is passed."""
616
+ env = Environment("test-env")
617
+ received_items: list[str] | None = ["initial"]
618
+
619
+ @env.scenario("optional_list_null")
620
+ async def optional_list_null_scenario(items: list[str] | None = None):
621
+ nonlocal received_items
622
+ received_items = items
623
+ yield f"Got {items}"
624
+ yield 1.0
625
+
626
+ prompt = env._prompt_manager._prompts.get("test-env:optional_list_null")
627
+ assert prompt is not None
628
+
629
+ await prompt.render({"items": "null"})
630
+
631
+ assert received_items is None
632
+
633
+ @pytest.mark.asyncio
634
+ async def test_optional_str_with_value(self) -> None:
635
+ """Optional[str] receives string value correctly."""
636
+ env = Environment("test-env")
637
+ received_name: str | None = None
638
+
639
+ @env.scenario("optional_str")
640
+ async def optional_str_scenario(name: str | None = None):
641
+ nonlocal received_name
642
+ received_name = name
643
+ yield f"Got {name}"
644
+ yield 1.0
645
+
646
+ prompt = env._prompt_manager._prompts.get("test-env:optional_str")
647
+ assert prompt is not None
648
+
649
+ await prompt.render({"name": "Alice"})
650
+
651
+ assert received_name == "Alice"
652
+
653
+ @pytest.mark.asyncio
654
+ async def test_optional_str_with_null(self) -> None:
655
+ """Optional[str] receives None when 'null' is passed."""
656
+ env = Environment("test-env")
657
+ received_name: str | None = "initial"
658
+
659
+ @env.scenario("optional_str_null")
660
+ async def optional_str_null_scenario(name: str | None = None):
661
+ nonlocal received_name
662
+ received_name = name
663
+ yield f"Got {name}"
664
+ yield 1.0
665
+
666
+ prompt = env._prompt_manager._prompts.get("test-env:optional_str_null")
667
+ assert prompt is not None
668
+
669
+ await prompt.render({"name": "null"})
670
+
671
+ assert received_name is None
672
+
673
+ @pytest.mark.asyncio
674
+ async def test_pydantic_model_deserialization(self) -> None:
675
+ """Pydantic models are properly deserialized from JSON."""
676
+ env = Environment("test-env")
677
+ received_config: _UserConfig | None = None
678
+
679
+ @env.scenario("pydantic_model")
680
+ async def pydantic_model_scenario(config: _UserConfig):
681
+ nonlocal received_config
682
+ received_config = config
683
+ yield f"Got config for {config.name}"
684
+ yield 1.0
685
+
686
+ prompt = env._prompt_manager._prompts.get("test-env:pydantic_model")
687
+ assert prompt is not None
688
+
689
+ await prompt.render({"config": '{"name": "Alice", "age": 30}'})
690
+
691
+ assert received_config is not None
692
+ assert isinstance(received_config, _UserConfig)
693
+ assert received_config.name == "Alice"
694
+ assert received_config.age == 30
695
+ assert received_config.active is True # default value
696
+
697
+ @pytest.mark.asyncio
698
+ async def test_enum_deserialization(self) -> None:
699
+ """Enum values are properly deserialized from JSON strings."""
700
+ env = Environment("test-env")
701
+ received_status: _Status | None = None
702
+
703
+ @env.scenario("enum_status")
704
+ async def enum_scenario(status: _Status):
705
+ nonlocal received_status
706
+ received_status = status
707
+ yield f"Status is {status.value}"
708
+ yield 1.0
709
+
710
+ prompt = env._prompt_manager._prompts.get("test-env:enum_status")
711
+ assert prompt is not None
712
+
713
+ await prompt.render({"status": '"active"'})
714
+
715
+ assert received_status is not None
716
+ assert isinstance(received_status, _Status)
717
+ assert received_status == _Status.ACTIVE
718
+
719
+ @pytest.mark.asyncio
720
+ async def test_datetime_deserialization(self) -> None:
721
+ """Datetime values are properly deserialized from ISO strings."""
722
+ env = Environment("test-env")
723
+ received_dt: datetime | None = None
724
+
725
+ @env.scenario("datetime_scenario")
726
+ async def datetime_scenario(created_at: datetime):
727
+ nonlocal received_dt
728
+ received_dt = created_at
729
+ yield f"Created at {created_at}"
730
+ yield 1.0
731
+
732
+ prompt = env._prompt_manager._prompts.get("test-env:datetime_scenario")
733
+ assert prompt is not None
734
+
735
+ await prompt.render({"created_at": '"2024-06-15T10:30:00"'})
736
+
737
+ assert received_dt is not None
738
+ assert isinstance(received_dt, datetime)
739
+ assert received_dt.year == 2024
740
+ assert received_dt.month == 6
741
+ assert received_dt.day == 15
742
+ assert received_dt.hour == 10
743
+ assert received_dt.minute == 30
744
+
745
+ @pytest.mark.asyncio
746
+ async def test_nested_pydantic_model(self) -> None:
747
+ """Nested Pydantic models are properly deserialized."""
748
+ env = Environment("test-env")
749
+ received_person: _Person | None = None
750
+
751
+ @env.scenario("nested_pydantic")
752
+ async def nested_pydantic_scenario(person: _Person):
753
+ nonlocal received_person
754
+ received_person = person
755
+ yield f"Person {person.name} from {person.address.city}"
756
+ yield 1.0
757
+
758
+ prompt = env._prompt_manager._prompts.get("test-env:nested_pydantic")
759
+ assert prompt is not None
760
+
761
+ json_data = '{"name": "Bob", "address": {"street": "123 Main St", "city": "NYC"}}'
762
+ await prompt.render({"person": json_data})
763
+
764
+ assert received_person is not None
765
+ assert isinstance(received_person, _Person)
766
+ assert received_person.name == "Bob"
767
+ assert isinstance(received_person.address, _Address)
768
+ assert received_person.address.city == "NYC"
769
+
770
+ @pytest.mark.asyncio
771
+ async def test_list_of_pydantic_models(self) -> None:
772
+ """List of Pydantic models are properly deserialized."""
773
+ env = Environment("test-env")
774
+ received_items: list[_Item] = []
775
+
776
+ @env.scenario("list_pydantic")
777
+ async def list_pydantic_scenario(items: list[_Item]):
778
+ nonlocal received_items
779
+ received_items = items
780
+ yield f"Got {len(items)} items"
781
+ yield 1.0
782
+
783
+ prompt = env._prompt_manager._prompts.get("test-env:list_pydantic")
784
+ assert prompt is not None
785
+
786
+ json_data = '[{"id": 1, "name": "Apple"}, {"id": 2, "name": "Banana"}]'
787
+ await prompt.render({"items": json_data})
788
+
789
+ assert len(received_items) == 2
790
+ assert all(isinstance(item, _Item) for item in received_items)
791
+ assert received_items[0].name == "Apple"
792
+ assert received_items[1].name == "Banana"
793
+
794
+
795
+ class TestScenarioNameNormalization:
796
+ """Test edge cases for environment and scenario name handling."""
797
+
798
+ @pytest.mark.asyncio
799
+ async def test_env_name_with_underscores_normalizes(self) -> None:
800
+ """Environment name with underscores normalizes to hyphens."""
801
+ env = Environment("my_test_env")
802
+ assert env.name == "my-test-env"
803
+
804
+ @env.scenario("greet")
805
+ async def greet():
806
+ yield "Hello"
807
+ yield 1.0
808
+
809
+ # Scenario should be registered with normalized name
810
+ assert "my-test-env:greet" in [p.name for p in env._prompt_manager._prompts.values()]
811
+
812
+ @pytest.mark.asyncio
813
+ async def test_env_name_with_spaces_normalizes(self) -> None:
814
+ """Environment name with spaces normalizes to hyphens."""
815
+ env = Environment("my test env")
816
+ assert env.name == "my-test-env"
817
+
818
+ @pytest.mark.asyncio
819
+ async def test_env_name_with_caps_normalizes(self) -> None:
820
+ """Environment name with capitals normalizes to lowercase."""
821
+ env = Environment("MyTestEnv")
822
+ assert env.name == "mytestenv"
823
+
824
+ @pytest.mark.asyncio
825
+ async def test_env_name_mixed_formatting(self) -> None:
826
+ """Environment name with mixed formatting normalizes correctly."""
827
+ env = Environment("My_Test Env")
828
+ assert env.name == "my-test-env"
829
+
830
+ @pytest.mark.asyncio
831
+ async def test_prefix_matches_normalized_name(self) -> None:
832
+ """Scenario prefix should match normalized env name."""
833
+ env = Environment("my_env") # Normalizes to "my-env"
834
+
835
+ @env.scenario("test")
836
+ async def test_scenario():
837
+ yield "Prompt"
838
+ yield 1.0
839
+
840
+ # Calling with normalized prefix should work as local
841
+ prompt = await env.run_scenario_setup("my-env:test", {})
842
+ assert prompt == "Prompt"
843
+ assert env._active_session is not None
844
+ assert env._active_session.is_local is True
845
+
846
+ @pytest.mark.asyncio
847
+ async def test_unnormalized_prefix_treated_as_remote(self) -> None:
848
+ """Calling with unnormalized prefix treats as remote (different env)."""
849
+ env = Environment("my_env") # Normalizes to "my-env"
850
+
851
+ @env.scenario("test")
852
+ async def test_scenario():
853
+ yield "Prompt"
854
+ yield 1.0
855
+
856
+ # Calling with "my_env:test" (underscore) won't match "my-env"
857
+ # So it's treated as remote - which will fail since no connection
858
+ with pytest.raises(ValueError, match="Scenario not found"):
859
+ await env.run_scenario_setup("my_env:test", {})
860
+
861
+
862
+ class TestScenarioMalformedNames:
863
+ """Test handling of malformed scenario names."""
864
+
865
+ @pytest.mark.asyncio
866
+ async def test_empty_scenario_name_rejected(self) -> None:
867
+ """Empty scenario name should be handled gracefully."""
868
+ env = Environment("test-env")
869
+
870
+ @env.scenario("valid")
871
+ async def valid_scenario():
872
+ yield "Prompt"
873
+ yield 1.0
874
+
875
+ # Empty name - should fail since not registered
876
+ with pytest.raises((ValueError, KeyError)):
877
+ await env.run_scenario_setup("", {})
878
+
879
+ @pytest.mark.asyncio
880
+ async def test_only_colon_handled(self) -> None:
881
+ """Scenario name that is just ':' should be handled."""
882
+ env = Environment("test-env")
883
+
884
+ # ":" splits to prefix="" and short_name=""
885
+ with pytest.raises((ValueError, KeyError)):
886
+ await env.run_scenario_setup(":", {})
887
+
888
+ @pytest.mark.asyncio
889
+ async def test_colon_in_scenario_name_rejected_at_registration(self) -> None:
890
+ """Scenario names with colons are rejected at registration time."""
891
+ env = Environment("test-env")
892
+
893
+ # Colons are reserved as the separator between env and scenario names
894
+ with pytest.raises(ValueError, match="cannot contain ':'"):
895
+
896
+ @env.scenario("invalid:name")
897
+ async def scenario_with_colon():
898
+ yield "Prompt"
899
+ yield 1.0
900
+
901
+ @pytest.mark.asyncio
902
+ async def test_whitespace_in_scenario_name(self) -> None:
903
+ """Scenario names with whitespace should work (not normalized)."""
904
+ env = Environment("test-env")
905
+
906
+ @env.scenario("my scenario")
907
+ async def scenario_with_space():
908
+ yield "Prompt"
909
+ yield 1.0
910
+
911
+ # Scenario names are NOT normalized (only env names are)
912
+ prompt = await env.run_scenario_setup("my scenario", {})
913
+ assert prompt == "Prompt"
914
+
915
+
916
+ class TestScenarioRegistration:
917
+ """Test scenario registration edge cases."""
918
+
919
+ @pytest.mark.asyncio
920
+ async def test_duplicate_scenario_name_overwrites(self) -> None:
921
+ """Registering same scenario name twice should overwrite."""
922
+ env = Environment("test-env")
923
+
924
+ @env.scenario("greet")
925
+ async def greet_v1():
926
+ yield "Hello v1"
927
+ yield 1.0
928
+
929
+ @env.scenario("greet")
930
+ async def greet_v2():
931
+ yield "Hello v2"
932
+ yield 1.0
933
+
934
+ # Should use v2
935
+ prompt = await env.run_scenario_setup("greet", {})
936
+ assert prompt == "Hello v2"
937
+
938
+ @pytest.mark.asyncio
939
+ async def test_scenario_with_special_chars(self) -> None:
940
+ """Scenario names can contain special characters."""
941
+ env = Environment("test-env")
942
+
943
+ @env.scenario("test-scenario_v2.0")
944
+ async def special_scenario():
945
+ yield "Prompt"
946
+ yield 1.0
947
+
948
+ prompt = await env.run_scenario_setup("test-scenario_v2.0", {})
949
+ assert prompt == "Prompt"
950
+
951
+ @pytest.mark.asyncio
952
+ async def test_scenario_that_yields_once(self) -> None:
953
+ """Scenario that yields only once (no evaluate) should handle gracefully."""
954
+ env = Environment("test-env")
955
+
956
+ @env.scenario("one-yield")
957
+ async def one_yield_scenario():
958
+ yield "Prompt"
959
+ # No second yield!
960
+
961
+ prompt = await env.run_scenario_setup("one-yield", {})
962
+ assert prompt == "Prompt"
963
+
964
+ assert env._active_session is not None
965
+ env._active_session.answer = "test"
966
+ # Evaluate should handle StopAsyncIteration and return 1.0
967
+ reward = await env.run_scenario_evaluate("one-yield")
968
+ assert reward == 1.0
969
+
970
+ @pytest.mark.asyncio
971
+ async def test_scenario_that_yields_three_times(self) -> None:
972
+ """Scenario that yields more than twice - third yield ignored."""
973
+ env = Environment("test-env")
974
+
975
+ @env.scenario("three-yields")
976
+ async def three_yield_scenario():
977
+ yield "Prompt"
978
+ yield 0.5
979
+ yield "This should be ignored"
980
+
981
+ prompt = await env.run_scenario_setup("three-yields", {})
982
+ assert prompt == "Prompt"
983
+
984
+ assert env._active_session is not None
985
+ env._active_session.answer = "test"
986
+ reward = await env.run_scenario_evaluate("three-yields")
987
+ assert reward == 0.5
988
+
989
+
990
+ class TestScenarioSessionState:
991
+ """Test session state management edge cases."""
992
+
993
+ @pytest.mark.asyncio
994
+ async def test_submit_before_setup_raises(self) -> None:
995
+ """Calling submit() before run_scenario_setup() should raise."""
996
+ env = Environment("test-env")
997
+
998
+ @env.scenario("test")
999
+ async def test_scenario():
1000
+ yield "Prompt"
1001
+ yield 1.0
1002
+
1003
+ with pytest.raises(ValueError, match="No active scenario session"):
1004
+ await env.submit("test", "answer")
1005
+
1006
+ @pytest.mark.asyncio
1007
+ async def test_evaluate_before_setup_returns_none(self) -> None:
1008
+ """Calling evaluate() before setup() should return None."""
1009
+ env = Environment("test-env")
1010
+
1011
+ @env.scenario("test")
1012
+ async def test_scenario():
1013
+ yield "Prompt"
1014
+ yield 1.0
1015
+
1016
+ result = await env.run_scenario_evaluate("test")
1017
+ assert result is None
1018
+
1019
+ @pytest.mark.asyncio
1020
+ async def test_double_evaluate_returns_none(self) -> None:
1021
+ """Calling evaluate() twice should return None on second call."""
1022
+ env = Environment("test-env")
1023
+
1024
+ @env.scenario("test")
1025
+ async def test_scenario():
1026
+ yield "Prompt"
1027
+ yield 0.75
1028
+
1029
+ await env.run_scenario_setup("test", {})
1030
+ assert env._active_session is not None
1031
+ env._active_session.answer = "answer"
1032
+
1033
+ reward1 = await env.run_scenario_evaluate("test")
1034
+ assert reward1 == 0.75
1035
+
1036
+ # Second call - session cleared
1037
+ reward2 = await env.run_scenario_evaluate("test")
1038
+ assert reward2 is None
1039
+
1040
+ @pytest.mark.asyncio
1041
+ async def test_submit_wrong_scenario_raises(self) -> None:
1042
+ """Submitting answer for wrong scenario should raise."""
1043
+ env = Environment("test-env")
1044
+
1045
+ @env.scenario("scenario-a")
1046
+ async def scenario_a():
1047
+ yield "Prompt A"
1048
+ yield 1.0
1049
+
1050
+ @env.scenario("scenario-b")
1051
+ async def scenario_b():
1052
+ yield "Prompt B"
1053
+ yield 1.0
1054
+
1055
+ await env.run_scenario_setup("scenario-a", {})
1056
+
1057
+ with pytest.raises(ValueError, match="Scenario mismatch"):
1058
+ await env.submit("scenario-b", "answer")
1059
+
1060
+ @pytest.mark.asyncio
1061
+ async def test_second_setup_overwrites_first(self) -> None:
1062
+ """Starting a new scenario before evaluating previous one overwrites."""
1063
+ env = Environment("test-env")
1064
+
1065
+ @env.scenario("first")
1066
+ async def first_scenario():
1067
+ yield "First"
1068
+ yield 1.0
1069
+
1070
+ @env.scenario("second")
1071
+ async def second_scenario():
1072
+ yield "Second"
1073
+ yield 0.5
1074
+
1075
+ await env.run_scenario_setup("first", {})
1076
+ assert env._active_session is not None
1077
+ assert env._active_session.local_name == "first"
1078
+
1079
+ # Start second without evaluating first
1080
+ await env.run_scenario_setup("second", {})
1081
+ assert env._active_session is not None
1082
+ assert env._active_session.local_name == "second"
1083
+
1084
+ env._active_session.answer = "answer"
1085
+ reward = await env.run_scenario_evaluate("second")
1086
+ assert reward == 0.5