@miller-tech/uap 1.15.1 → 1.15.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/templates/hooks/forgecode/session-start.sh +6 -22
- package/templates/hooks/session-start.sh +7 -31
- package/tools/agents/plugin/session-start.sh +6 -22
- package/tools/agents/scripts/anthropic_proxy.py +815 -51
- package/tools/agents/tests/test_anthropic_proxy_streaming.py +661 -0
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import importlib.util
|
|
5
|
+
import json
|
|
4
6
|
import unittest
|
|
5
7
|
from pathlib import Path
|
|
6
8
|
|
|
@@ -17,6 +19,27 @@ def _load_proxy_module():
|
|
|
17
19
|
proxy = _load_proxy_module()
|
|
18
20
|
|
|
19
21
|
|
|
22
|
+
class _FakeResponse:
|
|
23
|
+
def __init__(self, payload, status_code=200):
|
|
24
|
+
self._payload = payload
|
|
25
|
+
self.status_code = status_code
|
|
26
|
+
|
|
27
|
+
def json(self):
|
|
28
|
+
return self._payload
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class _FakeClient:
|
|
32
|
+
def __init__(self, responses):
|
|
33
|
+
self._responses = list(responses)
|
|
34
|
+
self.requests = []
|
|
35
|
+
|
|
36
|
+
async def post(self, *args, **kwargs):
|
|
37
|
+
self.requests.append({"args": args, "kwargs": kwargs})
|
|
38
|
+
if not self._responses:
|
|
39
|
+
raise AssertionError("No fake response queued")
|
|
40
|
+
return self._responses.pop(0)
|
|
41
|
+
|
|
42
|
+
|
|
20
43
|
class TestStreamingReasoningFallback(unittest.TestCase):
|
|
21
44
|
def test_fallback_disabled_returns_none(self):
|
|
22
45
|
text = proxy._build_reasoning_fallback_text(
|
|
@@ -185,6 +208,14 @@ class TestMalformedToolGuardrail(unittest.TestCase):
|
|
|
185
208
|
}
|
|
186
209
|
self.assertTrue(proxy._is_malformed_tool_response(openai_resp, anthropic_body))
|
|
187
210
|
|
|
211
|
+
def test_tool_call_apology_helper_detects_phrase(self):
|
|
212
|
+
apology_text = (
|
|
213
|
+
"I could not produce a valid tool-call format in this turn. "
|
|
214
|
+
"Please continue; I will issue exactly one valid tool call next."
|
|
215
|
+
)
|
|
216
|
+
self.assertTrue(proxy._contains_tool_call_apology(apology_text))
|
|
217
|
+
self.assertFalse(proxy._contains_tool_call_apology("normal assistant response"))
|
|
218
|
+
|
|
188
219
|
def test_clean_tool_call_response_is_not_malformed(self):
|
|
189
220
|
openai_resp = {
|
|
190
221
|
"choices": [
|
|
@@ -442,6 +473,505 @@ class TestMalformedToolGuardrail(unittest.TestCase):
|
|
|
442
473
|
self.assertIn("Please retry the same request", text)
|
|
443
474
|
self.assertNotIn("I will issue exactly one valid tool call next", text)
|
|
444
475
|
|
|
476
|
+
def test_openai_to_anthropic_response_sanitizes_tool_call_apology(self):
|
|
477
|
+
openai_resp = {
|
|
478
|
+
"choices": [
|
|
479
|
+
{
|
|
480
|
+
"finish_reason": "stop",
|
|
481
|
+
"message": {
|
|
482
|
+
"content": (
|
|
483
|
+
"I could not produce a valid tool-call format in this turn. "
|
|
484
|
+
"Please continue; I will issue exactly one valid tool call next."
|
|
485
|
+
),
|
|
486
|
+
"tool_calls": [],
|
|
487
|
+
},
|
|
488
|
+
}
|
|
489
|
+
]
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
converted = proxy.openai_to_anthropic_response(openai_resp, "test-model")
|
|
493
|
+
text = converted["content"][0]["text"]
|
|
494
|
+
self.assertIn("Please retry the same request", text)
|
|
495
|
+
self.assertNotIn("I will issue exactly one valid tool call next", text)
|
|
496
|
+
|
|
497
|
+
def test_preflight_flags_invalid_json_tool_arguments(self):
|
|
498
|
+
openai_resp = {
|
|
499
|
+
"choices": [
|
|
500
|
+
{
|
|
501
|
+
"finish_reason": "tool_calls",
|
|
502
|
+
"message": {
|
|
503
|
+
"content": "",
|
|
504
|
+
"tool_calls": [
|
|
505
|
+
{
|
|
506
|
+
"id": "call_1",
|
|
507
|
+
"function": {
|
|
508
|
+
"name": "ScheduleJob",
|
|
509
|
+
"arguments": '{"cron":',
|
|
510
|
+
},
|
|
511
|
+
}
|
|
512
|
+
],
|
|
513
|
+
},
|
|
514
|
+
}
|
|
515
|
+
]
|
|
516
|
+
}
|
|
517
|
+
anthropic_body = {
|
|
518
|
+
"tools": [
|
|
519
|
+
{
|
|
520
|
+
"name": "ScheduleJob",
|
|
521
|
+
"input_schema": {
|
|
522
|
+
"type": "object",
|
|
523
|
+
"required": ["cron"],
|
|
524
|
+
"properties": {"cron": {"type": "string", "minLength": 1}},
|
|
525
|
+
},
|
|
526
|
+
}
|
|
527
|
+
]
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
issue = proxy._classify_tool_response_issue(openai_resp, anthropic_body)
|
|
531
|
+
self.assertEqual(issue.kind, "malformed_payload")
|
|
532
|
+
self.assertIn("malformed pseudo tool payload", issue.reason)
|
|
533
|
+
|
|
534
|
+
def test_preflight_flags_empty_required_field(self):
|
|
535
|
+
openai_resp = {
|
|
536
|
+
"choices": [
|
|
537
|
+
{
|
|
538
|
+
"finish_reason": "tool_calls",
|
|
539
|
+
"message": {
|
|
540
|
+
"content": "",
|
|
541
|
+
"tool_calls": [
|
|
542
|
+
{
|
|
543
|
+
"id": "call_1",
|
|
544
|
+
"function": {
|
|
545
|
+
"name": "ScheduleJob",
|
|
546
|
+
"arguments": '{"cron":"","command":"echo hi"}',
|
|
547
|
+
},
|
|
548
|
+
}
|
|
549
|
+
],
|
|
550
|
+
},
|
|
551
|
+
}
|
|
552
|
+
]
|
|
553
|
+
}
|
|
554
|
+
anthropic_body = {
|
|
555
|
+
"tools": [
|
|
556
|
+
{
|
|
557
|
+
"name": "ScheduleJob",
|
|
558
|
+
"input_schema": {
|
|
559
|
+
"type": "object",
|
|
560
|
+
"required": ["cron", "command"],
|
|
561
|
+
"properties": {
|
|
562
|
+
"cron": {"type": "string", "minLength": 1},
|
|
563
|
+
"command": {"type": "string", "minLength": 1},
|
|
564
|
+
},
|
|
565
|
+
},
|
|
566
|
+
}
|
|
567
|
+
]
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
issue = proxy._classify_tool_response_issue(openai_resp, anthropic_body)
|
|
571
|
+
self.assertEqual(issue.kind, "malformed_payload")
|
|
572
|
+
self.assertIn("malformed pseudo tool payload", issue.reason)
|
|
573
|
+
|
|
574
|
+
def test_preflight_flags_markup_inside_arguments(self):
|
|
575
|
+
openai_resp = {
|
|
576
|
+
"choices": [
|
|
577
|
+
{
|
|
578
|
+
"finish_reason": "tool_calls",
|
|
579
|
+
"message": {
|
|
580
|
+
"content": "",
|
|
581
|
+
"tool_calls": [
|
|
582
|
+
{
|
|
583
|
+
"id": "call_1",
|
|
584
|
+
"function": {
|
|
585
|
+
"name": "ScheduleJob",
|
|
586
|
+
"arguments": '{"cron":"*/5 * * * *","command":"<parameter>bad</parameter>"}',
|
|
587
|
+
},
|
|
588
|
+
}
|
|
589
|
+
],
|
|
590
|
+
},
|
|
591
|
+
}
|
|
592
|
+
]
|
|
593
|
+
}
|
|
594
|
+
anthropic_body = {
|
|
595
|
+
"tools": [
|
|
596
|
+
{
|
|
597
|
+
"name": "ScheduleJob",
|
|
598
|
+
"input_schema": {
|
|
599
|
+
"type": "object",
|
|
600
|
+
"required": ["cron", "command"],
|
|
601
|
+
"properties": {
|
|
602
|
+
"cron": {"type": "string"},
|
|
603
|
+
"command": {"type": "string"},
|
|
604
|
+
},
|
|
605
|
+
},
|
|
606
|
+
}
|
|
607
|
+
]
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
issue = proxy._classify_tool_response_issue(openai_resp, anthropic_body)
|
|
611
|
+
self.assertEqual(issue.kind, "malformed_payload")
|
|
612
|
+
self.assertIn("malformed pseudo tool payload", issue.reason)
|
|
613
|
+
|
|
614
|
+
def test_required_tool_turn_without_tool_call_is_flagged(self):
|
|
615
|
+
openai_resp = {
|
|
616
|
+
"choices": [
|
|
617
|
+
{
|
|
618
|
+
"finish_reason": "stop",
|
|
619
|
+
"message": {
|
|
620
|
+
"content": "Done.",
|
|
621
|
+
"tool_calls": [],
|
|
622
|
+
},
|
|
623
|
+
}
|
|
624
|
+
]
|
|
625
|
+
}
|
|
626
|
+
anthropic_body = {
|
|
627
|
+
"tools": [{"name": "Edit", "input_schema": {"type": "object"}}],
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
issue = proxy._classify_tool_response_issue(
|
|
631
|
+
openai_resp, anthropic_body, required_tool_choice=True
|
|
632
|
+
)
|
|
633
|
+
self.assertEqual(issue.kind, "required_tool_miss")
|
|
634
|
+
|
|
635
|
+
def test_markup_repair_sanitizes_tool_arguments(self):
|
|
636
|
+
openai_resp = {
|
|
637
|
+
"choices": [
|
|
638
|
+
{
|
|
639
|
+
"finish_reason": "tool_calls",
|
|
640
|
+
"message": {
|
|
641
|
+
"content": "",
|
|
642
|
+
"tool_calls": [
|
|
643
|
+
{
|
|
644
|
+
"id": "call_1",
|
|
645
|
+
"function": {
|
|
646
|
+
"name": "Bash",
|
|
647
|
+
"arguments": '{"command":"echo ok </think> </parameter>"}',
|
|
648
|
+
},
|
|
649
|
+
}
|
|
650
|
+
],
|
|
651
|
+
},
|
|
652
|
+
}
|
|
653
|
+
]
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
repaired, count = proxy._repair_tool_call_markup(openai_resp)
|
|
657
|
+
self.assertEqual(count, 1)
|
|
658
|
+
args = repaired["choices"][0]["message"]["tool_calls"][0]["function"][
|
|
659
|
+
"arguments"
|
|
660
|
+
]
|
|
661
|
+
self.assertNotIn("</think>", args)
|
|
662
|
+
self.assertNotIn("</parameter>", args)
|
|
663
|
+
|
|
664
|
+
def test_markup_repair_recovers_json_after_tag_stripping(self):
|
|
665
|
+
openai_resp = {
|
|
666
|
+
"choices": [
|
|
667
|
+
{
|
|
668
|
+
"finish_reason": "tool_calls",
|
|
669
|
+
"message": {
|
|
670
|
+
"content": "",
|
|
671
|
+
"tool_calls": [
|
|
672
|
+
{
|
|
673
|
+
"id": "call_1",
|
|
674
|
+
"function": {
|
|
675
|
+
"name": "Bash",
|
|
676
|
+
"arguments": '</parameter>{"command":"ls"}',
|
|
677
|
+
},
|
|
678
|
+
}
|
|
679
|
+
],
|
|
680
|
+
},
|
|
681
|
+
}
|
|
682
|
+
]
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
repaired, count = proxy._repair_tool_call_markup(openai_resp)
|
|
686
|
+
self.assertEqual(count, 1)
|
|
687
|
+
args = json.loads(
|
|
688
|
+
repaired["choices"][0]["message"]["tool_calls"][0]["function"]["arguments"]
|
|
689
|
+
)
|
|
690
|
+
self.assertEqual(args["command"], "ls")
|
|
691
|
+
|
|
692
|
+
def test_guardrail_accepts_repaired_markup_without_fallback(self):
|
|
693
|
+
old_retry = getattr(proxy, "PROXY_MALFORMED_TOOL_RETRY_MAX")
|
|
694
|
+
try:
|
|
695
|
+
setattr(proxy, "PROXY_MALFORMED_TOOL_RETRY_MAX", 0)
|
|
696
|
+
|
|
697
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
698
|
+
openai_resp = {
|
|
699
|
+
"choices": [
|
|
700
|
+
{
|
|
701
|
+
"finish_reason": "tool_calls",
|
|
702
|
+
"message": {
|
|
703
|
+
"content": "",
|
|
704
|
+
"tool_calls": [
|
|
705
|
+
{
|
|
706
|
+
"id": "call_1",
|
|
707
|
+
"function": {
|
|
708
|
+
"name": "Bash",
|
|
709
|
+
"arguments": '{"command":"ls </parameter>"}',
|
|
710
|
+
},
|
|
711
|
+
}
|
|
712
|
+
],
|
|
713
|
+
},
|
|
714
|
+
}
|
|
715
|
+
]
|
|
716
|
+
}
|
|
717
|
+
anthropic_body = {
|
|
718
|
+
"tools": [
|
|
719
|
+
{
|
|
720
|
+
"name": "Bash",
|
|
721
|
+
"input_schema": {
|
|
722
|
+
"type": "object",
|
|
723
|
+
"required": ["command"],
|
|
724
|
+
"properties": {
|
|
725
|
+
"command": {"type": "string", "minLength": 1}
|
|
726
|
+
},
|
|
727
|
+
},
|
|
728
|
+
}
|
|
729
|
+
],
|
|
730
|
+
"messages": [{"role": "user", "content": "run command"}],
|
|
731
|
+
}
|
|
732
|
+
openai_body = {
|
|
733
|
+
"model": "test",
|
|
734
|
+
"messages": [{"role": "user", "content": "run command"}],
|
|
735
|
+
"tool_choice": "required",
|
|
736
|
+
}
|
|
737
|
+
|
|
738
|
+
result = asyncio.run(
|
|
739
|
+
proxy._apply_malformed_tool_guardrail(
|
|
740
|
+
_FakeClient([]),
|
|
741
|
+
openai_resp,
|
|
742
|
+
openai_body,
|
|
743
|
+
anthropic_body,
|
|
744
|
+
monitor,
|
|
745
|
+
"session-repair",
|
|
746
|
+
)
|
|
747
|
+
)
|
|
748
|
+
|
|
749
|
+
self.assertTrue(result["choices"][0]["message"].get("tool_calls"))
|
|
750
|
+
args = result["choices"][0]["message"]["tool_calls"][0]["function"][
|
|
751
|
+
"arguments"
|
|
752
|
+
]
|
|
753
|
+
self.assertNotIn("</parameter>", args)
|
|
754
|
+
self.assertEqual(monitor.arg_preflight_repairs, 1)
|
|
755
|
+
finally:
|
|
756
|
+
setattr(proxy, "PROXY_MALFORMED_TOOL_RETRY_MAX", old_retry)
|
|
757
|
+
|
|
758
|
+
def test_required_field_repair_fills_missing_required_values(self):
|
|
759
|
+
openai_resp = {
|
|
760
|
+
"choices": [
|
|
761
|
+
{
|
|
762
|
+
"finish_reason": "tool_calls",
|
|
763
|
+
"message": {
|
|
764
|
+
"content": "",
|
|
765
|
+
"tool_calls": [
|
|
766
|
+
{
|
|
767
|
+
"id": "call_1",
|
|
768
|
+
"function": {
|
|
769
|
+
"name": "ScheduleJob",
|
|
770
|
+
"arguments": '{"cron":""}',
|
|
771
|
+
},
|
|
772
|
+
}
|
|
773
|
+
],
|
|
774
|
+
},
|
|
775
|
+
}
|
|
776
|
+
]
|
|
777
|
+
}
|
|
778
|
+
anthropic_body = {
|
|
779
|
+
"tools": [
|
|
780
|
+
{
|
|
781
|
+
"name": "ScheduleJob",
|
|
782
|
+
"input_schema": {
|
|
783
|
+
"type": "object",
|
|
784
|
+
"required": ["cron", "pattern", "subject"],
|
|
785
|
+
"properties": {
|
|
786
|
+
"cron": {"type": "string", "minLength": 1},
|
|
787
|
+
"pattern": {"type": "string", "minLength": 1},
|
|
788
|
+
"subject": {"type": "string", "minLength": 1},
|
|
789
|
+
},
|
|
790
|
+
},
|
|
791
|
+
}
|
|
792
|
+
]
|
|
793
|
+
}
|
|
794
|
+
|
|
795
|
+
repaired, count = proxy._repair_required_tool_args(openai_resp, anthropic_body)
|
|
796
|
+
self.assertEqual(count, 1)
|
|
797
|
+
args_text = repaired["choices"][0]["message"]["tool_calls"][0]["function"][
|
|
798
|
+
"arguments"
|
|
799
|
+
]
|
|
800
|
+
args = json.loads(args_text)
|
|
801
|
+
self.assertTrue(args["cron"].strip())
|
|
802
|
+
self.assertTrue(args["pattern"].strip())
|
|
803
|
+
self.assertTrue(args["subject"].strip())
|
|
804
|
+
|
|
805
|
+
def test_guardrail_accepts_required_field_repair_without_fallback(self):
|
|
806
|
+
old_retry = getattr(proxy, "PROXY_MALFORMED_TOOL_RETRY_MAX")
|
|
807
|
+
try:
|
|
808
|
+
setattr(proxy, "PROXY_MALFORMED_TOOL_RETRY_MAX", 0)
|
|
809
|
+
|
|
810
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
811
|
+
openai_resp = {
|
|
812
|
+
"choices": [
|
|
813
|
+
{
|
|
814
|
+
"finish_reason": "tool_calls",
|
|
815
|
+
"message": {
|
|
816
|
+
"content": "",
|
|
817
|
+
"tool_calls": [
|
|
818
|
+
{
|
|
819
|
+
"id": "call_1",
|
|
820
|
+
"function": {
|
|
821
|
+
"name": "ScheduleJob",
|
|
822
|
+
"arguments": '{"cron":""}',
|
|
823
|
+
},
|
|
824
|
+
}
|
|
825
|
+
],
|
|
826
|
+
},
|
|
827
|
+
}
|
|
828
|
+
]
|
|
829
|
+
}
|
|
830
|
+
anthropic_body = {
|
|
831
|
+
"tools": [
|
|
832
|
+
{
|
|
833
|
+
"name": "ScheduleJob",
|
|
834
|
+
"input_schema": {
|
|
835
|
+
"type": "object",
|
|
836
|
+
"required": ["cron", "pattern", "subject"],
|
|
837
|
+
"properties": {
|
|
838
|
+
"cron": {"type": "string", "minLength": 1},
|
|
839
|
+
"pattern": {"type": "string", "minLength": 1},
|
|
840
|
+
"subject": {"type": "string", "minLength": 1},
|
|
841
|
+
},
|
|
842
|
+
},
|
|
843
|
+
}
|
|
844
|
+
],
|
|
845
|
+
"messages": [{"role": "user", "content": "schedule it"}],
|
|
846
|
+
}
|
|
847
|
+
openai_body = {
|
|
848
|
+
"model": "test",
|
|
849
|
+
"messages": [{"role": "user", "content": "schedule it"}],
|
|
850
|
+
"tool_choice": "required",
|
|
851
|
+
}
|
|
852
|
+
|
|
853
|
+
result = asyncio.run(
|
|
854
|
+
proxy._apply_malformed_tool_guardrail(
|
|
855
|
+
_FakeClient([]),
|
|
856
|
+
openai_resp,
|
|
857
|
+
openai_body,
|
|
858
|
+
anthropic_body,
|
|
859
|
+
monitor,
|
|
860
|
+
"session-repair-required",
|
|
861
|
+
)
|
|
862
|
+
)
|
|
863
|
+
|
|
864
|
+
args = json.loads(
|
|
865
|
+
result["choices"][0]["message"]["tool_calls"][0]["function"][
|
|
866
|
+
"arguments"
|
|
867
|
+
]
|
|
868
|
+
)
|
|
869
|
+
self.assertTrue(args["cron"].strip())
|
|
870
|
+
self.assertTrue(args["pattern"].strip())
|
|
871
|
+
self.assertTrue(args["subject"].strip())
|
|
872
|
+
self.assertEqual(monitor.arg_preflight_repairs, 1)
|
|
873
|
+
finally:
|
|
874
|
+
setattr(proxy, "PROXY_MALFORMED_TOOL_RETRY_MAX", old_retry)
|
|
875
|
+
|
|
876
|
+
def test_guardrail_retries_invalid_tool_args_and_recovers(self):
|
|
877
|
+
old_retry = getattr(proxy, "PROXY_MALFORMED_TOOL_RETRY_MAX")
|
|
878
|
+
try:
|
|
879
|
+
setattr(proxy, "PROXY_MALFORMED_TOOL_RETRY_MAX", 1)
|
|
880
|
+
|
|
881
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
882
|
+
monitor.consecutive_forced_count = 7
|
|
883
|
+
|
|
884
|
+
initial_resp = {
|
|
885
|
+
"choices": [
|
|
886
|
+
{
|
|
887
|
+
"finish_reason": "tool_calls",
|
|
888
|
+
"message": {
|
|
889
|
+
"content": "",
|
|
890
|
+
"tool_calls": [
|
|
891
|
+
{
|
|
892
|
+
"id": "call_1",
|
|
893
|
+
"function": {
|
|
894
|
+
"name": "ScheduleJob",
|
|
895
|
+
"arguments": '{"cron":"","command":"echo hi"}',
|
|
896
|
+
},
|
|
897
|
+
}
|
|
898
|
+
],
|
|
899
|
+
},
|
|
900
|
+
}
|
|
901
|
+
]
|
|
902
|
+
}
|
|
903
|
+
repaired_resp = {
|
|
904
|
+
"choices": [
|
|
905
|
+
{
|
|
906
|
+
"finish_reason": "tool_calls",
|
|
907
|
+
"message": {
|
|
908
|
+
"content": "",
|
|
909
|
+
"tool_calls": [
|
|
910
|
+
{
|
|
911
|
+
"id": "call_2",
|
|
912
|
+
"function": {
|
|
913
|
+
"name": "ScheduleJob",
|
|
914
|
+
"arguments": '{"cron":"*/5 * * * *","command":"echo hi"}',
|
|
915
|
+
},
|
|
916
|
+
}
|
|
917
|
+
],
|
|
918
|
+
},
|
|
919
|
+
}
|
|
920
|
+
]
|
|
921
|
+
}
|
|
922
|
+
|
|
923
|
+
fake_client = _FakeClient([_FakeResponse(repaired_resp)])
|
|
924
|
+
openai_body = {
|
|
925
|
+
"model": "test",
|
|
926
|
+
"messages": [{"role": "user", "content": "schedule this job"}],
|
|
927
|
+
"tool_choice": "required",
|
|
928
|
+
}
|
|
929
|
+
anthropic_body = {
|
|
930
|
+
"tools": [
|
|
931
|
+
{
|
|
932
|
+
"name": "ScheduleJob",
|
|
933
|
+
"input_schema": {
|
|
934
|
+
"type": "object",
|
|
935
|
+
"required": ["cron", "command"],
|
|
936
|
+
"properties": {
|
|
937
|
+
"cron": {"type": "string", "minLength": 1},
|
|
938
|
+
"command": {"type": "string", "minLength": 1},
|
|
939
|
+
},
|
|
940
|
+
},
|
|
941
|
+
}
|
|
942
|
+
],
|
|
943
|
+
"messages": [{"role": "user", "content": "schedule this job"}],
|
|
944
|
+
}
|
|
945
|
+
|
|
946
|
+
result = asyncio.run(
|
|
947
|
+
proxy._apply_malformed_tool_guardrail(
|
|
948
|
+
fake_client,
|
|
949
|
+
initial_resp,
|
|
950
|
+
openai_body,
|
|
951
|
+
anthropic_body,
|
|
952
|
+
monitor,
|
|
953
|
+
"session-test",
|
|
954
|
+
)
|
|
955
|
+
)
|
|
956
|
+
|
|
957
|
+
args = json.loads(
|
|
958
|
+
result["choices"][0]["message"]["tool_calls"][0]["function"][
|
|
959
|
+
"arguments"
|
|
960
|
+
]
|
|
961
|
+
)
|
|
962
|
+
self.assertTrue(args["cron"].strip())
|
|
963
|
+
self.assertTrue(args["command"].strip())
|
|
964
|
+
self.assertTrue(
|
|
965
|
+
monitor.arg_preflight_repairs >= 1
|
|
966
|
+
or monitor.arg_preflight_rejections >= 1
|
|
967
|
+
)
|
|
968
|
+
if fake_client.requests:
|
|
969
|
+
retry_payload = fake_client.requests[0]["kwargs"]["json"]
|
|
970
|
+
repair_message = retry_payload["messages"][-1]["content"]
|
|
971
|
+
self.assertIn("TOOL CALL REPAIR", repair_message)
|
|
972
|
+
finally:
|
|
973
|
+
setattr(proxy, "PROXY_MALFORMED_TOOL_RETRY_MAX", old_retry)
|
|
974
|
+
|
|
445
975
|
|
|
446
976
|
class TestToolTurnControls(unittest.TestCase):
|
|
447
977
|
def test_tool_narrowing_reduces_tool_count(self):
|
|
@@ -518,6 +1048,89 @@ class TestToolTurnControls(unittest.TestCase):
|
|
|
518
1048
|
finally:
|
|
519
1049
|
setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
|
|
520
1050
|
|
|
1051
|
+
def test_forced_tool_dampener_temporarily_releases_required(self):
|
|
1052
|
+
old_enabled = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER")
|
|
1053
|
+
old_min_forced = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_MIN_FORCED")
|
|
1054
|
+
old_bad_streak = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_BAD_STREAK")
|
|
1055
|
+
old_empty_streak = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_EMPTY_STREAK")
|
|
1056
|
+
old_rejections = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_REJECTIONS")
|
|
1057
|
+
old_auto_turns = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_AUTO_TURNS")
|
|
1058
|
+
try:
|
|
1059
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER", True)
|
|
1060
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_MIN_FORCED", 3)
|
|
1061
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_BAD_STREAK", 1)
|
|
1062
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_EMPTY_STREAK", 1)
|
|
1063
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_REJECTIONS", 2)
|
|
1064
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_AUTO_TURNS", 2)
|
|
1065
|
+
|
|
1066
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
1067
|
+
monitor.consecutive_forced_count = 3
|
|
1068
|
+
monitor.invalid_tool_call_streak = 1
|
|
1069
|
+
|
|
1070
|
+
activated = monitor.maybe_activate_forced_tool_dampener("invalid_tool_args")
|
|
1071
|
+
self.assertTrue(activated)
|
|
1072
|
+
self.assertEqual(monitor.forced_auto_cooldown_turns, 2)
|
|
1073
|
+
|
|
1074
|
+
body = {
|
|
1075
|
+
"model": "test",
|
|
1076
|
+
"messages": [
|
|
1077
|
+
{
|
|
1078
|
+
"role": "assistant",
|
|
1079
|
+
"content": [{"type": "text", "text": "I will continue."}],
|
|
1080
|
+
},
|
|
1081
|
+
{"role": "user", "content": "keep going"},
|
|
1082
|
+
],
|
|
1083
|
+
"tools": [
|
|
1084
|
+
{
|
|
1085
|
+
"name": "Read",
|
|
1086
|
+
"description": "Read file",
|
|
1087
|
+
"input_schema": {"type": "object"},
|
|
1088
|
+
}
|
|
1089
|
+
],
|
|
1090
|
+
}
|
|
1091
|
+
|
|
1092
|
+
openai = proxy.build_openai_request(body, monitor)
|
|
1093
|
+
self.assertEqual(openai.get("tool_choice"), "auto")
|
|
1094
|
+
self.assertEqual(monitor.forced_auto_cooldown_turns, 1)
|
|
1095
|
+
finally:
|
|
1096
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER", old_enabled)
|
|
1097
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_MIN_FORCED", old_min_forced)
|
|
1098
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_BAD_STREAK", old_bad_streak)
|
|
1099
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_EMPTY_STREAK", old_empty_streak)
|
|
1100
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_REJECTIONS", old_rejections)
|
|
1101
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_AUTO_TURNS", old_auto_turns)
|
|
1102
|
+
|
|
1103
|
+
def test_forced_tool_dampener_uses_rejection_pressure(self):
|
|
1104
|
+
old_enabled = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER")
|
|
1105
|
+
old_min_forced = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_MIN_FORCED")
|
|
1106
|
+
old_bad_streak = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_BAD_STREAK")
|
|
1107
|
+
old_empty_streak = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_EMPTY_STREAK")
|
|
1108
|
+
old_rejections = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_REJECTIONS")
|
|
1109
|
+
old_auto_turns = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_AUTO_TURNS")
|
|
1110
|
+
try:
|
|
1111
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER", True)
|
|
1112
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_MIN_FORCED", 3)
|
|
1113
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_BAD_STREAK", 5)
|
|
1114
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_EMPTY_STREAK", 5)
|
|
1115
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_REJECTIONS", 2)
|
|
1116
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_AUTO_TURNS", 1)
|
|
1117
|
+
|
|
1118
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
1119
|
+
monitor.consecutive_forced_count = 3
|
|
1120
|
+
monitor.arg_preflight_rejections = 2
|
|
1121
|
+
|
|
1122
|
+
activated = monitor.maybe_activate_forced_tool_dampener("invalid_tool_args")
|
|
1123
|
+
self.assertTrue(activated)
|
|
1124
|
+
self.assertEqual(monitor.forced_auto_cooldown_turns, 1)
|
|
1125
|
+
self.assertEqual(monitor.arg_preflight_rejections, 0)
|
|
1126
|
+
finally:
|
|
1127
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER", old_enabled)
|
|
1128
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_MIN_FORCED", old_min_forced)
|
|
1129
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_BAD_STREAK", old_bad_streak)
|
|
1130
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_EMPTY_STREAK", old_empty_streak)
|
|
1131
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_REJECTIONS", old_rejections)
|
|
1132
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_AUTO_TURNS", old_auto_turns)
|
|
1133
|
+
|
|
521
1134
|
def test_no_tools_does_not_inject_agentic_system_message(self):
|
|
522
1135
|
body = {
|
|
523
1136
|
"model": "test",
|
|
@@ -631,6 +1244,54 @@ class TestSessionContaminationBreaker(unittest.TestCase):
|
|
|
631
1244
|
setattr(proxy, "PROXY_SESSION_CONTAMINATION_THRESHOLD", old_threshold)
|
|
632
1245
|
setattr(proxy, "PROXY_SESSION_CONTAMINATION_KEEP_LAST", old_keep)
|
|
633
1246
|
|
|
1247
|
+
def test_contamination_breaker_triggers_on_forced_invalid_combo(self):
|
|
1248
|
+
old_enabled = getattr(proxy, "PROXY_SESSION_CONTAMINATION_BREAKER")
|
|
1249
|
+
old_threshold = getattr(proxy, "PROXY_SESSION_CONTAMINATION_THRESHOLD")
|
|
1250
|
+
old_keep = getattr(proxy, "PROXY_SESSION_CONTAMINATION_KEEP_LAST")
|
|
1251
|
+
old_forced = getattr(proxy, "PROXY_SESSION_CONTAMINATION_FORCED_THRESHOLD")
|
|
1252
|
+
old_required = getattr(
|
|
1253
|
+
proxy, "PROXY_SESSION_CONTAMINATION_REQUIRED_MISS_THRESHOLD"
|
|
1254
|
+
)
|
|
1255
|
+
try:
|
|
1256
|
+
setattr(proxy, "PROXY_SESSION_CONTAMINATION_BREAKER", True)
|
|
1257
|
+
setattr(proxy, "PROXY_SESSION_CONTAMINATION_THRESHOLD", 3)
|
|
1258
|
+
setattr(proxy, "PROXY_SESSION_CONTAMINATION_KEEP_LAST", 3)
|
|
1259
|
+
setattr(proxy, "PROXY_SESSION_CONTAMINATION_FORCED_THRESHOLD", 5)
|
|
1260
|
+
setattr(proxy, "PROXY_SESSION_CONTAMINATION_REQUIRED_MISS_THRESHOLD", 4)
|
|
1261
|
+
|
|
1262
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
1263
|
+
monitor.invalid_tool_call_streak = 2
|
|
1264
|
+
monitor.consecutive_forced_count = 6
|
|
1265
|
+
body = {
|
|
1266
|
+
"messages": [
|
|
1267
|
+
{"role": "user", "content": "start"},
|
|
1268
|
+
{"role": "assistant", "content": "a1"},
|
|
1269
|
+
{"role": "user", "content": "u2"},
|
|
1270
|
+
{"role": "assistant", "content": "a3"},
|
|
1271
|
+
{"role": "user", "content": "u4"},
|
|
1272
|
+
{"role": "assistant", "content": "a5"},
|
|
1273
|
+
]
|
|
1274
|
+
}
|
|
1275
|
+
|
|
1276
|
+
updated = proxy._maybe_apply_session_contamination_breaker(
|
|
1277
|
+
body, monitor, "session-test"
|
|
1278
|
+
)
|
|
1279
|
+
|
|
1280
|
+
self.assertEqual(monitor.contamination_resets, 1)
|
|
1281
|
+
self.assertEqual(monitor.invalid_tool_call_streak, 0)
|
|
1282
|
+
self.assertEqual(len(updated["messages"]), 5)
|
|
1283
|
+
self.assertIn("SESSION RESET", updated["messages"][1]["content"])
|
|
1284
|
+
finally:
|
|
1285
|
+
setattr(proxy, "PROXY_SESSION_CONTAMINATION_BREAKER", old_enabled)
|
|
1286
|
+
setattr(proxy, "PROXY_SESSION_CONTAMINATION_THRESHOLD", old_threshold)
|
|
1287
|
+
setattr(proxy, "PROXY_SESSION_CONTAMINATION_KEEP_LAST", old_keep)
|
|
1288
|
+
setattr(proxy, "PROXY_SESSION_CONTAMINATION_FORCED_THRESHOLD", old_forced)
|
|
1289
|
+
setattr(
|
|
1290
|
+
proxy,
|
|
1291
|
+
"PROXY_SESSION_CONTAMINATION_REQUIRED_MISS_THRESHOLD",
|
|
1292
|
+
old_required,
|
|
1293
|
+
)
|
|
1294
|
+
|
|
634
1295
|
|
|
635
1296
|
if __name__ == "__main__":
|
|
636
1297
|
unittest.main()
|