@miller-tech/uap 1.15.2 → 1.15.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/templates/hooks/forgecode/session-start.sh +6 -22
- package/templates/hooks/session-start.sh +7 -31
- package/tools/agents/plugin/session-start.sh +6 -22
- package/tools/agents/scripts/anthropic_proxy.py +780 -41
- package/tools/agents/tests/test_anthropic_proxy_streaming.py +632 -0
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import importlib.util
|
|
5
|
+
import json
|
|
4
6
|
import unittest
|
|
5
7
|
from pathlib import Path
|
|
6
8
|
|
|
@@ -17,6 +19,27 @@ def _load_proxy_module():
|
|
|
17
19
|
proxy = _load_proxy_module()
|
|
18
20
|
|
|
19
21
|
|
|
22
|
+
class _FakeResponse:
|
|
23
|
+
def __init__(self, payload, status_code=200):
|
|
24
|
+
self._payload = payload
|
|
25
|
+
self.status_code = status_code
|
|
26
|
+
|
|
27
|
+
def json(self):
|
|
28
|
+
return self._payload
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class _FakeClient:
|
|
32
|
+
def __init__(self, responses):
|
|
33
|
+
self._responses = list(responses)
|
|
34
|
+
self.requests = []
|
|
35
|
+
|
|
36
|
+
async def post(self, *args, **kwargs):
|
|
37
|
+
self.requests.append({"args": args, "kwargs": kwargs})
|
|
38
|
+
if not self._responses:
|
|
39
|
+
raise AssertionError("No fake response queued")
|
|
40
|
+
return self._responses.pop(0)
|
|
41
|
+
|
|
42
|
+
|
|
20
43
|
class TestStreamingReasoningFallback(unittest.TestCase):
|
|
21
44
|
def test_fallback_disabled_returns_none(self):
|
|
22
45
|
text = proxy._build_reasoning_fallback_text(
|
|
@@ -471,6 +494,484 @@ class TestMalformedToolGuardrail(unittest.TestCase):
|
|
|
471
494
|
self.assertIn("Please retry the same request", text)
|
|
472
495
|
self.assertNotIn("I will issue exactly one valid tool call next", text)
|
|
473
496
|
|
|
497
|
+
def test_preflight_flags_invalid_json_tool_arguments(self):
|
|
498
|
+
openai_resp = {
|
|
499
|
+
"choices": [
|
|
500
|
+
{
|
|
501
|
+
"finish_reason": "tool_calls",
|
|
502
|
+
"message": {
|
|
503
|
+
"content": "",
|
|
504
|
+
"tool_calls": [
|
|
505
|
+
{
|
|
506
|
+
"id": "call_1",
|
|
507
|
+
"function": {
|
|
508
|
+
"name": "ScheduleJob",
|
|
509
|
+
"arguments": '{"cron":',
|
|
510
|
+
},
|
|
511
|
+
}
|
|
512
|
+
],
|
|
513
|
+
},
|
|
514
|
+
}
|
|
515
|
+
]
|
|
516
|
+
}
|
|
517
|
+
anthropic_body = {
|
|
518
|
+
"tools": [
|
|
519
|
+
{
|
|
520
|
+
"name": "ScheduleJob",
|
|
521
|
+
"input_schema": {
|
|
522
|
+
"type": "object",
|
|
523
|
+
"required": ["cron"],
|
|
524
|
+
"properties": {"cron": {"type": "string", "minLength": 1}},
|
|
525
|
+
},
|
|
526
|
+
}
|
|
527
|
+
]
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
issue = proxy._classify_tool_response_issue(openai_resp, anthropic_body)
|
|
531
|
+
self.assertEqual(issue.kind, "malformed_payload")
|
|
532
|
+
self.assertIn("malformed pseudo tool payload", issue.reason)
|
|
533
|
+
|
|
534
|
+
def test_preflight_flags_empty_required_field(self):
|
|
535
|
+
openai_resp = {
|
|
536
|
+
"choices": [
|
|
537
|
+
{
|
|
538
|
+
"finish_reason": "tool_calls",
|
|
539
|
+
"message": {
|
|
540
|
+
"content": "",
|
|
541
|
+
"tool_calls": [
|
|
542
|
+
{
|
|
543
|
+
"id": "call_1",
|
|
544
|
+
"function": {
|
|
545
|
+
"name": "ScheduleJob",
|
|
546
|
+
"arguments": '{"cron":"","command":"echo hi"}',
|
|
547
|
+
},
|
|
548
|
+
}
|
|
549
|
+
],
|
|
550
|
+
},
|
|
551
|
+
}
|
|
552
|
+
]
|
|
553
|
+
}
|
|
554
|
+
anthropic_body = {
|
|
555
|
+
"tools": [
|
|
556
|
+
{
|
|
557
|
+
"name": "ScheduleJob",
|
|
558
|
+
"input_schema": {
|
|
559
|
+
"type": "object",
|
|
560
|
+
"required": ["cron", "command"],
|
|
561
|
+
"properties": {
|
|
562
|
+
"cron": {"type": "string", "minLength": 1},
|
|
563
|
+
"command": {"type": "string", "minLength": 1},
|
|
564
|
+
},
|
|
565
|
+
},
|
|
566
|
+
}
|
|
567
|
+
]
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
issue = proxy._classify_tool_response_issue(openai_resp, anthropic_body)
|
|
571
|
+
self.assertEqual(issue.kind, "malformed_payload")
|
|
572
|
+
self.assertIn("malformed pseudo tool payload", issue.reason)
|
|
573
|
+
|
|
574
|
+
def test_preflight_flags_markup_inside_arguments(self):
|
|
575
|
+
openai_resp = {
|
|
576
|
+
"choices": [
|
|
577
|
+
{
|
|
578
|
+
"finish_reason": "tool_calls",
|
|
579
|
+
"message": {
|
|
580
|
+
"content": "",
|
|
581
|
+
"tool_calls": [
|
|
582
|
+
{
|
|
583
|
+
"id": "call_1",
|
|
584
|
+
"function": {
|
|
585
|
+
"name": "ScheduleJob",
|
|
586
|
+
"arguments": '{"cron":"*/5 * * * *","command":"<parameter>bad</parameter>"}',
|
|
587
|
+
},
|
|
588
|
+
}
|
|
589
|
+
],
|
|
590
|
+
},
|
|
591
|
+
}
|
|
592
|
+
]
|
|
593
|
+
}
|
|
594
|
+
anthropic_body = {
|
|
595
|
+
"tools": [
|
|
596
|
+
{
|
|
597
|
+
"name": "ScheduleJob",
|
|
598
|
+
"input_schema": {
|
|
599
|
+
"type": "object",
|
|
600
|
+
"required": ["cron", "command"],
|
|
601
|
+
"properties": {
|
|
602
|
+
"cron": {"type": "string"},
|
|
603
|
+
"command": {"type": "string"},
|
|
604
|
+
},
|
|
605
|
+
},
|
|
606
|
+
}
|
|
607
|
+
]
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
issue = proxy._classify_tool_response_issue(openai_resp, anthropic_body)
|
|
611
|
+
self.assertEqual(issue.kind, "malformed_payload")
|
|
612
|
+
self.assertIn("malformed pseudo tool payload", issue.reason)
|
|
613
|
+
|
|
614
|
+
def test_required_tool_turn_without_tool_call_is_flagged(self):
|
|
615
|
+
openai_resp = {
|
|
616
|
+
"choices": [
|
|
617
|
+
{
|
|
618
|
+
"finish_reason": "stop",
|
|
619
|
+
"message": {
|
|
620
|
+
"content": "Done.",
|
|
621
|
+
"tool_calls": [],
|
|
622
|
+
},
|
|
623
|
+
}
|
|
624
|
+
]
|
|
625
|
+
}
|
|
626
|
+
anthropic_body = {
|
|
627
|
+
"tools": [{"name": "Edit", "input_schema": {"type": "object"}}],
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
issue = proxy._classify_tool_response_issue(
|
|
631
|
+
openai_resp, anthropic_body, required_tool_choice=True
|
|
632
|
+
)
|
|
633
|
+
self.assertEqual(issue.kind, "required_tool_miss")
|
|
634
|
+
|
|
635
|
+
def test_markup_repair_sanitizes_tool_arguments(self):
|
|
636
|
+
openai_resp = {
|
|
637
|
+
"choices": [
|
|
638
|
+
{
|
|
639
|
+
"finish_reason": "tool_calls",
|
|
640
|
+
"message": {
|
|
641
|
+
"content": "",
|
|
642
|
+
"tool_calls": [
|
|
643
|
+
{
|
|
644
|
+
"id": "call_1",
|
|
645
|
+
"function": {
|
|
646
|
+
"name": "Bash",
|
|
647
|
+
"arguments": '{"command":"echo ok </think> </parameter>"}',
|
|
648
|
+
},
|
|
649
|
+
}
|
|
650
|
+
],
|
|
651
|
+
},
|
|
652
|
+
}
|
|
653
|
+
]
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
repaired, count = proxy._repair_tool_call_markup(openai_resp)
|
|
657
|
+
self.assertEqual(count, 1)
|
|
658
|
+
args = repaired["choices"][0]["message"]["tool_calls"][0]["function"][
|
|
659
|
+
"arguments"
|
|
660
|
+
]
|
|
661
|
+
self.assertNotIn("</think>", args)
|
|
662
|
+
self.assertNotIn("</parameter>", args)
|
|
663
|
+
|
|
664
|
+
def test_markup_repair_recovers_json_after_tag_stripping(self):
|
|
665
|
+
openai_resp = {
|
|
666
|
+
"choices": [
|
|
667
|
+
{
|
|
668
|
+
"finish_reason": "tool_calls",
|
|
669
|
+
"message": {
|
|
670
|
+
"content": "",
|
|
671
|
+
"tool_calls": [
|
|
672
|
+
{
|
|
673
|
+
"id": "call_1",
|
|
674
|
+
"function": {
|
|
675
|
+
"name": "Bash",
|
|
676
|
+
"arguments": '</parameter>{"command":"ls"}',
|
|
677
|
+
},
|
|
678
|
+
}
|
|
679
|
+
],
|
|
680
|
+
},
|
|
681
|
+
}
|
|
682
|
+
]
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
repaired, count = proxy._repair_tool_call_markup(openai_resp)
|
|
686
|
+
self.assertEqual(count, 1)
|
|
687
|
+
args = json.loads(
|
|
688
|
+
repaired["choices"][0]["message"]["tool_calls"][0]["function"]["arguments"]
|
|
689
|
+
)
|
|
690
|
+
self.assertEqual(args["command"], "ls")
|
|
691
|
+
|
|
692
|
+
def test_guardrail_accepts_repaired_markup_without_fallback(self):
|
|
693
|
+
old_retry = getattr(proxy, "PROXY_MALFORMED_TOOL_RETRY_MAX")
|
|
694
|
+
try:
|
|
695
|
+
setattr(proxy, "PROXY_MALFORMED_TOOL_RETRY_MAX", 0)
|
|
696
|
+
|
|
697
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
698
|
+
openai_resp = {
|
|
699
|
+
"choices": [
|
|
700
|
+
{
|
|
701
|
+
"finish_reason": "tool_calls",
|
|
702
|
+
"message": {
|
|
703
|
+
"content": "",
|
|
704
|
+
"tool_calls": [
|
|
705
|
+
{
|
|
706
|
+
"id": "call_1",
|
|
707
|
+
"function": {
|
|
708
|
+
"name": "Bash",
|
|
709
|
+
"arguments": '{"command":"ls </parameter>"}',
|
|
710
|
+
},
|
|
711
|
+
}
|
|
712
|
+
],
|
|
713
|
+
},
|
|
714
|
+
}
|
|
715
|
+
]
|
|
716
|
+
}
|
|
717
|
+
anthropic_body = {
|
|
718
|
+
"tools": [
|
|
719
|
+
{
|
|
720
|
+
"name": "Bash",
|
|
721
|
+
"input_schema": {
|
|
722
|
+
"type": "object",
|
|
723
|
+
"required": ["command"],
|
|
724
|
+
"properties": {
|
|
725
|
+
"command": {"type": "string", "minLength": 1}
|
|
726
|
+
},
|
|
727
|
+
},
|
|
728
|
+
}
|
|
729
|
+
],
|
|
730
|
+
"messages": [{"role": "user", "content": "run command"}],
|
|
731
|
+
}
|
|
732
|
+
openai_body = {
|
|
733
|
+
"model": "test",
|
|
734
|
+
"messages": [{"role": "user", "content": "run command"}],
|
|
735
|
+
"tool_choice": "required",
|
|
736
|
+
}
|
|
737
|
+
|
|
738
|
+
result = asyncio.run(
|
|
739
|
+
proxy._apply_malformed_tool_guardrail(
|
|
740
|
+
_FakeClient([]),
|
|
741
|
+
openai_resp,
|
|
742
|
+
openai_body,
|
|
743
|
+
anthropic_body,
|
|
744
|
+
monitor,
|
|
745
|
+
"session-repair",
|
|
746
|
+
)
|
|
747
|
+
)
|
|
748
|
+
|
|
749
|
+
self.assertTrue(result["choices"][0]["message"].get("tool_calls"))
|
|
750
|
+
args = result["choices"][0]["message"]["tool_calls"][0]["function"][
|
|
751
|
+
"arguments"
|
|
752
|
+
]
|
|
753
|
+
self.assertNotIn("</parameter>", args)
|
|
754
|
+
self.assertEqual(monitor.arg_preflight_repairs, 1)
|
|
755
|
+
finally:
|
|
756
|
+
setattr(proxy, "PROXY_MALFORMED_TOOL_RETRY_MAX", old_retry)
|
|
757
|
+
|
|
758
|
+
def test_required_field_repair_fills_missing_required_values(self):
|
|
759
|
+
openai_resp = {
|
|
760
|
+
"choices": [
|
|
761
|
+
{
|
|
762
|
+
"finish_reason": "tool_calls",
|
|
763
|
+
"message": {
|
|
764
|
+
"content": "",
|
|
765
|
+
"tool_calls": [
|
|
766
|
+
{
|
|
767
|
+
"id": "call_1",
|
|
768
|
+
"function": {
|
|
769
|
+
"name": "ScheduleJob",
|
|
770
|
+
"arguments": '{"cron":""}',
|
|
771
|
+
},
|
|
772
|
+
}
|
|
773
|
+
],
|
|
774
|
+
},
|
|
775
|
+
}
|
|
776
|
+
]
|
|
777
|
+
}
|
|
778
|
+
anthropic_body = {
|
|
779
|
+
"tools": [
|
|
780
|
+
{
|
|
781
|
+
"name": "ScheduleJob",
|
|
782
|
+
"input_schema": {
|
|
783
|
+
"type": "object",
|
|
784
|
+
"required": ["cron", "pattern", "subject"],
|
|
785
|
+
"properties": {
|
|
786
|
+
"cron": {"type": "string", "minLength": 1},
|
|
787
|
+
"pattern": {"type": "string", "minLength": 1},
|
|
788
|
+
"subject": {"type": "string", "minLength": 1},
|
|
789
|
+
},
|
|
790
|
+
},
|
|
791
|
+
}
|
|
792
|
+
]
|
|
793
|
+
}
|
|
794
|
+
|
|
795
|
+
repaired, count = proxy._repair_required_tool_args(openai_resp, anthropic_body)
|
|
796
|
+
self.assertEqual(count, 1)
|
|
797
|
+
args_text = repaired["choices"][0]["message"]["tool_calls"][0]["function"][
|
|
798
|
+
"arguments"
|
|
799
|
+
]
|
|
800
|
+
args = json.loads(args_text)
|
|
801
|
+
self.assertTrue(args["cron"].strip())
|
|
802
|
+
self.assertTrue(args["pattern"].strip())
|
|
803
|
+
self.assertTrue(args["subject"].strip())
|
|
804
|
+
|
|
805
|
+
def test_guardrail_accepts_required_field_repair_without_fallback(self):
|
|
806
|
+
old_retry = getattr(proxy, "PROXY_MALFORMED_TOOL_RETRY_MAX")
|
|
807
|
+
try:
|
|
808
|
+
setattr(proxy, "PROXY_MALFORMED_TOOL_RETRY_MAX", 0)
|
|
809
|
+
|
|
810
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
811
|
+
openai_resp = {
|
|
812
|
+
"choices": [
|
|
813
|
+
{
|
|
814
|
+
"finish_reason": "tool_calls",
|
|
815
|
+
"message": {
|
|
816
|
+
"content": "",
|
|
817
|
+
"tool_calls": [
|
|
818
|
+
{
|
|
819
|
+
"id": "call_1",
|
|
820
|
+
"function": {
|
|
821
|
+
"name": "ScheduleJob",
|
|
822
|
+
"arguments": '{"cron":""}',
|
|
823
|
+
},
|
|
824
|
+
}
|
|
825
|
+
],
|
|
826
|
+
},
|
|
827
|
+
}
|
|
828
|
+
]
|
|
829
|
+
}
|
|
830
|
+
anthropic_body = {
|
|
831
|
+
"tools": [
|
|
832
|
+
{
|
|
833
|
+
"name": "ScheduleJob",
|
|
834
|
+
"input_schema": {
|
|
835
|
+
"type": "object",
|
|
836
|
+
"required": ["cron", "pattern", "subject"],
|
|
837
|
+
"properties": {
|
|
838
|
+
"cron": {"type": "string", "minLength": 1},
|
|
839
|
+
"pattern": {"type": "string", "minLength": 1},
|
|
840
|
+
"subject": {"type": "string", "minLength": 1},
|
|
841
|
+
},
|
|
842
|
+
},
|
|
843
|
+
}
|
|
844
|
+
],
|
|
845
|
+
"messages": [{"role": "user", "content": "schedule it"}],
|
|
846
|
+
}
|
|
847
|
+
openai_body = {
|
|
848
|
+
"model": "test",
|
|
849
|
+
"messages": [{"role": "user", "content": "schedule it"}],
|
|
850
|
+
"tool_choice": "required",
|
|
851
|
+
}
|
|
852
|
+
|
|
853
|
+
result = asyncio.run(
|
|
854
|
+
proxy._apply_malformed_tool_guardrail(
|
|
855
|
+
_FakeClient([]),
|
|
856
|
+
openai_resp,
|
|
857
|
+
openai_body,
|
|
858
|
+
anthropic_body,
|
|
859
|
+
monitor,
|
|
860
|
+
"session-repair-required",
|
|
861
|
+
)
|
|
862
|
+
)
|
|
863
|
+
|
|
864
|
+
args = json.loads(
|
|
865
|
+
result["choices"][0]["message"]["tool_calls"][0]["function"][
|
|
866
|
+
"arguments"
|
|
867
|
+
]
|
|
868
|
+
)
|
|
869
|
+
self.assertTrue(args["cron"].strip())
|
|
870
|
+
self.assertTrue(args["pattern"].strip())
|
|
871
|
+
self.assertTrue(args["subject"].strip())
|
|
872
|
+
self.assertEqual(monitor.arg_preflight_repairs, 1)
|
|
873
|
+
finally:
|
|
874
|
+
setattr(proxy, "PROXY_MALFORMED_TOOL_RETRY_MAX", old_retry)
|
|
875
|
+
|
|
876
|
+
def test_guardrail_retries_invalid_tool_args_and_recovers(self):
|
|
877
|
+
old_retry = getattr(proxy, "PROXY_MALFORMED_TOOL_RETRY_MAX")
|
|
878
|
+
try:
|
|
879
|
+
setattr(proxy, "PROXY_MALFORMED_TOOL_RETRY_MAX", 1)
|
|
880
|
+
|
|
881
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
882
|
+
monitor.consecutive_forced_count = 7
|
|
883
|
+
|
|
884
|
+
initial_resp = {
|
|
885
|
+
"choices": [
|
|
886
|
+
{
|
|
887
|
+
"finish_reason": "tool_calls",
|
|
888
|
+
"message": {
|
|
889
|
+
"content": "",
|
|
890
|
+
"tool_calls": [
|
|
891
|
+
{
|
|
892
|
+
"id": "call_1",
|
|
893
|
+
"function": {
|
|
894
|
+
"name": "ScheduleJob",
|
|
895
|
+
"arguments": '{"cron":"","command":"echo hi"}',
|
|
896
|
+
},
|
|
897
|
+
}
|
|
898
|
+
],
|
|
899
|
+
},
|
|
900
|
+
}
|
|
901
|
+
]
|
|
902
|
+
}
|
|
903
|
+
repaired_resp = {
|
|
904
|
+
"choices": [
|
|
905
|
+
{
|
|
906
|
+
"finish_reason": "tool_calls",
|
|
907
|
+
"message": {
|
|
908
|
+
"content": "",
|
|
909
|
+
"tool_calls": [
|
|
910
|
+
{
|
|
911
|
+
"id": "call_2",
|
|
912
|
+
"function": {
|
|
913
|
+
"name": "ScheduleJob",
|
|
914
|
+
"arguments": '{"cron":"*/5 * * * *","command":"echo hi"}',
|
|
915
|
+
},
|
|
916
|
+
}
|
|
917
|
+
],
|
|
918
|
+
},
|
|
919
|
+
}
|
|
920
|
+
]
|
|
921
|
+
}
|
|
922
|
+
|
|
923
|
+
fake_client = _FakeClient([_FakeResponse(repaired_resp)])
|
|
924
|
+
openai_body = {
|
|
925
|
+
"model": "test",
|
|
926
|
+
"messages": [{"role": "user", "content": "schedule this job"}],
|
|
927
|
+
"tool_choice": "required",
|
|
928
|
+
}
|
|
929
|
+
anthropic_body = {
|
|
930
|
+
"tools": [
|
|
931
|
+
{
|
|
932
|
+
"name": "ScheduleJob",
|
|
933
|
+
"input_schema": {
|
|
934
|
+
"type": "object",
|
|
935
|
+
"required": ["cron", "command"],
|
|
936
|
+
"properties": {
|
|
937
|
+
"cron": {"type": "string", "minLength": 1},
|
|
938
|
+
"command": {"type": "string", "minLength": 1},
|
|
939
|
+
},
|
|
940
|
+
},
|
|
941
|
+
}
|
|
942
|
+
],
|
|
943
|
+
"messages": [{"role": "user", "content": "schedule this job"}],
|
|
944
|
+
}
|
|
945
|
+
|
|
946
|
+
result = asyncio.run(
|
|
947
|
+
proxy._apply_malformed_tool_guardrail(
|
|
948
|
+
fake_client,
|
|
949
|
+
initial_resp,
|
|
950
|
+
openai_body,
|
|
951
|
+
anthropic_body,
|
|
952
|
+
monitor,
|
|
953
|
+
"session-test",
|
|
954
|
+
)
|
|
955
|
+
)
|
|
956
|
+
|
|
957
|
+
args = json.loads(
|
|
958
|
+
result["choices"][0]["message"]["tool_calls"][0]["function"][
|
|
959
|
+
"arguments"
|
|
960
|
+
]
|
|
961
|
+
)
|
|
962
|
+
self.assertTrue(args["cron"].strip())
|
|
963
|
+
self.assertTrue(args["command"].strip())
|
|
964
|
+
self.assertTrue(
|
|
965
|
+
monitor.arg_preflight_repairs >= 1
|
|
966
|
+
or monitor.arg_preflight_rejections >= 1
|
|
967
|
+
)
|
|
968
|
+
if fake_client.requests:
|
|
969
|
+
retry_payload = fake_client.requests[0]["kwargs"]["json"]
|
|
970
|
+
repair_message = retry_payload["messages"][-1]["content"]
|
|
971
|
+
self.assertIn("TOOL CALL REPAIR", repair_message)
|
|
972
|
+
finally:
|
|
973
|
+
setattr(proxy, "PROXY_MALFORMED_TOOL_RETRY_MAX", old_retry)
|
|
974
|
+
|
|
474
975
|
|
|
475
976
|
class TestToolTurnControls(unittest.TestCase):
|
|
476
977
|
def test_tool_narrowing_reduces_tool_count(self):
|
|
@@ -547,6 +1048,89 @@ class TestToolTurnControls(unittest.TestCase):
|
|
|
547
1048
|
finally:
|
|
548
1049
|
setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
|
|
549
1050
|
|
|
1051
|
+
def test_forced_tool_dampener_temporarily_releases_required(self):
|
|
1052
|
+
old_enabled = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER")
|
|
1053
|
+
old_min_forced = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_MIN_FORCED")
|
|
1054
|
+
old_bad_streak = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_BAD_STREAK")
|
|
1055
|
+
old_empty_streak = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_EMPTY_STREAK")
|
|
1056
|
+
old_rejections = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_REJECTIONS")
|
|
1057
|
+
old_auto_turns = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_AUTO_TURNS")
|
|
1058
|
+
try:
|
|
1059
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER", True)
|
|
1060
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_MIN_FORCED", 3)
|
|
1061
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_BAD_STREAK", 1)
|
|
1062
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_EMPTY_STREAK", 1)
|
|
1063
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_REJECTIONS", 2)
|
|
1064
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_AUTO_TURNS", 2)
|
|
1065
|
+
|
|
1066
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
1067
|
+
monitor.consecutive_forced_count = 3
|
|
1068
|
+
monitor.invalid_tool_call_streak = 1
|
|
1069
|
+
|
|
1070
|
+
activated = monitor.maybe_activate_forced_tool_dampener("invalid_tool_args")
|
|
1071
|
+
self.assertTrue(activated)
|
|
1072
|
+
self.assertEqual(monitor.forced_auto_cooldown_turns, 2)
|
|
1073
|
+
|
|
1074
|
+
body = {
|
|
1075
|
+
"model": "test",
|
|
1076
|
+
"messages": [
|
|
1077
|
+
{
|
|
1078
|
+
"role": "assistant",
|
|
1079
|
+
"content": [{"type": "text", "text": "I will continue."}],
|
|
1080
|
+
},
|
|
1081
|
+
{"role": "user", "content": "keep going"},
|
|
1082
|
+
],
|
|
1083
|
+
"tools": [
|
|
1084
|
+
{
|
|
1085
|
+
"name": "Read",
|
|
1086
|
+
"description": "Read file",
|
|
1087
|
+
"input_schema": {"type": "object"},
|
|
1088
|
+
}
|
|
1089
|
+
],
|
|
1090
|
+
}
|
|
1091
|
+
|
|
1092
|
+
openai = proxy.build_openai_request(body, monitor)
|
|
1093
|
+
self.assertEqual(openai.get("tool_choice"), "auto")
|
|
1094
|
+
self.assertEqual(monitor.forced_auto_cooldown_turns, 1)
|
|
1095
|
+
finally:
|
|
1096
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER", old_enabled)
|
|
1097
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_MIN_FORCED", old_min_forced)
|
|
1098
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_BAD_STREAK", old_bad_streak)
|
|
1099
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_EMPTY_STREAK", old_empty_streak)
|
|
1100
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_REJECTIONS", old_rejections)
|
|
1101
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_AUTO_TURNS", old_auto_turns)
|
|
1102
|
+
|
|
1103
|
+
def test_forced_tool_dampener_uses_rejection_pressure(self):
|
|
1104
|
+
old_enabled = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER")
|
|
1105
|
+
old_min_forced = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_MIN_FORCED")
|
|
1106
|
+
old_bad_streak = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_BAD_STREAK")
|
|
1107
|
+
old_empty_streak = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_EMPTY_STREAK")
|
|
1108
|
+
old_rejections = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_REJECTIONS")
|
|
1109
|
+
old_auto_turns = getattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_AUTO_TURNS")
|
|
1110
|
+
try:
|
|
1111
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER", True)
|
|
1112
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_MIN_FORCED", 3)
|
|
1113
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_BAD_STREAK", 5)
|
|
1114
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_EMPTY_STREAK", 5)
|
|
1115
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_REJECTIONS", 2)
|
|
1116
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_AUTO_TURNS", 1)
|
|
1117
|
+
|
|
1118
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
1119
|
+
monitor.consecutive_forced_count = 3
|
|
1120
|
+
monitor.arg_preflight_rejections = 2
|
|
1121
|
+
|
|
1122
|
+
activated = monitor.maybe_activate_forced_tool_dampener("invalid_tool_args")
|
|
1123
|
+
self.assertTrue(activated)
|
|
1124
|
+
self.assertEqual(monitor.forced_auto_cooldown_turns, 1)
|
|
1125
|
+
self.assertEqual(monitor.arg_preflight_rejections, 0)
|
|
1126
|
+
finally:
|
|
1127
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER", old_enabled)
|
|
1128
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_MIN_FORCED", old_min_forced)
|
|
1129
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_BAD_STREAK", old_bad_streak)
|
|
1130
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_EMPTY_STREAK", old_empty_streak)
|
|
1131
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_REJECTIONS", old_rejections)
|
|
1132
|
+
setattr(proxy, "PROXY_FORCED_TOOL_DAMPENER_AUTO_TURNS", old_auto_turns)
|
|
1133
|
+
|
|
550
1134
|
def test_no_tools_does_not_inject_agentic_system_message(self):
|
|
551
1135
|
body = {
|
|
552
1136
|
"model": "test",
|
|
@@ -660,6 +1244,54 @@ class TestSessionContaminationBreaker(unittest.TestCase):
|
|
|
660
1244
|
setattr(proxy, "PROXY_SESSION_CONTAMINATION_THRESHOLD", old_threshold)
|
|
661
1245
|
setattr(proxy, "PROXY_SESSION_CONTAMINATION_KEEP_LAST", old_keep)
|
|
662
1246
|
|
|
1247
|
+
def test_contamination_breaker_triggers_on_forced_invalid_combo(self):
|
|
1248
|
+
old_enabled = getattr(proxy, "PROXY_SESSION_CONTAMINATION_BREAKER")
|
|
1249
|
+
old_threshold = getattr(proxy, "PROXY_SESSION_CONTAMINATION_THRESHOLD")
|
|
1250
|
+
old_keep = getattr(proxy, "PROXY_SESSION_CONTAMINATION_KEEP_LAST")
|
|
1251
|
+
old_forced = getattr(proxy, "PROXY_SESSION_CONTAMINATION_FORCED_THRESHOLD")
|
|
1252
|
+
old_required = getattr(
|
|
1253
|
+
proxy, "PROXY_SESSION_CONTAMINATION_REQUIRED_MISS_THRESHOLD"
|
|
1254
|
+
)
|
|
1255
|
+
try:
|
|
1256
|
+
setattr(proxy, "PROXY_SESSION_CONTAMINATION_BREAKER", True)
|
|
1257
|
+
setattr(proxy, "PROXY_SESSION_CONTAMINATION_THRESHOLD", 3)
|
|
1258
|
+
setattr(proxy, "PROXY_SESSION_CONTAMINATION_KEEP_LAST", 3)
|
|
1259
|
+
setattr(proxy, "PROXY_SESSION_CONTAMINATION_FORCED_THRESHOLD", 5)
|
|
1260
|
+
setattr(proxy, "PROXY_SESSION_CONTAMINATION_REQUIRED_MISS_THRESHOLD", 4)
|
|
1261
|
+
|
|
1262
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
1263
|
+
monitor.invalid_tool_call_streak = 2
|
|
1264
|
+
monitor.consecutive_forced_count = 6
|
|
1265
|
+
body = {
|
|
1266
|
+
"messages": [
|
|
1267
|
+
{"role": "user", "content": "start"},
|
|
1268
|
+
{"role": "assistant", "content": "a1"},
|
|
1269
|
+
{"role": "user", "content": "u2"},
|
|
1270
|
+
{"role": "assistant", "content": "a3"},
|
|
1271
|
+
{"role": "user", "content": "u4"},
|
|
1272
|
+
{"role": "assistant", "content": "a5"},
|
|
1273
|
+
]
|
|
1274
|
+
}
|
|
1275
|
+
|
|
1276
|
+
updated = proxy._maybe_apply_session_contamination_breaker(
|
|
1277
|
+
body, monitor, "session-test"
|
|
1278
|
+
)
|
|
1279
|
+
|
|
1280
|
+
self.assertEqual(monitor.contamination_resets, 1)
|
|
1281
|
+
self.assertEqual(monitor.invalid_tool_call_streak, 0)
|
|
1282
|
+
self.assertEqual(len(updated["messages"]), 5)
|
|
1283
|
+
self.assertIn("SESSION RESET", updated["messages"][1]["content"])
|
|
1284
|
+
finally:
|
|
1285
|
+
setattr(proxy, "PROXY_SESSION_CONTAMINATION_BREAKER", old_enabled)
|
|
1286
|
+
setattr(proxy, "PROXY_SESSION_CONTAMINATION_THRESHOLD", old_threshold)
|
|
1287
|
+
setattr(proxy, "PROXY_SESSION_CONTAMINATION_KEEP_LAST", old_keep)
|
|
1288
|
+
setattr(proxy, "PROXY_SESSION_CONTAMINATION_FORCED_THRESHOLD", old_forced)
|
|
1289
|
+
setattr(
|
|
1290
|
+
proxy,
|
|
1291
|
+
"PROXY_SESSION_CONTAMINATION_REQUIRED_MISS_THRESHOLD",
|
|
1292
|
+
old_required,
|
|
1293
|
+
)
|
|
1294
|
+
|
|
663
1295
|
|
|
664
1296
|
if __name__ == "__main__":
|
|
665
1297
|
unittest.main()
|