tactus 0.32.2__py3-none-any.whl → 0.34.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tactus/__init__.py +1 -1
- tactus/adapters/__init__.py +18 -1
- tactus/adapters/broker_log.py +127 -34
- tactus/adapters/channels/__init__.py +153 -0
- tactus/adapters/channels/base.py +174 -0
- tactus/adapters/channels/broker.py +179 -0
- tactus/adapters/channels/cli.py +448 -0
- tactus/adapters/channels/host.py +225 -0
- tactus/adapters/channels/ipc.py +297 -0
- tactus/adapters/channels/sse.py +305 -0
- tactus/adapters/cli_hitl.py +223 -1
- tactus/adapters/control_loop.py +879 -0
- tactus/adapters/file_storage.py +35 -2
- tactus/adapters/ide_log.py +7 -1
- tactus/backends/http_backend.py +0 -1
- tactus/broker/client.py +31 -1
- tactus/broker/server.py +416 -92
- tactus/cli/app.py +270 -7
- tactus/cli/control.py +393 -0
- tactus/core/config_manager.py +33 -6
- tactus/core/dsl_stubs.py +102 -18
- tactus/core/execution_context.py +265 -8
- tactus/core/lua_sandbox.py +8 -9
- tactus/core/registry.py +19 -2
- tactus/core/runtime.py +235 -27
- tactus/docker/Dockerfile.pypi +49 -0
- tactus/docs/__init__.py +33 -0
- tactus/docs/extractor.py +326 -0
- tactus/docs/html_renderer.py +72 -0
- tactus/docs/models.py +121 -0
- tactus/docs/templates/base.html +204 -0
- tactus/docs/templates/index.html +58 -0
- tactus/docs/templates/module.html +96 -0
- tactus/dspy/agent.py +382 -22
- tactus/dspy/broker_lm.py +57 -6
- tactus/dspy/config.py +14 -3
- tactus/dspy/history.py +2 -1
- tactus/dspy/module.py +136 -11
- tactus/dspy/signature.py +0 -1
- tactus/ide/server.py +300 -9
- tactus/primitives/human.py +619 -47
- tactus/primitives/system.py +0 -1
- tactus/protocols/__init__.py +25 -0
- tactus/protocols/control.py +427 -0
- tactus/protocols/notification.py +207 -0
- tactus/sandbox/container_runner.py +79 -11
- tactus/sandbox/docker_manager.py +23 -0
- tactus/sandbox/entrypoint.py +26 -0
- tactus/sandbox/protocol.py +3 -0
- tactus/stdlib/README.md +77 -0
- tactus/stdlib/__init__.py +27 -1
- tactus/stdlib/classify/__init__.py +165 -0
- tactus/stdlib/classify/classify.spec.tac +195 -0
- tactus/stdlib/classify/classify.tac +257 -0
- tactus/stdlib/classify/fuzzy.py +282 -0
- tactus/stdlib/classify/llm.py +319 -0
- tactus/stdlib/classify/primitive.py +287 -0
- tactus/stdlib/core/__init__.py +57 -0
- tactus/stdlib/core/base.py +320 -0
- tactus/stdlib/core/confidence.py +211 -0
- tactus/stdlib/core/models.py +161 -0
- tactus/stdlib/core/retry.py +171 -0
- tactus/stdlib/core/validation.py +274 -0
- tactus/stdlib/extract/__init__.py +125 -0
- tactus/stdlib/extract/llm.py +330 -0
- tactus/stdlib/extract/primitive.py +256 -0
- tactus/stdlib/tac/tactus/classify/base.tac +51 -0
- tactus/stdlib/tac/tactus/classify/fuzzy.tac +87 -0
- tactus/stdlib/tac/tactus/classify/index.md +77 -0
- tactus/stdlib/tac/tactus/classify/init.tac +29 -0
- tactus/stdlib/tac/tactus/classify/llm.tac +150 -0
- tactus/stdlib/tac/tactus/classify.spec.tac +191 -0
- tactus/stdlib/tac/tactus/extract/base.tac +138 -0
- tactus/stdlib/tac/tactus/extract/index.md +96 -0
- tactus/stdlib/tac/tactus/extract/init.tac +27 -0
- tactus/stdlib/tac/tactus/extract/llm.tac +201 -0
- tactus/stdlib/tac/tactus/extract.spec.tac +153 -0
- tactus/stdlib/tac/tactus/generate/base.tac +142 -0
- tactus/stdlib/tac/tactus/generate/index.md +195 -0
- tactus/stdlib/tac/tactus/generate/init.tac +28 -0
- tactus/stdlib/tac/tactus/generate/llm.tac +169 -0
- tactus/stdlib/tac/tactus/generate.spec.tac +210 -0
- tactus/testing/behave_integration.py +171 -7
- tactus/testing/context.py +0 -1
- tactus/testing/evaluation_runner.py +0 -1
- tactus/testing/gherkin_parser.py +0 -1
- tactus/testing/mock_hitl.py +0 -1
- tactus/testing/mock_tools.py +0 -1
- tactus/testing/models.py +0 -1
- tactus/testing/steps/builtin.py +0 -1
- tactus/testing/steps/custom.py +81 -22
- tactus/testing/steps/registry.py +0 -1
- tactus/testing/test_runner.py +7 -1
- tactus/validation/semantic_visitor.py +11 -5
- tactus/validation/validator.py +0 -1
- {tactus-0.32.2.dist-info → tactus-0.34.0.dist-info}/METADATA +14 -2
- {tactus-0.32.2.dist-info → tactus-0.34.0.dist-info}/RECORD +100 -49
- {tactus-0.32.2.dist-info → tactus-0.34.0.dist-info}/WHEEL +0 -0
- {tactus-0.32.2.dist-info → tactus-0.34.0.dist-info}/entry_points.txt +0 -0
- {tactus-0.32.2.dist-info → tactus-0.34.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Confidence Extraction
|
|
3
|
+
|
|
4
|
+
Utilities for extracting confidence scores from LLM responses.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def extract_confidence(
|
|
12
|
+
response: str,
|
|
13
|
+
mode: str = "heuristic",
|
|
14
|
+
classification: Optional[str] = None,
|
|
15
|
+
) -> Optional[float]:
|
|
16
|
+
"""
|
|
17
|
+
Extract confidence score from an LLM response.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
response: The LLM response text
|
|
21
|
+
mode: Extraction mode - "heuristic", "explicit", or "none"
|
|
22
|
+
classification: The classification value (for context)
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Confidence score between 0.0 and 1.0, or None if extraction disabled
|
|
26
|
+
|
|
27
|
+
Modes:
|
|
28
|
+
- "heuristic": Look for confidence indicators in text (default)
|
|
29
|
+
- "explicit": Look for explicit confidence values like "Confidence: 85%"
|
|
30
|
+
- "none": Return None (confidence disabled)
|
|
31
|
+
"""
|
|
32
|
+
if mode == "none":
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
if mode == "explicit":
|
|
36
|
+
return _extract_explicit_confidence(response)
|
|
37
|
+
|
|
38
|
+
return _extract_heuristic_confidence(response)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _extract_explicit_confidence(response: str) -> Optional[float]:
|
|
42
|
+
"""
|
|
43
|
+
Extract explicit confidence values from response.
|
|
44
|
+
|
|
45
|
+
Looks for patterns like:
|
|
46
|
+
- "Confidence: 85%"
|
|
47
|
+
- "confidence = 0.85"
|
|
48
|
+
- "(85% confident)"
|
|
49
|
+
"""
|
|
50
|
+
patterns = [
|
|
51
|
+
r"confidence[:\s=]+(\d+)%",
|
|
52
|
+
r"confidence[:\s=]+0?\.(\d+)",
|
|
53
|
+
r"\((\d+)%\s*confident\)",
|
|
54
|
+
r"(\d+)%\s*confidence",
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
response_lower = response.lower()
|
|
58
|
+
|
|
59
|
+
for pattern in patterns:
|
|
60
|
+
match = re.search(pattern, response_lower)
|
|
61
|
+
if match:
|
|
62
|
+
value = match.group(1)
|
|
63
|
+
# Convert to float between 0 and 1
|
|
64
|
+
if "." not in pattern:
|
|
65
|
+
return float(value) / 100.0
|
|
66
|
+
else:
|
|
67
|
+
return float(f"0.{value}")
|
|
68
|
+
|
|
69
|
+
# Fallback to heuristic if no explicit value found
|
|
70
|
+
return _extract_heuristic_confidence(response)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _extract_heuristic_confidence(response: str) -> Optional[float]:
|
|
74
|
+
"""
|
|
75
|
+
Extract confidence using text heuristics.
|
|
76
|
+
|
|
77
|
+
Looks for language indicators of certainty level.
|
|
78
|
+
"""
|
|
79
|
+
response_lower = response.lower()
|
|
80
|
+
|
|
81
|
+
# Very high confidence indicators (0.95)
|
|
82
|
+
very_high = [
|
|
83
|
+
"definitely",
|
|
84
|
+
"certainly",
|
|
85
|
+
"absolutely",
|
|
86
|
+
"100%",
|
|
87
|
+
"without a doubt",
|
|
88
|
+
"unquestionably",
|
|
89
|
+
"undoubtedly",
|
|
90
|
+
"clearly",
|
|
91
|
+
"obviously",
|
|
92
|
+
]
|
|
93
|
+
for indicator in very_high:
|
|
94
|
+
if indicator in response_lower:
|
|
95
|
+
return 0.95
|
|
96
|
+
|
|
97
|
+
# High confidence indicators (0.85)
|
|
98
|
+
high = [
|
|
99
|
+
"very confident",
|
|
100
|
+
"highly likely",
|
|
101
|
+
"strongly believe",
|
|
102
|
+
"sure that",
|
|
103
|
+
"confident that",
|
|
104
|
+
"very likely",
|
|
105
|
+
]
|
|
106
|
+
for indicator in high:
|
|
107
|
+
if indicator in response_lower:
|
|
108
|
+
return 0.85
|
|
109
|
+
|
|
110
|
+
# Medium-high confidence indicators (0.75)
|
|
111
|
+
med_high = [
|
|
112
|
+
"likely",
|
|
113
|
+
"probably",
|
|
114
|
+
"appears to be",
|
|
115
|
+
"seems to be",
|
|
116
|
+
"based on",
|
|
117
|
+
"indicates",
|
|
118
|
+
"suggests",
|
|
119
|
+
]
|
|
120
|
+
for indicator in med_high:
|
|
121
|
+
if indicator in response_lower:
|
|
122
|
+
return 0.75
|
|
123
|
+
|
|
124
|
+
# Medium confidence indicators (0.60)
|
|
125
|
+
medium = [
|
|
126
|
+
"may be",
|
|
127
|
+
"might be",
|
|
128
|
+
"could be",
|
|
129
|
+
"possibly",
|
|
130
|
+
"perhaps",
|
|
131
|
+
"somewhat",
|
|
132
|
+
]
|
|
133
|
+
for indicator in medium:
|
|
134
|
+
if indicator in response_lower:
|
|
135
|
+
return 0.60
|
|
136
|
+
|
|
137
|
+
# Low confidence indicators (0.45)
|
|
138
|
+
low = [
|
|
139
|
+
"not entirely sure",
|
|
140
|
+
"uncertain",
|
|
141
|
+
"difficult to determine",
|
|
142
|
+
"hard to tell",
|
|
143
|
+
"ambiguous",
|
|
144
|
+
"unclear",
|
|
145
|
+
]
|
|
146
|
+
for indicator in low:
|
|
147
|
+
if indicator in response_lower:
|
|
148
|
+
return 0.45
|
|
149
|
+
|
|
150
|
+
# Very low confidence indicators (0.30)
|
|
151
|
+
very_low = [
|
|
152
|
+
"very uncertain",
|
|
153
|
+
"cannot determine",
|
|
154
|
+
"impossible to tell",
|
|
155
|
+
"no way to know",
|
|
156
|
+
"purely guessing",
|
|
157
|
+
]
|
|
158
|
+
for indicator in very_low:
|
|
159
|
+
if indicator in response_lower:
|
|
160
|
+
return 0.30
|
|
161
|
+
|
|
162
|
+
# Default confidence when no indicators found
|
|
163
|
+
return 0.70
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
# Confidence level mappings for string labels
|
|
167
|
+
CONFIDENCE_LABELS = {
|
|
168
|
+
"very_high": 0.95,
|
|
169
|
+
"high": 0.85,
|
|
170
|
+
"medium_high": 0.75,
|
|
171
|
+
"medium": 0.60,
|
|
172
|
+
"low": 0.45,
|
|
173
|
+
"very_low": 0.30,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def confidence_to_label(confidence: float) -> str:
|
|
178
|
+
"""
|
|
179
|
+
Convert numeric confidence to a label.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
confidence: Confidence value between 0.0 and 1.0
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
Label string: "very_high", "high", "medium_high", "medium", "low", or "very_low"
|
|
186
|
+
"""
|
|
187
|
+
if confidence >= 0.90:
|
|
188
|
+
return "very_high"
|
|
189
|
+
elif confidence >= 0.80:
|
|
190
|
+
return "high"
|
|
191
|
+
elif confidence >= 0.70:
|
|
192
|
+
return "medium_high"
|
|
193
|
+
elif confidence >= 0.55:
|
|
194
|
+
return "medium"
|
|
195
|
+
elif confidence >= 0.40:
|
|
196
|
+
return "low"
|
|
197
|
+
else:
|
|
198
|
+
return "very_low"
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def label_to_confidence(label: str) -> float:
|
|
202
|
+
"""
|
|
203
|
+
Convert a label to numeric confidence.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
label: Confidence label
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
Confidence value between 0.0 and 1.0
|
|
210
|
+
"""
|
|
211
|
+
return CONFIDENCE_LABELS.get(label.lower(), 0.70)
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pydantic models for stdlib result types.
|
|
3
|
+
|
|
4
|
+
These models provide:
|
|
5
|
+
- Type safety and validation
|
|
6
|
+
- Consistent result structures across all classifiers/extractors
|
|
7
|
+
- Easy serialization for Lua interop
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
from pydantic import BaseModel, Field
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ClassifierResult(BaseModel):
|
|
15
|
+
"""
|
|
16
|
+
Result from any classifier (LLM, fuzzy match, etc.).
|
|
17
|
+
|
|
18
|
+
All classifiers return this same structure, enabling polymorphism.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
value: str = Field(..., description="The classification result")
|
|
22
|
+
confidence: Optional[float] = Field(
|
|
23
|
+
None,
|
|
24
|
+
ge=0.0,
|
|
25
|
+
le=1.0,
|
|
26
|
+
description="Confidence score between 0.0 and 1.0",
|
|
27
|
+
)
|
|
28
|
+
explanation: Optional[str] = Field(
|
|
29
|
+
None, description="Reasoning or explanation for the classification"
|
|
30
|
+
)
|
|
31
|
+
matched_text: Optional[str] = Field(
|
|
32
|
+
None, description="The actual text that was matched (for fuzzy matching)"
|
|
33
|
+
)
|
|
34
|
+
retry_count: int = Field(0, ge=0, description="Number of retries needed to get valid result")
|
|
35
|
+
raw_response: Optional[str] = Field(None, description="Raw response from LLM (if applicable)")
|
|
36
|
+
error: Optional[str] = Field(None, description="Error message if classification failed")
|
|
37
|
+
|
|
38
|
+
def to_lua_dict(self) -> Dict[str, Any]:
|
|
39
|
+
"""Convert to dict suitable for Lua interop."""
|
|
40
|
+
return {
|
|
41
|
+
"value": self.value,
|
|
42
|
+
"confidence": self.confidence,
|
|
43
|
+
"explanation": self.explanation,
|
|
44
|
+
"matched_text": self.matched_text,
|
|
45
|
+
"retry_count": self.retry_count,
|
|
46
|
+
"raw_response": self.raw_response,
|
|
47
|
+
"error": self.error,
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
51
|
+
"""Convert to dict (convenience alias for to_lua_dict)."""
|
|
52
|
+
return self.to_lua_dict()
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def is_error(self) -> bool:
|
|
56
|
+
"""Check if this result represents an error."""
|
|
57
|
+
return self.error is not None or self.value == "ERROR"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ExtractorResult(BaseModel):
|
|
61
|
+
"""
|
|
62
|
+
Result from any extractor (LLM, schema-based, etc.).
|
|
63
|
+
|
|
64
|
+
Contains extracted fields plus validation information.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
fields: Dict[str, Any] = Field(default_factory=dict, description="Extracted field values")
|
|
68
|
+
validation_errors: List[str] = Field(
|
|
69
|
+
default_factory=list, description="Validation errors for extracted fields"
|
|
70
|
+
)
|
|
71
|
+
retry_count: int = Field(0, ge=0, description="Number of retries needed to get valid result")
|
|
72
|
+
raw_response: Optional[str] = Field(None, description="Raw response from LLM (if applicable)")
|
|
73
|
+
error: Optional[str] = Field(None, description="Error message if extraction failed")
|
|
74
|
+
|
|
75
|
+
def to_lua_dict(self) -> Dict[str, Any]:
|
|
76
|
+
"""Convert to dict suitable for Lua interop."""
|
|
77
|
+
result = dict(self.fields) # Flatten fields to top level
|
|
78
|
+
result["_validation_errors"] = self.validation_errors
|
|
79
|
+
result["_retry_count"] = self.retry_count
|
|
80
|
+
result["_error"] = self.error
|
|
81
|
+
return result
|
|
82
|
+
|
|
83
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
84
|
+
"""Convert to dict (convenience alias for to_lua_dict)."""
|
|
85
|
+
return self.to_lua_dict()
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def is_valid(self) -> bool:
|
|
89
|
+
"""Check if extraction was valid (no errors)."""
|
|
90
|
+
return len(self.validation_errors) == 0 and self.error is None
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class ClassifierConfig(BaseModel):
|
|
94
|
+
"""
|
|
95
|
+
Configuration for a classifier.
|
|
96
|
+
|
|
97
|
+
Used to validate and document classifier options.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
classes: List[str] = Field(..., min_length=2, description="Valid classification values")
|
|
101
|
+
target_classes: List[str] = Field(
|
|
102
|
+
default_factory=list,
|
|
103
|
+
description="Target classes for precision/recall metrics (subset of classes)",
|
|
104
|
+
)
|
|
105
|
+
prompt: Optional[str] = Field(None, description="Classification instruction/prompt")
|
|
106
|
+
max_retries: int = Field(3, ge=0, description="Maximum retry attempts")
|
|
107
|
+
temperature: float = Field(0.3, ge=0.0, le=2.0, description="LLM temperature")
|
|
108
|
+
model: Optional[str] = Field(None, description="Model to use (optional)")
|
|
109
|
+
confidence_mode: str = Field(
|
|
110
|
+
"heuristic",
|
|
111
|
+
description="Confidence extraction mode: 'heuristic', 'logprobs', or 'none'",
|
|
112
|
+
)
|
|
113
|
+
parse_direction: str = Field(
|
|
114
|
+
"start",
|
|
115
|
+
description="Where to look for classification: 'start', 'end', or 'any'",
|
|
116
|
+
)
|
|
117
|
+
method: str = Field("llm", description="Classification method: 'llm' or 'fuzzy'")
|
|
118
|
+
|
|
119
|
+
# Fuzzy match specific
|
|
120
|
+
expected: Optional[str] = Field(None, description="Expected value for fuzzy matching")
|
|
121
|
+
threshold: float = Field(
|
|
122
|
+
0.8, ge=0.0, le=1.0, description="Similarity threshold for fuzzy matching"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class ExtractorConfig(BaseModel):
|
|
127
|
+
"""
|
|
128
|
+
Configuration for an extractor.
|
|
129
|
+
|
|
130
|
+
Used to validate and document extractor options.
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
fields: Dict[str, str] = Field(
|
|
134
|
+
..., description="Fields to extract with their types (name -> type)"
|
|
135
|
+
)
|
|
136
|
+
prompt: Optional[str] = Field(None, description="Extraction instruction/prompt")
|
|
137
|
+
max_retries: int = Field(3, ge=0, description="Maximum retry attempts")
|
|
138
|
+
temperature: float = Field(0.3, ge=0.0, le=2.0, description="LLM temperature")
|
|
139
|
+
model: Optional[str] = Field(None, description="Model to use (optional)")
|
|
140
|
+
strict: bool = Field(
|
|
141
|
+
True, description="Whether to require all fields (strict) or allow missing"
|
|
142
|
+
)
|
|
143
|
+
method: str = Field("llm", description="Extraction method: 'llm' or 'schema'")
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class EvaluationResult(BaseModel):
|
|
147
|
+
"""
|
|
148
|
+
Result from evaluating a classifier/extractor on test data.
|
|
149
|
+
|
|
150
|
+
Contains metrics like accuracy, precision, recall, F1.
|
|
151
|
+
"""
|
|
152
|
+
|
|
153
|
+
accuracy: float = Field(..., ge=0.0, le=1.0)
|
|
154
|
+
precision: Optional[float] = Field(None, ge=0.0, le=1.0)
|
|
155
|
+
recall: Optional[float] = Field(None, ge=0.0, le=1.0)
|
|
156
|
+
f1: Optional[float] = Field(None, ge=0.0, le=1.0)
|
|
157
|
+
confusion_matrix: Optional[Dict[str, Dict[str, int]]] = None
|
|
158
|
+
total_samples: int = Field(..., ge=0)
|
|
159
|
+
total_retries: int = Field(0, ge=0)
|
|
160
|
+
mean_confidence: Optional[float] = Field(None, ge=0.0, le=1.0)
|
|
161
|
+
errors: List[str] = Field(default_factory=list)
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Retry with Conversational Feedback
|
|
3
|
+
|
|
4
|
+
Provides intelligent retry logic that preserves conversation history
|
|
5
|
+
and gives the LLM feedback about previous attempts.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Any, Callable, Dict, List, Optional, TypeVar
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
T = TypeVar("T")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RetryWithFeedback:
|
|
17
|
+
"""
|
|
18
|
+
Retry logic with conversational feedback.
|
|
19
|
+
|
|
20
|
+
Unlike simple retry, this approach:
|
|
21
|
+
1. Preserves conversation history across attempts
|
|
22
|
+
2. Gives the LLM feedback about why the previous attempt failed
|
|
23
|
+
3. Enables "self-healing" where the LLM learns from mistakes
|
|
24
|
+
|
|
25
|
+
This is the core pattern used by Plexus LangGraphScore nodes
|
|
26
|
+
for reliable classification.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
max_retries: int = 3,
|
|
32
|
+
on_retry: Optional[Callable[[int, str, str], None]] = None,
|
|
33
|
+
):
|
|
34
|
+
"""
|
|
35
|
+
Initialize retry handler.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
max_retries: Maximum number of retry attempts
|
|
39
|
+
on_retry: Optional callback called on each retry (attempt, error, feedback)
|
|
40
|
+
"""
|
|
41
|
+
self.max_retries = max_retries
|
|
42
|
+
self.on_retry = on_retry
|
|
43
|
+
|
|
44
|
+
def execute(
|
|
45
|
+
self,
|
|
46
|
+
call_fn: Callable[[str], str],
|
|
47
|
+
initial_message: str,
|
|
48
|
+
validate_fn: Callable[[str], bool],
|
|
49
|
+
build_feedback_fn: Callable[[str], str],
|
|
50
|
+
) -> Dict[str, Any]:
|
|
51
|
+
"""
|
|
52
|
+
Execute with retry logic.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
call_fn: Function to call the LLM (takes message, returns response)
|
|
56
|
+
initial_message: Initial message to send
|
|
57
|
+
validate_fn: Function to validate response (returns True if valid)
|
|
58
|
+
build_feedback_fn: Function to build feedback message from failed response
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Dict with:
|
|
62
|
+
- response: The successful response (or last failed response)
|
|
63
|
+
- success: Whether validation passed
|
|
64
|
+
- retry_count: Number of retries performed
|
|
65
|
+
- history: List of all (message, response) pairs
|
|
66
|
+
"""
|
|
67
|
+
history: List[Dict[str, str]] = []
|
|
68
|
+
retry_count = 0
|
|
69
|
+
current_message = initial_message
|
|
70
|
+
last_response = None
|
|
71
|
+
|
|
72
|
+
for attempt in range(self.max_retries + 1):
|
|
73
|
+
# Call the LLM
|
|
74
|
+
response = call_fn(current_message)
|
|
75
|
+
last_response = response
|
|
76
|
+
|
|
77
|
+
# Record in history
|
|
78
|
+
history.append({"message": current_message, "response": response})
|
|
79
|
+
|
|
80
|
+
# Validate the response
|
|
81
|
+
if validate_fn(response):
|
|
82
|
+
logger.debug(f"Valid response on attempt {attempt + 1}")
|
|
83
|
+
return {
|
|
84
|
+
"response": response,
|
|
85
|
+
"success": True,
|
|
86
|
+
"retry_count": retry_count,
|
|
87
|
+
"history": history,
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
# Response invalid - prepare for retry
|
|
91
|
+
if attempt < self.max_retries:
|
|
92
|
+
retry_count += 1
|
|
93
|
+
feedback = build_feedback_fn(response)
|
|
94
|
+
current_message = feedback
|
|
95
|
+
|
|
96
|
+
logger.debug(f"Retry {retry_count}: {feedback[:100]}...")
|
|
97
|
+
|
|
98
|
+
if self.on_retry:
|
|
99
|
+
self.on_retry(retry_count, response, feedback)
|
|
100
|
+
|
|
101
|
+
# All retries exhausted
|
|
102
|
+
logger.warning(f"Validation failed after {self.max_retries} retries")
|
|
103
|
+
return {
|
|
104
|
+
"response": last_response,
|
|
105
|
+
"success": False,
|
|
106
|
+
"retry_count": retry_count,
|
|
107
|
+
"history": history,
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def create_classification_validator(valid_classes: List[str]) -> Callable[[str], bool]:
|
|
112
|
+
"""
|
|
113
|
+
Create a validator function for classification responses.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
valid_classes: List of valid classification values
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Validator function that returns True if response contains valid class
|
|
120
|
+
"""
|
|
121
|
+
valid_lower = {c.lower() for c in valid_classes}
|
|
122
|
+
|
|
123
|
+
def validator(response: str) -> bool:
|
|
124
|
+
if not response:
|
|
125
|
+
return False
|
|
126
|
+
|
|
127
|
+
# Check first line for classification
|
|
128
|
+
first_line = response.strip().split("\n")[0].strip().lower()
|
|
129
|
+
|
|
130
|
+
# Remove common formatting
|
|
131
|
+
import re
|
|
132
|
+
|
|
133
|
+
cleaned = re.sub(r"[\*\"\'\`\:\.]", "", first_line).strip()
|
|
134
|
+
|
|
135
|
+
# Check for exact match
|
|
136
|
+
if cleaned in valid_lower:
|
|
137
|
+
return True
|
|
138
|
+
|
|
139
|
+
# Check for prefix match
|
|
140
|
+
for cls in valid_lower:
|
|
141
|
+
if cleaned.startswith(cls):
|
|
142
|
+
return True
|
|
143
|
+
|
|
144
|
+
return False
|
|
145
|
+
|
|
146
|
+
return validator
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def create_classification_feedback(valid_classes: List[str]) -> Callable[[str], str]:
|
|
150
|
+
"""
|
|
151
|
+
Create a feedback builder for classification retries.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
valid_classes: List of valid classification values
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
Function that builds feedback message from failed response
|
|
158
|
+
"""
|
|
159
|
+
classes_str = ", ".join(f'"{c}"' for c in valid_classes)
|
|
160
|
+
|
|
161
|
+
def build_feedback(response: str) -> str:
|
|
162
|
+
return f"""Your previous response was not a valid classification.
|
|
163
|
+
|
|
164
|
+
Your response: "{response[:200]}..."
|
|
165
|
+
|
|
166
|
+
VALID CLASSIFICATIONS ARE: {classes_str}
|
|
167
|
+
|
|
168
|
+
Please respond with EXACTLY one of these classifications on the first line, followed by your explanation.
|
|
169
|
+
Do not include any other text on the first line."""
|
|
170
|
+
|
|
171
|
+
return build_feedback
|