outputguard 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- outputguard/__init__.py +53 -0
- outputguard/cli.py +227 -0
- outputguard/exceptions.py +38 -0
- outputguard/guard.py +91 -0
- outputguard/models.py +29 -0
- outputguard/py.typed +0 -0
- outputguard/repairer.py +102 -0
- outputguard/report.py +110 -0
- outputguard/retry.py +84 -0
- outputguard/strategies/__init__.py +69 -0
- outputguard/strategies/extract_json.py +48 -0
- outputguard/strategies/fix_booleans.py +34 -0
- outputguard/strategies/fix_closers.py +42 -0
- outputguard/strategies/fix_commas.py +12 -0
- outputguard/strategies/fix_ellipsis.py +106 -0
- outputguard/strategies/fix_inner_quotes.py +87 -0
- outputguard/strategies/fix_keys.py +44 -0
- outputguard/strategies/fix_newlines.py +58 -0
- outputguard/strategies/fix_quotes.py +61 -0
- outputguard/strategies/fix_truncated.py +116 -0
- outputguard/strategies/fix_unicode.py +131 -0
- outputguard/strategies/fix_values.py +39 -0
- outputguard/strategies/remove_comments.py +55 -0
- outputguard/strategies/strip_fences.py +18 -0
- outputguard/validator.py +56 -0
- outputguard-0.2.0.dist-info/METADATA +395 -0
- outputguard-0.2.0.dist-info/RECORD +30 -0
- outputguard-0.2.0.dist-info/WHEEL +4 -0
- outputguard-0.2.0.dist-info/entry_points.txt +2 -0
- outputguard-0.2.0.dist-info/licenses/LICENSE +21 -0
outputguard/__init__.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from outputguard.exceptions import (
|
|
2
|
+
OutputGuardError,
|
|
3
|
+
ParseError,
|
|
4
|
+
RepairError,
|
|
5
|
+
SchemaValidationError,
|
|
6
|
+
StrategyError,
|
|
7
|
+
)
|
|
8
|
+
from outputguard.guard import OutputGuard
|
|
9
|
+
from outputguard.models import RepairResult, ValidationError, ValidationResult
|
|
10
|
+
from outputguard.report import RepairReport, StrategyApplication
|
|
11
|
+
|
|
12
|
+
_default_guard = OutputGuard()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def validate(text: str, schema: dict) -> ValidationResult:
|
|
16
|
+
return _default_guard.validate(text, schema)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def repair(text: str) -> RepairResult:
|
|
20
|
+
return _default_guard.repair(text) # type: ignore[return-value]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def validate_and_repair(text: str, schema: dict) -> ValidationResult:
|
|
24
|
+
return _default_guard.validate_and_repair(text, schema)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def parse(text: str, schema: dict) -> dict | list:
|
|
28
|
+
"""Validate, repair, and return parsed data. Raises on failure."""
|
|
29
|
+
return _default_guard.parse(text, schema)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def retry_prompt(text: str, schema: dict, errors: list[ValidationError]) -> str:
|
|
33
|
+
return _default_guard.retry_prompt(text, schema, errors)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
__all__ = [
|
|
37
|
+
"OutputGuard",
|
|
38
|
+
"OutputGuardError",
|
|
39
|
+
"ParseError",
|
|
40
|
+
"RepairError",
|
|
41
|
+
"RepairReport",
|
|
42
|
+
"RepairResult",
|
|
43
|
+
"SchemaValidationError",
|
|
44
|
+
"StrategyApplication",
|
|
45
|
+
"StrategyError",
|
|
46
|
+
"ValidationError",
|
|
47
|
+
"ValidationResult",
|
|
48
|
+
"parse",
|
|
49
|
+
"repair",
|
|
50
|
+
"retry_prompt",
|
|
51
|
+
"validate",
|
|
52
|
+
"validate_and_repair",
|
|
53
|
+
]
|
outputguard/cli.py
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
"""OutputGuard CLI — validate, repair, and inspect LLM JSON output."""
|
|
2
|
+
|
|
3
|
+
import dataclasses
|
|
4
|
+
import json
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
import click
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
from rich.table import Table
|
|
10
|
+
|
|
11
|
+
import outputguard
|
|
12
|
+
from outputguard.guard import OutputGuard
|
|
13
|
+
from outputguard.models import RepairResult, ValidationResult
|
|
14
|
+
from outputguard.repairer import repair as _repair
|
|
15
|
+
from outputguard.strategies import ALL_STRATEGIES, STRATEGY_DESCRIPTIONS
|
|
16
|
+
|
|
17
|
+
console = Console(stderr=True)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _read_input(input_path: str) -> str:
|
|
21
|
+
with click.open_file(input_path, "r") as f:
|
|
22
|
+
return f.read()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _load_schema(schema_path: str) -> dict:
|
|
26
|
+
with open(schema_path) as f:
|
|
27
|
+
return json.load(f)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _print_validation_text(result: ValidationResult) -> None:
|
|
31
|
+
if result.valid:
|
|
32
|
+
if result.repaired:
|
|
33
|
+
console.print(
|
|
34
|
+
"[yellow]⚠ Repaired and valid[/yellow] "
|
|
35
|
+
f"strategies: {', '.join(result.strategies_applied)}"
|
|
36
|
+
)
|
|
37
|
+
else:
|
|
38
|
+
console.print("[green]✓ Valid[/green]")
|
|
39
|
+
else:
|
|
40
|
+
console.print("[red]✗ Invalid[/red]")
|
|
41
|
+
for err in result.errors:
|
|
42
|
+
console.print(f" [red]{err.path}[/red]: {err.message}")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _result_to_dict(obj: ValidationResult | RepairResult) -> dict:
|
|
46
|
+
return dataclasses.asdict(obj)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _write_output(text: str, output_path: str | None) -> None:
|
|
50
|
+
if output_path:
|
|
51
|
+
with open(output_path, "w") as f:
|
|
52
|
+
f.write(text)
|
|
53
|
+
else:
|
|
54
|
+
click.echo(text)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@click.group()
|
|
58
|
+
def cli() -> None:
|
|
59
|
+
"""OutputGuard — validate and repair LLM JSON output."""
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@cli.command()
|
|
63
|
+
@click.argument("input_path", metavar="INPUT")
|
|
64
|
+
@click.option("-s", "--schema", "schema_path", required=True, help="Path to JSON Schema file.")
|
|
65
|
+
@click.option(
|
|
66
|
+
"-r", "--repair", "do_repair", is_flag=True, help="Attempt repair if validation fails."
|
|
67
|
+
)
|
|
68
|
+
@click.option(
|
|
69
|
+
"-f",
|
|
70
|
+
"--format",
|
|
71
|
+
"fmt",
|
|
72
|
+
type=click.Choice(["text", "json"]),
|
|
73
|
+
default="text",
|
|
74
|
+
help="Output format.",
|
|
75
|
+
)
|
|
76
|
+
@click.option("-q", "--quiet", is_flag=True, help="Exit code only, no output.")
|
|
77
|
+
@click.option("-o", "--output", "output_path", default=None, help="Write result to file.")
|
|
78
|
+
@click.option("-d", "--diff", "show_diff", is_flag=True, help="Show diff of repairs.")
|
|
79
|
+
@click.option("-v", "--verbose", is_flag=True, help="Show each strategy's effect.")
|
|
80
|
+
def validate(
|
|
81
|
+
input_path: str,
|
|
82
|
+
schema_path: str,
|
|
83
|
+
do_repair: bool,
|
|
84
|
+
fmt: str,
|
|
85
|
+
quiet: bool,
|
|
86
|
+
output_path: str | None,
|
|
87
|
+
show_diff: bool,
|
|
88
|
+
verbose: bool,
|
|
89
|
+
) -> None:
|
|
90
|
+
"""Validate INPUT (file or - for stdin) against a JSON schema."""
|
|
91
|
+
text = _read_input(input_path)
|
|
92
|
+
schema = _load_schema(schema_path)
|
|
93
|
+
|
|
94
|
+
if do_repair:
|
|
95
|
+
result = outputguard.validate_and_repair(text, schema)
|
|
96
|
+
else:
|
|
97
|
+
result = outputguard.validate(text, schema)
|
|
98
|
+
|
|
99
|
+
if not quiet:
|
|
100
|
+
if fmt == "json":
|
|
101
|
+
_write_output(json.dumps(_result_to_dict(result), indent=2), output_path)
|
|
102
|
+
else:
|
|
103
|
+
_print_validation_text(result)
|
|
104
|
+
if result.valid and result.repaired:
|
|
105
|
+
if show_diff or verbose:
|
|
106
|
+
_show_repair_details(text, result, verbose)
|
|
107
|
+
if result.repaired_text:
|
|
108
|
+
_write_output(result.repaired_text, output_path)
|
|
109
|
+
|
|
110
|
+
sys.exit(0 if result.valid else 1)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _show_repair_details(original: str, result: ValidationResult, verbose: bool) -> None:
|
|
114
|
+
"""Show diff/verbose output for a repair."""
|
|
115
|
+
if not result.repaired:
|
|
116
|
+
return
|
|
117
|
+
_result, report = _repair(original, report=True)
|
|
118
|
+
if verbose:
|
|
119
|
+
step_diffs = report.step_diffs()
|
|
120
|
+
if step_diffs:
|
|
121
|
+
console.print("\n[bold]Strategy details:[/bold]")
|
|
122
|
+
console.print(step_diffs)
|
|
123
|
+
console.print(f"[dim]Confidence: {report.confidence:.0%}[/dim]")
|
|
124
|
+
else:
|
|
125
|
+
diff = report.diff
|
|
126
|
+
if diff:
|
|
127
|
+
console.print("\n[bold]Diff:[/bold]")
|
|
128
|
+
console.print(diff)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@cli.command()
|
|
132
|
+
@click.argument("input_path", metavar="INPUT")
|
|
133
|
+
@click.option(
|
|
134
|
+
"-f",
|
|
135
|
+
"--format",
|
|
136
|
+
"fmt",
|
|
137
|
+
type=click.Choice(["text", "json"]),
|
|
138
|
+
default="text",
|
|
139
|
+
help="Output format.",
|
|
140
|
+
)
|
|
141
|
+
@click.option("-o", "--output", "output_path", default=None, help="Write result to file.")
|
|
142
|
+
@click.option("--strategies", default=None, help="Comma-separated strategy names.")
|
|
143
|
+
@click.option("-d", "--diff", "show_diff", is_flag=True, help="Show diff of repairs.")
|
|
144
|
+
@click.option("-v", "--verbose", is_flag=True, help="Show each strategy's effect.")
|
|
145
|
+
def repair(
|
|
146
|
+
input_path: str,
|
|
147
|
+
fmt: str,
|
|
148
|
+
output_path: str | None,
|
|
149
|
+
strategies: str | None,
|
|
150
|
+
show_diff: bool,
|
|
151
|
+
verbose: bool,
|
|
152
|
+
) -> None:
|
|
153
|
+
"""Repair malformed JSON from INPUT (file or - for stdin)."""
|
|
154
|
+
text = _read_input(input_path)
|
|
155
|
+
strategy_list = [s.strip() for s in strategies.split(",")] if strategies else None
|
|
156
|
+
|
|
157
|
+
guard = OutputGuard(strategies=strategy_list)
|
|
158
|
+
need_report = show_diff or verbose
|
|
159
|
+
if need_report:
|
|
160
|
+
result, report = guard.repair(text, report=True)
|
|
161
|
+
else:
|
|
162
|
+
result = guard.repair(text)
|
|
163
|
+
report = None
|
|
164
|
+
|
|
165
|
+
if fmt == "json":
|
|
166
|
+
_write_output(json.dumps(_result_to_dict(result), indent=2), output_path)
|
|
167
|
+
else:
|
|
168
|
+
if result.repaired:
|
|
169
|
+
console.print(
|
|
170
|
+
f"[yellow]⚠ Repaired[/yellow] strategies: {', '.join(result.strategies_applied)}"
|
|
171
|
+
)
|
|
172
|
+
if report and verbose:
|
|
173
|
+
step_diffs = report.step_diffs()
|
|
174
|
+
if step_diffs:
|
|
175
|
+
console.print("\n[bold]Strategy details:[/bold]")
|
|
176
|
+
console.print(step_diffs)
|
|
177
|
+
console.print(f"[dim]Confidence: {report.confidence:.0%}[/dim]")
|
|
178
|
+
elif report and show_diff:
|
|
179
|
+
diff = report.diff
|
|
180
|
+
if diff:
|
|
181
|
+
console.print("\n[bold]Diff:[/bold]")
|
|
182
|
+
console.print(diff)
|
|
183
|
+
_write_output(result.text, output_path)
|
|
184
|
+
elif result.parse_error:
|
|
185
|
+
console.print(f"[red]✗ Could not repair[/red]: {result.parse_error}")
|
|
186
|
+
else:
|
|
187
|
+
console.print("[green]✓ Already valid JSON[/green]")
|
|
188
|
+
_write_output(result.text, output_path)
|
|
189
|
+
|
|
190
|
+
sys.exit(0 if result.repaired or result.parse_error is None else 1)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
@cli.command("retry-prompt")
|
|
194
|
+
@click.argument("input_path", metavar="INPUT")
|
|
195
|
+
@click.option("-s", "--schema", "schema_path", required=True, help="Path to JSON Schema file.")
|
|
196
|
+
def retry_prompt(input_path: str, schema_path: str) -> None:
|
|
197
|
+
"""Generate a retry prompt for invalid JSON from INPUT."""
|
|
198
|
+
text = _read_input(input_path)
|
|
199
|
+
schema = _load_schema(schema_path)
|
|
200
|
+
|
|
201
|
+
result = outputguard.validate(text, schema)
|
|
202
|
+
prompt = outputguard.retry_prompt(text, schema, result.errors)
|
|
203
|
+
click.echo(prompt)
|
|
204
|
+
sys.exit(0)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
@cli.command()
|
|
208
|
+
def strategies() -> None:
|
|
209
|
+
"""List all available repair strategies."""
|
|
210
|
+
table = Table(title="Repair Strategies")
|
|
211
|
+
table.add_column("#", style="dim", width=4)
|
|
212
|
+
table.add_column("Name", style="cyan")
|
|
213
|
+
table.add_column("Description")
|
|
214
|
+
|
|
215
|
+
for i, (name, _fn) in enumerate(ALL_STRATEGIES, 1):
|
|
216
|
+
table.add_row(str(i), name, STRATEGY_DESCRIPTIONS.get(name, ""))
|
|
217
|
+
|
|
218
|
+
console.print(table)
|
|
219
|
+
sys.exit(0)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
@cli.command()
|
|
223
|
+
def version() -> None:
|
|
224
|
+
"""Show outputguard version."""
|
|
225
|
+
from importlib.metadata import version as pkg_version
|
|
226
|
+
|
|
227
|
+
click.echo(f"outputguard {pkg_version('outputguard')}")
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
class OutputGuardError(Exception):
|
|
2
|
+
"""Base exception for all outputguard errors."""
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ParseError(OutputGuardError):
|
|
6
|
+
"""JSON could not be parsed even after repair attempts."""
|
|
7
|
+
|
|
8
|
+
def __init__(self, message: str, original_text: str, parse_error: str | None = None):
|
|
9
|
+
self.original_text = original_text
|
|
10
|
+
self.parse_error = parse_error
|
|
11
|
+
super().__init__(message)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SchemaValidationError(OutputGuardError):
|
|
15
|
+
"""JSON parsed but doesn't match the schema, even after repair."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, message: str, data: dict | list, errors: list, schema: dict):
|
|
18
|
+
self.data = data
|
|
19
|
+
self.validation_errors = errors
|
|
20
|
+
self.schema = schema
|
|
21
|
+
super().__init__(message)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class RepairError(OutputGuardError):
|
|
25
|
+
"""Repair was attempted but failed."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, message: str, strategies_tried: list[str], original_text: str):
|
|
28
|
+
self.strategies_tried = strategies_tried
|
|
29
|
+
self.original_text = original_text
|
|
30
|
+
super().__init__(message)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class StrategyError(OutputGuardError):
|
|
34
|
+
"""A specific repair strategy encountered an error."""
|
|
35
|
+
|
|
36
|
+
def __init__(self, message: str, strategy_name: str):
|
|
37
|
+
self.strategy_name = strategy_name
|
|
38
|
+
super().__init__(message)
|
outputguard/guard.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Literal, overload
|
|
4
|
+
|
|
5
|
+
from outputguard import repairer as _repairer
|
|
6
|
+
from outputguard import retry as _retry
|
|
7
|
+
from outputguard import validator as _validator
|
|
8
|
+
from outputguard.exceptions import ParseError, SchemaValidationError
|
|
9
|
+
from outputguard.models import RepairResult, ValidationError, ValidationResult
|
|
10
|
+
from outputguard.report import RepairReport
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class OutputGuard:
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
strategies: list[str] | None = None,
|
|
17
|
+
max_repair_attempts: int = 3,
|
|
18
|
+
):
|
|
19
|
+
self.strategies = strategies
|
|
20
|
+
self.max_repair_attempts = max_repair_attempts
|
|
21
|
+
|
|
22
|
+
def validate(self, text: str, schema: dict) -> ValidationResult:
|
|
23
|
+
return _validator.validate(text, schema)
|
|
24
|
+
|
|
25
|
+
@overload
|
|
26
|
+
def repair(self, text: str) -> RepairResult: ...
|
|
27
|
+
|
|
28
|
+
@overload
|
|
29
|
+
def repair(self, text: str, *, report: Literal[True]) -> tuple[RepairResult, RepairReport]: ...
|
|
30
|
+
|
|
31
|
+
def repair(
|
|
32
|
+
self, text: str, *, report: bool = False
|
|
33
|
+
) -> RepairResult | tuple[RepairResult, RepairReport]:
|
|
34
|
+
if report:
|
|
35
|
+
return _repairer.repair(text, self.strategies, report=True)
|
|
36
|
+
return _repairer.repair(text, self.strategies)
|
|
37
|
+
|
|
38
|
+
def validate_and_repair(self, text: str, schema: dict) -> ValidationResult:
|
|
39
|
+
"""Validate, and if invalid, attempt repair then re-validate."""
|
|
40
|
+
result = self.validate(text, schema)
|
|
41
|
+
if result.valid:
|
|
42
|
+
return result
|
|
43
|
+
|
|
44
|
+
current_text = text
|
|
45
|
+
for _attempt in range(self.max_repair_attempts):
|
|
46
|
+
repair_result = _repairer.repair(current_text, self.strategies)
|
|
47
|
+
if not repair_result.repaired:
|
|
48
|
+
continue
|
|
49
|
+
|
|
50
|
+
revalidation = self.validate(repair_result.text, schema)
|
|
51
|
+
if revalidation.valid:
|
|
52
|
+
revalidation.repaired = True
|
|
53
|
+
revalidation.strategies_applied = repair_result.strategies_applied
|
|
54
|
+
revalidation.original_text = text
|
|
55
|
+
revalidation.repaired_text = repair_result.text
|
|
56
|
+
return revalidation
|
|
57
|
+
current_text = repair_result.text
|
|
58
|
+
|
|
59
|
+
result.original_text = text
|
|
60
|
+
return result
|
|
61
|
+
|
|
62
|
+
def parse(self, text: str, schema: dict) -> dict | list:
|
|
63
|
+
"""Validate, repair, and return parsed data. Raises on failure.
|
|
64
|
+
|
|
65
|
+
This is the simplest API: give it text and a schema, get back
|
|
66
|
+
parsed data or an exception.
|
|
67
|
+
|
|
68
|
+
Raises:
|
|
69
|
+
ParseError: If the text cannot be parsed as JSON even after repair.
|
|
70
|
+
SchemaValidationError: If the parsed JSON doesn't match the schema.
|
|
71
|
+
"""
|
|
72
|
+
result = self.validate_and_repair(text, schema)
|
|
73
|
+
if result.valid:
|
|
74
|
+
assert result.data is not None
|
|
75
|
+
return result.data
|
|
76
|
+
|
|
77
|
+
if result.data is None:
|
|
78
|
+
raise ParseError(
|
|
79
|
+
"Could not parse JSON from LLM output",
|
|
80
|
+
original_text=text,
|
|
81
|
+
parse_error=result.errors[0].message if result.errors else None,
|
|
82
|
+
)
|
|
83
|
+
raise SchemaValidationError(
|
|
84
|
+
f"JSON does not match schema: {len(result.errors)} error(s)",
|
|
85
|
+
data=result.data,
|
|
86
|
+
errors=result.errors,
|
|
87
|
+
schema=schema,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
def retry_prompt(self, text: str, schema: dict, errors: list[ValidationError]) -> str:
|
|
91
|
+
return _retry.retry_prompt(text, schema, errors)
|
outputguard/models.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class ValidationError:
|
|
7
|
+
message: str
|
|
8
|
+
path: str # JSON path, e.g. "$.items[0].name"
|
|
9
|
+
schema_path: str # Schema path that was violated
|
|
10
|
+
value: Any = None
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class ValidationResult:
|
|
15
|
+
valid: bool
|
|
16
|
+
data: dict | list | None = None
|
|
17
|
+
errors: list[ValidationError] = field(default_factory=list)
|
|
18
|
+
repaired: bool = False
|
|
19
|
+
strategies_applied: list[str] = field(default_factory=list)
|
|
20
|
+
original_text: str = ""
|
|
21
|
+
repaired_text: str = ""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class RepairResult:
|
|
26
|
+
repaired: bool
|
|
27
|
+
text: str
|
|
28
|
+
strategies_applied: list[str] = field(default_factory=list)
|
|
29
|
+
parse_error: str | None = None
|
outputguard/py.typed
ADDED
|
File without changes
|
outputguard/repairer.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""JSON repair engine — applies strategies in sequence to fix malformed JSON."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Literal, overload
|
|
7
|
+
|
|
8
|
+
from outputguard.models import RepairResult
|
|
9
|
+
from outputguard.report import RepairReport, StrategyApplication
|
|
10
|
+
from outputguard.strategies import get_strategies
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@overload
|
|
14
|
+
def repair(text: str, strategies: list[str] | None = ...) -> RepairResult: ...
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@overload
|
|
18
|
+
def repair(
|
|
19
|
+
text: str, strategies: list[str] | None = ..., *, report: Literal[True]
|
|
20
|
+
) -> tuple[RepairResult, RepairReport]: ...
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def repair(
|
|
24
|
+
text: str, strategies: list[str] | None = None, *, report: bool = False
|
|
25
|
+
) -> RepairResult | tuple[RepairResult, RepairReport]:
|
|
26
|
+
"""Apply repair strategies in order, try to parse after each one.
|
|
27
|
+
|
|
28
|
+
If report=True, returns a (RepairResult, RepairReport) tuple.
|
|
29
|
+
"""
|
|
30
|
+
try:
|
|
31
|
+
json.loads(text)
|
|
32
|
+
result = RepairResult(repaired=False, text=text)
|
|
33
|
+
if report:
|
|
34
|
+
return result, RepairReport(original_text=text, final_text=text, success=True)
|
|
35
|
+
return result
|
|
36
|
+
except json.JSONDecodeError:
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
strategy_list = get_strategies(strategies)
|
|
40
|
+
last_error: str = ""
|
|
41
|
+
steps: list[StrategyApplication] = []
|
|
42
|
+
|
|
43
|
+
# First pass: apply ALL strategies in sequence, then try parsing
|
|
44
|
+
current = text
|
|
45
|
+
applied: list[str] = []
|
|
46
|
+
for name, fn in strategy_list:
|
|
47
|
+
before = current
|
|
48
|
+
try:
|
|
49
|
+
current = fn(current)
|
|
50
|
+
except Exception:
|
|
51
|
+
current = before
|
|
52
|
+
changed = current != before
|
|
53
|
+
steps.append(
|
|
54
|
+
StrategyApplication(name=name, changed=changed, input_text=before, output_text=current)
|
|
55
|
+
)
|
|
56
|
+
if changed:
|
|
57
|
+
applied.append(name)
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
json.loads(current)
|
|
61
|
+
result = RepairResult(repaired=True, text=current, strategies_applied=applied)
|
|
62
|
+
if report:
|
|
63
|
+
return result, RepairReport(
|
|
64
|
+
original_text=text, final_text=current, success=True, steps=steps
|
|
65
|
+
)
|
|
66
|
+
return result
|
|
67
|
+
except json.JSONDecodeError as e:
|
|
68
|
+
last_error = str(e)
|
|
69
|
+
|
|
70
|
+
# Second pass: apply one at a time with parse attempts between each
|
|
71
|
+
current = text
|
|
72
|
+
applied = []
|
|
73
|
+
steps = []
|
|
74
|
+
for name, fn in strategy_list:
|
|
75
|
+
before = current
|
|
76
|
+
try:
|
|
77
|
+
current = fn(current)
|
|
78
|
+
except Exception:
|
|
79
|
+
current = before
|
|
80
|
+
changed = current != before
|
|
81
|
+
steps.append(
|
|
82
|
+
StrategyApplication(name=name, changed=changed, input_text=before, output_text=current)
|
|
83
|
+
)
|
|
84
|
+
if changed:
|
|
85
|
+
applied.append(name)
|
|
86
|
+
try:
|
|
87
|
+
json.loads(current)
|
|
88
|
+
result = RepairResult(repaired=True, text=current, strategies_applied=applied)
|
|
89
|
+
if report:
|
|
90
|
+
return result, RepairReport(
|
|
91
|
+
original_text=text, final_text=current, success=True, steps=steps
|
|
92
|
+
)
|
|
93
|
+
return result
|
|
94
|
+
except json.JSONDecodeError as e:
|
|
95
|
+
last_error = str(e)
|
|
96
|
+
|
|
97
|
+
result = RepairResult(repaired=False, text=text, parse_error=last_error)
|
|
98
|
+
if report:
|
|
99
|
+
return result, RepairReport(
|
|
100
|
+
original_text=text, final_text=text, success=False, steps=steps, parse_error=last_error
|
|
101
|
+
)
|
|
102
|
+
return result
|
outputguard/report.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from difflib import unified_diff
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class StrategyApplication:
|
|
7
|
+
"""Record of a single strategy being applied."""
|
|
8
|
+
|
|
9
|
+
name: str
|
|
10
|
+
changed: bool
|
|
11
|
+
input_text: str
|
|
12
|
+
output_text: str
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def diff(self) -> str:
|
|
16
|
+
"""Unified diff of this strategy's changes."""
|
|
17
|
+
if not self.changed:
|
|
18
|
+
return ""
|
|
19
|
+
return "\n".join(
|
|
20
|
+
unified_diff(
|
|
21
|
+
self.input_text.splitlines(keepends=True),
|
|
22
|
+
self.output_text.splitlines(keepends=True),
|
|
23
|
+
fromfile=f"before_{self.name}",
|
|
24
|
+
tofile=f"after_{self.name}",
|
|
25
|
+
lineterm="",
|
|
26
|
+
)
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class RepairReport:
|
|
32
|
+
"""Detailed report of a repair operation."""
|
|
33
|
+
|
|
34
|
+
original_text: str
|
|
35
|
+
final_text: str
|
|
36
|
+
success: bool
|
|
37
|
+
steps: list[StrategyApplication] = field(default_factory=list)
|
|
38
|
+
parse_error: str | None = None
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def strategies_applied(self) -> list[str]:
|
|
42
|
+
"""Names of strategies that actually changed the text."""
|
|
43
|
+
return [s.name for s in self.steps if s.changed]
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def strategies_tried(self) -> list[str]:
|
|
47
|
+
"""Names of all strategies that were tried."""
|
|
48
|
+
return [s.name for s in self.steps]
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def diff(self) -> str:
|
|
52
|
+
"""Unified diff from original to final text."""
|
|
53
|
+
if self.original_text == self.final_text:
|
|
54
|
+
return ""
|
|
55
|
+
return "\n".join(
|
|
56
|
+
unified_diff(
|
|
57
|
+
self.original_text.splitlines(keepends=True),
|
|
58
|
+
self.final_text.splitlines(keepends=True),
|
|
59
|
+
fromfile="original",
|
|
60
|
+
tofile="repaired",
|
|
61
|
+
lineterm="",
|
|
62
|
+
)
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def confidence(self) -> float:
|
|
67
|
+
"""Heuristic confidence score (0.0 to 1.0) for the repair.
|
|
68
|
+
|
|
69
|
+
Higher when fewer strategies were needed and the change was minimal.
|
|
70
|
+
"""
|
|
71
|
+
if not self.success:
|
|
72
|
+
return 0.0
|
|
73
|
+
|
|
74
|
+
len(self.steps)
|
|
75
|
+
applied_count = len(self.strategies_applied)
|
|
76
|
+
|
|
77
|
+
if applied_count == 0:
|
|
78
|
+
return 1.0 # No repair needed, already valid
|
|
79
|
+
|
|
80
|
+
# Start at 1.0, reduce by:
|
|
81
|
+
# - Number of strategies needed (more = less confident)
|
|
82
|
+
# - Ratio of text changed (more change = less confident)
|
|
83
|
+
strategy_penalty = min(applied_count * 0.1, 0.5)
|
|
84
|
+
|
|
85
|
+
orig_len = max(len(self.original_text), 1)
|
|
86
|
+
final_len = max(len(self.final_text), 1)
|
|
87
|
+
change_ratio = abs(orig_len - final_len) / max(orig_len, final_len)
|
|
88
|
+
change_penalty = min(change_ratio * 0.5, 0.3)
|
|
89
|
+
|
|
90
|
+
return max(round(1.0 - strategy_penalty - change_penalty, 2), 0.1)
|
|
91
|
+
|
|
92
|
+
@property
|
|
93
|
+
def summary(self) -> str:
|
|
94
|
+
"""One-line summary of the repair."""
|
|
95
|
+
if not self.success:
|
|
96
|
+
return f"Repair failed after trying {len(self.steps)} strategies"
|
|
97
|
+
applied = self.strategies_applied
|
|
98
|
+
if not applied:
|
|
99
|
+
return "No repair needed — JSON was already valid"
|
|
100
|
+
return f"Repaired using {len(applied)} strategy(ies): {', '.join(applied)}"
|
|
101
|
+
|
|
102
|
+
def step_diffs(self) -> str:
|
|
103
|
+
"""Show diff for each strategy that made changes, useful for --verbose."""
|
|
104
|
+
parts = []
|
|
105
|
+
for step in self.steps:
|
|
106
|
+
if step.changed:
|
|
107
|
+
parts.append(f"=== {step.name} ===")
|
|
108
|
+
parts.append(step.diff)
|
|
109
|
+
parts.append("")
|
|
110
|
+
return "\n".join(parts)
|