shrinkray 0.0.0__py3-none-any.whl → 25.12.26.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shrinkray/__main__.py +130 -960
- shrinkray/cli.py +70 -0
- shrinkray/display.py +75 -0
- shrinkray/formatting.py +108 -0
- shrinkray/passes/bytes.py +217 -10
- shrinkray/passes/clangdelta.py +47 -17
- shrinkray/passes/definitions.py +84 -4
- shrinkray/passes/genericlanguages.py +61 -7
- shrinkray/passes/json.py +6 -0
- shrinkray/passes/patching.py +65 -57
- shrinkray/passes/python.py +66 -23
- shrinkray/passes/sat.py +505 -91
- shrinkray/passes/sequences.py +26 -6
- shrinkray/problem.py +206 -27
- shrinkray/process.py +49 -0
- shrinkray/reducer.py +187 -25
- shrinkray/state.py +599 -0
- shrinkray/subprocess/__init__.py +24 -0
- shrinkray/subprocess/client.py +253 -0
- shrinkray/subprocess/protocol.py +190 -0
- shrinkray/subprocess/worker.py +491 -0
- shrinkray/tui.py +915 -0
- shrinkray/ui.py +72 -0
- shrinkray/work.py +34 -6
- {shrinkray-0.0.0.dist-info → shrinkray-25.12.26.0.dist-info}/METADATA +44 -27
- shrinkray-25.12.26.0.dist-info/RECORD +33 -0
- {shrinkray-0.0.0.dist-info → shrinkray-25.12.26.0.dist-info}/WHEEL +2 -1
- shrinkray-25.12.26.0.dist-info/entry_points.txt +3 -0
- shrinkray-25.12.26.0.dist-info/top_level.txt +1 -0
- shrinkray/learning.py +0 -221
- shrinkray-0.0.0.dist-info/RECORD +0 -22
- shrinkray-0.0.0.dist-info/entry_points.txt +0 -3
- {shrinkray-0.0.0.dist-info → shrinkray-25.12.26.0.dist-info/licenses}/LICENSE +0 -0
shrinkray/cli.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""CLI utilities and types for shrink ray."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import shlex
|
|
5
|
+
import sys
|
|
6
|
+
from enum import Enum, IntEnum, auto
|
|
7
|
+
from shutil import which
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import click
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def validate_command(ctx: Any, param: Any, value: str) -> list[str]:
|
|
14
|
+
"""Validate and resolve a command string."""
|
|
15
|
+
parts = shlex.split(value)
|
|
16
|
+
command = parts[0]
|
|
17
|
+
|
|
18
|
+
if os.path.exists(command):
|
|
19
|
+
command = os.path.abspath(command)
|
|
20
|
+
else:
|
|
21
|
+
what = which(command)
|
|
22
|
+
if what is None:
|
|
23
|
+
raise click.BadParameter(f"{command}: command not found")
|
|
24
|
+
command = os.path.abspath(what)
|
|
25
|
+
return [command] + parts[1:]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class EnumChoice[EnumType: Enum](click.Choice):
|
|
29
|
+
"""A click Choice that works with Enums."""
|
|
30
|
+
|
|
31
|
+
def __init__(self, enum: type[EnumType]) -> None:
|
|
32
|
+
self.enum = enum
|
|
33
|
+
choices = [str(e.name) for e in enum]
|
|
34
|
+
self.__values = {e.name: e for e in enum}
|
|
35
|
+
super().__init__(choices)
|
|
36
|
+
|
|
37
|
+
def convert(self, value: str, param: Any, ctx: Any) -> EnumType:
|
|
38
|
+
return self.__values[value]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class InputType(IntEnum):
|
|
42
|
+
"""How input is passed to the test function."""
|
|
43
|
+
|
|
44
|
+
all = 0
|
|
45
|
+
stdin = 1
|
|
46
|
+
arg = 2
|
|
47
|
+
basename = 3
|
|
48
|
+
|
|
49
|
+
def enabled(self, value: "InputType") -> bool:
|
|
50
|
+
if self == InputType.all:
|
|
51
|
+
return True
|
|
52
|
+
return self == value
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class UIType(Enum):
|
|
56
|
+
"""Type of UI to use."""
|
|
57
|
+
|
|
58
|
+
basic = auto()
|
|
59
|
+
textual = auto()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def validate_ui(ctx, param, value) -> UIType:
|
|
63
|
+
"""Validate and determine UI type."""
|
|
64
|
+
if value is None:
|
|
65
|
+
if sys.stdin.isatty() and sys.stdout.isatty():
|
|
66
|
+
return UIType.textual
|
|
67
|
+
else:
|
|
68
|
+
return UIType.basic
|
|
69
|
+
else:
|
|
70
|
+
return value
|
shrinkray/display.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Display utilities for shrink ray."""
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
|
|
6
|
+
from binaryornot.check import is_binary_string # type: ignore[import-not-found]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_terminal_size() -> tuple[int, int]:
|
|
10
|
+
"""Get terminal size, with sensible fallbacks.
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
(columns, lines) tuple. Defaults to (80, 24) if terminal size
|
|
14
|
+
cannot be determined.
|
|
15
|
+
"""
|
|
16
|
+
size = shutil.get_terminal_size(fallback=(80, 24))
|
|
17
|
+
return (size.columns, size.lines)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def to_lines(test_case: bytes) -> list[str]:
|
|
21
|
+
"""Convert a test case to displayable lines."""
|
|
22
|
+
result = []
|
|
23
|
+
for line in test_case.split(b"\n"):
|
|
24
|
+
if is_binary_string(line):
|
|
25
|
+
result.append(line.hex())
|
|
26
|
+
else:
|
|
27
|
+
try:
|
|
28
|
+
result.append(line.decode("utf-8"))
|
|
29
|
+
except UnicodeDecodeError:
|
|
30
|
+
result.append(line.hex())
|
|
31
|
+
return result
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def to_blocks(test_case: bytes, block_size: int | None = None) -> list[str]:
|
|
35
|
+
"""Convert a test case to hex blocks for display.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
test_case: The bytes to convert
|
|
39
|
+
block_size: Number of bytes per block. If None, automatically
|
|
40
|
+
calculated from terminal width (each byte becomes 2 hex chars).
|
|
41
|
+
"""
|
|
42
|
+
if block_size is None:
|
|
43
|
+
columns, _ = get_terminal_size()
|
|
44
|
+
# Each byte becomes 2 hex chars, leave some margin
|
|
45
|
+
block_size = max(1, (columns - 4) // 2)
|
|
46
|
+
return [
|
|
47
|
+
test_case[i : i + block_size].hex()
|
|
48
|
+
for i in range(0, len(test_case), block_size)
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def format_diff(diff: Iterable[str], max_lines: int | None = None) -> str:
|
|
53
|
+
"""Format a diff for display, truncating if too long.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
diff: Iterable of diff lines
|
|
57
|
+
max_lines: Maximum number of lines to include. If None, uses
|
|
58
|
+
terminal height multiplied by a factor to allow scrolling
|
|
59
|
+
through substantial context.
|
|
60
|
+
"""
|
|
61
|
+
if max_lines is None:
|
|
62
|
+
_, lines = get_terminal_size()
|
|
63
|
+
# Allow multiple screenfuls of context for scrolling
|
|
64
|
+
max_lines = max(lines * 20, 100)
|
|
65
|
+
results = []
|
|
66
|
+
start_writing = False
|
|
67
|
+
for line in diff:
|
|
68
|
+
if not start_writing and line.startswith("@@"):
|
|
69
|
+
start_writing = True
|
|
70
|
+
if start_writing:
|
|
71
|
+
results.append(line)
|
|
72
|
+
if len(results) > max_lines:
|
|
73
|
+
results.append("...")
|
|
74
|
+
break
|
|
75
|
+
return "\n".join(results)
|
shrinkray/formatting.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Formatting utilities for shrink ray."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
from shutil import which
|
|
6
|
+
|
|
7
|
+
import chardet
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def find_python_command(name: str) -> str | None:
|
|
11
|
+
"""Find a Python command, checking both PATH and the current Python's bin directory."""
|
|
12
|
+
first_attempt = which(name)
|
|
13
|
+
if first_attempt is not None:
|
|
14
|
+
return first_attempt
|
|
15
|
+
second_attempt = os.path.join(os.path.dirname(sys.executable), name)
|
|
16
|
+
if os.path.exists(second_attempt):
|
|
17
|
+
return second_attempt
|
|
18
|
+
return None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def try_decode(data: bytes) -> tuple[str | None, str]:
|
|
22
|
+
"""Try to decode bytes using detected encoding."""
|
|
23
|
+
for guess in chardet.detect_all(data):
|
|
24
|
+
try:
|
|
25
|
+
enc = guess["encoding"]
|
|
26
|
+
if enc is not None:
|
|
27
|
+
return enc, data.decode(enc)
|
|
28
|
+
except UnicodeDecodeError:
|
|
29
|
+
pass
|
|
30
|
+
return None, ""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def default_formatter_command_for(filename: str) -> list[str] | str | None:
|
|
34
|
+
"""Get the default formatter command for a file based on its extension."""
|
|
35
|
+
*_, ext = os.path.splitext(filename)
|
|
36
|
+
|
|
37
|
+
if ext in (".c", ".h", ".cpp", ".hpp", ".cc", ".cxx"):
|
|
38
|
+
return which("clang-format")
|
|
39
|
+
|
|
40
|
+
if ext == ".py":
|
|
41
|
+
black = find_python_command("black")
|
|
42
|
+
if black is not None:
|
|
43
|
+
return [black, "-"]
|
|
44
|
+
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def default_reformat_data(data: bytes) -> bytes:
|
|
49
|
+
"""Apply a simple language-agnostic reformatting to data."""
|
|
50
|
+
encoding, decoded = try_decode(data)
|
|
51
|
+
if encoding is None:
|
|
52
|
+
return data
|
|
53
|
+
result = []
|
|
54
|
+
indent = 0
|
|
55
|
+
|
|
56
|
+
def newline() -> None:
|
|
57
|
+
result.append("\n" + indent * " ")
|
|
58
|
+
|
|
59
|
+
start_of_newline = True
|
|
60
|
+
for i, c in enumerate(decoded):
|
|
61
|
+
if c == "\n":
|
|
62
|
+
start_of_newline = True
|
|
63
|
+
newline()
|
|
64
|
+
continue
|
|
65
|
+
elif c == " ":
|
|
66
|
+
if start_of_newline:
|
|
67
|
+
continue
|
|
68
|
+
else:
|
|
69
|
+
start_of_newline = False
|
|
70
|
+
if c == "{":
|
|
71
|
+
result.append(c)
|
|
72
|
+
indent += 4
|
|
73
|
+
if i + 1 == len(decoded) or decoded[i + 1] != "}":
|
|
74
|
+
newline()
|
|
75
|
+
elif c == "}":
|
|
76
|
+
if len(result) > 1 and result[-1].endswith(" "):
|
|
77
|
+
result[-1] = result[-1][:-4]
|
|
78
|
+
result.append(c)
|
|
79
|
+
indent -= 4
|
|
80
|
+
newline()
|
|
81
|
+
elif c == ";":
|
|
82
|
+
result.append(c)
|
|
83
|
+
newline()
|
|
84
|
+
else:
|
|
85
|
+
result.append(c)
|
|
86
|
+
|
|
87
|
+
output = "".join(result)
|
|
88
|
+
prev = None
|
|
89
|
+
while prev != output:
|
|
90
|
+
prev = output
|
|
91
|
+
|
|
92
|
+
output = output.replace(" \n", "\n")
|
|
93
|
+
output = output.replace("\n\n", "\n")
|
|
94
|
+
|
|
95
|
+
return output.encode(encoding)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def determine_formatter_command(formatter: str, filename: str) -> list[str] | None:
|
|
99
|
+
"""Determine the formatter command to use based on settings and filename."""
|
|
100
|
+
if formatter.lower() == "default":
|
|
101
|
+
formatter_command = default_formatter_command_for(filename)
|
|
102
|
+
elif formatter.lower() != "none":
|
|
103
|
+
formatter_command = formatter
|
|
104
|
+
else:
|
|
105
|
+
formatter_command = None
|
|
106
|
+
if isinstance(formatter_command, str):
|
|
107
|
+
formatter_command = [formatter_command]
|
|
108
|
+
return formatter_command
|
shrinkray/passes/bytes.py
CHANGED
|
@@ -1,5 +1,26 @@
|
|
|
1
|
+
"""Byte-level reduction passes.
|
|
2
|
+
|
|
3
|
+
This module provides reduction passes that operate on raw bytes.
|
|
4
|
+
These are the foundation of Shrink Ray's reduction strategy, as
|
|
5
|
+
all file formats ultimately reduce to bytes.
|
|
6
|
+
|
|
7
|
+
Key passes:
|
|
8
|
+
- hollow: Keeps only start/end of bracketed regions
|
|
9
|
+
- lift_braces: Replaces {...} with its content
|
|
10
|
+
- debracket: Removes matching bracket pairs
|
|
11
|
+
- delete_byte_spans: Deletes contiguous byte ranges
|
|
12
|
+
- short_deletions: Deletes small (1-10 byte) sequences
|
|
13
|
+
- remove_indents/remove_whitespace: Whitespace normalization
|
|
14
|
+
- lower_bytes: Reduces byte values toward 0
|
|
15
|
+
- lexeme_based_deletions: Deletes between repeated patterns
|
|
16
|
+
|
|
17
|
+
Formats:
|
|
18
|
+
- Split(delimiter): Parses bytes into list of segments
|
|
19
|
+
- Tokenize(): Parses bytes into tokens (identifiers, numbers, etc.)
|
|
20
|
+
"""
|
|
21
|
+
|
|
1
22
|
from collections import defaultdict, deque
|
|
2
|
-
from
|
|
23
|
+
from collections.abc import Sequence
|
|
3
24
|
|
|
4
25
|
from attrs import define
|
|
5
26
|
|
|
@@ -9,6 +30,8 @@ from shrinkray.passes.patching import Cuts, Patches, apply_patches
|
|
|
9
30
|
|
|
10
31
|
@define(frozen=True)
|
|
11
32
|
class Encoding(Format[bytes, str]):
|
|
33
|
+
"""Format that decodes/encodes bytes using a character encoding."""
|
|
34
|
+
|
|
12
35
|
encoding: str
|
|
13
36
|
|
|
14
37
|
def __repr__(self) -> str:
|
|
@@ -27,6 +50,18 @@ class Encoding(Format[bytes, str]):
|
|
|
27
50
|
|
|
28
51
|
@define(frozen=True)
|
|
29
52
|
class Split(Format[bytes, list[bytes]]):
|
|
53
|
+
"""Format that splits bytes by a delimiter.
|
|
54
|
+
|
|
55
|
+
This enables sequence-based passes to work on lines, statements, etc.
|
|
56
|
+
|
|
57
|
+
Example:
|
|
58
|
+
# Delete duplicate lines
|
|
59
|
+
compose(Split(b"\\n"), delete_duplicates)
|
|
60
|
+
|
|
61
|
+
# Delete blocks of 1-10 semicolon-separated statements
|
|
62
|
+
compose(Split(b";"), block_deletion(1, 10))
|
|
63
|
+
"""
|
|
64
|
+
|
|
30
65
|
splitter: bytes
|
|
31
66
|
|
|
32
67
|
def __repr__(self) -> str:
|
|
@@ -44,6 +79,16 @@ class Split(Format[bytes, list[bytes]]):
|
|
|
44
79
|
|
|
45
80
|
|
|
46
81
|
def find_ngram_endpoints(value: bytes) -> list[tuple[int, list[int]]]:
|
|
82
|
+
"""Find repeated byte patterns and their positions.
|
|
83
|
+
|
|
84
|
+
This is used by lexeme_based_deletions to identify regions between
|
|
85
|
+
repeated patterns that might be deletable. For example, in code like:
|
|
86
|
+
print("hello"); print("world"); print("test")
|
|
87
|
+
The repeated "print" patterns suggest the semicolon-separated regions
|
|
88
|
+
might be independently deletable.
|
|
89
|
+
|
|
90
|
+
Returns a list of (ngram_length, [positions]) tuples.
|
|
91
|
+
"""
|
|
47
92
|
if len(set(value)) <= 1:
|
|
48
93
|
return []
|
|
49
94
|
queue: deque[tuple[int, Sequence[int]]] = deque([(0, range(len(value)))])
|
|
@@ -80,12 +125,24 @@ def find_ngram_endpoints(value: bytes) -> list[tuple[int, list[int]]]:
|
|
|
80
125
|
|
|
81
126
|
|
|
82
127
|
def tokenize(text: bytes) -> list[bytes]:
|
|
128
|
+
"""Split bytes into tokens: identifiers, numbers, and other characters.
|
|
129
|
+
|
|
130
|
+
This is a simple tokenizer that groups:
|
|
131
|
+
- Identifiers: [A-Za-z][A-Za-z0-9_]*
|
|
132
|
+
- Numbers: [0-9]+ (with optional decimal point)
|
|
133
|
+
- Spaces: runs of spaces
|
|
134
|
+
- Everything else: individual characters
|
|
135
|
+
|
|
136
|
+
Example:
|
|
137
|
+
tokenize(b"foo = 123") -> [b"foo", b" ", b"=", b" ", b"123"]
|
|
138
|
+
"""
|
|
83
139
|
result: list[bytes] = []
|
|
84
140
|
i = 0
|
|
85
141
|
while i < len(text):
|
|
86
142
|
c = bytes([text[i]])
|
|
87
143
|
j = i + 1
|
|
88
144
|
if b"A" <= c <= b"z":
|
|
145
|
+
# Identifier: consume alphanumeric and underscore
|
|
89
146
|
while j < len(text) and (
|
|
90
147
|
b"A"[0] <= text[j] <= b"z"[0]
|
|
91
148
|
or text[j] == b"_"[0]
|
|
@@ -93,11 +150,13 @@ def tokenize(text: bytes) -> list[bytes]:
|
|
|
93
150
|
):
|
|
94
151
|
j += 1
|
|
95
152
|
elif b"0" <= c <= b"9":
|
|
153
|
+
# Number: consume digits and decimal point
|
|
96
154
|
while j < len(text) and (
|
|
97
155
|
text[j] == b"."[0] or b"0"[0] <= text[j] <= b"9"[0]
|
|
98
156
|
):
|
|
99
157
|
j += 1
|
|
100
158
|
elif c == b" ":
|
|
159
|
+
# Space run: consume consecutive spaces
|
|
101
160
|
while j < len(text) and (text[j] == b" "[0]):
|
|
102
161
|
j += 1
|
|
103
162
|
result.append(text[i:j])
|
|
@@ -112,11 +171,25 @@ MAX_DELETE_INTERVAL = 8
|
|
|
112
171
|
async def lexeme_based_deletions(
|
|
113
172
|
problem: ReductionProblem[bytes], min_size: int = 8
|
|
114
173
|
) -> None:
|
|
174
|
+
"""Delete regions between repeated byte patterns.
|
|
175
|
+
|
|
176
|
+
This pass finds repeated patterns (like repeated keywords or punctuation)
|
|
177
|
+
and tries to delete the regions between them. For code like:
|
|
178
|
+
|
|
179
|
+
print("a"); print("b"); print("c")
|
|
180
|
+
|
|
181
|
+
The repeated "print(" pattern suggests each print statement might be
|
|
182
|
+
independently deletable. This pass identifies such regions and tries
|
|
183
|
+
to delete them.
|
|
184
|
+
|
|
185
|
+
Only regions >= min_size bytes are considered to avoid tiny deletions.
|
|
186
|
+
"""
|
|
115
187
|
intervals_by_k: dict[int, set[tuple[int, int]]] = defaultdict(set)
|
|
116
188
|
|
|
117
189
|
for k, endpoints in find_ngram_endpoints(problem.current_test_case):
|
|
118
|
-
intervals_by_k[k].update(zip(endpoints, endpoints[1:]))
|
|
190
|
+
intervals_by_k[k].update(zip(endpoints, endpoints[1:], strict=False))
|
|
119
191
|
|
|
192
|
+
# Sort by ngram length (longer patterns first) then by interval size
|
|
120
193
|
intervals_to_delete = [
|
|
121
194
|
t
|
|
122
195
|
for _, intervals in sorted(intervals_by_k.items(), reverse=True)
|
|
@@ -132,10 +205,21 @@ async def delete_intervals(
|
|
|
132
205
|
intervals_to_delete: list[tuple[int, int]],
|
|
133
206
|
shuffle: bool = False,
|
|
134
207
|
) -> None:
|
|
208
|
+
"""Try to delete each of the given byte intervals.
|
|
209
|
+
|
|
210
|
+
Each interval (start, end) represents a contiguous region to try deleting.
|
|
211
|
+
The patch applier will find which intervals can be deleted independently
|
|
212
|
+
and combine compatible deletions.
|
|
213
|
+
"""
|
|
135
214
|
await apply_patches(problem, Cuts(), [[t] for t in intervals_to_delete])
|
|
136
215
|
|
|
137
216
|
|
|
138
217
|
def brace_intervals(target: bytes, brace: bytes) -> list[tuple[int, int]]:
|
|
218
|
+
"""Find all intervals enclosed by matching brace pairs.
|
|
219
|
+
|
|
220
|
+
Given a two-byte brace string like b"{}", returns intervals for content
|
|
221
|
+
between each matched open/close pair. Handles nesting correctly.
|
|
222
|
+
"""
|
|
139
223
|
open, close = brace
|
|
140
224
|
intervals: list[tuple[int, int]] = []
|
|
141
225
|
stack: list[int] = []
|
|
@@ -151,6 +235,16 @@ def brace_intervals(target: bytes, brace: bytes) -> list[tuple[int, int]]:
|
|
|
151
235
|
|
|
152
236
|
|
|
153
237
|
async def debracket(problem: ReductionProblem[bytes]) -> None:
|
|
238
|
+
"""Remove matching bracket pairs, keeping their content.
|
|
239
|
+
|
|
240
|
+
Example transformations:
|
|
241
|
+
"(x + y)" -> "x + y"
|
|
242
|
+
"[1, 2]" -> "1, 2"
|
|
243
|
+
"{foo}" -> "foo"
|
|
244
|
+
|
|
245
|
+
This is useful when brackets become unnecessary after other reductions,
|
|
246
|
+
e.g., if a function call was simplified to just its first argument.
|
|
247
|
+
"""
|
|
154
248
|
cuts = [
|
|
155
249
|
[(u - 1, u), (v, v + 1)]
|
|
156
250
|
for brackets in [b"{}", b"()", b"[]"]
|
|
@@ -164,6 +258,10 @@ async def debracket(problem: ReductionProblem[bytes]) -> None:
|
|
|
164
258
|
|
|
165
259
|
|
|
166
260
|
def quote_intervals(target: bytes) -> list[tuple[int, int]]:
|
|
261
|
+
"""Find all intervals enclosed by matching quote pairs.
|
|
262
|
+
|
|
263
|
+
Returns intervals between consecutive single or double quotes.
|
|
264
|
+
"""
|
|
167
265
|
indices: dict[int, list[int]] = defaultdict(list)
|
|
168
266
|
for i, c in enumerate(target):
|
|
169
267
|
indices[c].append(i)
|
|
@@ -178,6 +276,20 @@ def quote_intervals(target: bytes) -> list[tuple[int, int]]:
|
|
|
178
276
|
|
|
179
277
|
|
|
180
278
|
async def hollow(problem: ReductionProblem[bytes]) -> None:
|
|
279
|
+
"""Delete the contents of bracketed and quoted regions.
|
|
280
|
+
|
|
281
|
+
Example transformations:
|
|
282
|
+
'{"lots": "of json"}' -> '{}'
|
|
283
|
+
"[1, 2, 3, 4, 5]" -> "[]"
|
|
284
|
+
'"long string here"' -> '""'
|
|
285
|
+
|
|
286
|
+
This is one of the most effective early passes: it quickly removes
|
|
287
|
+
large chunks of content from structured data, keeping only the
|
|
288
|
+
"skeleton" of brackets and quotes.
|
|
289
|
+
|
|
290
|
+
Intervals are sorted by size (smallest first) to maximize the chance
|
|
291
|
+
of finding independent deletions that can be combined.
|
|
292
|
+
"""
|
|
181
293
|
target = problem.current_test_case
|
|
182
294
|
intervals: list[tuple[int, int]] = []
|
|
183
295
|
for b in [
|
|
@@ -194,6 +306,15 @@ async def hollow(problem: ReductionProblem[bytes]) -> None:
|
|
|
194
306
|
|
|
195
307
|
|
|
196
308
|
async def short_deletions(problem: ReductionProblem[bytes]) -> None:
|
|
309
|
+
"""Try deleting every small (1-10 byte) substring.
|
|
310
|
+
|
|
311
|
+
This is a brute-force pass that tries all possible small deletions.
|
|
312
|
+
It's expensive but effective for cleaning up small syntax elements
|
|
313
|
+
that other passes miss.
|
|
314
|
+
|
|
315
|
+
Example: After other passes simplify "foo(x, y)" to "foo(x)", this
|
|
316
|
+
pass might find that deleting ", y" or "x, " works.
|
|
317
|
+
"""
|
|
197
318
|
target = problem.current_test_case
|
|
198
319
|
await delete_intervals(
|
|
199
320
|
problem,
|
|
@@ -206,6 +327,19 @@ async def short_deletions(problem: ReductionProblem[bytes]) -> None:
|
|
|
206
327
|
|
|
207
328
|
|
|
208
329
|
async def lift_braces(problem: ReductionProblem[bytes]) -> None:
|
|
330
|
+
"""Replace outer braces with inner braces' content.
|
|
331
|
+
|
|
332
|
+
For nested braces like {A{B}C}, this tries to replace the outer
|
|
333
|
+
braces with just the inner content: {A{B}C} -> {B}
|
|
334
|
+
|
|
335
|
+
Example transformations:
|
|
336
|
+
"if (x) { if (y) { z } }" -> "if (x) { z }"
|
|
337
|
+
"{ outer { inner } more }" -> "{ inner }"
|
|
338
|
+
|
|
339
|
+
This is useful for eliminating wrapper blocks while keeping the
|
|
340
|
+
essential nested structure. It complements debracket (which removes
|
|
341
|
+
brackets entirely) and hollow (which empties brackets).
|
|
342
|
+
"""
|
|
209
343
|
target = problem.current_test_case
|
|
210
344
|
|
|
211
345
|
open_brace, close_brace = b"{}"
|
|
@@ -214,6 +348,7 @@ async def lift_braces(problem: ReductionProblem[bytes]) -> None:
|
|
|
214
348
|
|
|
215
349
|
results: list[tuple[int, int, list[tuple[int, int]]]] = []
|
|
216
350
|
|
|
351
|
+
# Track brace nesting and record parent-child relationships
|
|
217
352
|
for i, c in enumerate(target):
|
|
218
353
|
if c == open_brace:
|
|
219
354
|
start_stack.append(i)
|
|
@@ -227,6 +362,7 @@ async def lift_braces(problem: ReductionProblem[bytes]) -> None:
|
|
|
227
362
|
if end > start:
|
|
228
363
|
results.append((start, end, children))
|
|
229
364
|
|
|
365
|
+
# For each parent-child pair, try deleting parent content around child
|
|
230
366
|
cuts: list[list[tuple[int, int]]] = []
|
|
231
367
|
for start, end, children in results:
|
|
232
368
|
for child_start, child_end in children:
|
|
@@ -252,6 +388,12 @@ class Tokenize(Format[bytes, list[bytes]]):
|
|
|
252
388
|
|
|
253
389
|
|
|
254
390
|
async def delete_byte_spans(problem: ReductionProblem[bytes]) -> None:
|
|
391
|
+
"""Delete spans between occurrences of the same byte value.
|
|
392
|
+
|
|
393
|
+
For each byte value that appears multiple times, tries to delete
|
|
394
|
+
regions from the start to the first occurrence, between consecutive
|
|
395
|
+
occurrences, and from the last occurrence to the end.
|
|
396
|
+
"""
|
|
255
397
|
indices: dict[int, list[int]] = defaultdict(list)
|
|
256
398
|
target = problem.current_test_case
|
|
257
399
|
for i, c in enumerate(target):
|
|
@@ -262,13 +404,18 @@ async def delete_byte_spans(problem: ReductionProblem[bytes]) -> None:
|
|
|
262
404
|
for c, ix in sorted(indices.items()):
|
|
263
405
|
if len(ix) > 1:
|
|
264
406
|
spans.append((0, ix[0] + 1))
|
|
265
|
-
spans.extend(zip(ix, ix[1:]))
|
|
407
|
+
spans.extend(zip(ix, ix[1:], strict=False))
|
|
266
408
|
spans.append((ix[-1], len(target)))
|
|
267
409
|
|
|
268
410
|
await apply_patches(problem, Cuts(), [[s] for s in spans])
|
|
269
411
|
|
|
270
412
|
|
|
271
413
|
async def remove_indents(problem: ReductionProblem[bytes]) -> None:
|
|
414
|
+
"""Remove leading spaces from lines.
|
|
415
|
+
|
|
416
|
+
Finds runs of spaces following newlines and tries to delete them.
|
|
417
|
+
Useful for normalizing indentation in code.
|
|
418
|
+
"""
|
|
272
419
|
target = problem.current_test_case
|
|
273
420
|
spans: list[list[tuple[int, int]]] = []
|
|
274
421
|
|
|
@@ -288,6 +435,12 @@ async def remove_indents(problem: ReductionProblem[bytes]) -> None:
|
|
|
288
435
|
|
|
289
436
|
|
|
290
437
|
async def remove_whitespace(problem: ReductionProblem[bytes]) -> None:
|
|
438
|
+
"""Collapse runs of whitespace.
|
|
439
|
+
|
|
440
|
+
Finds consecutive whitespace characters and tries to remove all but
|
|
441
|
+
the first, or all but the first two. Complements remove_indents by
|
|
442
|
+
handling whitespace anywhere in the file.
|
|
443
|
+
"""
|
|
291
444
|
target = problem.current_test_case
|
|
292
445
|
spans: list[list[tuple[int, int]]] = []
|
|
293
446
|
|
|
@@ -332,6 +485,11 @@ class NewlineReplacer(Patches[frozenset[int], bytes]):
|
|
|
332
485
|
|
|
333
486
|
|
|
334
487
|
async def replace_space_with_newlines(problem: ReductionProblem[bytes]) -> None:
|
|
488
|
+
"""Replace spaces and tabs with newlines.
|
|
489
|
+
|
|
490
|
+
Tries replacing each space or tab with a newline. This can help
|
|
491
|
+
normalize formatting and may enable other line-based reductions.
|
|
492
|
+
"""
|
|
335
493
|
await apply_patches(
|
|
336
494
|
problem,
|
|
337
495
|
NewlineReplacer(),
|
|
@@ -372,6 +530,12 @@ class ByteReplacement(Patches[ReplacementPatch, bytes]):
|
|
|
372
530
|
|
|
373
531
|
|
|
374
532
|
async def lower_bytes(problem: ReductionProblem[bytes]) -> None:
|
|
533
|
+
"""Globally replace byte values with smaller ones.
|
|
534
|
+
|
|
535
|
+
For each distinct byte value in the input, tries replacing all
|
|
536
|
+
occurrences with smaller values (0, 1, half, value-1, whitespace).
|
|
537
|
+
Also tries replacing pairs of bytes with the same smaller value.
|
|
538
|
+
"""
|
|
375
539
|
sources = sorted(set(problem.current_test_case))
|
|
376
540
|
|
|
377
541
|
patches = [
|
|
@@ -417,6 +581,12 @@ class IndividualByteReplacement(Patches[ReplacementPatch, bytes]):
|
|
|
417
581
|
|
|
418
582
|
|
|
419
583
|
async def lower_individual_bytes(problem: ReductionProblem[bytes]) -> None:
|
|
584
|
+
"""Replace individual bytes at specific positions with smaller values.
|
|
585
|
+
|
|
586
|
+
Unlike lower_bytes (which replaces all occurrences of a byte value),
|
|
587
|
+
this tries reducing individual byte positions. Also handles carry-like
|
|
588
|
+
patterns where decrementing one byte allows the next to become 255.
|
|
589
|
+
"""
|
|
420
590
|
initial = problem.current_test_case
|
|
421
591
|
patches = [
|
|
422
592
|
{i: r}
|
|
@@ -434,18 +604,18 @@ async def lower_individual_bytes(problem: ReductionProblem[bytes]) -> None:
|
|
|
434
604
|
RegionReplacementPatch = list[tuple[int, int, int]]
|
|
435
605
|
|
|
436
606
|
|
|
437
|
-
class RegionReplacement(Patches[
|
|
607
|
+
class RegionReplacement(Patches[RegionReplacementPatch, bytes]):
|
|
438
608
|
@property
|
|
439
|
-
def empty(self) ->
|
|
609
|
+
def empty(self) -> RegionReplacementPatch:
|
|
440
610
|
return []
|
|
441
611
|
|
|
442
|
-
def combine(self, *patches:
|
|
443
|
-
result = []
|
|
612
|
+
def combine(self, *patches: RegionReplacementPatch) -> RegionReplacementPatch:
|
|
613
|
+
result: RegionReplacementPatch = []
|
|
444
614
|
for p in patches:
|
|
445
615
|
result.extend(p)
|
|
446
616
|
return result
|
|
447
617
|
|
|
448
|
-
def apply(self, patch:
|
|
618
|
+
def apply(self, patch: RegionReplacementPatch, target: bytes) -> bytes:
|
|
449
619
|
result = bytearray(target)
|
|
450
620
|
for i, j, d in patch:
|
|
451
621
|
if d < result[i]:
|
|
@@ -453,11 +623,16 @@ class RegionReplacement(Patches[ReplacementPatch, bytes]):
|
|
|
453
623
|
result[k] = d
|
|
454
624
|
return bytes(result)
|
|
455
625
|
|
|
456
|
-
def size(self, patch:
|
|
626
|
+
def size(self, patch: RegionReplacementPatch) -> int:
|
|
457
627
|
return 0
|
|
458
628
|
|
|
459
629
|
|
|
460
630
|
async def short_replacements(problem: ReductionProblem[bytes]) -> None:
|
|
631
|
+
"""Replace short regions with uniform byte values.
|
|
632
|
+
|
|
633
|
+
Tries replacing 1-4 byte regions with uniform values like 0, 1,
|
|
634
|
+
space, newline, or period. Useful for simplifying small sequences.
|
|
635
|
+
"""
|
|
461
636
|
target = problem.current_test_case
|
|
462
637
|
patches = [
|
|
463
638
|
[(i, j, c)]
|
|
@@ -505,7 +680,7 @@ async def sort_whitespace(problem: ReductionProblem[bytes]) -> None:
|
|
|
505
680
|
i += 1
|
|
506
681
|
continue
|
|
507
682
|
|
|
508
|
-
async def can_move_to_whitespace(k):
|
|
683
|
+
async def can_move_to_whitespace(k: int) -> bool:
|
|
509
684
|
if i + k > len(problem.current_test_case):
|
|
510
685
|
return False
|
|
511
686
|
|
|
@@ -534,6 +709,11 @@ STANDARD_SUBSTITUTIONS = [(b"\0\0", b"\1"), (b"\0\0", b"\xff")]
|
|
|
534
709
|
|
|
535
710
|
|
|
536
711
|
async def standard_substitutions(problem: ReductionProblem[bytes]):
|
|
712
|
+
"""Apply standard byte sequence substitutions.
|
|
713
|
+
|
|
714
|
+
Tries some specific byte sequence replacements that are sometimes
|
|
715
|
+
helpful, primarily for handling edge cases in artificial test inputs.
|
|
716
|
+
"""
|
|
537
717
|
i = 0
|
|
538
718
|
while i < len(problem.current_test_case):
|
|
539
719
|
for k, v in STANDARD_SUBSTITUTIONS:
|
|
@@ -545,3 +725,30 @@ async def standard_substitutions(problem: ReductionProblem[bytes]):
|
|
|
545
725
|
break
|
|
546
726
|
else:
|
|
547
727
|
i += 1
|
|
728
|
+
|
|
729
|
+
|
|
730
|
+
async def line_sorter(problem: ReductionProblem[bytes]):
|
|
731
|
+
"""Sort lines into a more canonical order.
|
|
732
|
+
|
|
733
|
+
Uses insertion sort to reorder lines, swapping adjacent lines when
|
|
734
|
+
doing so maintains interestingness and produces a lexicographically
|
|
735
|
+
smaller result. This normalizes line order for reproducibility.
|
|
736
|
+
"""
|
|
737
|
+
lines = problem.current_test_case.split(b"\n")
|
|
738
|
+
i = 1
|
|
739
|
+
while i < len(lines):
|
|
740
|
+
j = i
|
|
741
|
+
while j > 0:
|
|
742
|
+
u = lines[j - 1]
|
|
743
|
+
v = lines[j]
|
|
744
|
+
if v + u < u + v:
|
|
745
|
+
attempt = list(lines)
|
|
746
|
+
attempt[j - 1], attempt[j] = attempt[j], attempt[j - 1]
|
|
747
|
+
if not await problem.is_interesting(b"\n".join(attempt)):
|
|
748
|
+
break
|
|
749
|
+
else:
|
|
750
|
+
j -= 1
|
|
751
|
+
lines = attempt
|
|
752
|
+
else:
|
|
753
|
+
break
|
|
754
|
+
i += 1
|