wcgw 5.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wcgw/__init__.py +4 -0
- wcgw/client/__init__.py +0 -0
- wcgw/client/bash_state/bash_state.py +1426 -0
- wcgw/client/bash_state/parser/__init__.py +7 -0
- wcgw/client/bash_state/parser/bash_statement_parser.py +181 -0
- wcgw/client/common.py +51 -0
- wcgw/client/diff-instructions.txt +73 -0
- wcgw/client/encoder/__init__.py +47 -0
- wcgw/client/file_ops/diff_edit.py +619 -0
- wcgw/client/file_ops/extensions.py +137 -0
- wcgw/client/file_ops/search_replace.py +212 -0
- wcgw/client/mcp_server/Readme.md +3 -0
- wcgw/client/mcp_server/__init__.py +32 -0
- wcgw/client/mcp_server/server.py +184 -0
- wcgw/client/memory.py +103 -0
- wcgw/client/modes.py +240 -0
- wcgw/client/repo_ops/display_tree.py +116 -0
- wcgw/client/repo_ops/file_stats.py +152 -0
- wcgw/client/repo_ops/path_prob.py +58 -0
- wcgw/client/repo_ops/paths_model.vocab +20000 -0
- wcgw/client/repo_ops/paths_tokens.model +80042 -0
- wcgw/client/repo_ops/repo_context.py +289 -0
- wcgw/client/schema_generator.py +63 -0
- wcgw/client/tool_prompts.py +98 -0
- wcgw/client/tools.py +1432 -0
- wcgw/py.typed +0 -0
- wcgw/types_.py +318 -0
- wcgw-5.5.4.dist-info/METADATA +339 -0
- wcgw-5.5.4.dist-info/RECORD +38 -0
- wcgw-5.5.4.dist-info/WHEEL +4 -0
- wcgw-5.5.4.dist-info/entry_points.txt +4 -0
- wcgw-5.5.4.dist-info/licenses/LICENSE +213 -0
- wcgw_cli/__init__.py +1 -0
- wcgw_cli/__main__.py +3 -0
- wcgw_cli/anthropic_client.py +486 -0
- wcgw_cli/cli.py +40 -0
- wcgw_cli/openai_client.py +404 -0
- wcgw_cli/openai_utils.py +67 -0
|
@@ -0,0 +1,619 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from difflib import SequenceMatcher
|
|
4
|
+
from typing import Callable, DefaultDict, Literal, Optional
|
|
5
|
+
|
|
6
|
+
TOLERANCE_TYPES = Literal["SILENT", "WARNING", "ERROR"]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SearchReplaceMatchError(Exception):
|
|
10
|
+
def __init__(self, message: str):
|
|
11
|
+
message = f"""
|
|
12
|
+
{message}
|
|
13
|
+
---
|
|
14
|
+
Last edit failed, no changes are applied. None of the search/replace blocks applied in the last tool call.
|
|
15
|
+
Recommendations:
|
|
16
|
+
- Retry immediately with same "percentage_to_change" using search replace blocks fixing above error.
|
|
17
|
+
- If you are still unsure you may re-read the file and then proceed accordingly.
|
|
18
|
+
|
|
19
|
+
If your search failed due to updates in the file content, remember that these
|
|
20
|
+
are the changes that the user made and you should preserve them
|
|
21
|
+
by updating your replace blocks too.
|
|
22
|
+
"""
|
|
23
|
+
super().__init__(message)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class Tolerance:
|
|
28
|
+
line_process: Callable[[str], str]
|
|
29
|
+
severity_cat: TOLERANCE_TYPES
|
|
30
|
+
score_multiplier: float
|
|
31
|
+
error_name: str
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class TolerancesHit(Tolerance):
|
|
36
|
+
count: int
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class FileEditOutput:
|
|
41
|
+
original_content: list[str]
|
|
42
|
+
orig_search_blocks: list[list[str]]
|
|
43
|
+
edited_with_tolerances: list[
|
|
44
|
+
tuple[slice, list[TolerancesHit], list[str]]
|
|
45
|
+
] # Need not be equal to orig_search_blocks when early exit
|
|
46
|
+
|
|
47
|
+
def replace_or_throw(
|
|
48
|
+
self,
|
|
49
|
+
max_errors: int,
|
|
50
|
+
) -> tuple[list[str], set[str]]:
|
|
51
|
+
new_lines = list[str]()
|
|
52
|
+
last_idx = 0
|
|
53
|
+
errors = []
|
|
54
|
+
warnings = set[str]()
|
|
55
|
+
info = set[str]()
|
|
56
|
+
score = 0.0
|
|
57
|
+
for (span, tolerances, replace_with), search_ in zip(
|
|
58
|
+
self.edited_with_tolerances, self.orig_search_blocks
|
|
59
|
+
):
|
|
60
|
+
for tol in tolerances:
|
|
61
|
+
score += tol.count * tol.score_multiplier
|
|
62
|
+
if tol.count > 0:
|
|
63
|
+
if tol.severity_cat == "WARNING":
|
|
64
|
+
warnings.add(tol.error_name)
|
|
65
|
+
elif tol.severity_cat == "ERROR":
|
|
66
|
+
search__ = "\n".join(search_)
|
|
67
|
+
errors.append(f"""
|
|
68
|
+
Got error while processing the following search block:
|
|
69
|
+
---
|
|
70
|
+
```
|
|
71
|
+
{search__}
|
|
72
|
+
```
|
|
73
|
+
---
|
|
74
|
+
Error:
|
|
75
|
+
{tol.error_name}
|
|
76
|
+
---
|
|
77
|
+
""")
|
|
78
|
+
else:
|
|
79
|
+
info.add(tol.error_name)
|
|
80
|
+
if len(errors) >= max_errors:
|
|
81
|
+
raise SearchReplaceMatchError("\n".join(errors))
|
|
82
|
+
if last_idx < span.start:
|
|
83
|
+
new_lines.extend(self.original_content[last_idx : span.start])
|
|
84
|
+
|
|
85
|
+
new_lines.extend(replace_with)
|
|
86
|
+
last_idx = span.stop
|
|
87
|
+
|
|
88
|
+
if last_idx < len(self.original_content):
|
|
89
|
+
new_lines.extend(self.original_content[last_idx:])
|
|
90
|
+
|
|
91
|
+
if errors:
|
|
92
|
+
raise SearchReplaceMatchError("\n".join(errors))
|
|
93
|
+
|
|
94
|
+
if score > 1000:
|
|
95
|
+
display = (list(warnings) + list(info))[:max_errors]
|
|
96
|
+
raise SearchReplaceMatchError(
|
|
97
|
+
"Too many warnings generated, not applying the edits\n"
|
|
98
|
+
+ "\n".join(display)
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
return new_lines, set(warnings)
|
|
102
|
+
|
|
103
|
+
@staticmethod
|
|
104
|
+
def get_best_match(
|
|
105
|
+
outputs: list["FileEditOutput"],
|
|
106
|
+
) -> list["FileEditOutput"]:
|
|
107
|
+
best_hits: list[FileEditOutput] = []
|
|
108
|
+
best_score = float("-inf")
|
|
109
|
+
assert outputs
|
|
110
|
+
for output in outputs:
|
|
111
|
+
hit_score = 0.0
|
|
112
|
+
for _, tols, _ in output.edited_with_tolerances:
|
|
113
|
+
for tol in tols:
|
|
114
|
+
hit_score += tol.count * tol.score_multiplier
|
|
115
|
+
if not best_hits:
|
|
116
|
+
best_hits.append(output)
|
|
117
|
+
best_score = hit_score
|
|
118
|
+
else:
|
|
119
|
+
if hit_score < best_score:
|
|
120
|
+
best_hits = [output]
|
|
121
|
+
best_score = hit_score
|
|
122
|
+
elif abs(hit_score - best_score) < 1e-3:
|
|
123
|
+
best_hits.append(output)
|
|
124
|
+
return best_hits
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def line_process_max_space_tolerance(line: str) -> str:
|
|
128
|
+
line = line.strip()
|
|
129
|
+
return re.sub(r"\s", "", line)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
REMOVE_INDENTATION = (
|
|
133
|
+
"Warning: matching without considering indentation (leading spaces)."
|
|
134
|
+
)
|
|
135
|
+
REMOVE_LINE_NUMS = "Warning: you gave search/replace blocks with leading line numbers, do not give them from the next time."
|
|
136
|
+
|
|
137
|
+
COMMON_MISTAKE_TRANSLATION = str.maketrans(
|
|
138
|
+
{
|
|
139
|
+
"‘": "'",
|
|
140
|
+
"’": "'",
|
|
141
|
+
"‚": ",",
|
|
142
|
+
"‛": "'",
|
|
143
|
+
"′": "'",
|
|
144
|
+
"“": '"',
|
|
145
|
+
"”": '"',
|
|
146
|
+
"‟": '"',
|
|
147
|
+
"″": '"',
|
|
148
|
+
"‹": "<",
|
|
149
|
+
"›": ">",
|
|
150
|
+
"‐": "-",
|
|
151
|
+
"‑": "-",
|
|
152
|
+
"‒": "-",
|
|
153
|
+
"–": "-",
|
|
154
|
+
"—": "-",
|
|
155
|
+
"―": "-",
|
|
156
|
+
"−": "-",
|
|
157
|
+
"…": "...",
|
|
158
|
+
}
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def remove_leading_linenums(string: str) -> str:
|
|
163
|
+
return re.sub(r"^\d+ ", "", string).rstrip()
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def normalize_common_mistakes(string: str) -> str:
|
|
167
|
+
"""Normalize unicode chars which are commonly confused by their ascii variants"""
|
|
168
|
+
return string.translate(COMMON_MISTAKE_TRANSLATION).rstrip()
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
DEFAULT_TOLERANCES = [
|
|
172
|
+
Tolerance(
|
|
173
|
+
line_process=str.rstrip,
|
|
174
|
+
severity_cat="SILENT",
|
|
175
|
+
score_multiplier=1,
|
|
176
|
+
error_name="",
|
|
177
|
+
),
|
|
178
|
+
Tolerance(
|
|
179
|
+
line_process=str.lstrip,
|
|
180
|
+
severity_cat="WARNING",
|
|
181
|
+
score_multiplier=10,
|
|
182
|
+
error_name=REMOVE_INDENTATION,
|
|
183
|
+
),
|
|
184
|
+
Tolerance(
|
|
185
|
+
line_process=remove_leading_linenums,
|
|
186
|
+
severity_cat="WARNING",
|
|
187
|
+
score_multiplier=5,
|
|
188
|
+
error_name=REMOVE_LINE_NUMS,
|
|
189
|
+
),
|
|
190
|
+
Tolerance(
|
|
191
|
+
line_process=normalize_common_mistakes,
|
|
192
|
+
severity_cat="WARNING",
|
|
193
|
+
score_multiplier=5,
|
|
194
|
+
error_name="Warning: matching after normalizing commonly confused characters (quotes, dashes, ellipsis).",
|
|
195
|
+
),
|
|
196
|
+
Tolerance(
|
|
197
|
+
line_process=line_process_max_space_tolerance,
|
|
198
|
+
severity_cat="WARNING",
|
|
199
|
+
score_multiplier=50,
|
|
200
|
+
error_name="Warning: matching after removing all spaces in lines.",
|
|
201
|
+
),
|
|
202
|
+
]
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def fix_line_nums(
|
|
206
|
+
matched_lines: list[str], searched_lines: list[str], replaced_lines: list[str]
|
|
207
|
+
) -> list[str]:
|
|
208
|
+
return [remove_leading_linenums(line) for line in replaced_lines]
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def fix_indentation(
|
|
212
|
+
matched_lines: list[str], searched_lines: list[str], replaced_lines: list[str]
|
|
213
|
+
) -> list[str]:
|
|
214
|
+
if not matched_lines or not searched_lines or not replaced_lines:
|
|
215
|
+
return replaced_lines
|
|
216
|
+
|
|
217
|
+
def get_indentation(line: str) -> str:
|
|
218
|
+
match = re.match(r"^(\s*)", line)
|
|
219
|
+
assert match
|
|
220
|
+
return match.group(0)
|
|
221
|
+
|
|
222
|
+
matched_indents = [get_indentation(line) for line in matched_lines if line.strip()]
|
|
223
|
+
searched_indents = [
|
|
224
|
+
get_indentation(line) for line in searched_lines if line.strip()
|
|
225
|
+
]
|
|
226
|
+
if len(matched_indents) != len(searched_indents):
|
|
227
|
+
return replaced_lines
|
|
228
|
+
diffs: list[int] = [
|
|
229
|
+
len(searched) - len(matched)
|
|
230
|
+
for matched, searched in zip(matched_indents, searched_indents)
|
|
231
|
+
]
|
|
232
|
+
if not all(diff == diffs[0] for diff in diffs):
|
|
233
|
+
return replaced_lines
|
|
234
|
+
if diffs[0] == 0:
|
|
235
|
+
return replaced_lines
|
|
236
|
+
|
|
237
|
+
# At this point we have same number of non-empty lines and the same indentation difference
|
|
238
|
+
# We can now adjust the indentation of the replaced lines
|
|
239
|
+
def adjust_indentation(line: str, diff: int) -> str:
|
|
240
|
+
if diff < 0:
|
|
241
|
+
return matched_indents[0][:-diff] + line
|
|
242
|
+
return line[diff:]
|
|
243
|
+
|
|
244
|
+
if diffs[0] > 0:
|
|
245
|
+
if not (all(not line[: diffs[0]].strip() for line in replaced_lines)):
|
|
246
|
+
return replaced_lines
|
|
247
|
+
return [adjust_indentation(line, diffs[0]) for line in replaced_lines]
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def remove_leading_trailing_empty_lines(lines: list[str]) -> list[str]:
|
|
251
|
+
start = 0
|
|
252
|
+
end = len(lines) - 1
|
|
253
|
+
if end < start:
|
|
254
|
+
return lines
|
|
255
|
+
while not lines[start].strip():
|
|
256
|
+
start += 1
|
|
257
|
+
if start >= len(lines):
|
|
258
|
+
break
|
|
259
|
+
while not lines[end].strip():
|
|
260
|
+
end -= 1
|
|
261
|
+
if end < 0:
|
|
262
|
+
break
|
|
263
|
+
return lines[start : end + 1]
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
@dataclass
|
|
267
|
+
class FileEditInput:
|
|
268
|
+
file_lines: list[str]
|
|
269
|
+
file_line_offset: int
|
|
270
|
+
search_replace_blocks: list[tuple[list[str], list[str]]]
|
|
271
|
+
search_replace_offset: int
|
|
272
|
+
tolerances: list["Tolerance"] = field(default_factory=lambda: DEFAULT_TOLERANCES)
|
|
273
|
+
|
|
274
|
+
def edit_file(self) -> list[FileEditOutput]:
|
|
275
|
+
n_file_lines = len(self.file_lines)
|
|
276
|
+
n_blocks = len(self.search_replace_blocks)
|
|
277
|
+
|
|
278
|
+
# Boundary conditions
|
|
279
|
+
no_match_output = FileEditOutput(
|
|
280
|
+
original_content=self.file_lines,
|
|
281
|
+
orig_search_blocks=[x[0] for x in self.search_replace_blocks],
|
|
282
|
+
edited_with_tolerances=[
|
|
283
|
+
(
|
|
284
|
+
slice(0, 0),
|
|
285
|
+
[
|
|
286
|
+
TolerancesHit(
|
|
287
|
+
line_process=lambda x: x,
|
|
288
|
+
severity_cat="ERROR",
|
|
289
|
+
score_multiplier=float("inf"),
|
|
290
|
+
error_name="The blocks couldn't be matched, maybe the sequence of search blocks was incorrect?",
|
|
291
|
+
count=max(1, len(search_lines)),
|
|
292
|
+
)
|
|
293
|
+
for search_lines, _ in self.search_replace_blocks[
|
|
294
|
+
self.search_replace_offset :
|
|
295
|
+
]
|
|
296
|
+
],
|
|
297
|
+
[],
|
|
298
|
+
)
|
|
299
|
+
],
|
|
300
|
+
)
|
|
301
|
+
if (
|
|
302
|
+
self.file_line_offset >= n_file_lines
|
|
303
|
+
and self.search_replace_offset < n_blocks
|
|
304
|
+
):
|
|
305
|
+
return [no_match_output]
|
|
306
|
+
elif self.file_line_offset >= n_file_lines:
|
|
307
|
+
return [
|
|
308
|
+
FileEditOutput(
|
|
309
|
+
self.file_lines,
|
|
310
|
+
[x[0] for x in self.search_replace_blocks],
|
|
311
|
+
[(slice(0, 0), [], [])],
|
|
312
|
+
)
|
|
313
|
+
]
|
|
314
|
+
elif self.search_replace_offset >= n_blocks:
|
|
315
|
+
return [
|
|
316
|
+
FileEditOutput(
|
|
317
|
+
self.file_lines,
|
|
318
|
+
[x[0] for x in self.search_replace_blocks],
|
|
319
|
+
[(slice(0, 0), [], [])],
|
|
320
|
+
)
|
|
321
|
+
]
|
|
322
|
+
|
|
323
|
+
# search for first block
|
|
324
|
+
first_block = self.search_replace_blocks[self.search_replace_offset]
|
|
325
|
+
replace_by = first_block[1]
|
|
326
|
+
|
|
327
|
+
# Try exact match
|
|
328
|
+
matches = match_exact(self.file_lines, self.file_line_offset, first_block[0])
|
|
329
|
+
|
|
330
|
+
all_outputs = list[list[tuple[slice, list[TolerancesHit], list[str]]]]()
|
|
331
|
+
|
|
332
|
+
if not matches:
|
|
333
|
+
# Try tolerances
|
|
334
|
+
matches_with_tolerances = match_with_tolerance(
|
|
335
|
+
self.file_lines, self.file_line_offset, first_block[0], self.tolerances
|
|
336
|
+
)
|
|
337
|
+
if not matches_with_tolerances:
|
|
338
|
+
# Try with no empty lines
|
|
339
|
+
matches_with_tolerances = match_with_tolerance_empty_line(
|
|
340
|
+
self.file_lines,
|
|
341
|
+
self.file_line_offset,
|
|
342
|
+
first_block[0],
|
|
343
|
+
self.tolerances,
|
|
344
|
+
)
|
|
345
|
+
replace_by = remove_leading_trailing_empty_lines(first_block[1])
|
|
346
|
+
|
|
347
|
+
if not matches_with_tolerances:
|
|
348
|
+
# Report edit distance
|
|
349
|
+
sim_match, sim_sim, sim_context = (
|
|
350
|
+
find_least_edit_distance_substring(
|
|
351
|
+
self.file_lines, self.file_line_offset, first_block[0]
|
|
352
|
+
)
|
|
353
|
+
)
|
|
354
|
+
if sim_match:
|
|
355
|
+
matches_with_tolerances = [
|
|
356
|
+
(
|
|
357
|
+
sim_match,
|
|
358
|
+
[
|
|
359
|
+
TolerancesHit(
|
|
360
|
+
lambda x: x,
|
|
361
|
+
"ERROR",
|
|
362
|
+
float("inf"),
|
|
363
|
+
"Couldn't find match. Here's the latest snippet from the file which might be relevant for you to consider:\n```"
|
|
364
|
+
+ sim_context
|
|
365
|
+
+ "\n```",
|
|
366
|
+
int(len(first_block[0]) // sim_sim),
|
|
367
|
+
)
|
|
368
|
+
],
|
|
369
|
+
)
|
|
370
|
+
]
|
|
371
|
+
|
|
372
|
+
else:
|
|
373
|
+
matches_with_tolerances = [(match, []) for match in matches]
|
|
374
|
+
|
|
375
|
+
for match, tolerances in matches_with_tolerances:
|
|
376
|
+
if any(
|
|
377
|
+
tolerance.error_name == REMOVE_LINE_NUMS for tolerance in tolerances
|
|
378
|
+
):
|
|
379
|
+
replace_by = fix_line_nums(
|
|
380
|
+
self.file_lines[match.start : match.stop],
|
|
381
|
+
first_block[0],
|
|
382
|
+
replace_by,
|
|
383
|
+
)
|
|
384
|
+
if any(
|
|
385
|
+
tolerance.error_name == REMOVE_INDENTATION for tolerance in tolerances
|
|
386
|
+
):
|
|
387
|
+
replace_by = fix_indentation(
|
|
388
|
+
self.file_lines[match.start : match.stop],
|
|
389
|
+
first_block[0],
|
|
390
|
+
replace_by,
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
file_edit_input = FileEditInput(
|
|
394
|
+
self.file_lines,
|
|
395
|
+
match.stop,
|
|
396
|
+
self.search_replace_blocks,
|
|
397
|
+
self.search_replace_offset + 1,
|
|
398
|
+
self.tolerances,
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
if any(tolerance.severity_cat == "ERROR" for tolerance in tolerances):
|
|
402
|
+
# Exit early
|
|
403
|
+
all_outputs.append(
|
|
404
|
+
[
|
|
405
|
+
(match, tolerances, replace_by),
|
|
406
|
+
]
|
|
407
|
+
)
|
|
408
|
+
else:
|
|
409
|
+
remaining_output = file_edit_input.edit_file()
|
|
410
|
+
for rem_output in remaining_output:
|
|
411
|
+
all_outputs.append(
|
|
412
|
+
[
|
|
413
|
+
(match, tolerances, replace_by),
|
|
414
|
+
*rem_output.edited_with_tolerances,
|
|
415
|
+
]
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
if not all_outputs:
|
|
419
|
+
return [no_match_output]
|
|
420
|
+
|
|
421
|
+
return [
|
|
422
|
+
FileEditOutput(
|
|
423
|
+
self.file_lines, [x[0] for x in self.search_replace_blocks], output
|
|
424
|
+
)
|
|
425
|
+
for output in all_outputs
|
|
426
|
+
]
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def find_contiguous_match(search_line_positions: list[set[int]]) -> list[slice]:
|
|
430
|
+
n_search_lines = len(search_line_positions)
|
|
431
|
+
|
|
432
|
+
def search_in_dictionary(search_offset: int, search_index: int) -> bool:
|
|
433
|
+
if search_offset >= n_search_lines:
|
|
434
|
+
return True
|
|
435
|
+
|
|
436
|
+
if search_index in search_line_positions[search_offset]:
|
|
437
|
+
return search_in_dictionary(search_offset + 1, search_index + 1)
|
|
438
|
+
return False
|
|
439
|
+
|
|
440
|
+
matched_slices = []
|
|
441
|
+
for index in search_line_positions[0]:
|
|
442
|
+
if search_in_dictionary(1, index + 1):
|
|
443
|
+
matched_slices.append(slice(index, index + n_search_lines, 1))
|
|
444
|
+
return matched_slices
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def match_exact(
|
|
448
|
+
content: list[str], content_offset: int, search: list[str]
|
|
449
|
+
) -> list[slice]:
|
|
450
|
+
n_search_lines = len(search)
|
|
451
|
+
n_content = len(content) - content_offset
|
|
452
|
+
if n_search_lines > n_content:
|
|
453
|
+
return []
|
|
454
|
+
if n_search_lines == 0:
|
|
455
|
+
return []
|
|
456
|
+
if n_content == 0:
|
|
457
|
+
return []
|
|
458
|
+
content_positions = DefaultDict[str, set[int]](set)
|
|
459
|
+
for i in range(content_offset, n_content):
|
|
460
|
+
content_positions[content[i]].add(i)
|
|
461
|
+
search_line_positions = [content_positions[line] for line in search]
|
|
462
|
+
|
|
463
|
+
matched_slices = find_contiguous_match(search_line_positions)
|
|
464
|
+
|
|
465
|
+
return matched_slices
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def match_with_tolerance(
|
|
469
|
+
content: list[str],
|
|
470
|
+
content_offset: int,
|
|
471
|
+
search: list[str],
|
|
472
|
+
tolerances: list[Tolerance],
|
|
473
|
+
) -> list[tuple[slice, list[TolerancesHit]]]:
|
|
474
|
+
n_search_lines = len(search)
|
|
475
|
+
n_content = len(content) - content_offset
|
|
476
|
+
if n_search_lines > n_content:
|
|
477
|
+
return []
|
|
478
|
+
if n_search_lines == 0:
|
|
479
|
+
return []
|
|
480
|
+
if n_content == 0:
|
|
481
|
+
return []
|
|
482
|
+
content_positions = DefaultDict[str, set[int]](set)
|
|
483
|
+
for i in range(content_offset, n_content):
|
|
484
|
+
content_positions[content[i]].add(i)
|
|
485
|
+
search_line_positions = [content_positions[line] for line in search]
|
|
486
|
+
|
|
487
|
+
tolerance_index_by_content_line: list[dict[int, int]] = [
|
|
488
|
+
{} for _ in range(len(search))
|
|
489
|
+
]
|
|
490
|
+
for tidx, tolerance in enumerate(tolerances):
|
|
491
|
+
content_positions = DefaultDict[str, set[int]](set)
|
|
492
|
+
for i in range(content_offset, n_content):
|
|
493
|
+
line = content[i]
|
|
494
|
+
content_positions[tolerance.line_process(line)].add(i)
|
|
495
|
+
for i, line in enumerate(search):
|
|
496
|
+
new_lines = content_positions[tolerance.line_process(line)]
|
|
497
|
+
new_indices = new_lines - search_line_positions[i]
|
|
498
|
+
search_line_positions[i].update(new_indices)
|
|
499
|
+
tolerance_index_by_content_line[i].update(
|
|
500
|
+
{idx: tidx for idx in new_indices}
|
|
501
|
+
)
|
|
502
|
+
matched_slices = find_contiguous_match(search_line_positions)
|
|
503
|
+
|
|
504
|
+
tolerances_counts: list[list[TolerancesHit]] = [
|
|
505
|
+
[
|
|
506
|
+
TolerancesHit(
|
|
507
|
+
line_process=tol.line_process,
|
|
508
|
+
severity_cat=tol.severity_cat,
|
|
509
|
+
score_multiplier=tol.score_multiplier,
|
|
510
|
+
count=0,
|
|
511
|
+
error_name=tol.error_name,
|
|
512
|
+
)
|
|
513
|
+
for tol in tolerances
|
|
514
|
+
]
|
|
515
|
+
for _ in range(len(matched_slices))
|
|
516
|
+
]
|
|
517
|
+
for sidx, slice in enumerate(matched_slices):
|
|
518
|
+
for search_idx, content_idx in enumerate(
|
|
519
|
+
range(slice.start, slice.stop, slice.step)
|
|
520
|
+
):
|
|
521
|
+
if content_idx in tolerance_index_by_content_line[search_idx]:
|
|
522
|
+
tolerances_counts[sidx][
|
|
523
|
+
tolerance_index_by_content_line[search_idx][content_idx]
|
|
524
|
+
].count += 1
|
|
525
|
+
|
|
526
|
+
# Remove 0 counts
|
|
527
|
+
tolerances_counts = [[x for x in y if x.count > 0] for y in tolerances_counts]
|
|
528
|
+
|
|
529
|
+
return list(zip(matched_slices, tolerances_counts))
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
def match_with_tolerance_empty_line(
|
|
533
|
+
content: list[str],
|
|
534
|
+
content_offset: int,
|
|
535
|
+
search: list[str],
|
|
536
|
+
tolerances: list[Tolerance],
|
|
537
|
+
) -> list[tuple[slice, list[TolerancesHit]]]:
|
|
538
|
+
new_content = list[str]()
|
|
539
|
+
new_to_original = dict[int, int]()
|
|
540
|
+
for i in range(content_offset, len(content)):
|
|
541
|
+
line = content[i]
|
|
542
|
+
if line.strip():
|
|
543
|
+
new_to_original[len(new_content)] = i
|
|
544
|
+
new_content.append(line)
|
|
545
|
+
|
|
546
|
+
search = [line for line in search if line.strip()]
|
|
547
|
+
|
|
548
|
+
matches_with_tolerancs = match_with_tolerance(new_content, 0, search, tolerances)
|
|
549
|
+
|
|
550
|
+
new_matches_with_tolerances = list[tuple[slice, list[TolerancesHit]]]()
|
|
551
|
+
for matches, tolerance_counts in matches_with_tolerancs:
|
|
552
|
+
matches = slice(
|
|
553
|
+
new_to_original[matches.start], new_to_original[matches.stop - 1] + 1, 1
|
|
554
|
+
)
|
|
555
|
+
new_matches_with_tolerances.append((matches, tolerance_counts))
|
|
556
|
+
return new_matches_with_tolerances
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
def find_least_edit_distance_substring(
|
|
560
|
+
orig_content_lines: list[str], offset: int, find_lines: list[str]
|
|
561
|
+
) -> tuple[Optional[slice], float, str]:
|
|
562
|
+
# Prepare content lines, stripping whitespace and keeping track of original indices
|
|
563
|
+
content_lines = [
|
|
564
|
+
orig_content_lines[i].strip() for i in range(offset, len(orig_content_lines))
|
|
565
|
+
]
|
|
566
|
+
new_to_original_indices = {}
|
|
567
|
+
new_content_lines = []
|
|
568
|
+
for i, line in enumerate(content_lines):
|
|
569
|
+
if not line:
|
|
570
|
+
continue
|
|
571
|
+
new_content_lines.append(line)
|
|
572
|
+
new_to_original_indices[len(new_content_lines) - 1] = i
|
|
573
|
+
content_lines = new_content_lines
|
|
574
|
+
|
|
575
|
+
# Prepare find lines, removing empty lines
|
|
576
|
+
find_lines = [line.strip() for line in find_lines if line.strip()]
|
|
577
|
+
|
|
578
|
+
# Initialize variables for best match tracking
|
|
579
|
+
max_similarity = 0.0
|
|
580
|
+
min_edit_distance_lines = None
|
|
581
|
+
context_lines = []
|
|
582
|
+
|
|
583
|
+
# For each possible starting position in content
|
|
584
|
+
for i in range(max(1, len(content_lines) - len(find_lines) + 1)):
|
|
585
|
+
# Calculate similarity for the block starting at position i
|
|
586
|
+
block_similarity = 0.0
|
|
587
|
+
for j in range(len(find_lines)):
|
|
588
|
+
if (i + j) < len(content_lines):
|
|
589
|
+
# Use SequenceMatcher for more efficient similarity calculation
|
|
590
|
+
similarity = SequenceMatcher(
|
|
591
|
+
None, content_lines[i + j], find_lines[j]
|
|
592
|
+
).ratio()
|
|
593
|
+
block_similarity += similarity
|
|
594
|
+
|
|
595
|
+
# If this block is more similar than previous best
|
|
596
|
+
if block_similarity > max_similarity:
|
|
597
|
+
max_similarity = block_similarity
|
|
598
|
+
# Map back to original line indices
|
|
599
|
+
orig_start_index = new_to_original_indices[i]
|
|
600
|
+
orig_end_index = (
|
|
601
|
+
new_to_original_indices.get(
|
|
602
|
+
i + len(find_lines) - 1, len(orig_content_lines) - 1
|
|
603
|
+
)
|
|
604
|
+
+ 1
|
|
605
|
+
)
|
|
606
|
+
# Get the original lines
|
|
607
|
+
min_edit_distance_lines = slice(
|
|
608
|
+
orig_start_index + offset, orig_end_index + offset
|
|
609
|
+
)
|
|
610
|
+
# Get context (10 lines before and after)
|
|
611
|
+
context_lines = orig_content_lines[
|
|
612
|
+
max(0, orig_start_index - 10 + offset) : (orig_end_index + 10 + offset)
|
|
613
|
+
]
|
|
614
|
+
|
|
615
|
+
return (
|
|
616
|
+
min_edit_distance_lines,
|
|
617
|
+
max_similarity,
|
|
618
|
+
"\n".join(context_lines),
|
|
619
|
+
)
|