wcgw 2.6.3__py3-none-any.whl → 2.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wcgw might be problematic. Click here for more details.
- wcgw/client/anthropic_client.py +59 -51
- wcgw/client/diff-instructions.txt +0 -1
- wcgw/client/file_ops/diff_edit.py +482 -0
- wcgw/client/file_ops/search_replace.py +119 -0
- wcgw/client/mcp_server/server.py +45 -2
- wcgw/client/memory.py +78 -0
- wcgw/client/openai_client.py +38 -18
- wcgw/client/tools.py +83 -169
- wcgw/relay/serve.py +41 -12
- wcgw/types_.py +8 -0
- {wcgw-2.6.3.dist-info → wcgw-2.7.1.dist-info}/METADATA +14 -3
- {wcgw-2.6.3.dist-info → wcgw-2.7.1.dist-info}/RECORD +15 -12
- {wcgw-2.6.3.dist-info → wcgw-2.7.1.dist-info}/WHEEL +0 -0
- {wcgw-2.6.3.dist-info → wcgw-2.7.1.dist-info}/entry_points.txt +0 -0
- {wcgw-2.6.3.dist-info → wcgw-2.7.1.dist-info}/licenses/LICENSE +0 -0
wcgw/client/anthropic_client.py
CHANGED
|
@@ -1,59 +1,51 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
import json
|
|
3
3
|
import mimetypes
|
|
4
|
-
|
|
5
|
-
import
|
|
4
|
+
import os
|
|
5
|
+
import subprocess
|
|
6
|
+
import tempfile
|
|
6
7
|
import traceback
|
|
7
|
-
|
|
8
|
-
import
|
|
8
|
+
import uuid
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Literal, Optional, cast
|
|
11
|
+
|
|
12
|
+
import rich
|
|
9
13
|
from anthropic import Anthropic
|
|
10
14
|
from anthropic.types import (
|
|
11
|
-
|
|
15
|
+
ImageBlockParam,
|
|
12
16
|
MessageParam,
|
|
17
|
+
TextBlockParam,
|
|
18
|
+
ToolParam,
|
|
13
19
|
ToolResultBlockParam,
|
|
14
20
|
ToolUseBlockParam,
|
|
15
|
-
ImageBlockParam,
|
|
16
|
-
TextBlockParam,
|
|
17
21
|
)
|
|
18
|
-
|
|
19
|
-
import rich
|
|
20
|
-
import petname # type: ignore[import-untyped]
|
|
22
|
+
from dotenv import load_dotenv
|
|
21
23
|
from typer import Typer
|
|
22
|
-
import uuid
|
|
23
24
|
|
|
24
25
|
from ..types_ import (
|
|
25
26
|
BashCommand,
|
|
26
27
|
BashInteraction,
|
|
27
|
-
|
|
28
|
-
FileEditFindReplace,
|
|
28
|
+
ContextSave,
|
|
29
29
|
FileEdit,
|
|
30
|
+
GetScreenInfo,
|
|
30
31
|
Keyboard,
|
|
31
32
|
Mouse,
|
|
32
33
|
ReadFiles,
|
|
33
34
|
ReadImage,
|
|
34
35
|
ResetShell,
|
|
35
36
|
ScreenShot,
|
|
36
|
-
|
|
37
|
+
WriteIfEmpty,
|
|
38
|
+
)
|
|
39
|
+
from .common import discard_input
|
|
40
|
+
from .memory import load_memory
|
|
41
|
+
from .tools import (
|
|
42
|
+
DoneFlag,
|
|
43
|
+
ImageData,
|
|
44
|
+
default_enc,
|
|
45
|
+
get_tool_output,
|
|
46
|
+
initialize,
|
|
47
|
+
which_tool_name,
|
|
37
48
|
)
|
|
38
|
-
|
|
39
|
-
from .common import Models, discard_input
|
|
40
|
-
from .common import CostData
|
|
41
|
-
from .tools import ImageData
|
|
42
|
-
from .computer_use import Computer
|
|
43
|
-
|
|
44
|
-
from .tools import DoneFlag, get_tool_output, which_tool_name, default_enc
|
|
45
|
-
|
|
46
|
-
from urllib import parse
|
|
47
|
-
import subprocess
|
|
48
|
-
import os
|
|
49
|
-
import tempfile
|
|
50
|
-
|
|
51
|
-
import toml
|
|
52
|
-
from pydantic import BaseModel
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
from dotenv import load_dotenv
|
|
56
|
-
|
|
57
49
|
|
|
58
50
|
History = list[MessageParam]
|
|
59
51
|
|
|
@@ -135,19 +127,28 @@ def loop(
|
|
|
135
127
|
|
|
136
128
|
history: History = []
|
|
137
129
|
waiting_for_assistant = False
|
|
130
|
+
memory = None
|
|
138
131
|
if resume:
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
132
|
+
try:
|
|
133
|
+
_, memory = load_memory(
|
|
134
|
+
resume,
|
|
135
|
+
8000,
|
|
136
|
+
lambda x: default_enc.encode(x).ids,
|
|
137
|
+
lambda x: default_enc.decode(x),
|
|
138
|
+
)
|
|
139
|
+
except OSError:
|
|
140
|
+
if resume == "latest":
|
|
141
|
+
resume_path = sorted(Path(".wcgw").iterdir(), key=os.path.getmtime)[-1]
|
|
142
|
+
else:
|
|
143
|
+
resume_path = Path(resume)
|
|
144
|
+
if not resume_path.exists():
|
|
145
|
+
raise FileNotFoundError(f"File {resume} not found")
|
|
146
|
+
with resume_path.open() as f:
|
|
147
|
+
history = json.load(f)
|
|
148
|
+
if len(history) <= 2:
|
|
149
|
+
raise ValueError("Invalid history file")
|
|
150
|
+
first_message = ""
|
|
151
|
+
waiting_for_assistant = history[-1]["role"] != "assistant"
|
|
151
152
|
|
|
152
153
|
limit = 1
|
|
153
154
|
|
|
@@ -216,6 +217,15 @@ def loop(
|
|
|
216
217
|
- Use absolute file path only.
|
|
217
218
|
- Use SEARCH/REPLACE blocks to edit the file.
|
|
218
219
|
- If the edit fails due to block not matching, please retry with correct block till it matches. Re-read the file to ensure you've all the lines correct.
|
|
220
|
+
""",
|
|
221
|
+
),
|
|
222
|
+
ToolParam(
|
|
223
|
+
input_schema=ContextSave.model_json_schema(),
|
|
224
|
+
name="ContextSave",
|
|
225
|
+
description="""
|
|
226
|
+
Saves provided description and file contents of all the relevant file paths or globs in a single text file.
|
|
227
|
+
- Provide random unqiue id or whatever user provided.
|
|
228
|
+
- Leave project path as empty string if no project path
|
|
219
229
|
""",
|
|
220
230
|
),
|
|
221
231
|
]
|
|
@@ -270,9 +280,10 @@ def loop(
|
|
|
270
280
|
""",
|
|
271
281
|
),
|
|
272
282
|
]
|
|
273
|
-
uname_sysname = os.uname().sysname
|
|
274
|
-
uname_machine = os.uname().machine
|
|
275
283
|
|
|
284
|
+
initial_info = initialize(
|
|
285
|
+
os.getcwd(), [], resume if (memory and resume) else "", 8000
|
|
286
|
+
)
|
|
276
287
|
system = f"""
|
|
277
288
|
You're an expert software engineer with shell and code knowledge.
|
|
278
289
|
|
|
@@ -284,10 +295,7 @@ Instructions:
|
|
|
284
295
|
- Do not provide code snippets unless asked by the user, instead directly add/edit the code.
|
|
285
296
|
- Do not install new tools/packages before ensuring no such tools/package or an alternative already exists.
|
|
286
297
|
|
|
287
|
-
|
|
288
|
-
- System: {uname_sysname}
|
|
289
|
-
- Machine: {uname_machine}
|
|
290
|
-
- Current directory: {os.getcwd()}
|
|
298
|
+
{initial_info}
|
|
291
299
|
"""
|
|
292
300
|
|
|
293
301
|
with open(os.path.join(os.path.dirname(__file__), "diff-instructions.txt")) as f:
|
|
@@ -47,7 +47,6 @@ Every *SEARCH/REPLACE block* must use this format:
|
|
|
47
47
|
|
|
48
48
|
Every "<<<<<<< SEARCH" section must *EXACTLY MATCH* the existing file content, character for character, including all comments, docstrings, whitespaces, etc.
|
|
49
49
|
|
|
50
|
-
*SEARCH/REPLACE* blocks will *only* replace the first match occurrence.
|
|
51
50
|
Including multiple unique *SEARCH/REPLACE* blocks if needed.
|
|
52
51
|
Include enough lines in each SEARCH section to uniquely match each set of lines that need to change.
|
|
53
52
|
|
|
@@ -0,0 +1,482 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from difflib import SequenceMatcher
|
|
4
|
+
from typing import Callable, DefaultDict, Literal, Optional
|
|
5
|
+
|
|
6
|
+
TOLERANCE_TYPES = Literal["SILENT", "WARNING", "ERROR"]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class Tolerance:
|
|
11
|
+
line_process: Callable[[str], str]
|
|
12
|
+
severity_cat: TOLERANCE_TYPES
|
|
13
|
+
score_multiplier: float
|
|
14
|
+
error_name: str
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class TolerancesHit(Tolerance):
|
|
19
|
+
count: int
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class FileEditOutput:
|
|
24
|
+
original_content: list[str]
|
|
25
|
+
orig_search_blocks: list[list[str]]
|
|
26
|
+
edited_with_tolerances: list[tuple[slice, list[TolerancesHit], list[str]]]
|
|
27
|
+
|
|
28
|
+
def replace_or_throw(
|
|
29
|
+
self,
|
|
30
|
+
max_errors: int,
|
|
31
|
+
) -> tuple[list[str], set[str]]:
|
|
32
|
+
new_lines = list[str]()
|
|
33
|
+
last_idx = 0
|
|
34
|
+
errors = []
|
|
35
|
+
warnings = set[str]()
|
|
36
|
+
for (span, tolerances, replace_with), search_ in zip(
|
|
37
|
+
self.edited_with_tolerances, self.orig_search_blocks
|
|
38
|
+
):
|
|
39
|
+
for tol in tolerances:
|
|
40
|
+
if tol.count > 0:
|
|
41
|
+
if tol.severity_cat == "WARNING":
|
|
42
|
+
warnings.add(tol.error_name)
|
|
43
|
+
elif tol.severity_cat == "ERROR":
|
|
44
|
+
errors.append(f"""
|
|
45
|
+
Got error while processing the following search block:
|
|
46
|
+
---
|
|
47
|
+
```
|
|
48
|
+
{'\n'.join(search_)}
|
|
49
|
+
```
|
|
50
|
+
---
|
|
51
|
+
Error:
|
|
52
|
+
{tol.error_name}
|
|
53
|
+
---
|
|
54
|
+
""")
|
|
55
|
+
if len(errors) >= max_errors:
|
|
56
|
+
raise Exception("\n".join(errors))
|
|
57
|
+
if last_idx < span.start:
|
|
58
|
+
new_lines.extend(self.original_content[last_idx : span.start])
|
|
59
|
+
|
|
60
|
+
new_lines.extend(replace_with)
|
|
61
|
+
last_idx = span.stop
|
|
62
|
+
|
|
63
|
+
if last_idx < len(self.original_content):
|
|
64
|
+
new_lines.extend(self.original_content[last_idx:])
|
|
65
|
+
|
|
66
|
+
if errors:
|
|
67
|
+
raise Exception("\n".join(errors))
|
|
68
|
+
|
|
69
|
+
return new_lines, set(warnings)
|
|
70
|
+
|
|
71
|
+
@staticmethod
|
|
72
|
+
def get_best_match(
|
|
73
|
+
outputs: list["FileEditOutput"],
|
|
74
|
+
) -> tuple[list["FileEditOutput"], bool]:
|
|
75
|
+
best_hits: list[FileEditOutput] = []
|
|
76
|
+
best_score = float("-inf")
|
|
77
|
+
assert outputs
|
|
78
|
+
for output in outputs:
|
|
79
|
+
hit_score = 0.0
|
|
80
|
+
for _, tols, _ in output.edited_with_tolerances:
|
|
81
|
+
for tol in tols:
|
|
82
|
+
hit_score += tol.count * tol.score_multiplier
|
|
83
|
+
if not best_hits:
|
|
84
|
+
best_hits.append(output)
|
|
85
|
+
best_score = hit_score
|
|
86
|
+
else:
|
|
87
|
+
if hit_score < best_score:
|
|
88
|
+
best_hits = [output]
|
|
89
|
+
best_score = hit_score
|
|
90
|
+
elif abs(hit_score - best_score) < 1e-3:
|
|
91
|
+
best_hits.append(output)
|
|
92
|
+
|
|
93
|
+
return best_hits, best_score < 0
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def line_process_max_space_tolerance(line: str) -> str:
|
|
97
|
+
line = line.strip()
|
|
98
|
+
return re.sub(r"\s", "", line)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
DEFAULT_TOLERANCES = [
|
|
102
|
+
Tolerance(
|
|
103
|
+
line_process=str.rstrip,
|
|
104
|
+
severity_cat="SILENT",
|
|
105
|
+
score_multiplier=1,
|
|
106
|
+
error_name="",
|
|
107
|
+
),
|
|
108
|
+
Tolerance(
|
|
109
|
+
line_process=str.lstrip,
|
|
110
|
+
severity_cat="WARNING",
|
|
111
|
+
score_multiplier=10,
|
|
112
|
+
error_name="Warning: matching without considering indentation (leading spaces).",
|
|
113
|
+
),
|
|
114
|
+
Tolerance(
|
|
115
|
+
line_process=line_process_max_space_tolerance,
|
|
116
|
+
severity_cat="WARNING",
|
|
117
|
+
score_multiplier=50,
|
|
118
|
+
error_name="Warning: matching after removing all spaces in lines.",
|
|
119
|
+
),
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def remove_leading_trailing_empty_lines(lines: list[str]) -> list[str]:
|
|
124
|
+
start = 0
|
|
125
|
+
end = len(lines) - 1
|
|
126
|
+
if end < start:
|
|
127
|
+
return lines
|
|
128
|
+
while not lines[start].strip():
|
|
129
|
+
start += 1
|
|
130
|
+
if start >= len(lines):
|
|
131
|
+
break
|
|
132
|
+
while not lines[end].strip():
|
|
133
|
+
end -= 1
|
|
134
|
+
if end < 0:
|
|
135
|
+
break
|
|
136
|
+
return lines[start : end + 1]
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@dataclass
|
|
140
|
+
class FileEditInput:
|
|
141
|
+
file_lines: list[str]
|
|
142
|
+
file_line_offset: int
|
|
143
|
+
search_replace_blocks: list[tuple[list[str], list[str]]]
|
|
144
|
+
search_replace_offset: int
|
|
145
|
+
tolerances: list["Tolerance"] = field(default_factory=lambda: DEFAULT_TOLERANCES)
|
|
146
|
+
|
|
147
|
+
def edit_file(self) -> list[FileEditOutput]:
|
|
148
|
+
n_file_lines = len(self.file_lines)
|
|
149
|
+
n_blocks = len(self.search_replace_blocks)
|
|
150
|
+
|
|
151
|
+
# Boundary conditions
|
|
152
|
+
no_match_output = FileEditOutput(
|
|
153
|
+
original_content=self.file_lines,
|
|
154
|
+
orig_search_blocks=[x[0] for x in self.search_replace_blocks],
|
|
155
|
+
edited_with_tolerances=[
|
|
156
|
+
(
|
|
157
|
+
slice(0, 0),
|
|
158
|
+
[
|
|
159
|
+
TolerancesHit(
|
|
160
|
+
line_process=lambda x: x,
|
|
161
|
+
severity_cat="ERROR",
|
|
162
|
+
score_multiplier=float("-inf"),
|
|
163
|
+
error_name="The blocks couldn't be matched, maybe the sequence of search blocks was incorrect?",
|
|
164
|
+
count=max(1, len(search_lines)),
|
|
165
|
+
)
|
|
166
|
+
for search_lines, _ in self.search_replace_blocks[
|
|
167
|
+
self.search_replace_offset :
|
|
168
|
+
]
|
|
169
|
+
],
|
|
170
|
+
[],
|
|
171
|
+
)
|
|
172
|
+
],
|
|
173
|
+
)
|
|
174
|
+
if (
|
|
175
|
+
self.file_line_offset >= n_file_lines
|
|
176
|
+
and self.search_replace_offset < n_blocks
|
|
177
|
+
):
|
|
178
|
+
return [no_match_output]
|
|
179
|
+
elif self.file_line_offset >= n_file_lines:
|
|
180
|
+
return [
|
|
181
|
+
FileEditOutput(
|
|
182
|
+
self.file_lines,
|
|
183
|
+
[x[0] for x in self.search_replace_blocks],
|
|
184
|
+
[(slice(0, 0), [], [])],
|
|
185
|
+
)
|
|
186
|
+
]
|
|
187
|
+
elif self.search_replace_offset >= n_blocks:
|
|
188
|
+
return [
|
|
189
|
+
FileEditOutput(
|
|
190
|
+
self.file_lines,
|
|
191
|
+
[x[0] for x in self.search_replace_blocks],
|
|
192
|
+
[(slice(0, 0), [], [])],
|
|
193
|
+
)
|
|
194
|
+
]
|
|
195
|
+
|
|
196
|
+
# search for first block
|
|
197
|
+
first_block = self.search_replace_blocks[self.search_replace_offset]
|
|
198
|
+
|
|
199
|
+
# Try exact match
|
|
200
|
+
matches = match_exact(self.file_lines, self.file_line_offset, first_block[0])
|
|
201
|
+
|
|
202
|
+
all_outputs = list[list[tuple[slice, list[TolerancesHit], list[str]]]]()
|
|
203
|
+
|
|
204
|
+
if not matches:
|
|
205
|
+
# Try tolerances
|
|
206
|
+
matches_with_tolerances = match_with_tolerance(
|
|
207
|
+
self.file_lines, self.file_line_offset, first_block[0], self.tolerances
|
|
208
|
+
)
|
|
209
|
+
replace_by = first_block[1]
|
|
210
|
+
if not matches_with_tolerances:
|
|
211
|
+
# Try with no empty lines
|
|
212
|
+
matches_with_tolerances = match_with_tolerance_empty_line(
|
|
213
|
+
self.file_lines,
|
|
214
|
+
self.file_line_offset,
|
|
215
|
+
first_block[0],
|
|
216
|
+
self.tolerances,
|
|
217
|
+
)
|
|
218
|
+
replace_by = remove_leading_trailing_empty_lines(first_block[1])
|
|
219
|
+
|
|
220
|
+
if not matches_with_tolerances:
|
|
221
|
+
# Report edit distance
|
|
222
|
+
sim_match, sim_sim, sim_context = (
|
|
223
|
+
find_least_edit_distance_substring(
|
|
224
|
+
self.file_lines, self.file_line_offset, first_block[0]
|
|
225
|
+
)
|
|
226
|
+
)
|
|
227
|
+
if sim_match:
|
|
228
|
+
matches_with_tolerances = [
|
|
229
|
+
(
|
|
230
|
+
sim_match,
|
|
231
|
+
[
|
|
232
|
+
TolerancesHit(
|
|
233
|
+
lambda x: x,
|
|
234
|
+
"ERROR",
|
|
235
|
+
-1,
|
|
236
|
+
"Couldn't find match. Do you mean to match the lines in the following context?\n```"
|
|
237
|
+
+ sim_context
|
|
238
|
+
+ "\n```",
|
|
239
|
+
int(len(first_block[0]) // sim_sim),
|
|
240
|
+
)
|
|
241
|
+
],
|
|
242
|
+
)
|
|
243
|
+
]
|
|
244
|
+
|
|
245
|
+
for match, tolerances in matches_with_tolerances:
|
|
246
|
+
file_edit_input = FileEditInput(
|
|
247
|
+
self.file_lines,
|
|
248
|
+
match.stop,
|
|
249
|
+
self.search_replace_blocks,
|
|
250
|
+
self.search_replace_offset + 1,
|
|
251
|
+
self.tolerances,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
remaining_output = file_edit_input.edit_file()
|
|
255
|
+
for rem_output in remaining_output:
|
|
256
|
+
all_outputs.append(
|
|
257
|
+
[
|
|
258
|
+
(match, tolerances, replace_by),
|
|
259
|
+
*rem_output.edited_with_tolerances,
|
|
260
|
+
]
|
|
261
|
+
)
|
|
262
|
+
else:
|
|
263
|
+
for match in matches:
|
|
264
|
+
file_edit_input = FileEditInput(
|
|
265
|
+
self.file_lines,
|
|
266
|
+
match.stop,
|
|
267
|
+
self.search_replace_blocks,
|
|
268
|
+
self.search_replace_offset + 1,
|
|
269
|
+
self.tolerances,
|
|
270
|
+
)
|
|
271
|
+
remaining_output = file_edit_input.edit_file()
|
|
272
|
+
for rem_output in remaining_output:
|
|
273
|
+
all_outputs.append(
|
|
274
|
+
[
|
|
275
|
+
(
|
|
276
|
+
match,
|
|
277
|
+
[],
|
|
278
|
+
first_block[1],
|
|
279
|
+
),
|
|
280
|
+
*rem_output.edited_with_tolerances,
|
|
281
|
+
]
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
if not all_outputs:
|
|
285
|
+
return [no_match_output]
|
|
286
|
+
|
|
287
|
+
return [
|
|
288
|
+
FileEditOutput(
|
|
289
|
+
self.file_lines, [x[0] for x in self.search_replace_blocks], output
|
|
290
|
+
)
|
|
291
|
+
for output in all_outputs
|
|
292
|
+
]
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def find_contiguous_match(search_line_positions: list[set[int]]) -> list[slice]:
|
|
296
|
+
n_search_lines = len(search_line_positions)
|
|
297
|
+
|
|
298
|
+
def search_in_dictionary(search_offset: int, search_index: int) -> bool:
|
|
299
|
+
if search_offset >= n_search_lines:
|
|
300
|
+
return True
|
|
301
|
+
|
|
302
|
+
if search_index in search_line_positions[search_offset]:
|
|
303
|
+
return search_in_dictionary(search_offset + 1, search_index + 1)
|
|
304
|
+
return False
|
|
305
|
+
|
|
306
|
+
matched_slices = []
|
|
307
|
+
for index in search_line_positions[0]:
|
|
308
|
+
if search_in_dictionary(1, index + 1):
|
|
309
|
+
matched_slices.append(slice(index, index + n_search_lines, 1))
|
|
310
|
+
return matched_slices
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def match_exact(
|
|
314
|
+
content: list[str], content_offset: int, search: list[str]
|
|
315
|
+
) -> list[slice]:
|
|
316
|
+
n_search_lines = len(search)
|
|
317
|
+
n_content = len(content) - content_offset
|
|
318
|
+
if n_search_lines > n_content:
|
|
319
|
+
return []
|
|
320
|
+
if n_search_lines == 0:
|
|
321
|
+
return []
|
|
322
|
+
if n_content == 0:
|
|
323
|
+
return []
|
|
324
|
+
content_positions = DefaultDict[str, set[int]](set)
|
|
325
|
+
for i in range(content_offset, n_content):
|
|
326
|
+
content_positions[content[i]].add(i)
|
|
327
|
+
search_line_positions = [content_positions[line] for line in search]
|
|
328
|
+
|
|
329
|
+
matched_slices = find_contiguous_match(search_line_positions)
|
|
330
|
+
|
|
331
|
+
return matched_slices
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def match_with_tolerance(
|
|
335
|
+
content: list[str],
|
|
336
|
+
content_offset: int,
|
|
337
|
+
search: list[str],
|
|
338
|
+
tolerances: list[Tolerance],
|
|
339
|
+
) -> list[tuple[slice, list[TolerancesHit]]]:
|
|
340
|
+
n_search_lines = len(search)
|
|
341
|
+
n_content = len(content) - content_offset
|
|
342
|
+
if n_search_lines > n_content:
|
|
343
|
+
return []
|
|
344
|
+
if n_search_lines == 0:
|
|
345
|
+
return []
|
|
346
|
+
if n_content == 0:
|
|
347
|
+
return []
|
|
348
|
+
content_positions = DefaultDict[str, set[int]](set)
|
|
349
|
+
for i in range(content_offset, n_content):
|
|
350
|
+
content_positions[content[i]].add(i)
|
|
351
|
+
search_line_positions = [content_positions[line] for line in search]
|
|
352
|
+
|
|
353
|
+
tolerance_index_by_content_line: list[dict[int, int]] = [
|
|
354
|
+
{} for _ in range(len(search))
|
|
355
|
+
]
|
|
356
|
+
for tidx, tolerance in enumerate(tolerances):
|
|
357
|
+
content_positions = DefaultDict[str, set[int]](set)
|
|
358
|
+
for i in range(content_offset, n_content):
|
|
359
|
+
line = content[i]
|
|
360
|
+
content_positions[tolerance.line_process(line)].add(i)
|
|
361
|
+
for i, line in enumerate(search):
|
|
362
|
+
new_lines = content_positions[tolerance.line_process(line)]
|
|
363
|
+
new_indices = new_lines - search_line_positions[i]
|
|
364
|
+
search_line_positions[i].update(new_indices)
|
|
365
|
+
tolerance_index_by_content_line[i].update(
|
|
366
|
+
{idx: tidx for idx in new_indices}
|
|
367
|
+
)
|
|
368
|
+
matched_slices = find_contiguous_match(search_line_positions)
|
|
369
|
+
|
|
370
|
+
tolerances_counts: list[list[TolerancesHit]] = [
|
|
371
|
+
[
|
|
372
|
+
TolerancesHit(
|
|
373
|
+
line_process=tol.line_process,
|
|
374
|
+
severity_cat=tol.severity_cat,
|
|
375
|
+
score_multiplier=tol.score_multiplier,
|
|
376
|
+
count=0,
|
|
377
|
+
error_name=tol.error_name,
|
|
378
|
+
)
|
|
379
|
+
for tol in tolerances
|
|
380
|
+
]
|
|
381
|
+
for _ in range(len(matched_slices))
|
|
382
|
+
]
|
|
383
|
+
for sidx, slice in enumerate(matched_slices):
|
|
384
|
+
for search_idx, content_idx in enumerate(
|
|
385
|
+
range(slice.start, slice.stop, slice.step)
|
|
386
|
+
):
|
|
387
|
+
if content_idx in tolerance_index_by_content_line[search_idx]:
|
|
388
|
+
tolerances_counts[sidx][
|
|
389
|
+
tolerance_index_by_content_line[search_idx][content_idx]
|
|
390
|
+
].count += 1
|
|
391
|
+
|
|
392
|
+
return list(zip(matched_slices, tolerances_counts))
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def match_with_tolerance_empty_line(
|
|
396
|
+
content: list[str],
|
|
397
|
+
content_offset: int,
|
|
398
|
+
search: list[str],
|
|
399
|
+
tolerances: list[Tolerance],
|
|
400
|
+
) -> list[tuple[slice, list[TolerancesHit]]]:
|
|
401
|
+
new_content = list[str]()
|
|
402
|
+
new_to_original = dict[int, int]()
|
|
403
|
+
for i in range(content_offset, len(content)):
|
|
404
|
+
line = content[i]
|
|
405
|
+
if line.strip():
|
|
406
|
+
new_to_original[len(new_content)] = i
|
|
407
|
+
new_content.append(line)
|
|
408
|
+
|
|
409
|
+
search = [line for line in search if line.strip()]
|
|
410
|
+
|
|
411
|
+
matches_with_tolerancs = match_with_tolerance(new_content, 0, search, tolerances)
|
|
412
|
+
|
|
413
|
+
new_matches_with_tolerances = list[tuple[slice, list[TolerancesHit]]]()
|
|
414
|
+
for matches, tolerance_counts in matches_with_tolerancs:
|
|
415
|
+
matches = slice(
|
|
416
|
+
new_to_original[matches.start], new_to_original[matches.stop - 1] + 1, 1
|
|
417
|
+
)
|
|
418
|
+
new_matches_with_tolerances.append((matches, tolerance_counts))
|
|
419
|
+
return new_matches_with_tolerances
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def find_least_edit_distance_substring(
|
|
423
|
+
orig_content_lines: list[str], offset: int, find_lines: list[str]
|
|
424
|
+
) -> tuple[Optional[slice], float, str]:
|
|
425
|
+
# Prepare content lines, stripping whitespace and keeping track of original indices
|
|
426
|
+
content_lines = [
|
|
427
|
+
orig_content_lines[i].strip() for i in range(offset, len(orig_content_lines))
|
|
428
|
+
]
|
|
429
|
+
new_to_original_indices = {}
|
|
430
|
+
new_content_lines = []
|
|
431
|
+
for i, line in enumerate(content_lines):
|
|
432
|
+
if not line:
|
|
433
|
+
continue
|
|
434
|
+
new_content_lines.append(line)
|
|
435
|
+
new_to_original_indices[len(new_content_lines) - 1] = i
|
|
436
|
+
content_lines = new_content_lines
|
|
437
|
+
|
|
438
|
+
# Prepare find lines, removing empty lines
|
|
439
|
+
find_lines = [line.strip() for line in find_lines if line.strip()]
|
|
440
|
+
|
|
441
|
+
# Initialize variables for best match tracking
|
|
442
|
+
max_similarity = 0.0
|
|
443
|
+
min_edit_distance_lines = None
|
|
444
|
+
context_lines = []
|
|
445
|
+
|
|
446
|
+
# For each possible starting position in content
|
|
447
|
+
for i in range(max(1, len(content_lines) - len(find_lines) + 1)):
|
|
448
|
+
# Calculate similarity for the block starting at position i
|
|
449
|
+
block_similarity = 0.0
|
|
450
|
+
for j in range(len(find_lines)):
|
|
451
|
+
if (i + j) < len(content_lines):
|
|
452
|
+
# Use SequenceMatcher for more efficient similarity calculation
|
|
453
|
+
similarity = SequenceMatcher(
|
|
454
|
+
None, content_lines[i + j], find_lines[j]
|
|
455
|
+
).ratio()
|
|
456
|
+
block_similarity += similarity
|
|
457
|
+
|
|
458
|
+
# If this block is more similar than previous best
|
|
459
|
+
if block_similarity > max_similarity:
|
|
460
|
+
max_similarity = block_similarity
|
|
461
|
+
# Map back to original line indices
|
|
462
|
+
orig_start_index = new_to_original_indices[i]
|
|
463
|
+
orig_end_index = (
|
|
464
|
+
new_to_original_indices.get(
|
|
465
|
+
i + len(find_lines) - 1, len(orig_content_lines) - 1
|
|
466
|
+
)
|
|
467
|
+
+ 1
|
|
468
|
+
)
|
|
469
|
+
# Get the original lines
|
|
470
|
+
min_edit_distance_lines = slice(
|
|
471
|
+
orig_start_index + offset, orig_end_index + offset
|
|
472
|
+
)
|
|
473
|
+
# Get context (10 lines before and after)
|
|
474
|
+
context_lines = orig_content_lines[
|
|
475
|
+
max(0, orig_start_index - 10 + offset) : (orig_end_index + 10 + offset)
|
|
476
|
+
]
|
|
477
|
+
|
|
478
|
+
return (
|
|
479
|
+
min_edit_distance_lines,
|
|
480
|
+
max_similarity,
|
|
481
|
+
"\n".join(context_lines),
|
|
482
|
+
)
|