patch-fixer 0.3.4__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,22 +7,64 @@ from pathlib import Path
7
7
 
8
8
  from git import Repo
9
9
 
10
- path_regex = r'(?:[A-Za-z0-9_.-]+/?)+'
10
+ path_regex = r'[^ \n\t]+(?: [^ \n\t]+)*'
11
11
  regexes = {
12
- "DIFF_LINE": re.compile(rf'diff --git (a/{path_regex}) (b/{path_regex})'),
13
- "MODE_LINE": re.compile(r'(new|deleted) file mode [0-7]{6}'),
14
- "INDEX_LINE": re.compile(r'index [0-9a-f]{7,64}\.\.[0-9a-f]{7,64}(?: [0-7]{6})?|similarity index ([0-9]+)%'),
15
- "BINARY_LINE": re.compile(rf'Binary files (a/{path_regex}|/dev/null) and (b/{path_regex}|/dev/null) differ'),
16
- "RENAME_FROM": re.compile(rf'rename from ({path_regex})'),
17
- "RENAME_TO": re.compile(rf'rename to ({path_regex})'),
18
- "FILE_HEADER_START": re.compile(rf'--- (a/{path_regex}|/dev/null)'),
19
- "FILE_HEADER_END": re.compile(rf'\+\+\+ (b/{path_regex}|/dev/null)'),
20
- "HUNK_HEADER": re.compile(r'^@@ -(\d+),(\d+) \+(\d+),(\d+) @@(.*)$'),
21
- "END_LINE": re.compile(r'\')
12
+ "DIFF_LINE": re.compile(rf'^diff --git (a/{path_regex}) (b/{path_regex})$'),
13
+ "MODE_LINE": re.compile(r'^(new|deleted) file mode [0-7]{6}$'),
14
+ "INDEX_LINE": re.compile(r'^index [0-9a-f]{7,64}\.\.[0-9a-f]{7,64}(?: [0-7]{6})?$|^similarity index ([0-9]+)%$'),
15
+ "BINARY_LINE": re.compile(rf'^Binary files (a/{path_regex}|/dev/null) and (b/{path_regex}|/dev/null) differ$'),
16
+ "RENAME_FROM": re.compile(rf'^rename from ({path_regex})$'),
17
+ "RENAME_TO": re.compile(rf'^rename to ({path_regex})$'),
18
+ "FILE_HEADER_START": re.compile(rf'^--- (a/{path_regex}|/dev/null)$'),
19
+ "FILE_HEADER_END": re.compile(rf'^\+\+\+ (b/{path_regex}|/dev/null)$'),
20
+ "HUNK_HEADER": re.compile(r'^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*)$'),
21
+ "END_LINE": re.compile(r'^\$'),
22
22
  }
23
23
 
24
24
 
25
- class MissingHunkError(Exception):
25
+ class HunkErrorBase(Exception):
26
+ def __init__(self, hunk_lines, file="(unknown file)"):
27
+ super().__init__()
28
+ self.hunk = "".join(hunk_lines)
29
+ self.file = file
30
+
31
+ def format_hunk_for_error(self):
32
+ """Format hunk for error messages, showing only context and deletion lines."""
33
+ error_lines = []
34
+ for line in self.hunk.splitlines(keepends=True):
35
+ if line.startswith((' ', '-')): # context or deletion lines
36
+ error_lines.append(line)
37
+ # skip addition lines (+) as they shouldn't be in the original file
38
+ return ''.join(error_lines)
39
+
40
+ def add_file(self, file):
41
+ self.file = file
42
+
43
+
44
+ class MissingHunkError(HunkErrorBase):
45
+ def __str__(self):
46
+ return (f"Could not find hunk in {self.file}:"
47
+ f"\n================================"
48
+ f"\n{self.format_hunk_for_error()}"
49
+ f"================================")
50
+
51
+
52
+ class OutOfOrderHunk(HunkErrorBase):
53
+ def __init__(self, hunk_lines, prev_header, file="(unknown file)"):
54
+ super().__init__(hunk_lines, file)
55
+ self.prev_header = prev_header
56
+
57
+ def __str__(self):
58
+ return (f"Out of order hunk in {self.file}:"
59
+ f"\n==============================="
60
+ f"\n{self.format_hunk_for_error()}"
61
+ f"==============================="
62
+ f"\nOccurs before previous hunk with header {self.prev_header}")
63
+
64
+
65
+ class EmptyHunk(Exception):
66
+ # don't inherit from HunkErrorBase since this is a sentinel exception
67
+ # meant to catch the case where the very last hunk is empty
26
68
  pass
27
69
 
28
70
 
@@ -64,11 +106,12 @@ def normalize_line(line):
64
106
 
65
107
  def fuzzy_line_similarity(line1, line2, threshold=0.8):
66
108
  """Calculate similarity between two lines using a simple ratio."""
67
- if not line1 or not line2:
68
- return 0.0
69
-
70
109
  l1, l2 = line1.strip(), line2.strip()
71
110
 
111
+ # empty strings are identical
112
+ if len(l1) == 0 and len(l2) == 0:
113
+ return 1.0
114
+
72
115
  if l1 == l2:
73
116
  return 1.0
74
117
 
@@ -88,7 +131,10 @@ def find_hunk_start(context_lines, original_lines, fuzzy=False):
88
131
  """Search original_lines for context_lines and return start line index (0-based)."""
89
132
  ctx = []
90
133
  for line in context_lines:
91
- if line.startswith(" "):
134
+ if regexes["END_LINE"].match(line):
135
+ # "" is just git metadata; skip
136
+ continue
137
+ elif line.startswith(" "):
92
138
  ctx.append(line.lstrip(" "))
93
139
  elif line.startswith("-"):
94
140
  # can't use lstrip; we want to keep other dashes in the line
@@ -124,7 +170,7 @@ def find_hunk_start(context_lines, original_lines, fuzzy=False):
124
170
  if best_match_score > 0.6:
125
171
  return best_match_pos
126
172
 
127
- return 0
173
+ raise MissingHunkError(context_lines)
128
174
 
129
175
 
130
176
  def match_line(line):
@@ -156,40 +202,131 @@ def reconstruct_file_header(diff_line, header_type):
156
202
  raise ValueError(f"Unsupported header type: {header_type}")
157
203
 
158
204
 
159
- def capture_hunk(current_hunk, original_lines, offset, last_hunk, hunk_context, fuzzy=False):
205
+ def find_all_hunk_starts(hunk_lines, search_lines, fuzzy=False):
206
+ """Return all line indices in search_lines where this hunk matches."""
207
+ matches = []
208
+ start = 0
209
+ while True:
210
+ try:
211
+ idx = find_hunk_start(hunk_lines, search_lines[start:], fuzzy=fuzzy)
212
+ matches.append(start + idx)
213
+ start += idx + 1
214
+ except MissingHunkError:
215
+ break
216
+ return matches
217
+
218
+
219
+ def capture_hunk(current_hunk, original_lines, offset, last_hunk, old_header, fuzzy=False):
220
+ """
221
+ Try to locate the hunk's true position in the original file.
222
+
223
+ If multiple possible matches exist, pick the one closest to the expected
224
+ (possibly corrupted) line number derived from the old hunk header.
225
+ """
226
+ if not current_hunk:
227
+ raise EmptyHunk
228
+
229
+ # extract needed info from old header match groups
230
+ expected_old_start = int(old_header[0]) if old_header else 0
231
+ try:
232
+ hunk_context = old_header[4]
233
+ except IndexError:
234
+ hunk_context = ""
235
+
236
+ # presence or absence of end line shouldn't affect line counts
237
+ if regexes["END_LINE"].match(current_hunk[-1]):
238
+ hunk_len = len(current_hunk) - 1
239
+ else:
240
+ hunk_len = len(current_hunk)
241
+
160
242
  # compute line counts
161
- old_count = sum(1 for l in current_hunk if l.startswith((' ', '-')))
162
- new_count = sum(1 for l in current_hunk if l.startswith((' ', '+')))
243
+ context_count = sum(1 for l in current_hunk if l.startswith(' '))
244
+ minus_count = sum(1 for l in current_hunk if l.startswith('-'))
245
+ plus_count = sum(1 for l in current_hunk if l.startswith('+'))
163
246
 
164
- if old_count > 0:
165
- # compute starting line in original file
166
- old_start = find_hunk_start(current_hunk, original_lines, fuzzy=fuzzy) + 1
247
+ old_count = context_count + minus_count
248
+ new_count = context_count + plus_count
167
249
 
168
- # if the line number descends, we either have a bad match or a new file
169
- if old_start < last_hunk:
170
- raise MissingHunkError
250
+ if minus_count == hunk_len: # file deletion
251
+ old_start = 1
252
+ new_start = 0
253
+ elif plus_count == hunk_len: # file creation
254
+ old_start = 0
255
+ new_start = 1
256
+ else: # file modification
257
+ search_index = last_hunk
258
+ search_lines = original_lines[search_index:]
259
+
260
+ # gather *all* possible matches
261
+ matches = find_all_hunk_starts(current_hunk, search_lines, fuzzy=fuzzy)
262
+ if matches:
263
+ # rebase to file line numbers (1-indexed later)
264
+ candidate_positions = [m + search_index for m in matches]
265
+
266
+ if expected_old_start:
267
+ # choose the one closest to the expected position
268
+ old_start = min(
269
+ candidate_positions,
270
+ key=lambda pos: abs(pos + 1 - expected_old_start),
271
+ ) + 1 # convert to 1-indexed
272
+ else:
273
+ # pick first match if no expected line info
274
+ old_start = candidate_positions[0] + 1
171
275
  else:
172
- if new_count == 0:
173
- # complete deletion of remaining content
174
- new_start = 0
276
+ # try from start of file, excluding lines already searched
277
+ search_index += hunk_len
278
+ search_lines = original_lines[:search_index]
279
+ matches = find_all_hunk_starts(current_hunk, search_lines, fuzzy=fuzzy)
280
+ if not matches:
281
+ raise MissingHunkError(current_hunk)
282
+ if expected_old_start:
283
+ old_start = (
284
+ min(matches, key=lambda pos: abs(pos + 1 - expected_old_start)) + 1
285
+ )
175
286
  else:
176
- new_start = old_start + offset
177
- else:
178
- # old count of zero can only mean file creation, since adding lines to
179
- # an existing file requires surrounding context lines without a +
180
- old_start = 0
181
- new_start = 1 # line numbers are 1-indexed in the real world
287
+ old_start = matches[0] + 1
288
+
289
+ if old_start < last_hunk + 1:
290
+ raise OutOfOrderHunk(current_hunk, original_lines[last_hunk])
291
+
292
+ if new_count == 0:
293
+ # complete deletion of remaining content
294
+ new_start = 0
295
+ else:
296
+ new_start = old_start + offset
182
297
 
183
298
  offset += (new_count - old_count)
184
299
 
185
- last_hunk = old_start
300
+ last_hunk += (old_start - last_hunk)
186
301
 
187
- # write corrected header
188
- fixed_header = f"@@ -{old_start},{old_count} +{new_start},{new_count} @@{hunk_context}\n"
302
+ # use condensed header if it's only one line
303
+ old_part = f"{old_start},{old_count}" if old_count != 1 else f"{old_start}"
304
+ new_part = f"{new_start},{new_count}" if new_count != 1 else f"{new_start}"
305
+
306
+ fixed_header = f"@@ -{old_part} +{new_part} @@{hunk_context}\n"
189
307
 
190
308
  return fixed_header, offset, last_hunk
191
309
 
192
310
 
311
+ def read_file_with_fallback_encoding(file_path):
312
+ """Read file with UTF-8, falling back to other encodings if needed."""
313
+ encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
314
+
315
+ for encoding in encodings:
316
+ try:
317
+ with open(file_path, 'r', encoding=encoding) as f:
318
+ return f.readlines()
319
+ except UnicodeDecodeError:
320
+ continue
321
+
322
+ # If all encodings fail, read as binary and replace problematic characters
323
+ with open(file_path, 'rb') as f:
324
+ content = f.read()
325
+ # Decode with UTF-8, replacing errors
326
+ text_content = content.decode('utf-8', errors='replace')
327
+ return text_content.splitlines(keepends=True)
328
+
329
+
193
330
  def regenerate_index(old_path, new_path, cur_dir):
194
331
  repo = Repo(cur_dir)
195
332
 
@@ -235,10 +372,9 @@ def fix_patch(patch_lines, original, remove_binary=False, fuzzy=False, add_newli
235
372
  file_start_header = False
236
373
  file_end_header = False
237
374
  look_for_rename = False
238
- similarity_index = None
239
375
  missing_index = False
240
376
  binary_file = False
241
- hunk_context = ""
377
+ current_hunk_header = ()
242
378
  original_lines = []
243
379
  file_loaded = False
244
380
 
@@ -253,10 +389,10 @@ def fix_patch(patch_lines, original, remove_binary=False, fuzzy=False, add_newli
253
389
  fixed_header,
254
390
  offset,
255
391
  last_hunk
256
- ) = capture_hunk(current_hunk, original_lines, offset, last_hunk, hunk_context, fuzzy=fuzzy)
257
- except MissingHunkError:
258
- raise NotImplementedError(f"Could not find hunk in {current_file}:"
259
- f"\n\n{''.join(current_hunk)}")
392
+ ) = capture_hunk(current_hunk, original_lines, offset, last_hunk, current_hunk_header, fuzzy=fuzzy)
393
+ except (MissingHunkError, OutOfOrderHunk) as e:
394
+ e.add_file(current_file)
395
+ raise e
260
396
  fixed_lines.append(fixed_header)
261
397
  fixed_lines.extend(current_hunk)
262
398
  current_hunk = []
@@ -310,20 +446,11 @@ def fix_patch(patch_lines, original, remove_binary=False, fuzzy=False, add_newli
310
446
  current_path = Path(current_file).absolute()
311
447
  offset = 0
312
448
  last_hunk = 0
313
- if not Path.exists(current_path):
314
- # this is meant to handle cases where the source file
315
- # doesn't exist (e.g., when applying a patch that renames
316
- # a file created earlier in the same patch)
317
- # TODO: but really, does that ever happen???
318
- fixed_lines.append(normalize_line(line))
319
- look_for_rename = True
320
- file_loaded = False
321
- continue
322
449
  if not current_path.is_file():
323
450
  raise IsADirectoryError(f"Rename from header points to a directory, not a file: {current_file}")
324
451
  if dir_mode or current_path == original_path:
325
- with open(current_path, encoding='utf-8') as f:
326
- original_lines = [l.rstrip('\n') for l in f.readlines()]
452
+ file_lines = read_file_with_fallback_encoding(current_path)
453
+ original_lines = [l.rstrip('\n') for l in file_lines]
327
454
  fixed_lines.append(normalize_line(line))
328
455
  file_loaded = True
329
456
  else:
@@ -335,7 +462,7 @@ def fix_patch(patch_lines, original, remove_binary=False, fuzzy=False, add_newli
335
462
  last_index = i - 2
336
463
  else:
337
464
  raise NotImplementedError("Missing `rename from` header not yet supported.")
338
- if not look_for_rename:
465
+ if not file_loaded:
339
466
  # if we're not looking for a rename but encounter "rename to",
340
467
  # this indicates a malformed patch - log warning but continue
341
468
  warnings.warn(
@@ -382,8 +509,8 @@ def fix_patch(patch_lines, original, remove_binary=False, fuzzy=False, add_newli
382
509
  raise IsADirectoryError(f"File header start points to a directory, not a file: {current_file}")
383
510
  if not file_loaded:
384
511
  if dir_mode or Path(current_file) == Path(original):
385
- with open(current_file, encoding='utf-8') as f:
386
- original_lines = [l.rstrip('\n') for l in f.readlines()]
512
+ file_lines = read_file_with_fallback_encoding(current_path)
513
+ original_lines = [l.rstrip('\n') for l in file_lines]
387
514
  file_loaded = True
388
515
  else:
389
516
  raise FileNotFoundError(f"Filename {current_file} in header does not match argument {original}")
@@ -471,7 +598,7 @@ def fix_patch(patch_lines, original, remove_binary=False, fuzzy=False, add_newli
471
598
  # we can't fix the hunk header before we've captured a hunk
472
599
  if first_hunk:
473
600
  first_hunk = False
474
- hunk_context = match_groups[4]
601
+ current_hunk_header = match_groups
475
602
  continue
476
603
 
477
604
  try:
@@ -479,20 +606,20 @@ def fix_patch(patch_lines, original, remove_binary=False, fuzzy=False, add_newli
479
606
  fixed_header,
480
607
  offset,
481
608
  last_hunk
482
- ) = capture_hunk(current_hunk, original_lines, offset, last_hunk, hunk_context, fuzzy=fuzzy)
483
- except MissingHunkError:
484
- raise NotImplementedError(f"Could not find hunk in {current_file}:"
485
- f"\n\n{''.join(current_hunk)}")
609
+ ) = capture_hunk(current_hunk, original_lines, offset, last_hunk, current_hunk_header, fuzzy=fuzzy)
610
+ except (MissingHunkError, OutOfOrderHunk) as e:
611
+ e.add_file(current_file)
612
+ raise e
486
613
  fixed_lines.append(fixed_header)
487
614
  fixed_lines.extend(current_hunk)
488
615
  current_hunk = []
489
- hunk_context = match_groups[4]
616
+ current_hunk_header = match_groups
490
617
  case "END_LINE":
491
618
  # if user requested, add a newline at end of file when this marker is present
492
619
  if add_newline:
493
620
  fixed_lines.append("\n")
494
621
  else:
495
- fixed_lines.append(normalize_line(line))
622
+ current_hunk.append(normalize_line(line))
496
623
  case _:
497
624
  # TODO: fix fuzzy string matching to be less granular
498
625
  # this is a normal line, add to current hunk
@@ -504,15 +631,20 @@ def fix_patch(patch_lines, original, remove_binary=False, fuzzy=False, add_newli
504
631
  fixed_header,
505
632
  offset,
506
633
  last_hunk
507
- ) = capture_hunk(current_hunk, original_lines, offset, last_hunk, hunk_context, fuzzy=fuzzy)
508
- except MissingHunkError:
509
- raise NotImplementedError(f"Could not find hunk in {current_file}:"
510
- f"\n\n{''.join(current_hunk)}")
634
+ ) = capture_hunk(current_hunk, original_lines, offset, last_hunk, current_hunk_header, fuzzy=fuzzy)
635
+ except EmptyHunk:
636
+ return fixed_lines
637
+ except (MissingHunkError, OutOfOrderHunk) as e:
638
+ e.add_file(current_file)
639
+ raise e
511
640
  fixed_lines.append(fixed_header)
512
641
  fixed_lines.extend(current_hunk)
513
642
 
514
- # if original file didn't end with a newline, strip out the newline here
515
- if original_lines and not original_lines[-1].endswith("\n"):
643
+ # if original file didn't end with a newline, strip out the newline here,
644
+ # unless user explicitly requested to add final newline
645
+ if (not add_newline and
646
+ ((original_lines and not original_lines[-1].endswith("\n")) or
647
+ (fixed_lines and len(original_lines) == 0))):
516
648
  fixed_lines[-1] = fixed_lines[-1].rstrip("\n")
517
649
 
518
650
  return fixed_lines
@@ -539,5 +671,4 @@ def main():
539
671
 
540
672
 
541
673
  if __name__ == "__main__":
542
- main()
543
-
674
+ main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: patch-fixer
3
- Version: 0.3.4
3
+ Version: 0.4.1
4
4
  Summary: Fixes erroneous git apply patches to the best of its ability.
5
5
  Maintainer-email: Alex Mueller <amueller474@gmail.com>
6
6
  License-Expression: Apache-2.0
@@ -55,6 +55,11 @@ where:
55
55
  - `broken.patch` is the malformed patch generated by the LLM
56
56
  - `fixed.patch` is the output file containing the (hopefully) fixed patch
57
57
 
58
+ Options:
59
+ - `--fuzzy`: enable fuzzy string matching for better context matching (experimental)
60
+ - `--add-newline`: add final newlines when processing "No newline at end of file" markers
61
+
62
+
58
63
  #### Splitting patches by file:
59
64
  ```bash
60
65
  # Split with files specified on command line
@@ -81,9 +86,16 @@ original = "/path/to/original/state" # file or directory being patched
81
86
  with open(patch_file, encoding="utf-8") as f:
82
87
  patch_lines = f.readlines()
83
88
 
89
+ # basic usage
84
90
  fixed_lines = fix_patch(patch_lines, original)
85
- output_file = "/path/to/fixed.patch"
86
91
 
92
+ # with fuzzy matching enabled
93
+ fixed_lines = fix_patch(patch_lines, original, fuzzy=True)
94
+
95
+ # with final newline addition
96
+ fixed_lines = fix_patch(patch_lines, original, add_newline=True)
97
+
98
+ output_file = "/path/to/fixed.patch"
87
99
  with open(output_file, 'w', encoding='utf-8') as f:
88
100
  f.writelines(fixed_lines)
89
101
  ```
@@ -0,0 +1,10 @@
1
+ patch_fixer/__init__.py,sha256=n5DDMr4jbO3epK3ybBvjDyRddTWlWamN6ao5BC7xHFo,65
2
+ patch_fixer/cli.py,sha256=4zy02FsVrUrcQzsBwQ58PVfJXoG4OsOYKpk2JXGw1cY,3841
3
+ patch_fixer/patch_fixer.py,sha256=OuJkwhOq2Q9zcotxIRlT1kBZaD76JCxY5VCMrcSzWnA,28084
4
+ patch_fixer/split.py,sha256=l0rHM6-ZBuB9Iv6Ng6rxqZH5eKfvk2t87j__nDu67kM,3869
5
+ patch_fixer-0.4.1.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
6
+ patch_fixer-0.4.1.dist-info/METADATA,sha256=4O0lHxiYNuta3IjGfLadnqwITFnJHD-gQVvb-lyXGos,4907
7
+ patch_fixer-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
+ patch_fixer-0.4.1.dist-info/entry_points.txt,sha256=ftc6dP6B1zJouSPeCCJLZtx-EEGVSrNEwy4YhtnEoxA,53
9
+ patch_fixer-0.4.1.dist-info/top_level.txt,sha256=yyp3KjFgExJsrFsS9ZBCnkhb05xg8hPYhB7ncdpTOv0,12
10
+ patch_fixer-0.4.1.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- patch_fixer/__init__.py,sha256=n5DDMr4jbO3epK3ybBvjDyRddTWlWamN6ao5BC7xHFo,65
2
- patch_fixer/cli.py,sha256=4zy02FsVrUrcQzsBwQ58PVfJXoG4OsOYKpk2JXGw1cY,3841
3
- patch_fixer/patch_fixer.py,sha256=eqrqe6jKlEWiCjOiLiFnq9oPi1HZPrZBSEsCcEANeFw,23478
4
- patch_fixer/split.py,sha256=l0rHM6-ZBuB9Iv6Ng6rxqZH5eKfvk2t87j__nDu67kM,3869
5
- patch_fixer-0.3.4.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
6
- patch_fixer-0.3.4.dist-info/METADATA,sha256=cV7wioKTFQulrTUB9R_s_lDfDNJDYfwEp3uSho2fqXc,4521
7
- patch_fixer-0.3.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
- patch_fixer-0.3.4.dist-info/entry_points.txt,sha256=ftc6dP6B1zJouSPeCCJLZtx-EEGVSrNEwy4YhtnEoxA,53
9
- patch_fixer-0.3.4.dist-info/top_level.txt,sha256=yyp3KjFgExJsrFsS9ZBCnkhb05xg8hPYhB7ncdpTOv0,12
10
- patch_fixer-0.3.4.dist-info/RECORD,,