PyPI - patch-fixer - Versions diffs - 0.3.4__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

patch-fixer 0.3.4py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

patch_fixer/patch_fixer.py CHANGED Viewed

@@ -7,22 +7,64 @@ from pathlib import Path
 from git import Repo
-path_regex = r'(?:[A-Za-z0-9_.-]+/?)+'
+path_regex = r'[^ \n\t]+(?: [^ \n\t]+)*'
 regexes = {
-    "DIFF_LINE": re.compile(rf'diff --git (a/{path_regex}) (b/{path_regex})'),
-    "MODE_LINE": re.compile(r'(new|deleted) file mode [0-7]{6}'),
-    "INDEX_LINE": re.compile(r'index [0-9a-f]{7,64}\.\.[0-9a-f]{7,64}(?: [0-7]{6})?|similarity index ([0-9]+)%'),
-    "BINARY_LINE": re.compile(rf'Binary files (a/{path_regex}|/dev/null) and (b/{path_regex}|/dev/null) differ'),
-    "RENAME_FROM": re.compile(rf'rename from ({path_regex})'),
-    "RENAME_TO": re.compile(rf'rename to ({path_regex})'),
-    "FILE_HEADER_START": re.compile(rf'--- (a/{path_regex}|/dev/null)'),
-    "FILE_HEADER_END": re.compile(rf'\+\+\+ (b/{path_regex}|/dev/null)'),
-    "HUNK_HEADER": re.compile(r'^@@ -(\d+),(\d+) \+(\d+),(\d+) @@(.*)$'),
-    "END_LINE": re.compile(r'\')
+    "DIFF_LINE": re.compile(rf'^diff --git (a/{path_regex}) (b/{path_regex})$'),
+    "MODE_LINE": re.compile(r'^(new|deleted) file mode [0-7]{6}$'),
+    "INDEX_LINE": re.compile(r'^index [0-9a-f]{7,64}\.\.[0-9a-f]{7,64}(?: [0-7]{6})?$|^similarity index ([0-9]+)%$'),
+    "BINARY_LINE": re.compile(rf'^Binary files (a/{path_regex}|/dev/null) and (b/{path_regex}|/dev/null) differ$'),
+    "RENAME_FROM": re.compile(rf'^rename from ({path_regex})$'),
+    "RENAME_TO": re.compile(rf'^rename to ({path_regex})$'),
+    "FILE_HEADER_START": re.compile(rf'^--- (a/{path_regex}|/dev/null)$'),
+    "FILE_HEADER_END": re.compile(rf'^\+\+\+ (b/{path_regex}|/dev/null)$'),
+    "HUNK_HEADER": re.compile(r'^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*)$'),
+    "END_LINE": re.compile(r'^\$'),
 }
-class MissingHunkError(Exception):
+class HunkErrorBase(Exception):
+    def __init__(self, hunk_lines, file="(unknown file)"):
+        super().__init__()
+        self.hunk = "".join(hunk_lines)
+        self.file = file
+    def format_hunk_for_error(self):
+        """Format hunk for error messages, showing only context and deletion lines."""
+        error_lines = []
+        for line in self.hunk.splitlines(keepends=True):
+            if line.startswith((' ', '-')):  # context or deletion lines
+                error_lines.append(line)
+            # skip addition lines (+) as they shouldn't be in the original file
+        return ''.join(error_lines)
+    def add_file(self, file):
+        self.file = file
+class MissingHunkError(HunkErrorBase):
+    def __str__(self):
+        return (f"Could not find hunk in {self.file}:"
+                f"\n================================"
+                f"\n{self.format_hunk_for_error()}"
+                f"================================")
+class OutOfOrderHunk(HunkErrorBase):
+    def __init__(self, hunk_lines, prev_header, file="(unknown file)"):
+        super().__init__(hunk_lines, file)
+        self.prev_header = prev_header
+    def __str__(self):
+        return (f"Out of order hunk in {self.file}:"
+                f"\n==============================="
+                f"\n{self.format_hunk_for_error()}"
+                f"==============================="
+                f"\nOccurs before previous hunk with header {self.prev_header}")
+class EmptyHunk(Exception):
+    # don't inherit from HunkErrorBase since this is a sentinel exception
+    # meant to catch the case where the very last hunk is empty
     pass
@@ -64,11 +106,12 @@ def normalize_line(line):
 def fuzzy_line_similarity(line1, line2, threshold=0.8):
     """Calculate similarity between two lines using a simple ratio."""
-    if not line1 or not line2:
-        return 0.0
     l1, l2 = line1.strip(), line2.strip()
+    # empty strings are identical
+    if len(l1) == 0 and len(l2) == 0:
+        return 1.0
     if l1 == l2:
         return 1.0
@@ -88,7 +131,10 @@ def find_hunk_start(context_lines, original_lines, fuzzy=False):
     """Search original_lines for context_lines and return start line index (0-based)."""
     ctx = []
     for line in context_lines:
-        if line.startswith(" "):
+        if regexes["END_LINE"].match(line):
+            # "" is just git metadata; skip
+            continue
+        elif line.startswith(" "):
             ctx.append(line.lstrip(" "))
         elif line.startswith("-"):
             # can't use lstrip; we want to keep other dashes in the line
@@ -124,7 +170,7 @@ def find_hunk_start(context_lines, original_lines, fuzzy=False):
         if best_match_score > 0.6:
             return best_match_pos
-    return 0
+    raise MissingHunkError(context_lines)
 def match_line(line):
@@ -156,40 +202,131 @@ def reconstruct_file_header(diff_line, header_type):
             raise ValueError(f"Unsupported header type: {header_type}")
-def capture_hunk(current_hunk, original_lines, offset, last_hunk, hunk_context, fuzzy=False):
+def find_all_hunk_starts(hunk_lines, search_lines, fuzzy=False):
+    """Return all line indices in search_lines where this hunk matches."""
+    matches = []
+    start = 0
+    while True:
+        try:
+            idx = find_hunk_start(hunk_lines, search_lines[start:], fuzzy=fuzzy)
+            matches.append(start + idx)
+            start += idx + 1
+        except MissingHunkError:
+            break
+    return matches
+def capture_hunk(current_hunk, original_lines, offset, last_hunk, old_header, fuzzy=False):
+    """
+    Try to locate the hunk's true position in the original file.
+    If multiple possible matches exist, pick the one closest to the expected
+    (possibly corrupted) line number derived from the old hunk header.
+    """
+    if not current_hunk:
+        raise EmptyHunk
+    # extract needed info from old header match groups
+    expected_old_start = int(old_header[0]) if old_header else 0
+    try:
+        hunk_context = old_header[4]
+    except IndexError:
+        hunk_context = ""
+    # presence or absence of end line shouldn't affect line counts
+    if regexes["END_LINE"].match(current_hunk[-1]):
+        hunk_len = len(current_hunk) - 1
+    else:
+        hunk_len = len(current_hunk)
     # compute line counts
-    old_count = sum(1 for l in current_hunk if l.startswith((' ', '-')))
-    new_count = sum(1 for l in current_hunk if l.startswith((' ', '+')))
+    context_count = sum(1 for l in current_hunk if l.startswith(' '))
+    minus_count = sum(1 for l in current_hunk if l.startswith('-'))
+    plus_count = sum(1 for l in current_hunk if l.startswith('+'))
-    if old_count > 0:
-        # compute starting line in original file
-        old_start = find_hunk_start(current_hunk, original_lines, fuzzy=fuzzy) + 1
+    old_count = context_count + minus_count
+    new_count = context_count + plus_count
-        # if the line number descends, we either have a bad match or a new file
-        if old_start < last_hunk:
-            raise MissingHunkError
+    if minus_count == hunk_len:     # file deletion
+        old_start = 1
+        new_start = 0
+    elif plus_count == hunk_len:    # file creation
+        old_start = 0
+        new_start = 1
+    else:                           # file modification
+        search_index = last_hunk
+        search_lines = original_lines[search_index:]
+        # gather *all* possible matches
+        matches = find_all_hunk_starts(current_hunk, search_lines, fuzzy=fuzzy)
+        if matches:
+            # rebase to file line numbers (1-indexed later)
+            candidate_positions = [m + search_index for m in matches]
+            if expected_old_start:
+                # choose the one closest to the expected position
+                old_start = min(
+                    candidate_positions,
+                    key=lambda pos: abs(pos + 1 - expected_old_start),
+                ) + 1  # convert to 1-indexed
+            else:
+                # pick first match if no expected line info
+                old_start = candidate_positions[0] + 1
         else:
-            if new_count == 0:
-                # complete deletion of remaining content
-                new_start = 0
+            # try from start of file, excluding lines already searched
+            search_index += hunk_len
+            search_lines = original_lines[:search_index]
+            matches = find_all_hunk_starts(current_hunk, search_lines, fuzzy=fuzzy)
+            if not matches:
+                raise MissingHunkError(current_hunk)
+            if expected_old_start:
+                old_start = (
+                    min(matches, key=lambda pos: abs(pos + 1 - expected_old_start)) + 1
+                )
             else:
-                new_start = old_start + offset
-    else:
-        # old count of zero can only mean file creation, since adding lines to
-        # an existing file requires surrounding context lines without a +
-        old_start = 0
-        new_start = 1   # line numbers are 1-indexed in the real world
+                old_start = matches[0] + 1
+        if old_start < last_hunk + 1:
+            raise OutOfOrderHunk(current_hunk, original_lines[last_hunk])
+        if new_count == 0:
+            # complete deletion of remaining content
+            new_start = 0
+        else:
+            new_start = old_start + offset
     offset += (new_count - old_count)
-    last_hunk = old_start
+    last_hunk += (old_start - last_hunk)
-    # write corrected header
-    fixed_header = f"@@ -{old_start},{old_count} +{new_start},{new_count} @@{hunk_context}\n"
+    # use condensed header if it's only one line
+    old_part = f"{old_start},{old_count}" if old_count != 1 else f"{old_start}"
+    new_part = f"{new_start},{new_count}" if new_count != 1 else f"{new_start}"
+    fixed_header = f"@@ -{old_part} +{new_part} @@{hunk_context}\n"
     return fixed_header, offset, last_hunk
+def read_file_with_fallback_encoding(file_path):
+    """Read file with UTF-8, falling back to other encodings if needed."""
+    encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
+    for encoding in encodings:
+        try:
+            with open(file_path, 'r', encoding=encoding) as f:
+                return f.readlines()
+        except UnicodeDecodeError:
+            continue
+    # If all encodings fail, read as binary and replace problematic characters
+    with open(file_path, 'rb') as f:
+        content = f.read()
+        # Decode with UTF-8, replacing errors
+        text_content = content.decode('utf-8', errors='replace')
+        return text_content.splitlines(keepends=True)
 def regenerate_index(old_path, new_path, cur_dir):
     repo = Repo(cur_dir)
@@ -235,10 +372,9 @@ def fix_patch(patch_lines, original, remove_binary=False, fuzzy=False, add_newli
     file_start_header = False
     file_end_header = False
     look_for_rename = False
-    similarity_index = None
     missing_index = False
     binary_file = False
-    hunk_context = ""
+    current_hunk_header = ()
     original_lines = []
     file_loaded = False
@@ -253,10 +389,10 @@ def fix_patch(patch_lines, original, remove_binary=False, fuzzy=False, add_newli
                             fixed_header,
                             offset,
                             last_hunk
-                        ) = capture_hunk(current_hunk, original_lines, offset, last_hunk, hunk_context, fuzzy=fuzzy)
-                    except MissingHunkError:
-                        raise NotImplementedError(f"Could not find hunk in {current_file}:"
-                                                  f"\n\n{''.join(current_hunk)}")
+                        ) = capture_hunk(current_hunk, original_lines, offset, last_hunk, current_hunk_header, fuzzy=fuzzy)
+                    except (MissingHunkError, OutOfOrderHunk) as e:
+                        e.add_file(current_file)
+                        raise e
                     fixed_lines.append(fixed_header)
                     fixed_lines.extend(current_hunk)
                     current_hunk = []
@@ -310,20 +446,11 @@ def fix_patch(patch_lines, original, remove_binary=False, fuzzy=False, add_newli
                 current_path = Path(current_file).absolute()
                 offset = 0
                 last_hunk = 0
-                if not Path.exists(current_path):
-                    # this is meant to handle cases where the source file
-                    # doesn't exist (e.g., when applying a patch that renames
-                    # a file created earlier in the same patch)
-                    # TODO: but really, does that ever happen???
-                    fixed_lines.append(normalize_line(line))
-                    look_for_rename = True
-                    file_loaded = False
-                    continue
                 if not current_path.is_file():
                     raise IsADirectoryError(f"Rename from header points to a directory, not a file: {current_file}")
                 if dir_mode or current_path == original_path:
-                    with open(current_path, encoding='utf-8') as f:
-                        original_lines = [l.rstrip('\n') for l in f.readlines()]
+                    file_lines = read_file_with_fallback_encoding(current_path)
+                    original_lines = [l.rstrip('\n') for l in file_lines]
                     fixed_lines.append(normalize_line(line))
                     file_loaded = True
                 else:
@@ -335,7 +462,7 @@ def fix_patch(patch_lines, original, remove_binary=False, fuzzy=False, add_newli
                         last_index = i - 2
                     else:
                         raise NotImplementedError("Missing `rename from` header not yet supported.")
-                if not look_for_rename:
+                if not file_loaded:
                     # if we're not looking for a rename but encounter "rename to",
                     # this indicates a malformed patch - log warning but continue
                     warnings.warn(
@@ -382,8 +509,8 @@ def fix_patch(patch_lines, original, remove_binary=False, fuzzy=False, add_newli
                     raise IsADirectoryError(f"File header start points to a directory, not a file: {current_file}")
                 if not file_loaded:
                     if dir_mode or Path(current_file) == Path(original):
-                        with open(current_file, encoding='utf-8') as f:
-                            original_lines = [l.rstrip('\n') for l in f.readlines()]
+                        file_lines = read_file_with_fallback_encoding(current_path)
+                        original_lines = [l.rstrip('\n') for l in file_lines]
                         file_loaded = True
                     else:
                         raise FileNotFoundError(f"Filename {current_file} in header does not match argument {original}")
@@ -471,7 +598,7 @@ def fix_patch(patch_lines, original, remove_binary=False, fuzzy=False, add_newli
                 # we can't fix the hunk header before we've captured a hunk
                 if first_hunk:
                     first_hunk = False
-                    hunk_context = match_groups[4]
+                    current_hunk_header = match_groups
                     continue
                 try:
@@ -479,20 +606,20 @@ def fix_patch(patch_lines, original, remove_binary=False, fuzzy=False, add_newli
                         fixed_header,
                         offset,
                         last_hunk
-                    ) = capture_hunk(current_hunk, original_lines, offset, last_hunk, hunk_context, fuzzy=fuzzy)
-                except MissingHunkError:
-                    raise NotImplementedError(f"Could not find hunk in {current_file}:"
-                                              f"\n\n{''.join(current_hunk)}")
+                    ) = capture_hunk(current_hunk, original_lines, offset, last_hunk, current_hunk_header, fuzzy=fuzzy)
+                except (MissingHunkError, OutOfOrderHunk) as e:
+                    e.add_file(current_file)
+                    raise e
                 fixed_lines.append(fixed_header)
                 fixed_lines.extend(current_hunk)
                 current_hunk = []
-                hunk_context = match_groups[4]
+                current_hunk_header = match_groups
             case "END_LINE":
                 # if user requested, add a newline at end of file when this marker is present
                 if add_newline:
                     fixed_lines.append("\n")
                 else:
-                    fixed_lines.append(normalize_line(line))
+                    current_hunk.append(normalize_line(line))
             case _:
                 # TODO: fix fuzzy string matching to be less granular
                 # this is a normal line, add to current hunk
@@ -504,15 +631,20 @@ def fix_patch(patch_lines, original, remove_binary=False, fuzzy=False, add_newli
             fixed_header,
             offset,
             last_hunk
-        ) = capture_hunk(current_hunk, original_lines, offset, last_hunk, hunk_context, fuzzy=fuzzy)
-    except MissingHunkError:
-        raise NotImplementedError(f"Could not find hunk in {current_file}:"
-                                  f"\n\n{''.join(current_hunk)}")
+        ) = capture_hunk(current_hunk, original_lines, offset, last_hunk, current_hunk_header, fuzzy=fuzzy)
+    except EmptyHunk:
+        return fixed_lines
+    except (MissingHunkError, OutOfOrderHunk) as e:
+        e.add_file(current_file)
+        raise e
     fixed_lines.append(fixed_header)
     fixed_lines.extend(current_hunk)
-    # if original file didn't end with a newline, strip out the newline here
-    if original_lines and not original_lines[-1].endswith("\n"):
+    # if original file didn't end with a newline, strip out the newline here,
+    # unless user explicitly requested to add final newline
+    if (not add_newline and
+            ((original_lines and not original_lines[-1].endswith("\n")) or
+             (fixed_lines and len(original_lines) == 0))):
         fixed_lines[-1] = fixed_lines[-1].rstrip("\n")
     return fixed_lines
@@ -539,5 +671,4 @@ def main():
 if __name__ == "__main__":
-    main()
+    main()

{patch_fixer-0.3.4.dist-info → patch_fixer-0.4.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: patch-fixer
-Version: 0.3.4
+Version: 0.4.1
 Summary: Fixes erroneous git apply patches to the best of its ability.
 Maintainer-email: Alex Mueller <amueller474@gmail.com>
 License-Expression: Apache-2.0
@@ -55,6 +55,11 @@ where:
 - `broken.patch` is the malformed patch generated by the LLM
 - `fixed.patch` is the output file containing the (hopefully) fixed patch
+Options:
+- `--fuzzy`: enable fuzzy string matching for better context matching (experimental)
+- `--add-newline`: add final newlines when processing "No newline at end of file" markers
 #### Splitting patches by file:
 ```bash
 # Split with files specified on command line
@@ -81,9 +86,16 @@ original = "/path/to/original/state"    # file or directory being patched
 with open(patch_file, encoding="utf-8") as f:
     patch_lines = f.readlines()
+# basic usage
 fixed_lines = fix_patch(patch_lines, original)
-output_file = "/path/to/fixed.patch"
+# with fuzzy matching enabled
+fixed_lines = fix_patch(patch_lines, original, fuzzy=True)
+# with final newline addition
+fixed_lines = fix_patch(patch_lines, original, add_newline=True)
+output_file = "/path/to/fixed.patch"
 with open(output_file, 'w', encoding='utf-8') as f:
     f.writelines(fixed_lines)
 ```

patch_fixer-0.4.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+patch_fixer/__init__.py,sha256=n5DDMr4jbO3epK3ybBvjDyRddTWlWamN6ao5BC7xHFo,65
+patch_fixer/cli.py,sha256=4zy02FsVrUrcQzsBwQ58PVfJXoG4OsOYKpk2JXGw1cY,3841
+patch_fixer/patch_fixer.py,sha256=OuJkwhOq2Q9zcotxIRlT1kBZaD76JCxY5VCMrcSzWnA,28084
+patch_fixer/split.py,sha256=l0rHM6-ZBuB9Iv6Ng6rxqZH5eKfvk2t87j__nDu67kM,3869
+patch_fixer-0.4.1.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
+patch_fixer-0.4.1.dist-info/METADATA,sha256=4O0lHxiYNuta3IjGfLadnqwITFnJHD-gQVvb-lyXGos,4907
+patch_fixer-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+patch_fixer-0.4.1.dist-info/entry_points.txt,sha256=ftc6dP6B1zJouSPeCCJLZtx-EEGVSrNEwy4YhtnEoxA,53
+patch_fixer-0.4.1.dist-info/top_level.txt,sha256=yyp3KjFgExJsrFsS9ZBCnkhb05xg8hPYhB7ncdpTOv0,12
+patch_fixer-0.4.1.dist-info/RECORD,,

patch_fixer-0.3.4.dist-info/RECORD DELETED Viewed

@@ -1,10 +0,0 @@
-patch_fixer/__init__.py,sha256=n5DDMr4jbO3epK3ybBvjDyRddTWlWamN6ao5BC7xHFo,65
-patch_fixer/cli.py,sha256=4zy02FsVrUrcQzsBwQ58PVfJXoG4OsOYKpk2JXGw1cY,3841
-patch_fixer/patch_fixer.py,sha256=eqrqe6jKlEWiCjOiLiFnq9oPi1HZPrZBSEsCcEANeFw,23478
-patch_fixer/split.py,sha256=l0rHM6-ZBuB9Iv6Ng6rxqZH5eKfvk2t87j__nDu67kM,3869
-patch_fixer-0.3.4.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
-patch_fixer-0.3.4.dist-info/METADATA,sha256=cV7wioKTFQulrTUB9R_s_lDfDNJDYfwEp3uSho2fqXc,4521
-patch_fixer-0.3.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-patch_fixer-0.3.4.dist-info/entry_points.txt,sha256=ftc6dP6B1zJouSPeCCJLZtx-EEGVSrNEwy4YhtnEoxA,53
-patch_fixer-0.3.4.dist-info/top_level.txt,sha256=yyp3KjFgExJsrFsS9ZBCnkhb05xg8hPYhB7ncdpTOv0,12
-patch_fixer-0.3.4.dist-info/RECORD,,

{patch_fixer-0.3.4.dist-info → patch_fixer-0.4.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{patch_fixer-0.3.4.dist-info → patch_fixer-0.4.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{patch_fixer-0.3.4.dist-info → patch_fixer-0.4.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{patch_fixer-0.3.4.dist-info → patch_fixer-0.4.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

patch-fixer 0.3.4__py3-none-any.whl → 0.4.1__py3-none-any.whl

patch-fixer 0.3.4py3-none-any.whl → 0.4.1py3-none-any.whl