PyPI - patch-fixer - Versions diffs - 0.3.3__tar.gz → 0.3.4__tar.gz - Mend

patch-fixer 0.3.3tar.gz → 0.3.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

{patch_fixer-0.3.3 → patch_fixer-0.3.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: patch-fixer
-Version: 0.3.3
+Version: 0.3.4
 Summary: Fixes erroneous git apply patches to the best of its ability.
 Maintainer-email: Alex Mueller <amueller474@gmail.com>
 License-Expression: Apache-2.0
@@ -107,6 +107,13 @@ with open("excluded.patch", 'w', encoding='utf-8') as f:
     f.writelines(excluded)
 ```
+## Known Limitations
+- When fixing patches with missing `index` lines, the tool requires the files to be in a git repository to regenerate the index. This is only needed for file deletions and renames.
+- `patch-fixer` assumes the patch follows git's unified diff format.
+- Current implementation is not very robust to corrupted hunk content
+  - Much more comprehensive fuzzy string matching is planned
 ## Local Testing
 ```bash
 git clone https://github.com/ajcm474/patch-fixer.git

{patch_fixer-0.3.3 → patch_fixer-0.3.4}/README.md RENAMED Viewed

@@ -78,6 +78,13 @@ with open("excluded.patch", 'w', encoding='utf-8') as f:
     f.writelines(excluded)
 ```
+## Known Limitations
+- When fixing patches with missing `index` lines, the tool requires the files to be in a git repository to regenerate the index. This is only needed for file deletions and renames.
+- `patch-fixer` assumes the patch follows git's unified diff format.
+- Current implementation is not very robust to corrupted hunk content
+  - Much more comprehensive fuzzy string matching is planned
 ## Local Testing
 ```bash
 git clone https://github.com/ajcm474/patch-fixer.git

{patch_fixer-0.3.3 → patch_fixer-0.3.4}/patch_fixer/cli.py RENAMED Viewed

@@ -14,7 +14,12 @@ def fix_command(args):
     with open(args.broken_patch, encoding='utf-8') as f:
         patch_lines = f.readlines()
-    fixed_lines = fix_patch(patch_lines, args.original)
+    fixed_lines = fix_patch(
+        patch_lines,
+        args.original,
+        fuzzy=args.fuzzy,
+        add_newline=args.add_newline
+    )
     with open(args.output, 'w', encoding='utf-8') as f:
         f.writelines(fixed_lines)
@@ -77,6 +82,16 @@ def main():
         'output',
         help='Path where the fixed patch will be written'
     )
+    fix_parser.add_argument(
+        '--fuzzy',
+        action='store_true',
+        help='Enable fuzzy string matching when finding hunks in original files'
+    )
+    fix_parser.add_argument(
+        '--add-newline',
+        action='store_true',
+        help='Add final newline when processing "No newline at end of file" markers'
+    )
     # split command
     split_parser = subparsers.add_parser(

{patch_fixer-0.3.3 → patch_fixer-0.3.4}/patch_fixer/patch_fixer.py RENAMED Viewed

@@ -2,6 +2,7 @@
 import os
 import re
 import sys
+import warnings
 from pathlib import Path
 from git import Repo
@@ -61,7 +62,29 @@ def normalize_line(line):
     return core + "\n"
-def find_hunk_start(context_lines, original_lines):
+def fuzzy_line_similarity(line1, line2, threshold=0.8):
+    """Calculate similarity between two lines using a simple ratio."""
+    if not line1 or not line2:
+        return 0.0
+    l1, l2 = line1.strip(), line2.strip()
+    if l1 == l2:
+        return 1.0
+    if len(l1) == 0 or len(l2) == 0:
+        return 0.0
+    # count common characters
+    common = 0
+    for char in set(l1) & set(l2):
+        common += min(l1.count(char), l2.count(char))
+    total_chars = len(l1) + len(l2)
+    return (2.0 * common) / total_chars if total_chars > 0 else 0.0
+def find_hunk_start(context_lines, original_lines, fuzzy=False):
     """Search original_lines for context_lines and return start line index (0-based)."""
     ctx = []
     for line in context_lines:
@@ -74,11 +97,33 @@ def find_hunk_start(context_lines, original_lines):
             ctx.append(line)
     if not ctx:
         raise ValueError("Cannot search for empty hunk.")
+    # first try exact matching
     for i in range(len(original_lines) - len(ctx) + 1):
         # this part will fail if the diff is malformed beyond hunk header
-        equal_lines = [original_lines[i+j].strip() == ctx[j].strip() for j in range(len(ctx))]
+        equal_lines = [original_lines[i + j].strip() == ctx[j].strip() for j in range(len(ctx))]
         if all(equal_lines):
             return i
+    # if fuzzy matching is enabled and exact match failed, try fuzzy match
+    if fuzzy:
+        best_match_score = 0.0
+        best_match_pos = 0
+        for i in range(len(original_lines) - len(ctx) + 1):
+            total_similarity = 0.0
+            for j in range(len(ctx)):
+                similarity = fuzzy_line_similarity(original_lines[i + j], ctx[j])
+                total_similarity += similarity
+            avg_similarity = total_similarity / len(ctx)
+            if avg_similarity > best_match_score and avg_similarity > 0.6:
+                best_match_score = avg_similarity
+                best_match_pos = i
+        if best_match_score > 0.6:
+            return best_match_pos
     return 0
@@ -111,14 +156,14 @@ def reconstruct_file_header(diff_line, header_type):
             raise ValueError(f"Unsupported header type: {header_type}")
-def capture_hunk(current_hunk, original_lines, offset, last_hunk, hunk_context):
+def capture_hunk(current_hunk, original_lines, offset, last_hunk, hunk_context, fuzzy=False):
     # compute line counts
     old_count = sum(1 for l in current_hunk if l.startswith((' ', '-')))
     new_count = sum(1 for l in current_hunk if l.startswith((' ', '+')))
     if old_count > 0:
         # compute starting line in original file
-        old_start = find_hunk_start(current_hunk, original_lines) + 1
+        old_start = find_hunk_start(current_hunk, original_lines, fuzzy=fuzzy) + 1
         # if the line number descends, we either have a bad match or a new file
         if old_start < last_hunk:
@@ -147,7 +192,11 @@ def capture_hunk(current_hunk, original_lines, offset, last_hunk, hunk_context):
 def regenerate_index(old_path, new_path, cur_dir):
     repo = Repo(cur_dir)
-    mode = " 100644"     # TODO: check if mode can be a different number
+    # Common git file modes: 100644 (regular file), 100755 (executable file),
+    # 120000 (symbolic link), 160000 (submodule), 040000 (tree/directory)
+    # TODO: guess mode based on above information
+    mode = " 100644"
     # file deletion
     if new_path == "/dev/null":
@@ -164,12 +213,15 @@ def regenerate_index(old_path, new_path, cur_dir):
     return f"index {old_sha}..{new_sha}{mode}\n"
-def fix_patch(patch_lines, original, remove_binary=False):
+def fix_patch(patch_lines, original, remove_binary=False, fuzzy=False, add_newline=False):
     dir_mode = os.path.isdir(original)
     original_path = Path(original).absolute()
     # make relative paths in the diff work
-    os.chdir(original_path)
+    if dir_mode:
+        os.chdir(original_path)
+    else:
+        os.chdir(original_path.parent)
     fixed_lines = []
     current_hunk = []
@@ -201,7 +253,7 @@ def fix_patch(patch_lines, original, remove_binary=False):
                             fixed_header,
                             offset,
                             last_hunk
-                        ) = capture_hunk(current_hunk, original_lines, offset, last_hunk, hunk_context)
+                        ) = capture_hunk(current_hunk, original_lines, offset, last_hunk, hunk_context, fuzzy=fuzzy)
                     except MissingHunkError:
                         raise NotImplementedError(f"Could not find hunk in {current_file}:"
                                                   f"\n\n{''.join(current_hunk)}")
@@ -224,7 +276,12 @@ def fix_patch(patch_lines, original, remove_binary=False):
                 last_mode = i
                 fixed_lines.append(normalize_line(line))
             case "INDEX_LINE":
-                # TODO: verify that mode is present for anything but deletion
+                # mode should be present in index line for all operations except file deletion
+                # for deletions, the mode is omitted since the file no longer exists
+                index_line = normalize_line(line).strip()
+                if not index_line.endswith("..0000000") and not re.search(r' [0-7]{6}$', index_line):
+                    # TODO: this is the right idea, but a poor implementation
+                    pass
                 last_index = i
                 similarity_index = match_groups[0]
                 if similarity_index:
@@ -238,7 +295,9 @@ def fix_patch(patch_lines, original, remove_binary=False):
                 fixed_lines.append(normalize_line(line))
             case "RENAME_FROM":
                 if not look_for_rename:
-                    pass    # TODO: handle missing index line
+                    # handle case where rename from appears without corresponding index line
+                    # this may indicate a malformed patch, but we can try to continue
+                    warnings.warn(f"Warning: 'rename from' found without expected index line at line {i+1}")
                 if binary_file:
                     raise NotImplementedError("Renaming binary files not yet supported")
                 if last_index != i - 1:
@@ -252,7 +311,10 @@ def fix_patch(patch_lines, original, remove_binary=False):
                 offset = 0
                 last_hunk = 0
                 if not Path.exists(current_path):
-                    # TODO: verify whether this block is necessary at all
+                    # this is meant to handle cases where the source file
+                    # doesn't exist (e.g., when applying a patch that renames
+                    # a file created earlier in the same patch)
+                    # TODO: but really, does that ever happen???
                     fixed_lines.append(normalize_line(line))
                     look_for_rename = True
                     file_loaded = False
@@ -273,7 +335,12 @@ def fix_patch(patch_lines, original, remove_binary=False):
                         last_index = i - 2
                     else:
                         raise NotImplementedError("Missing `rename from` header not yet supported.")
-                # TODO: do something sensible if `look_for_rename` is false
+                if not look_for_rename:
+                    # if we're not looking for a rename but encounter "rename to",
+                    # this indicates a malformed patch - log warning but continue
+                    warnings.warn(
+                        f"Warning: unexpected 'rename to' found at line {i + 1} without corresponding 'rename from'"
+                    )
                 current_file = match_groups[0]
                 current_path = Path(current_file).absolute()
                 if current_file and current_path.is_dir():
@@ -412,7 +479,7 @@ def fix_patch(patch_lines, original, remove_binary=False):
                         fixed_header,
                         offset,
                         last_hunk
-                    ) = capture_hunk(current_hunk, original_lines, offset, last_hunk, hunk_context)
+                    ) = capture_hunk(current_hunk, original_lines, offset, last_hunk, hunk_context, fuzzy=fuzzy)
                 except MissingHunkError:
                     raise NotImplementedError(f"Could not find hunk in {current_file}:"
                                               f"\n\n{''.join(current_hunk)}")
@@ -421,10 +488,13 @@ def fix_patch(patch_lines, original, remove_binary=False):
                 current_hunk = []
                 hunk_context = match_groups[4]
             case "END_LINE":
-                # TODO: add newline at end of file if user requests
-                fixed_lines.append(normalize_line(line))
+                # if user requested, add a newline at end of file when this marker is present
+                if add_newline:
+                    fixed_lines.append("\n")
+                else:
+                    fixed_lines.append(normalize_line(line))
             case _:
-                # TODO: fuzzy string matching
+                # TODO: fix fuzzy string matching to be less granular
                 # this is a normal line, add to current hunk
                 current_hunk.append(normalize_line(line))
@@ -434,7 +504,7 @@ def fix_patch(patch_lines, original, remove_binary=False):
             fixed_header,
             offset,
             last_hunk
-        ) = capture_hunk(current_hunk, original_lines, offset, last_hunk, hunk_context)
+        ) = capture_hunk(current_hunk, original_lines, offset, last_hunk, hunk_context, fuzzy=fuzzy)
     except MissingHunkError:
         raise NotImplementedError(f"Could not find hunk in {current_file}:"
                                   f"\n\n{''.join(current_hunk)}")

{patch_fixer-0.3.3 → patch_fixer-0.3.4}/patch_fixer.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: patch-fixer
-Version: 0.3.3
+Version: 0.3.4
 Summary: Fixes erroneous git apply patches to the best of its ability.
 Maintainer-email: Alex Mueller <amueller474@gmail.com>
 License-Expression: Apache-2.0
@@ -107,6 +107,13 @@ with open("excluded.patch", 'w', encoding='utf-8') as f:
     f.writelines(excluded)
 ```
+## Known Limitations
+- When fixing patches with missing `index` lines, the tool requires the files to be in a git repository to regenerate the index. This is only needed for file deletions and renames.
+- `patch-fixer` assumes the patch follows git's unified diff format.
+- Current implementation is not very robust to corrupted hunk content
+  - Much more comprehensive fuzzy string matching is planned
 ## Local Testing
 ```bash
 git clone https://github.com/ajcm474/patch-fixer.git

{patch_fixer-0.3.3 → patch_fixer-0.3.4}/patch_fixer.egg-info/SOURCES.txt RENAMED Viewed

@@ -11,6 +11,8 @@ patch_fixer.egg-info/dependency_links.txt
 patch_fixer.egg-info/entry_points.txt
 patch_fixer.egg-info/requires.txt
 patch_fixer.egg-info/top_level.txt
+tests/test_cli.py
+tests/test_fuzzy.py
 tests/test_norm.py
 tests/test_repos.py
 tests/test_split.py

{patch_fixer-0.3.3 → patch_fixer-0.3.4}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "patch-fixer"
-version = "0.3.3"
+version = "0.3.4"
 description = "Fixes erroneous git apply patches to the best of its ability."
 maintainers = [
     {name = "Alex Mueller", email="amueller474@gmail.com"},

patch_fixer-0.3.4/tests/test_cli.py ADDED Viewed

@@ -0,0 +1,149 @@
+"""Tests for the CLI module."""
+import os
+import tempfile
+from unittest.mock import patch
+import pytest
+from patch_fixer.cli import main
+class TestCLI:
+    """Test cases for CLI functionality."""
+    def test_no_command(self, capsys):
+        """Test that help is shown when no command is provided."""
+        with patch('sys.argv', ['patch-fixer']):
+            result = main()
+            assert result == 1
+            captured = capsys.readouterr()
+            assert 'usage: patch-fixer' in captured.out
+            assert 'Available commands' in captured.out
+        def test_fix_command(self):
+            """Test the fix command in directory mode."""
+            with tempfile.TemporaryDirectory() as tmpdir:
+                # create test files
+                original_file = os.path.join(tmpdir, 'original.txt')
+                with open(original_file, 'w') as f:
+                    f.write("line1\nline2\nline3\n")
+                broken_patch = os.path.join(tmpdir, 'broken.patch')
+                with open(broken_patch, 'w') as f:
+                    f.write("""diff --git a/original.txt b/original.txt
+    --- a/original.txt
+    +++ b/original.txt
+    @@ -1,3 +1,3 @@
+     line1
+    -line2
+    +modified line2
+     line3
+    """)
+                output_patch = os.path.join(tmpdir, 'fixed.patch')
+                # use directory mode to work around bug in file mode
+                with patch('sys.argv', ['patch-fixer', 'fix', tmpdir, broken_patch, output_patch]):
+                    result = main()
+                assert result == 0
+                assert os.path.exists(output_patch)
+                with open(output_patch) as f:
+                    content = f.read()
+                    assert 'diff --git' in content
+                    assert 'modified line2' in content
+    def test_split_command_with_files(self):
+        """Test the split command with files specified on command line."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            input_patch = os.path.join(tmpdir, 'input.patch')
+            with open(input_patch, 'w') as f:
+                f.write("""diff --git a/file1.txt b/file1.txt
+--- a/file1.txt
++++ b/file1.txt
+@@ -1,1 +1,1 @@
+-old1
++new1
+diff --git a/file2.txt b/file2.txt
+--- a/file2.txt
++++ b/file2.txt
+@@ -1,1 +1,1 @@
+-old2
++new2
+""")
+            included = os.path.join(tmpdir, 'included.patch')
+            excluded = os.path.join(tmpdir, 'excluded.patch')
+            with patch('sys.argv', ['patch-fixer', 'split', input_patch, included, excluded,
+                                    '-f', 'file1.txt']):
+                result = main()
+            assert result == 0
+            assert os.path.exists(included)
+            assert os.path.exists(excluded)
+            with open(included) as f:
+                content = f.read()
+                assert 'file1.txt' in content
+                assert 'new1' in content
+                assert 'file2.txt' not in content
+            with open(excluded) as f:
+                content = f.read()
+                assert 'file2.txt' in content
+                assert 'new2' in content
+                assert 'file1.txt' not in content
+    def test_split_command_with_include_file(self):
+        """Test the split command with include file."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # create include file
+            include_list = os.path.join(tmpdir, 'include.txt')
+            with open(include_list, 'w') as f:
+                f.write("file1.txt\n")
+            input_patch = os.path.join(tmpdir, 'input.patch')
+            with open(input_patch, 'w') as f:
+                f.write("""diff --git a/file1.txt b/file1.txt
+--- a/file1.txt
++++ b/file1.txt
+@@ -1,1 +1,1 @@
+-old1
++new1
+diff --git a/file2.txt b/file2.txt
+--- a/file2.txt
++++ b/file2.txt
+@@ -1,1 +1,1 @@
+-old2
++new2
+""")
+            included = os.path.join(tmpdir, 'included.patch')
+            excluded = os.path.join(tmpdir, 'excluded.patch')
+            with patch('sys.argv', ['patch-fixer', 'split', input_patch, included, excluded,
+                                    '-i', include_list]):
+                result = main()
+            assert result == 0
+            assert os.path.exists(included)
+            assert os.path.exists(excluded)
+            with open(included) as f:
+                content = f.read()
+                assert 'file1.txt' in content
+            with open(excluded) as f:
+                content = f.read()
+                assert 'file2.txt' in content
+    def test_error_handling(self, capsys):
+        """Test error handling in CLI."""
+        with patch('sys.argv', ['patch-fixer', 'fix', 'nonexistent', 'nonexistent', 'out']):
+            result = main()
+            assert result == 1
+            captured = capsys.readouterr()
+            assert 'Error:' in captured.err

patch_fixer-0.3.4/tests/test_fuzzy.py ADDED Viewed

@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+import pytest
+from patch_fixer.patch_fixer import fuzzy_line_similarity, find_hunk_start
+class TestFuzzyMatching:
+    """Test fuzzy string matching functionality."""
+    def test_fuzzy_line_similarity_exact_match(self):
+        """Test fuzzy similarity with exact matches."""
+        assert fuzzy_line_similarity("hello world", "hello world") == 1.0
+        assert fuzzy_line_similarity("", "") == 1.0
+    def test_fuzzy_line_similarity_no_match(self):
+        """Test fuzzy similarity with no common characters."""
+        assert fuzzy_line_similarity("abc", "xyz") == 0.0
+        assert fuzzy_line_similarity("", "xyz") == 0.0
+        assert fuzzy_line_similarity("abc", "") == 0.0
+    def test_fuzzy_line_similarity_partial_match(self):
+        """Test fuzzy similarity with partial matches."""
+        # "hello" and "hell" share 4 characters
+        similarity = fuzzy_line_similarity("hello", "hell")
+        assert 0.7 < similarity < 1.0
+        # common characters but different order
+        similarity = fuzzy_line_similarity("abc", "bac")
+        assert similarity > 0.5
+    def test_fuzzy_line_similarity_whitespace(self):
+        """Test fuzzy similarity handles whitespace correctly."""
+        assert fuzzy_line_similarity("  hello  ", "hello") == 1.0
+        assert fuzzy_line_similarity("\thello\n", "hello") == 1.0
+    def test_find_hunk_start_exact_match(self):
+        """Test exact matching in find_hunk_start."""
+        original_lines = [
+            "line 1\n",
+            "line 2\n",
+            "line 3\n",
+            "line 4\n"
+        ]
+        context_lines = [
+            " line 2\n",
+            " line 3\n"
+        ]
+        result = find_hunk_start(context_lines, original_lines, fuzzy=False)
+        assert result == 1  # should find match at line 1 (0-indexed)
+    def test_find_hunk_start_fuzzy_match(self):
+        """Test fuzzy matching in find_hunk_start."""
+        original_lines = [
+            "line 1\n",
+            "line two\n",  # slightly different
+            "line 3\n",
+            "line 4\n"
+        ]
+        context_lines = [
+            " line 2\n",  # different from "line two"
+            " line 3\n"
+        ]
+        # exact match should fail
+        result_exact = find_hunk_start(context_lines, original_lines, fuzzy=False)
+        assert result_exact == 0  # should return 0 when no exact match
+        # fuzzy match should succeed
+        result_fuzzy = find_hunk_start(context_lines, original_lines, fuzzy=True)
+        assert result_fuzzy == 1  # should find fuzzy match at line 1
+    def test_find_hunk_start_with_deletions(self):
+        """Test hunk finding with deletion context."""
+        original_lines = [
+            "line 1\n",
+            "line 2\n",
+            "line 3\n",
+            "line 4\n"
+        ]
+        context_lines = [
+            " line 1\n",  # context
+            "-line 2\n",  # deletion - should match original
+            " line 3\n"  # context
+        ]
+        result = find_hunk_start(context_lines, original_lines, fuzzy=False)
+        assert result == 0  # should find match at line 0
+    def test_find_hunk_start_empty_context(self):
+        """Test that empty context raises ValueError."""
+        original_lines = ["line 1\n", "line 2\n"]
+        with pytest.raises(ValueError, match="Cannot search for empty hunk"):
+            find_hunk_start([], original_lines)
+    def test_find_hunk_start_fuzzy_threshold(self):
+        """Test fuzzy matching threshold behavior."""
+        original_lines = [
+            "completely different content\n",
+            "another different line\n",
+            "line 3\n",
+            "line 4\n"
+        ]
+        context_lines = [
+            " line 1\n",  # very different from original
+            " line 2\n"  # very different from original
+        ]
+        # even with fuzzy matching, very different content should not match
+        result = find_hunk_start(context_lines, original_lines, fuzzy=True)
+        assert result == 0  # should return 0 when similarity is too low

{patch_fixer-0.3.3 → patch_fixer-0.3.4}/tests/test_repos.py RENAMED Viewed

@@ -44,6 +44,7 @@ REPOS = {
 }
 CACHE_DIR = Path.home() / ".patch-testing"
+DIFF_CACHE_DIR = CACHE_DIR / "diffs"
 class DeletedBranchError(ValueError):
@@ -69,8 +70,7 @@ def download_commit_zip(repo_url, commit_hash: str, dest_path: Path) -> None:
     try:
         r = requests.get(url, stream=True)
         r.raise_for_status()
-    except Exception as e:
-        # TODO: don't use bare except
+    except (requests.RequestException, requests.HTTPError) as e:
         print(f"Failed to download commit snapshot: {e}")
         sys.exit(1)
@@ -102,11 +102,19 @@ def clone_repos(repo_group, repo_name, old_commit, new_commit):
         if not new_exists:
             shutil.copytree(repo_old_path, repo_new_path)
-        # TODO: handle deleted branches here too
         repo_old = Repo(repo_old_path)
         repo_new = Repo(repo_new_path)
-        repo_old.git.reset("--hard", old_commit)
-        repo_new.git.reset("--hard", new_commit)
+        try:
+            verify_commit_exists(repo_old, old_commit)
+            repo_old.git.reset("--hard", old_commit)
+        except DeletedBranchError:
+            download_commit_zip(f"https://github.com/{repo_group}/{repo_name}", old_commit, repo_old_path)
+        try:
+            verify_commit_exists(repo_new, new_commit)
+            repo_new.git.reset("--hard", new_commit)
+        except DeletedBranchError:
+            download_commit_zip(f"https://github.com/{repo_group}/{repo_name}", new_commit, repo_new_path)
     # otherwise, clone it and make a copy for each commit
     else:
@@ -133,20 +141,39 @@ def clone_repos(repo_group, repo_name, old_commit, new_commit):
     return repo_old, repo_old_path, repo_new, repo_new_path
+def get_cached_diff(repo_group, repo_name, old_commit, new_commit):
+    """Get diff from cache or generate and cache it."""
+    DIFF_CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    diff_filename = f"{repo_group}_{repo_name}_{old_commit}_{new_commit}.diff"
+    diff_path = DIFF_CACHE_DIR / diff_filename
+    if diff_path.exists():
+        with open(diff_path, 'r', encoding='utf-8') as f:
+            return f.read()
+    # generate diff and cache it
+    (repo_old, repo_old_path, repo_new, repo_new_path) = clone_repos(repo_group, repo_name, old_commit, new_commit)
+    diff_content = repo_new.git.diff(old_commit, new_commit)
+    with open(diff_path, 'w', encoding='utf-8') as f:
+        f.write(diff_content)
+    return diff_content
 @pytest.mark.parametrize(
     "repo_group, repo_name, old_commit, new_commit",
     [(*repo, *commits) for repo, commits in REPOS.items()]
 )
 def test_integration_equality(repo_group, repo_name, old_commit, new_commit):
     """ Make sure the patch fixer doesn't corrupt valid diffs. """
-    (
-        repo_old,
-        repo_old_path,
-        repo_new,
-        repo_new_path
-    ) = clone_repos(repo_group, repo_name, old_commit, new_commit)
-    expected = repo_new.git.diff(old_commit, new_commit)
+    # use cached diff if available, otherwise generate and cache it
+    expected = get_cached_diff(repo_group, repo_name, old_commit, new_commit)
+    # we still need the old repo path for the patch fixer
+    (repo_old, repo_old_path, _, _) = clone_repos(repo_group, repo_name, old_commit, new_commit)
     input_lines = expected.splitlines(keepends=True)
     fixed_lines = fix_patch(input_lines, repo_old_path)
     actual = "".join(fixed_lines)