difflicious 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1005 @@
1
+ """Secure git command execution wrapper for Difflicious."""
2
+
3
+ import logging
4
+ import os
5
+ import re
6
+ import subprocess
7
+ from pathlib import Path
8
+ from typing import Any, Optional, cast
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ # Common constants for git operations
14
+ COMMON_DEFAULT_BRANCHES = ["main", "master", "trunk"]
15
+
16
+ # Allowed git options for safety validation
17
+ SAFE_GIT_OPTIONS = {
18
+ "--porcelain",
19
+ "--short",
20
+ "--branch",
21
+ "--ahead-behind",
22
+ "--no-renames",
23
+ "--find-renames",
24
+ "--name-only",
25
+ "--name-status",
26
+ "--numstat",
27
+ "--stat",
28
+ "--patch",
29
+ "--no-patch",
30
+ "--raw",
31
+ "--format",
32
+ "--oneline",
33
+ "--graph",
34
+ "--decorate",
35
+ "--all",
36
+ "--color",
37
+ "--no-color",
38
+ "--word-diff",
39
+ "--unified",
40
+ "--context",
41
+ "--show-current",
42
+ "--cached",
43
+ "--verify",
44
+ }
45
+
46
+ # Allow safe single-dash options
47
+ SAFE_SHORT_OPTIONS = {"-s", "-b", "-u", "-z", "-n", "-p", "-w", "-a"}
48
+
49
+
50
+ class GitOperationError(Exception):
51
+ """Exception raised when git operations fail."""
52
+
53
+ pass
54
+
55
+
56
+ class GitRepository:
57
+ """Secure wrapper for git operations with subprocess sanitization."""
58
+
59
+ def __init__(self, repo_path: Optional[str] = None):
60
+ """Initialize git repository wrapper.
61
+
62
+ Args:
63
+ repo_path: Path to git repository. Defaults to current working directory.
64
+ """
65
+ self.repo_path = Path(repo_path) if repo_path else Path.cwd()
66
+ self._validate_repository()
67
+
68
+ def _validate_repository(self) -> None:
69
+ """Validate that the path contains a git repository."""
70
+ if not self.repo_path.exists():
71
+ raise GitOperationError(f"Repository path does not exist: {self.repo_path}")
72
+
73
+ git_dir = self.repo_path / ".git"
74
+ if not (git_dir.exists() or (self.repo_path / ".git").is_file()):
75
+ raise GitOperationError(f"Not a git repository: {self.repo_path}")
76
+
77
+ def _execute_git_command(
78
+ self, args: list[str], timeout: int = 30
79
+ ) -> tuple[str, str, int]:
80
+ """Execute a git command with proper security and error handling.
81
+
82
+ Args:
83
+ args: List of git command arguments (without 'git' prefix)
84
+ timeout: Command timeout in seconds
85
+
86
+ Returns:
87
+ Tuple of (stdout, stderr, return_code)
88
+
89
+ Raises:
90
+ GitOperationError: If git command fails or times out
91
+ """
92
+ self._validate_repository()
93
+ # Validate command arguments (no shell quoting; pass as list)
94
+ sanitized_args = self._sanitize_args(args)
95
+
96
+ # Build full command
97
+ cmd = ["git"] + sanitized_args
98
+
99
+ logger.debug(f"Executing git command: {' '.join(cmd)}")
100
+
101
+ try:
102
+ env = {**os.environ, "GIT_OPTIONAL_LOCKS": "0"}
103
+ result = subprocess.run(
104
+ cmd,
105
+ cwd=self.repo_path,
106
+ capture_output=True,
107
+ text=True,
108
+ timeout=timeout,
109
+ check=False, # We'll handle return codes manually
110
+ env=env,
111
+ )
112
+
113
+ logger.debug(f"Git command completed with return code: {result.returncode}")
114
+ return result.stdout, result.stderr, result.returncode
115
+
116
+ except subprocess.TimeoutExpired:
117
+ raise GitOperationError(
118
+ f"Git command timed out after {timeout}s: {' '.join(cmd)}"
119
+ ) from None
120
+ except FileNotFoundError:
121
+ raise GitOperationError(
122
+ "Git executable not found. Please ensure git is installed."
123
+ ) from None
124
+ except Exception as e:
125
+ raise GitOperationError(f"Failed to execute git command: {e}") from e
126
+
127
+ def _sanitize_args(self, args: list[str]) -> list[str]:
128
+ """Validate git command arguments to prevent injection attacks.
129
+
130
+ Args:
131
+ args: Raw command arguments
132
+
133
+ Returns:
134
+ Validated arguments safe for subprocess execution
135
+ """
136
+ validated_args: list[str] = []
137
+ for arg in args:
138
+ if not isinstance(arg, str):
139
+ raise GitOperationError(f"Invalid argument type: {type(arg)}")
140
+
141
+ # Remove dangerous characters and patterns
142
+ if any(
143
+ char in arg for char in [";", "|", "&", "`", "$", "(", ")", ">", "<"]
144
+ ):
145
+ raise GitOperationError(
146
+ f"Dangerous characters detected in argument: {arg}"
147
+ )
148
+
149
+ # Prevent command injection via git options
150
+ if arg.startswith("-") and not self._is_safe_git_option(arg):
151
+ raise GitOperationError(f"Unsafe git option: {arg}")
152
+
153
+ validated_args.append(arg)
154
+
155
+ return validated_args
156
+
157
+ def _is_safe_git_option(self, option: str) -> bool:
158
+ """Check if a git option is safe to use.
159
+
160
+ Args:
161
+ option: Git command option to validate
162
+
163
+ Returns:
164
+ True if option is safe, False otherwise
165
+ """
166
+ # Check for -U<number> pattern (unified diff with context lines)
167
+ if re.match(r"^-U\d+$", option):
168
+ return True
169
+
170
+ return option in SAFE_GIT_OPTIONS or option in SAFE_SHORT_OPTIONS
171
+
172
+ def get_status(self) -> dict[str, Any]:
173
+ """Get git repository status information.
174
+
175
+ Returns:
176
+ Dictionary containing git status information
177
+ """
178
+ try:
179
+ # Get basic repository info
180
+ current_branch = self.get_current_branch()
181
+
182
+ # Get repository status
183
+ status_stdout, _, status_code = self._execute_git_command(
184
+ ["status", "--porcelain"]
185
+ )
186
+
187
+ # Parse status output
188
+ files_changed = 0
189
+ if status_code == 0:
190
+ files_changed = len(
191
+ [line for line in status_stdout.strip().split("\n") if line.strip()]
192
+ )
193
+
194
+ # Check if git is available and working
195
+ git_available = current_branch != "error" or status_code == 0
196
+
197
+ return {
198
+ "git_available": git_available,
199
+ "current_branch": current_branch,
200
+ "files_changed": files_changed,
201
+ "repository_path": str(self.repo_path),
202
+ "is_clean": files_changed == 0,
203
+ }
204
+
205
+ except GitOperationError as e:
206
+ logger.error(f"Failed to get git status: {e}")
207
+ return {
208
+ "git_available": False,
209
+ "current_branch": "error",
210
+ "files_changed": 0,
211
+ "repository_path": str(self.repo_path),
212
+ "is_clean": True,
213
+ "error": str(e),
214
+ }
215
+
216
+ def _resolve_base_ref(
217
+ self, use_head: bool = False, preferred_ref: Optional[str] = None
218
+ ) -> str:
219
+ """Resolve the base reference for comparisons.
220
+
221
+ If use_head is True, return "HEAD". Otherwise, use preferred_ref if provided
222
+ and valid; fall back to repository default branch; finally to "HEAD".
223
+ """
224
+ if use_head:
225
+ return "HEAD"
226
+
227
+ # If an explicit ref is provided and looks safe, try it first
228
+ if preferred_ref and self._is_safe_commit_sha(preferred_ref):
229
+ return str(preferred_ref)
230
+
231
+ branches_info = self.get_branches()
232
+ reference_point = branches_info.get("default_branch", "main") or "main"
233
+ if not self._is_safe_commit_sha(str(reference_point)):
234
+ reference_point = "HEAD"
235
+ return str(reference_point)
236
+
237
+ def summarize_changes(
238
+ self, include_unstaged: bool = True, include_untracked: bool = True
239
+ ) -> dict[str, Any]:
240
+ """Return counts of changed files without fetching diff contents.
241
+
242
+ Returns a dict with the same group keys as get_diff, but only 'count' fields
243
+ populated. This is designed to be efficient for status endpoints.
244
+ """
245
+ summary: dict[str, dict[str, int]] = {
246
+ "untracked": {"count": 0},
247
+ "unstaged": {"count": 0},
248
+ "staged": {"count": 0},
249
+ }
250
+
251
+ try:
252
+ # Untracked files
253
+ if include_untracked:
254
+ stdout, _, rc = self._execute_git_command(["status", "--porcelain"])
255
+ if rc == 0 and stdout:
256
+ summary["untracked"]["count"] = sum(
257
+ 1 for line in stdout.split("\n") if line.startswith("??")
258
+ )
259
+
260
+ # Unstaged changes (working tree vs index)
261
+ if include_unstaged:
262
+ stdout, _, rc = self._execute_git_command(["diff", "--name-only"])
263
+ if rc == 0 and stdout:
264
+ summary["unstaged"]["count"] = sum(
265
+ 1 for line in stdout.split("\n") if line.strip()
266
+ )
267
+
268
+ # Staged changes (index vs HEAD)
269
+ stdout, _, rc = self._execute_git_command(
270
+ ["diff", "--cached", "--name-only", "HEAD"]
271
+ )
272
+ if rc == 0 and stdout:
273
+ summary["staged"]["count"] = sum(
274
+ 1 for line in stdout.split("\n") if line.strip()
275
+ )
276
+
277
+ except GitOperationError as e:
278
+ logger.warning(f"summarize_changes failed: {e}")
279
+
280
+ return summary
281
+
282
+ def get_current_branch(self) -> str:
283
+ """Get the currently checked-out branch."""
284
+ try:
285
+ stdout, _, return_code = self._execute_git_command(
286
+ ["branch", "--show-current"]
287
+ )
288
+ if return_code == 0:
289
+ return stdout.strip()
290
+ return "unknown"
291
+ except GitOperationError as e:
292
+ logger.error(f"Failed to get current branch: {e}")
293
+ return "error"
294
+
295
+ def get_repository_name(self) -> str:
296
+ """Get the repository name.
297
+
298
+ Returns:
299
+ Repository name derived from remote URL or directory name
300
+ """
301
+ try:
302
+ # First try to get from remote origin URL
303
+ stdout, stderr, return_code = self._execute_git_command(
304
+ ["remote", "get-url", "origin"]
305
+ )
306
+ if return_code == 0 and stdout.strip():
307
+ remote_url = stdout.strip()
308
+ # Extract repo name from various URL formats:
309
+ # https://github.com/user/repo.git -> repo
310
+ # git@github.com:user/repo.git -> repo
311
+ # /path/to/repo -> repo
312
+ if remote_url.endswith(".git"):
313
+ remote_url = remote_url[:-4]
314
+ repo_name = remote_url.split("/")[-1]
315
+ if repo_name:
316
+ return repo_name
317
+
318
+ # Fallback to directory name
319
+ return os.path.basename(self.repo_path)
320
+
321
+ except GitOperationError as e:
322
+ logger.warning(f"Failed to get repository name from remote: {e}")
323
+ # Final fallback to directory name
324
+ return os.path.basename(self.repo_path)
325
+
326
+ def get_branches(self) -> dict[str, Any]:
327
+ """Get a list of all local and remote branches."""
328
+ try:
329
+ stdout, _, return_code = self._execute_git_command(["branch", "-a"])
330
+ if return_code != 0:
331
+ return {"branches": [], "default_branch": None}
332
+
333
+ branches: list[str] = []
334
+ for raw_line in stdout.strip().split("\n"):
335
+ line = raw_line.strip()
336
+ if not line:
337
+ continue
338
+ # Skip symbolic-refs like "origin/HEAD -> origin/main"
339
+ if "->" in line:
340
+ continue
341
+
342
+ # Remove common leading decorations from some git configs (e.g., '*', '+', '!')
343
+ # and normalize by taking the first whitespace-delimited token (drops verbose/commit parts)
344
+ cleaned = re.sub(r"^[*+!\s]+", "", line)
345
+ if not cleaned:
346
+ continue
347
+ token = cleaned.split()[0]
348
+
349
+ # Clean up remote branch names
350
+ if token.startswith("remotes/origin/"):
351
+ token = token[len("remotes/origin/") :]
352
+
353
+ if token and token not in branches:
354
+ branches.append(token)
355
+ default_branch = self.get_main_branch(branches)
356
+ return {"branches": sorted(set(branches)), "default_branch": default_branch}
357
+ except GitOperationError as e:
358
+ logger.error(f"Failed to get branches: {e}")
359
+ return {"branches": [], "default_branch": None}
360
+
361
+ def get_main_branch(self, branches: list[str]) -> Optional[str]:
362
+ """Determine the main branch from a list of branches.
363
+
364
+ First tries to get the actual default branch from the remote,
365
+ then falls back to common naming conventions.
366
+ """
367
+ # Prefer cached default branch if available
368
+ cached = cast(Optional[str], getattr(self, "_cached_default_branch", None))
369
+ if cached and cached in branches:
370
+ return cached
371
+ # First, try to get the actual default branch from remote
372
+ try:
373
+ # Method 1: git remote show origin
374
+ stdout, stderr, return_code = self._execute_git_command(
375
+ ["remote", "show", "origin"]
376
+ )
377
+ if return_code == 0 and stdout:
378
+ for line in stdout.split("\n"):
379
+ if "HEAD branch:" in line:
380
+ default_branch = line.split("HEAD branch:")[1].strip()
381
+ if default_branch in branches:
382
+ self._cached_default_branch = default_branch
383
+ return str(default_branch)
384
+ except GitOperationError:
385
+ pass
386
+
387
+ # Method 2: git symbolic-ref for remote HEAD
388
+ try:
389
+ stdout, stderr, return_code = self._execute_git_command(
390
+ ["symbolic-ref", "refs/remotes/origin/HEAD"]
391
+ )
392
+ if return_code == 0 and stdout:
393
+ # Output format: refs/remotes/origin/main
394
+ default_branch = stdout.strip().split("/")[-1]
395
+ if default_branch in branches:
396
+ self._cached_default_branch = default_branch
397
+ return str(default_branch)
398
+ except GitOperationError:
399
+ pass
400
+
401
+ # Method 3: Check for origin/HEAD in remote branches
402
+ try:
403
+ stdout, stderr, return_code = self._execute_git_command(["branch", "-r"])
404
+ if return_code == 0 and stdout:
405
+ for line in stdout.split("\n"):
406
+ if "origin/HEAD" in line:
407
+ # Extract the branch it points to
408
+ parts = line.strip().split(" -> ")
409
+ if len(parts) == 2:
410
+ default_branch = parts[1].replace("origin/", "")
411
+ if default_branch in branches:
412
+ self._cached_default_branch = default_branch
413
+ return str(default_branch)
414
+ except GitOperationError:
415
+ pass
416
+
417
+ # Fallback to common naming conventions
418
+ common_defaults = COMMON_DEFAULT_BRANCHES
419
+ for default_branch in common_defaults:
420
+ if default_branch in branches:
421
+ self._cached_default_branch = default_branch
422
+ return str(default_branch)
423
+
424
+ # Final fallback: look for a branch with a remote counterpart
425
+ for branch in branches:
426
+ if f"remotes/origin/{branch}" in branches:
427
+ return str(branch)
428
+
429
+ return None
430
+
431
+ def get_diff(
432
+ self,
433
+ use_head: bool = False,
434
+ include_unstaged: bool = True,
435
+ include_untracked: bool = False,
436
+ file_path: Optional[str] = None,
437
+ base_ref: Optional[str] = None,
438
+ ) -> dict[str, Any]:
439
+ """Get git diff information comparing working directory to a reference point.
440
+
441
+ Args:
442
+ use_head: If True, compare against HEAD. If False, compare against default branch.
443
+ include_unstaged: If True, include unstaged changes in the output.
444
+ include_untracked: If True, include untracked files in the output.
445
+ file_path: Optional specific file to diff
446
+
447
+ Returns:
448
+ Dictionary containing grouped diff information
449
+ """
450
+ try:
451
+ groups: dict[str, dict[str, Any]] = {
452
+ "untracked": {"files": [], "count": 0},
453
+ "unstaged": {"files": [], "count": 0},
454
+ "staged": {"files": [], "count": 0},
455
+ }
456
+
457
+ # Resolve base reference
458
+ reference_point = self._resolve_base_ref(
459
+ use_head=use_head, preferred_ref=base_ref
460
+ )
461
+
462
+ # Untracked files
463
+ if include_untracked:
464
+ stdout, _, rc = self._execute_git_command(["status", "--porcelain"])
465
+ if rc == 0 and stdout:
466
+ for line in stdout.strip().split("\n"):
467
+ if line.startswith("??"):
468
+ fname = line[3:].strip()
469
+ if not file_path or file_path in fname:
470
+ groups["untracked"]["files"].append(
471
+ {
472
+ "path": fname,
473
+ "additions": 0,
474
+ "deletions": 0,
475
+ "changes": 0,
476
+ "status": "added",
477
+ "content": f"New untracked file: {fname}",
478
+ }
479
+ )
480
+ groups["untracked"]["count"] = len(groups["untracked"]["files"])
481
+
482
+ # Unstaged (working tree) vs base
483
+ if include_unstaged:
484
+ base_args_unstaged: list[str] = [] if use_head else [reference_point]
485
+ unstaged_files = self._collect_diff_metadata(
486
+ base_args_unstaged, file_path
487
+ )
488
+ groups["unstaged"]["files"].extend(unstaged_files)
489
+ groups["unstaged"]["count"] = len(unstaged_files)
490
+
491
+ # Staged (index) vs base (HEAD or branch)
492
+ base_args_staged: list[str] = (
493
+ ["--cached", "HEAD"] if use_head else ["--cached", reference_point]
494
+ )
495
+ staged_files = self._collect_diff_metadata(base_args_staged, file_path)
496
+ groups["staged"]["files"].extend(staged_files)
497
+ groups["staged"]["count"] = len(staged_files)
498
+
499
+ # For each file, lazily fill content as before
500
+ # Preserve previous behavior: include content strings in results
501
+ for group_name in ("unstaged", "staged"):
502
+ for diff_info in groups[group_name]["files"]:
503
+ if use_head and group_name == "unstaged":
504
+ content = self._get_file_diff(
505
+ diff_info["path"], None, None, False
506
+ )
507
+ elif group_name == "staged":
508
+ content = self._get_file_diff(
509
+ diff_info["path"], None, None, True
510
+ )
511
+ else:
512
+ content = self._get_file_diff(
513
+ diff_info["path"], reference_point, None, False
514
+ )
515
+ diff_info["content"] = content
516
+
517
+ return groups
518
+
519
+ except GitOperationError as e:
520
+ logger.error(f"Failed to get git diff: {e}")
521
+ return {
522
+ "untracked": {"files": [], "count": 0},
523
+ "unstaged": {"files": [], "count": 0},
524
+ "staged": {"files": [], "count": 0},
525
+ }
526
+
527
+ def _is_safe_commit_sha(self, sha: str) -> bool:
528
+ """Validate that a commit SHA is safe to use.
529
+
530
+ Args:
531
+ sha: Commit SHA or reference to validate
532
+
533
+ Returns:
534
+ True if SHA is safe, False otherwise
535
+ """
536
+ if not isinstance(sha, str):
537
+ return False
538
+
539
+ # Allow branch names, tag names, and SHAs
540
+ # Reject dangerous characters
541
+ if any(
542
+ char in sha for char in [";", "|", "&", "`", "$", "(", ")", ">", "<", " "]
543
+ ):
544
+ return False
545
+
546
+ # Must be reasonable length (branch names, tags, or SHAs)
547
+ if len(sha) < 1 or len(sha) > 100:
548
+ return False
549
+
550
+ # Check if it's a valid git reference
551
+ try:
552
+ _, _, return_code = self._execute_git_command(
553
+ ["rev-parse", "--verify", sha]
554
+ )
555
+ return return_code == 0
556
+ except GitOperationError:
557
+ return False
558
+
559
+ def _is_safe_file_path(self, file_path: str) -> bool:
560
+ """Validate that a file path is safe and within the repository.
561
+
562
+ Args:
563
+ file_path: File path to validate
564
+
565
+ Returns:
566
+ True if path is safe, False otherwise
567
+ """
568
+ try:
569
+ # Resolve path relative to repository
570
+ full_path = (self.repo_path / file_path).resolve()
571
+
572
+ # Ensure path is within repository
573
+ return str(full_path).startswith(str(self.repo_path.resolve()))
574
+
575
+ except Exception:
576
+ return False
577
+
578
+ def _parse_diff_output(self, output: str) -> list[dict[str, Any]]:
579
+ """Parse git diff --numstat output.
580
+
581
+ Args:
582
+ output: Raw git diff output
583
+
584
+ Returns:
585
+ List of file diff information
586
+ """
587
+ diffs: list[dict[str, Any]] = []
588
+
589
+ if not output.strip():
590
+ return diffs
591
+
592
+ lines = output.strip().split("\n")
593
+
594
+ for line in lines:
595
+ if not line.strip():
596
+ continue
597
+
598
+ # Parse numstat format: "additions\tdeletions\tfilename"
599
+ parts = line.split("\t")
600
+ if len(parts) >= 3:
601
+ try:
602
+ additions = int(parts[0]) if parts[0] != "-" else 0
603
+ deletions = int(parts[1]) if parts[1] != "-" else 0
604
+ filename = parts[2]
605
+
606
+ diffs.append(
607
+ {
608
+ "path": filename,
609
+ "additions": additions,
610
+ "deletions": deletions,
611
+ "changes": additions + deletions,
612
+ "status": "modified", # Will be updated by caller with actual status
613
+ "content": "", # Will be filled by caller
614
+ }
615
+ )
616
+ except ValueError:
617
+ # Skip lines that don't parse correctly
618
+ continue
619
+
620
+ return diffs
621
+
622
+ def _get_file_status_map(
623
+ self, use_head: bool = False, reference_point: str = "HEAD"
624
+ ) -> dict[str, str]:
625
+ """Get a mapping of file paths to their git status.
626
+
627
+ Args:
628
+ use_head: If True, get status for HEAD comparison, otherwise for branch comparison
629
+ reference_point: The git reference to compare against (e.g. "HEAD", "main")
630
+
631
+ Returns:
632
+ Dictionary mapping file paths to status strings (added, modified, deleted, etc.)
633
+ """
634
+ status_map = {}
635
+
636
+ # Convert git status codes to readable names
637
+ git_status_map = {
638
+ "M": "modified",
639
+ "A": "added",
640
+ "D": "deleted",
641
+ "R": "renamed",
642
+ "C": "copied",
643
+ "T": "type changed",
644
+ "U": "unmerged",
645
+ "X": "unknown",
646
+ }
647
+
648
+ try:
649
+ if use_head:
650
+ # For HEAD comparison, we need both unstaged and staged status
651
+
652
+ # Get unstaged changes (working directory vs index)
653
+ unstaged_args = ["diff", "--name-status", "--find-renames"]
654
+ stdout, stderr, return_code = self._execute_git_command(unstaged_args)
655
+ if return_code == 0:
656
+ for line in stdout.strip().split("\n"):
657
+ if line.strip():
658
+ parts = line.split("\t")
659
+ if len(parts) >= 2:
660
+ status_code = parts[0]
661
+ filename = parts[1]
662
+ status = git_status_map.get(status_code, "modified")
663
+ status_map[filename] = status
664
+
665
+ # Get staged changes (index vs HEAD)
666
+ staged_args = ["diff", "--cached", "--name-status", "--find-renames"]
667
+ stdout, stderr, return_code = self._execute_git_command(staged_args)
668
+ if return_code == 0:
669
+ for line in stdout.strip().split("\n"):
670
+ if line.strip():
671
+ parts = line.split("\t")
672
+ if len(parts) >= 2:
673
+ status_code = parts[0]
674
+ filename = parts[1]
675
+ status = git_status_map.get(status_code, "modified")
676
+ # For staged files, don't override unstaged status if it exists
677
+ if filename not in status_map:
678
+ status_map[filename] = status
679
+ else:
680
+ # For branch comparison, get status of working directory vs reference branch
681
+ branch_args = [
682
+ "diff",
683
+ "--name-status",
684
+ "--find-renames",
685
+ reference_point,
686
+ ]
687
+ stdout, stderr, return_code = self._execute_git_command(branch_args)
688
+ if return_code == 0:
689
+ for line in stdout.strip().split("\n"):
690
+ if line.strip():
691
+ parts = line.split("\t")
692
+ if len(parts) >= 2:
693
+ status_code = parts[0]
694
+ filename = parts[1]
695
+ status = git_status_map.get(status_code, "modified")
696
+ status_map[filename] = status
697
+
698
+ except Exception as e:
699
+ logger.warning(f"Failed to get file status map: {e}")
700
+
701
+ return status_map
702
+
703
+ def _collect_diff_metadata(
704
+ self, base_args: list[str], file_path: Optional[str] = None
705
+ ) -> list[dict[str, Any]]:
706
+ """Collect per-file additions/deletions and status for a diff invocation.
707
+
708
+ Runs git diff twice (numstat and name-status) with the same arguments
709
+ and merges the results.
710
+ """
711
+ # Build args for numstat and name-status
712
+ # Note: Avoid --find-renames to maintain compatibility with tests and
713
+ # mocked expectations that rely on minimal arg lists.
714
+ numstat_args = ["diff", "--numstat", *base_args]
715
+ namestat_args = ["diff", "--name-status", *base_args]
716
+
717
+ if file_path:
718
+ if not self._is_safe_file_path(file_path):
719
+ raise GitOperationError(f"Unsafe file path: {file_path}")
720
+ numstat_args.append(file_path)
721
+ namestat_args.append(file_path)
722
+
723
+ # Parse numstat output
724
+ numstat_stdout, _, rc_num = self._execute_git_command(numstat_args)
725
+ files = {}
726
+ if rc_num == 0 and numstat_stdout:
727
+ for line in numstat_stdout.strip().split("\n"):
728
+ if not line.strip():
729
+ continue
730
+ parts = line.split("\t")
731
+ if len(parts) >= 3:
732
+ add_str, del_str, path = parts[0], parts[1], parts[2]
733
+ try:
734
+ additions = int(add_str) if add_str != "-" else 0
735
+ deletions = int(del_str) if del_str != "-" else 0
736
+ except ValueError:
737
+ additions, deletions = 0, 0
738
+ files[path] = {
739
+ "path": path,
740
+ "additions": additions,
741
+ "deletions": deletions,
742
+ "changes": additions + deletions,
743
+ "status": "modified",
744
+ "content": "",
745
+ }
746
+
747
+ # Parse name-status output
748
+ namestat_stdout, _, rc_ns = self._execute_git_command(namestat_args)
749
+ if rc_ns == 0 and namestat_stdout:
750
+ # Track old paths from renames to filter them out
751
+ old_paths_from_renames = set()
752
+
753
+ for line in namestat_stdout.strip().split("\n"):
754
+ if not line.strip():
755
+ continue
756
+ parts = line.split("\t")
757
+ if len(parts) >= 2:
758
+ status_code = parts[0]
759
+ # Handle renames/copies: last column is the new path
760
+ path = parts[-1]
761
+
762
+ # For renames, track the old path to filter it out later and store it
763
+ old_path_for_file = None
764
+ if status_code.startswith("R") and len(parts) >= 3:
765
+ old_path_for_file = parts[1] # Second column is the old path
766
+ old_paths_from_renames.add(old_path_for_file)
767
+
768
+ status_map = {
769
+ "M": "modified",
770
+ "A": "added",
771
+ "D": "deleted",
772
+ "R": "renamed",
773
+ "C": "copied",
774
+ "T": "type changed",
775
+ "U": "unmerged",
776
+ "X": "unknown",
777
+ }
778
+ status = status_map.get(status_code[0], "modified")
779
+ if path in files:
780
+ files[path]["status"] = status
781
+ if old_path_for_file:
782
+ files[path]["old_path"] = old_path_for_file
783
+ else:
784
+ file_data = {
785
+ "path": path,
786
+ "additions": 0,
787
+ "deletions": 0,
788
+ "changes": 0,
789
+ "status": status,
790
+ "content": "",
791
+ }
792
+ if old_path_for_file:
793
+ file_data["old_path"] = old_path_for_file
794
+ files[path] = file_data
795
+
796
+ # Filter out old paths from renames (they would show as deleted)
797
+ files = {
798
+ path: data
799
+ for path, data in files.items()
800
+ if path not in old_paths_from_renames
801
+ }
802
+
803
+ return list(files.values())
804
+
805
+ def _get_file_diff(
806
+ self,
807
+ file_path: str,
808
+ base_commit: Optional[str] = None,
809
+ target_commit: Optional[str] = None,
810
+ use_cached: bool = False,
811
+ context_lines: int = 3,
812
+ ) -> str:
813
+ """Get detailed diff content for a specific file.
814
+
815
+ Args:
816
+ file_path: Path to the file
817
+ base_commit: Base commit to compare from
818
+ target_commit: Target commit to compare to
819
+ use_cached: Whether to get staged diff (used when no commits specified)
820
+ context_lines: Number of context lines to include (default: 3)
821
+
822
+ Returns:
823
+ Diff content as string
824
+ """
825
+ try:
826
+ if not self._is_safe_file_path(file_path):
827
+ return f"Error: Unsafe file path: {file_path}"
828
+
829
+ diff_args = ["diff"]
830
+
831
+ # Add context lines argument
832
+ diff_args.append(f"-U{context_lines}")
833
+
834
+ # Handle commit comparison (same logic as main get_diff method)
835
+ if base_commit or target_commit:
836
+ if base_commit and target_commit:
837
+ diff_args.extend([base_commit, target_commit])
838
+ elif base_commit:
839
+ diff_args.append(base_commit)
840
+ else:
841
+ if use_cached:
842
+ diff_args.append("--cached")
843
+
844
+ diff_args.extend(["--no-color", file_path])
845
+
846
+ stdout, stderr, return_code = self._execute_git_command(diff_args)
847
+
848
+ if return_code != 0 and stderr:
849
+ return f"Error getting diff: {stderr}"
850
+
851
+ return stdout
852
+
853
+ except GitOperationError as e:
854
+ return f"Error: {e}"
855
+
856
+ def get_full_file_diff(
857
+ self,
858
+ file_path: str,
859
+ base_ref: Optional[str] = None,
860
+ use_head: bool = False,
861
+ use_cached: bool = False,
862
+ ) -> str:
863
+ """Get complete diff content for a specific file with unlimited context.
864
+
865
+ Args:
866
+ file_path: Path to the file
867
+ base_ref: Base reference for comparison (branch name or commit)
868
+ use_head: Whether to compare against HEAD instead of branch
869
+ use_cached: Whether to get staged diff
870
+
871
+ Returns:
872
+ Complete diff content as string with unlimited context
873
+
874
+ Raises:
875
+ GitOperationError: If diff operation fails
876
+ """
877
+ try:
878
+ if not self._is_safe_file_path(file_path):
879
+ raise GitOperationError(f"Unsafe file path: {file_path}")
880
+
881
+ diff_args = ["diff"]
882
+
883
+ # Use million lines of context for full diff view
884
+ diff_args.append("-U1000000")
885
+
886
+ # Determine comparison mode
887
+ if use_cached:
888
+ diff_args.append("--cached")
889
+ elif use_head:
890
+ # Compare working directory vs HEAD
891
+ pass # No additional args needed
892
+ elif base_ref:
893
+ # Compare working directory vs specified reference
894
+ if not self._is_safe_commit_sha(base_ref):
895
+ raise GitOperationError(f"Unsafe base reference: {base_ref}")
896
+ diff_args.append(base_ref)
897
+ else:
898
+ # Default to main branch comparison
899
+ branches_info = self.get_branches()
900
+ default_branch = branches_info.get("default_branch", "main")
901
+ if default_branch and self._is_safe_commit_sha(default_branch):
902
+ diff_args.append(default_branch)
903
+
904
+ diff_args.extend(["--no-color", file_path])
905
+
906
+ stdout, stderr, return_code = self._execute_git_command(diff_args)
907
+
908
+ if return_code != 0 and stderr:
909
+ raise GitOperationError(f"Git diff failed: {stderr}")
910
+
911
+ return stdout
912
+
913
+ except GitOperationError:
914
+ raise
915
+ except Exception as e:
916
+ raise GitOperationError(f"Failed to get full file diff: {e}") from e
917
+
918
+ def get_file_line_count(self, file_path: str) -> int:
919
+ """Get the total number of lines in a file.
920
+
921
+ Args:
922
+ file_path: Path to the file
923
+
924
+ Returns:
925
+ Number of lines in the file
926
+
927
+ Raises:
928
+ GitOperationError: If file cannot be read or counted
929
+ """
930
+ try:
931
+ if not self._is_safe_file_path(file_path):
932
+ raise GitOperationError(f"Unsafe file path: {file_path}")
933
+
934
+ full_path = (self.repo_path / file_path).resolve()
935
+ if not full_path.exists():
936
+ raise GitOperationError(f"File does not exist: {file_path}")
937
+
938
+ # Pure Python line counting, memory-efficient
939
+ line_count = 0
940
+ with full_path.open("rb") as f:
941
+ for _ in f:
942
+ line_count += 1
943
+ return line_count
944
+
945
+ except Exception as e:
946
+ raise GitOperationError(f"Failed to get file line count: {e}") from e
947
+
948
+ def get_file_lines(
949
+ self, file_path: str, start_line: int, end_line: int
950
+ ) -> list[str]:
951
+ """Get specific lines from a file using fast bash tools.
952
+
953
+ Args:
954
+ file_path: Path to the file relative to repository root
955
+ start_line: Starting line number (1-based, inclusive)
956
+ end_line: Ending line number (1-based, inclusive)
957
+
958
+ Returns:
959
+ List of lines from the file
960
+
961
+ Raises:
962
+ GitOperationError: If operation fails
963
+ """
964
+ if start_line < 1 or end_line < start_line:
965
+ raise GitOperationError(f"Invalid line range: {start_line}-{end_line}")
966
+
967
+ # Sanitize file path
968
+ if not self._is_safe_file_path(file_path):
969
+ raise GitOperationError(f"Unsafe file path: {file_path}")
970
+
971
+ full_path = (self.repo_path / file_path).resolve()
972
+ if not full_path.is_file():
973
+ raise GitOperationError(f"File not found: {file_path}")
974
+
975
+ try:
976
+ # Pure Python slicing
977
+ lines: list[str] = []
978
+ with full_path.open("r", encoding="utf-8", errors="replace") as f:
979
+ for idx, line in enumerate(f, start=1):
980
+ if idx < start_line:
981
+ continue
982
+ if idx > end_line:
983
+ break
984
+ lines.append(line.rstrip("\n"))
985
+ return lines
986
+
987
+ except Exception as e:
988
+ raise GitOperationError(
989
+ f"Failed to get file lines {start_line}-{end_line}: {e}"
990
+ ) from e
991
+
992
+
993
+ def get_git_repository(repo_path: Optional[str] = None) -> GitRepository:
994
+ """Factory function to create a GitRepository instance.
995
+
996
+ Args:
997
+ repo_path: Optional path to git repository
998
+
999
+ Returns:
1000
+ GitRepository instance
1001
+
1002
+ Raises:
1003
+ GitOperationError: If repository is invalid
1004
+ """
1005
+ return GitRepository(repo_path)