iam-policy-validator 1.13.1__py3-none-any.whl → 1.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {iam_policy_validator-1.13.1.dist-info → iam_policy_validator-1.14.1.dist-info}/METADATA +1 -1
  2. {iam_policy_validator-1.13.1.dist-info → iam_policy_validator-1.14.1.dist-info}/RECORD +45 -39
  3. iam_validator/__version__.py +1 -1
  4. iam_validator/checks/action_condition_enforcement.py +6 -0
  5. iam_validator/checks/action_resource_matching.py +12 -12
  6. iam_validator/checks/action_validation.py +1 -0
  7. iam_validator/checks/condition_key_validation.py +2 -0
  8. iam_validator/checks/condition_type_mismatch.py +3 -0
  9. iam_validator/checks/full_wildcard.py +1 -0
  10. iam_validator/checks/mfa_condition_check.py +2 -0
  11. iam_validator/checks/policy_structure.py +9 -0
  12. iam_validator/checks/policy_type_validation.py +11 -0
  13. iam_validator/checks/principal_validation.py +5 -0
  14. iam_validator/checks/resource_validation.py +4 -0
  15. iam_validator/checks/sensitive_action.py +1 -0
  16. iam_validator/checks/service_wildcard.py +6 -3
  17. iam_validator/checks/set_operator_validation.py +3 -0
  18. iam_validator/checks/sid_uniqueness.py +2 -0
  19. iam_validator/checks/trust_policy_validation.py +3 -0
  20. iam_validator/checks/utils/__init__.py +16 -0
  21. iam_validator/checks/utils/action_parser.py +149 -0
  22. iam_validator/checks/wildcard_action.py +1 -0
  23. iam_validator/checks/wildcard_resource.py +231 -4
  24. iam_validator/commands/analyze.py +19 -1
  25. iam_validator/commands/completion.py +6 -2
  26. iam_validator/commands/validate.py +231 -12
  27. iam_validator/core/aws_service/fetcher.py +21 -9
  28. iam_validator/core/codeowners.py +245 -0
  29. iam_validator/core/config/check_documentation.py +390 -0
  30. iam_validator/core/config/config_loader.py +199 -0
  31. iam_validator/core/config/defaults.py +25 -0
  32. iam_validator/core/constants.py +1 -0
  33. iam_validator/core/diff_parser.py +8 -4
  34. iam_validator/core/finding_fingerprint.py +131 -0
  35. iam_validator/core/formatters/sarif.py +370 -128
  36. iam_validator/core/ignore_processor.py +309 -0
  37. iam_validator/core/ignored_findings.py +400 -0
  38. iam_validator/core/models.py +54 -4
  39. iam_validator/core/policy_loader.py +313 -4
  40. iam_validator/core/pr_commenter.py +223 -22
  41. iam_validator/core/report.py +22 -6
  42. iam_validator/integrations/github_integration.py +881 -123
  43. {iam_policy_validator-1.13.1.dist-info → iam_policy_validator-1.14.1.dist-info}/WHEEL +0 -0
  44. {iam_policy_validator-1.13.1.dist-info → iam_policy_validator-1.14.1.dist-info}/entry_points.txt +0 -0
  45. {iam_policy_validator-1.13.1.dist-info → iam_policy_validator-1.14.1.dist-info}/licenses/LICENSE +0 -0
@@ -4,19 +4,53 @@ This module provides functionality to interact with GitHub,
4
4
  including posting PR comments, line comments, labels, and retrieving PR information.
5
5
  """
6
6
 
7
+ import asyncio
8
+ import base64
7
9
  import logging
8
10
  import os
9
11
  import re
12
+ import time
10
13
  from enum import Enum
11
- from typing import Any
14
+ from typing import TYPE_CHECKING, Any
12
15
 
13
16
  import httpx
14
17
 
15
18
  from iam_validator.core import constants
16
19
 
20
+ if TYPE_CHECKING:
21
+ from iam_validator.core.codeowners import CodeOwnersParser
22
+
17
23
  logger = logging.getLogger(__name__)
18
24
 
19
25
 
26
+ class GitHubRateLimitError(Exception):
27
+ """Raised when GitHub API rate limit is exceeded."""
28
+
29
+ def __init__(self, reset_time: int, message: str = "GitHub API rate limit exceeded"):
30
+ self.reset_time = reset_time
31
+ super().__init__(message)
32
+
33
+
34
+ class GitHubRetryableError(Exception):
35
+ """Raised for transient GitHub API errors that should be retried."""
36
+
37
+ pass
38
+
39
+
40
+ # Retry configuration
41
+ MAX_RETRIES = 3
42
+ INITIAL_BACKOFF_SECONDS = 1.0
43
+ MAX_BACKOFF_SECONDS = 30.0
44
+ BACKOFF_MULTIPLIER = 2.0
45
+
46
+ # HTTP status codes that should trigger retry
47
+ RETRYABLE_STATUS_CODES = {408, 429, 500, 502, 503, 504}
48
+
49
+ # Concurrency limit for parallel API operations (deletions, updates)
50
+ # This prevents hitting GitHub's secondary rate limits while still being fast
51
+ MAX_CONCURRENT_API_CALLS = 10
52
+
53
+
20
54
  class PRState(str, Enum):
21
55
  """GitHub PR state."""
22
56
 
@@ -66,6 +100,12 @@ class GitHubIntegration:
66
100
  os.environ.get("GITHUB_API_URL", "https://api.github.com")
67
101
  )
68
102
  self._client: httpx.AsyncClient | None = None
103
+ # Cache for team memberships: (org, team_slug) -> list[str]
104
+ # Reduces API calls when checking multiple users against same team
105
+ self._team_cache: dict[tuple[str, str], list[str]] = {}
106
+ # Cache for CODEOWNERS content (fetched once per instance)
107
+ self._codeowners_cache: str | None = None
108
+ self._codeowners_loaded: bool = False
69
109
 
70
110
  def _validate_token(self, token: str | None) -> str | None:
71
111
  """Validate and sanitize GitHub token.
@@ -262,7 +302,114 @@ class GitHubIntegration:
262
302
  async def _make_request(
263
303
  self, method: str, endpoint: str, **kwargs: Any
264
304
  ) -> dict[str, Any] | None:
265
- """Make an HTTP request to GitHub API.
305
+ """Make an HTTP request to GitHub API with retry and rate limit handling.
306
+
307
+ Implements exponential backoff for transient errors (5xx, 429) and
308
+ respects GitHub's rate limit headers.
309
+
310
+ Args:
311
+ method: HTTP method (GET, POST, PATCH, DELETE)
312
+ endpoint: API endpoint path
313
+ **kwargs: Additional arguments to pass to httpx
314
+
315
+ Returns:
316
+ Response JSON or None on error
317
+ """
318
+ if not self.is_configured():
319
+ logger.error("GitHub integration not configured")
320
+ return None
321
+
322
+ url = f"{self.api_url}/repos/{self.repository}/{endpoint}"
323
+ backoff = INITIAL_BACKOFF_SECONDS
324
+ last_error: Exception | None = None
325
+
326
+ for attempt in range(MAX_RETRIES + 1):
327
+ try:
328
+ if self._client:
329
+ response = await self._client.request(method, url, **kwargs)
330
+ else:
331
+ async with httpx.AsyncClient(headers=self._get_headers()) as client:
332
+ response = await client.request(method, url, **kwargs)
333
+
334
+ # Handle rate limiting (429)
335
+ if response.status_code == 429:
336
+ # Get reset time from headers
337
+ reset_time = response.headers.get("X-RateLimit-Reset")
338
+ retry_after = response.headers.get("Retry-After")
339
+
340
+ if retry_after:
341
+ wait_time = int(retry_after)
342
+ elif reset_time:
343
+ wait_time = max(0, int(reset_time) - int(time.time()))
344
+ else:
345
+ wait_time = min(backoff, MAX_BACKOFF_SECONDS)
346
+
347
+ if attempt < MAX_RETRIES:
348
+ logger.warning(
349
+ f"Rate limited on {method} {endpoint}, "
350
+ f"waiting {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES + 1})"
351
+ )
352
+ await asyncio.sleep(wait_time)
353
+ backoff = min(backoff * BACKOFF_MULTIPLIER, MAX_BACKOFF_SECONDS)
354
+ continue
355
+ else:
356
+ raise GitHubRateLimitError(
357
+ int(reset_time or 0),
358
+ f"Rate limit exceeded after {MAX_RETRIES + 1} attempts",
359
+ )
360
+
361
+ # Handle retryable server errors (5xx)
362
+ if response.status_code in RETRYABLE_STATUS_CODES and attempt < MAX_RETRIES:
363
+ logger.warning(
364
+ f"Retryable error {response.status_code} on {method} {endpoint}, "
365
+ f"retrying in {backoff:.1f}s (attempt {attempt + 1}/{MAX_RETRIES + 1})"
366
+ )
367
+ await asyncio.sleep(backoff)
368
+ backoff = min(backoff * BACKOFF_MULTIPLIER, MAX_BACKOFF_SECONDS)
369
+ continue
370
+
371
+ response.raise_for_status()
372
+ return response.json() if response.text else {}
373
+
374
+ except httpx.HTTPStatusError as e:
375
+ last_error = e
376
+ # Don't retry client errors (4xx) except rate limit
377
+ if 400 <= e.response.status_code < 500 and e.response.status_code != 429:
378
+ logger.error(f"HTTP error: {e.response.status_code} - {e.response.text}")
379
+ return None
380
+ # For server errors, continue to retry logic
381
+ if attempt < MAX_RETRIES:
382
+ logger.warning(
383
+ f"HTTP error {e.response.status_code}, retrying in {backoff:.1f}s"
384
+ )
385
+ await asyncio.sleep(backoff)
386
+ backoff = min(backoff * BACKOFF_MULTIPLIER, MAX_BACKOFF_SECONDS)
387
+ continue
388
+
389
+ except (httpx.ConnectError, httpx.TimeoutException) as e:
390
+ last_error = e
391
+ if attempt < MAX_RETRIES:
392
+ logger.warning(
393
+ f"Connection error on {method} {endpoint}: {e}, "
394
+ f"retrying in {backoff:.1f}s (attempt {attempt + 1}/{MAX_RETRIES + 1})"
395
+ )
396
+ await asyncio.sleep(backoff)
397
+ backoff = min(backoff * BACKOFF_MULTIPLIER, MAX_BACKOFF_SECONDS)
398
+ continue
399
+
400
+ except Exception as e: # pylint: disable=broad-exception-caught
401
+ logger.error(f"Unexpected error on {method} {endpoint}: {e}")
402
+ return None
403
+
404
+ # All retries exhausted
405
+ if last_error:
406
+ logger.error(f"Request failed after {MAX_RETRIES + 1} attempts: {last_error}")
407
+ return None
408
+
409
+ async def _make_request_no_retry(
410
+ self, method: str, endpoint: str, **kwargs: Any
411
+ ) -> dict[str, Any] | None:
412
+ """Make an HTTP request without retry (for non-critical operations).
266
413
 
267
414
  Args:
268
415
  method: HTTP method (GET, POST, PATCH, DELETE)
@@ -295,6 +442,89 @@ class GitHubIntegration:
295
442
  logger.error(f"Request failed: {e}")
296
443
  return None
297
444
 
445
+ async def _make_paginated_request(
446
+ self, endpoint: str, max_pages: int = 100
447
+ ) -> list[dict[str, Any]]:
448
+ """Make a paginated GET request to GitHub API, fetching all pages.
449
+
450
+ GitHub API returns at most 100 items per page for list endpoints.
451
+ This method follows pagination links to fetch ALL items.
452
+
453
+ Args:
454
+ endpoint: API endpoint path (e.g., "pulls/123/comments")
455
+ max_pages: Maximum number of pages to fetch (safety limit)
456
+
457
+ Returns:
458
+ Combined list of all items across all pages
459
+ """
460
+ if not self.is_configured():
461
+ logger.error("GitHub integration not configured")
462
+ return []
463
+
464
+ all_items: list[dict[str, Any]] = []
465
+ url: str | None = f"{self.api_url}/repos/{self.repository}/{endpoint}"
466
+ page_count = 0
467
+
468
+ # Add per_page=100 to maximize items per request
469
+ if "?" in endpoint:
470
+ url = f"{url}&per_page=100"
471
+ else:
472
+ url = f"{url}?per_page=100"
473
+
474
+ while url and page_count < max_pages:
475
+ page_count += 1
476
+ try:
477
+ if self._client:
478
+ response = await self._client.request("GET", url)
479
+ else:
480
+ async with httpx.AsyncClient(
481
+ timeout=httpx.Timeout(30.0), headers=self._get_headers()
482
+ ) as client:
483
+ response = await client.request("GET", url)
484
+
485
+ response.raise_for_status()
486
+ items = response.json()
487
+
488
+ if isinstance(items, list):
489
+ all_items.extend(items)
490
+ logger.debug(
491
+ f"Fetched page {page_count} with {len(items)} items "
492
+ f"(total: {len(all_items)})"
493
+ )
494
+ else:
495
+ # Not a list response, shouldn't happen for list endpoints
496
+ logger.warning(f"Unexpected response type on page {page_count}")
497
+ break
498
+
499
+ # Check for next page in Link header
500
+ # Format: <url>; rel="next", <url>; rel="last"
501
+ link_header = response.headers.get("Link", "")
502
+ url = None # Reset for next iteration
503
+
504
+ if link_header:
505
+ for link in link_header.split(","):
506
+ if 'rel="next"' in link:
507
+ # Extract URL from <url>
508
+ match = re.search(r"<([^>]+)>", link)
509
+ if match:
510
+ url = match.group(1)
511
+ break
512
+
513
+ except httpx.HTTPStatusError as e:
514
+ logger.error(f"HTTP error during pagination: {e.response.status_code}")
515
+ break
516
+ except Exception as e: # pylint: disable=broad-exception-caught
517
+ logger.error(f"Error during pagination: {e}")
518
+ break
519
+
520
+ if page_count >= max_pages:
521
+ logger.warning(f"Reached max pages limit ({max_pages}), results may be incomplete")
522
+
523
+ logger.debug(
524
+ f"Paginated request complete: {len(all_items)} total items from {page_count} page(s)"
525
+ )
526
+ return all_items
527
+
298
528
  # ==================== PR Comments ====================
299
529
 
300
530
  async def post_comment(self, comment_body: str) -> bool:
@@ -350,7 +580,11 @@ class GitHubIntegration:
350
580
  ) -> bool:
351
581
  """Post or update multiple related comments (for large reports).
352
582
 
353
- This method will:
583
+ For single-part comments (most common case), this will UPDATE the
584
+ existing comment in place rather than delete and recreate it.
585
+ This preserves comment history and avoids PR timeline noise.
586
+
587
+ For multi-part comments:
354
588
  1. Delete all old comments with the identifier
355
589
  2. Post new comments in sequence with part indicators
356
590
  3. Validate each part stays under GitHub's limit
@@ -365,16 +599,45 @@ class GitHubIntegration:
365
599
  # GitHub's actual limit
366
600
  github_comment_limit = 65536
367
601
 
368
- # Delete all existing comments with this identifier
602
+ total_parts = len(comment_parts)
603
+
604
+ # Optimization: For single-part comments, use update-or-create
605
+ # This preserves the existing comment and avoids PR timeline noise
606
+ if total_parts == 1:
607
+ part_body = comment_parts[0]
608
+ full_body = f"{identifier}\n\n{part_body}"
609
+
610
+ # Safety check: ensure we don't exceed GitHub's limit
611
+ if len(full_body) > github_comment_limit:
612
+ logger.error(
613
+ f"Comment exceeds GitHub's limit ({len(full_body)} > {github_comment_limit} chars). "
614
+ f"Comment will be truncated."
615
+ )
616
+ available_space = github_comment_limit - 500
617
+ truncated_body = part_body[:available_space]
618
+ truncation_warning = (
619
+ "\n\n---\n\n"
620
+ "> ⚠️ **This comment was truncated to fit GitHub's size limit**\n"
621
+ ">\n"
622
+ "> Download the full report using `--output report.json` or "
623
+ "`--format markdown --output report.md`\n"
624
+ )
625
+ full_body = f"{identifier}\n\n{truncated_body}{truncation_warning}"
626
+
627
+ success = await self.update_or_create_comment(full_body, identifier)
628
+ if success:
629
+ logger.info("Successfully updated summary comment")
630
+ return success
631
+
632
+ # Multi-part: Delete all existing comments with this identifier first
369
633
  await self._delete_comments_with_identifier(identifier)
370
634
 
371
635
  # Post each part
372
636
  success = True
373
- total_parts = len(comment_parts)
374
637
 
375
638
  for part_num, part_body in enumerate(comment_parts, 1):
376
639
  # Add identifier and part indicator
377
- part_indicator = f"**(Part {part_num}/{total_parts})**" if total_parts > 1 else ""
640
+ part_indicator = f"**(Part {part_num}/{total_parts})**"
378
641
  full_body = f"{identifier}\n{part_indicator}\n\n{part_body}"
379
642
 
380
643
  # Safety check: ensure we don't exceed GitHub's limit
@@ -472,19 +735,15 @@ class GitHubIntegration:
472
735
  # ==================== PR Review Comments (Line-specific) ====================
473
736
 
474
737
  async def get_review_comments(self) -> list[dict[str, Any]]:
475
- """Get all review comments on the PR.
738
+ """Get all review comments on the PR with pagination.
739
+
740
+ Fetches ALL review comments across all pages. This is critical for
741
+ proper comment deduplication and cleanup when there are many findings.
476
742
 
477
743
  Returns:
478
- List of review comment dicts
744
+ List of all review comment dicts
479
745
  """
480
- result = await self._make_request(
481
- "GET",
482
- f"pulls/{self.pr_number}/comments",
483
- )
484
-
485
- if result and isinstance(result, list):
486
- return result
487
- return []
746
+ return await self._make_paginated_request(f"pulls/{self.pr_number}/comments")
488
747
 
489
748
  async def get_bot_review_comments_with_location(
490
749
  self, identifier: str = constants.BOT_IDENTIFIER
@@ -552,29 +811,52 @@ class GitHubIntegration:
552
811
  )
553
812
 
554
813
  if result is not None: # DELETE returns empty dict on success
555
- logger.info(f"Successfully deleted review comment {comment_id}")
814
+ logger.debug(f"Successfully deleted review comment {comment_id}")
556
815
  return True
557
816
  return False
558
817
 
559
- async def resolve_review_comment(self, comment_id: int) -> bool:
560
- """Resolve a specific review comment.
818
+ async def _delete_comments_parallel(
819
+ self, comment_ids: list[int], max_concurrent: int = MAX_CONCURRENT_API_CALLS
820
+ ) -> tuple[int, int]:
821
+ """Delete multiple review comments in parallel with controlled concurrency.
822
+
823
+ Uses a semaphore to limit concurrent API calls, preventing rate limit issues
824
+ while still being much faster than sequential deletion.
561
825
 
562
826
  Args:
563
- comment_id: ID of the comment to resolve
827
+ comment_ids: List of comment IDs to delete
828
+ max_concurrent: Maximum number of concurrent deletions (default: 10)
564
829
 
565
830
  Returns:
566
- True if successful, False otherwise
831
+ Tuple of (successful_count, failed_count)
567
832
  """
568
- result = await self._make_request(
569
- "PATCH",
570
- f"pulls/comments/{comment_id}",
571
- json={"state": "resolved"},
833
+ if not comment_ids:
834
+ return (0, 0)
835
+
836
+ semaphore = asyncio.Semaphore(max_concurrent)
837
+
838
+ async def delete_with_limit(comment_id: int) -> bool:
839
+ async with semaphore:
840
+ return await self.delete_review_comment(comment_id)
841
+
842
+ # Run all deletions in parallel (semaphore controls actual concurrency)
843
+ results = await asyncio.gather(
844
+ *[delete_with_limit(cid) for cid in comment_ids],
845
+ return_exceptions=True,
572
846
  )
573
847
 
574
- if result is not None:
575
- logger.info(f"Successfully resolved review comment {comment_id}")
576
- return True
577
- return False
848
+ successful = sum(1 for r in results if r is True)
849
+ failed = len(results) - successful
850
+
851
+ if successful > 0:
852
+ logger.info(f"Parallel deletion: {successful} deleted, {failed} failed")
853
+
854
+ return (successful, failed)
855
+
856
+ # NOTE: resolve_review_comment was removed because GitHub REST API doesn't support
857
+ # resolving review comments via {"state": "resolved"}. Resolving review threads
858
+ # requires the GraphQL API with resolveReviewThread mutation.
859
+ # See: https://docs.github.com/en/graphql/reference/mutations#resolvereviewthread
578
860
 
579
861
  async def update_review_comment(self, comment_id: int, new_body: str) -> bool:
580
862
  """Update the body text of an existing review comment.
@@ -601,6 +883,7 @@ class GitHubIntegration:
601
883
  """Delete all review comments from the bot (from previous runs).
602
884
 
603
885
  This ensures old/outdated comments are removed before posting new ones.
886
+ Uses parallel deletion for speed when there are many comments.
604
887
 
605
888
  Args:
606
889
  identifier: String to identify bot comments
@@ -609,8 +892,9 @@ class GitHubIntegration:
609
892
  Number of comments deleted
610
893
  """
611
894
  comments = await self.get_review_comments()
612
- deleted_count = 0
613
895
 
896
+ # Collect all bot comment IDs to delete
897
+ comment_ids_to_delete: list[int] = []
614
898
  for comment in comments:
615
899
  if not isinstance(comment, dict):
616
900
  continue
@@ -620,47 +904,23 @@ class GitHubIntegration:
620
904
 
621
905
  # Check if this is a bot comment
622
906
  if identifier in str(body) and isinstance(comment_id, int):
623
- if await self.delete_review_comment(comment_id):
624
- deleted_count += 1
625
-
626
- if deleted_count > 0:
627
- logger.info(f"Cleaned up {deleted_count} old review comments")
628
-
629
- return deleted_count
630
-
631
- async def cleanup_bot_review_comments_by_resolving(
632
- self, identifier: str = constants.BOT_IDENTIFIER
633
- ) -> int:
634
- """Resolve all review comments from the bot (from previous runs).
635
-
636
- This marks old/outdated comments as resolved instead of deleting them,
637
- preserving them in the PR for audit trail purposes.
638
-
639
- Args:
640
- identifier: String to identify bot comments
641
-
642
- Returns:
643
- Number of comments resolved
644
- """
645
- comments = await self.get_review_comments()
646
- resolved_count = 0
907
+ comment_ids_to_delete.append(comment_id)
647
908
 
648
- for comment in comments:
649
- if not isinstance(comment, dict):
650
- continue
909
+ if not comment_ids_to_delete:
910
+ return 0
651
911
 
652
- body = comment.get("body", "")
653
- comment_id = comment.get("id")
912
+ # Delete all bot comments in parallel
913
+ successful, _failed = await self._delete_comments_parallel(comment_ids_to_delete)
654
914
 
655
- # Check if this is a bot comment
656
- if identifier in str(body) and isinstance(comment_id, int):
657
- if await self.resolve_review_comment(comment_id):
658
- resolved_count += 1
915
+ if successful > 0:
916
+ logger.info(f"Cleaned up {successful} old review comments")
659
917
 
660
- if resolved_count > 0:
661
- logger.info(f"Resolved {resolved_count} old review comments")
918
+ return successful
662
919
 
663
- return resolved_count
920
+ # NOTE: cleanup_bot_review_comments_by_resolving was removed because it depended on
921
+ # resolve_review_comment which doesn't work with GitHub REST API.
922
+ # Use cleanup_bot_review_comments (deletion) instead, or implement GraphQL-based
923
+ # resolution if audit trail preservation is needed.
664
924
 
665
925
  async def create_review_comment(
666
926
  self,
@@ -778,19 +1038,37 @@ class GitHubIntegration:
778
1038
  body: str = "",
779
1039
  event: ReviewEvent = ReviewEvent.COMMENT,
780
1040
  identifier: str = constants.REVIEW_IDENTIFIER,
1041
+ validated_files: set[str] | None = None,
1042
+ skip_cleanup: bool = False,
781
1043
  ) -> bool:
782
- """Smart comment management: update existing, create new, delete resolved.
1044
+ """Smart comment management using fingerprint-based matching.
1045
+
1046
+ This method uses finding fingerprints (stable IDs) as the PRIMARY key
1047
+ for matching comments, with location as SECONDARY for new comments.
1048
+
1049
+ Strategy:
1050
+ 1. Index existing comments by finding_id (from HTML comment)
1051
+ 2. For each new comment:
1052
+ - If finding_id exists: UPDATE (even if line changed)
1053
+ - If new: CREATE at specified line
1054
+ 3. Delete comments whose finding_id is not in new set (resolved)
1055
+ (unless skip_cleanup=True)
783
1056
 
784
- This method implements a three-step process:
785
- 1. Fetch existing bot comments at each location
786
- 2. For each new comment: update if exists, create if new
787
- 3. Delete old comments where issues have been resolved
1057
+ Note: Comments stay at their original line even if the issue moved,
1058
+ because GitHub doesn't support moving review comments. The comment
1059
+ body is updated to reflect any changes.
788
1060
 
789
1061
  Args:
790
1062
  comments: List of comment dicts with keys: path, line, body, (optional) side
791
1063
  body: The overall review body text
792
1064
  event: The review event type (APPROVE, REQUEST_CHANGES, COMMENT)
793
1065
  identifier: String to identify bot comments (for matching existing)
1066
+ validated_files: Set of all file paths that were validated in this run.
1067
+ Used to clean up comments for files that no longer have findings.
1068
+ If None, only files with current findings are considered.
1069
+ skip_cleanup: If True, skip the cleanup phase (deleting resolved comments).
1070
+ Use this in streaming mode where files are processed one at a time
1071
+ to avoid deleting comments from files processed earlier.
794
1072
 
795
1073
  Returns:
796
1074
  True if successful, False otherwise
@@ -798,29 +1076,31 @@ class GitHubIntegration:
798
1076
  Example:
799
1077
  # First run: Creates 3 comments
800
1078
  comments = [
801
- {"path": "policy.json", "line": 5, "body": "Issue A"},
802
- {"path": "policy.json", "line": 10, "body": "Issue B"},
803
- {"path": "policy.json", "line": 15, "body": "Issue C"},
1079
+ {"path": "policy.json", "line": 5, "body": "<!-- finding-id: abc123 -->Issue A"},
1080
+ {"path": "policy.json", "line": 10, "body": "<!-- finding-id: def456 -->Issue B"},
804
1081
  ]
805
1082
 
806
- # Second run: Updates Issue A, keeps B, deletes C (resolved), adds D
1083
+ # Second run: Same findings, even if lines shifted
807
1084
  comments = [
808
- {"path": "policy.json", "line": 5, "body": "Issue A (updated)"},
809
- {"path": "policy.json", "line": 10, "body": "Issue B"}, # Same = no update
810
- {"path": "policy.json", "line": 20, "body": "Issue D"}, # New
1085
+ {"path": "policy.json", "line": 8, "body": "<!-- finding-id: abc123 -->Issue A (updated)"},
1086
+ {"path": "policy.json", "line": 15, "body": "<!-- finding-id: def456 -->Issue B"},
811
1087
  ]
812
- # Result: line 15 comment deleted (resolved), line 5 updated, line 20 created
1088
+ # Result: Both comments UPDATED in place (not recreated), preserving conversation history
813
1089
  """
814
- # Step 1: Get existing bot comments mapped by location
815
- existing_comments = await self.get_bot_review_comments_with_location(identifier)
816
- logger.debug(f"Found {len(existing_comments)} existing bot comments")
1090
+ # Step 1: Get existing bot comments indexed by fingerprint
1091
+ existing_by_fingerprint = await self._get_bot_comments_by_fingerprint(identifier)
1092
+ logger.debug(
1093
+ f"Found {len(existing_by_fingerprint)} existing bot comments with fingerprints"
1094
+ )
1095
+
1096
+ # Also get location-based index for fallback (comments without fingerprints)
1097
+ existing_by_location = await self.get_bot_review_comments_with_location(identifier)
817
1098
 
818
- # Track which existing comments we've seen (to know what to delete later)
1099
+ seen_fingerprints: set[str] = set()
819
1100
  seen_locations: set[tuple[str, int, str]] = set()
1101
+ # Track comment IDs that were updated/matched - these should NOT be deleted
1102
+ matched_comment_ids: set[int] = set()
820
1103
  updated_count = 0
821
- created_count = 0
822
-
823
- # Step 2: Update or create each new comment
824
1104
  new_comments_for_review: list[dict[str, Any]] = []
825
1105
 
826
1106
  for comment in comments:
@@ -828,33 +1108,59 @@ class GitHubIntegration:
828
1108
  line = comment["line"]
829
1109
  new_body = comment["body"]
830
1110
 
831
- # Extract issue type from comment body HTML comment
1111
+ # Try fingerprint-based matching first
1112
+ finding_id = self._extract_finding_id(new_body)
1113
+
1114
+ if finding_id:
1115
+ seen_fingerprints.add(finding_id)
1116
+
1117
+ if finding_id in existing_by_fingerprint:
1118
+ existing = existing_by_fingerprint[finding_id]
1119
+ matched_comment_ids.add(existing["id"])
1120
+ # Check if update needed (body changed)
1121
+ if existing["body"] != new_body:
1122
+ success = await self.update_review_comment(existing["id"], new_body)
1123
+ if success:
1124
+ updated_count += 1
1125
+ logger.debug(
1126
+ f"Updated comment for finding {finding_id[:8]}... "
1127
+ f"(was at {existing['path']}:{existing['line']})"
1128
+ )
1129
+ else:
1130
+ logger.debug(f"Comment for finding {finding_id[:8]}... unchanged")
1131
+ continue
1132
+
1133
+ # Fallback: location-based matching
1134
+ # This handles both:
1135
+ # 1. Legacy comments without fingerprints
1136
+ # 2. Comments with fingerprints that don't match (e.g., path changed)
832
1137
  issue_type_match = re.search(r"<!-- issue-type: (\w+) -->", new_body)
833
1138
  issue_type = issue_type_match.group(1) if issue_type_match else "unknown"
834
-
835
1139
  location = (path, line, issue_type)
836
1140
  seen_locations.add(location)
837
1141
 
838
- existing = existing_comments.get(location)
839
-
840
- if existing:
841
- # Comment exists at this location - check if body changed
842
- if existing["body"] != new_body:
843
- # Update the existing comment
844
- success = await self.update_review_comment(existing["id"], new_body)
1142
+ existing_loc = existing_by_location.get(location)
1143
+ if existing_loc:
1144
+ # Found existing comment at same location with same issue type
1145
+ # Update it (this handles both legacy comments and fingerprint mismatches)
1146
+ matched_comment_ids.add(existing_loc["id"])
1147
+ if existing_loc["body"] != new_body:
1148
+ success = await self.update_review_comment(existing_loc["id"], new_body)
845
1149
  if success:
846
1150
  updated_count += 1
847
- logger.debug(f"Updated comment at {path}:{line}")
848
- else:
849
- logger.warning(f"Failed to update comment at {path}:{line}")
850
- else:
851
- # Body unchanged, skip update
852
- logger.debug(f"Comment at {path}:{line} unchanged, skipping update")
853
- else:
854
- # New comment - collect for batch creation
855
- new_comments_for_review.append(comment)
1151
+ if finding_id:
1152
+ logger.debug(
1153
+ f"Updated comment at {path}:{line} (fingerprint mismatch, location match)"
1154
+ )
1155
+ else:
1156
+ logger.debug(f"Updated legacy comment at {path}:{line}")
1157
+ continue
1158
+
1159
+ # New comment - collect for batch creation
1160
+ new_comments_for_review.append(comment)
856
1161
 
857
- # Step 3: Create new comments via review API (if any)
1162
+ # Step 2: Create new comments via review API (if any)
1163
+ created_count = 0
858
1164
  if new_comments_for_review:
859
1165
  success = await self.create_review_with_comments(
860
1166
  new_comments_for_review,
@@ -868,26 +1174,92 @@ class GitHubIntegration:
868
1174
  logger.error("Failed to create new review comments")
869
1175
  return False
870
1176
 
871
- # Step 4: Delete comments for resolved issues (not in new comment set)
872
- # IMPORTANT: Only delete comments for files that are in the current batch
873
- # to avoid deleting comments from other files processed in the same run
1177
+ # Step 3: Delete resolved comments (unless skip_cleanup is True)
1178
+ # In streaming mode, we skip cleanup because we're processing files one at a time
1179
+ # and don't want to delete comments from files processed earlier in the stream
874
1180
  deleted_count = 0
875
- files_in_batch = {comment["path"] for comment in comments}
876
-
877
- for location, existing in existing_comments.items():
878
- # Only delete if:
879
- # 1. This location is not in the new comment set (resolved issue)
880
- # 2. AND this file is in the current batch (don't touch other files' comments)
881
- if location not in seen_locations and existing["path"] in files_in_batch:
882
- # This comment location is no longer in the new issues - delete it
883
- success = await self.delete_review_comment(existing["id"])
884
- if success:
885
- deleted_count += 1
886
- logger.debug(
887
- f"Deleted resolved comment at {existing['path']}:{existing['line']}"
888
- )
889
1181
 
890
- # Summary
1182
+ if skip_cleanup:
1183
+ logger.debug("Skipping cleanup phase (streaming mode)")
1184
+ else:
1185
+ # Priority: fingerprint-based deletion, then location-based for legacy
1186
+ # Also clean up comments for files removed from the PR or files that were
1187
+ # validated but no longer have findings
1188
+ files_with_findings = {c["path"] for c in comments}
1189
+
1190
+ # Use validated_files if provided, otherwise fall back to files_with_findings
1191
+ # This ensures we clean up comments for files that were validated but have no findings
1192
+ files_in_scope = validated_files if validated_files is not None else files_with_findings
1193
+
1194
+ # Get current PR files to detect removed files
1195
+ # Note: get_pr_files() returns [] on error, so we check for non-empty result
1196
+ pr_files = await self.get_pr_files()
1197
+ if pr_files:
1198
+ current_pr_files: set[str] | None = {f["filename"] for f in pr_files}
1199
+ else:
1200
+ # Empty result could be an API error - fall back to batch-only cleanup
1201
+ # to avoid accidentally deleting valid comments
1202
+ logger.debug("Could not fetch PR files for cleanup, using batch-only mode")
1203
+ current_pr_files = None
1204
+
1205
+ def should_delete_comment(existing_path: str) -> bool:
1206
+ """Check if a comment should be deleted based on file status.
1207
+
1208
+ A comment should be deleted if the file is part of this PR.
1209
+ The fingerprint check (done by caller) ensures we only delete
1210
+ comments for findings that are no longer present.
1211
+
1212
+ This aggressive cleanup ensures stale comments are removed even if:
1213
+ - The file was fixed but not re-validated in this specific run
1214
+ - The validation runs on a subset of PR files
1215
+
1216
+ We preserve comments for files NOT in the PR to avoid accidentally
1217
+ deleting comments from other branches/PRs.
1218
+ """
1219
+ # If we successfully fetched PR files, delete comments for any PR file
1220
+ # whose finding is no longer present (fingerprint check done by caller)
1221
+ if current_pr_files is not None:
1222
+ return existing_path in current_pr_files
1223
+
1224
+ # Fallback: if we couldn't fetch PR files, only clean up validated files
1225
+ # to avoid accidentally deleting valid comments
1226
+ return existing_path in files_in_scope
1227
+
1228
+ # Collect all comment IDs to delete
1229
+ # Delete by fingerprint (primary) - comments that:
1230
+ # 1. Were NOT matched (updated) in this run
1231
+ # 2. Have a fingerprint not in the new findings
1232
+ # 3. Are in files that are part of this PR/validation
1233
+ comment_ids_to_delete: list[int] = []
1234
+
1235
+ for fingerprint, existing in existing_by_fingerprint.items():
1236
+ comment_id = existing["id"]
1237
+ # Skip if this comment was matched/updated via location fallback
1238
+ if comment_id in matched_comment_ids:
1239
+ continue
1240
+ if fingerprint not in seen_fingerprints and should_delete_comment(existing["path"]):
1241
+ comment_ids_to_delete.append(comment_id)
1242
+ logger.debug(f"Marking for deletion: resolved comment {fingerprint[:8]}...")
1243
+
1244
+ # Delete by location (legacy comments without fingerprints)
1245
+ for location, existing in existing_by_location.items():
1246
+ comment_id = existing["id"]
1247
+ # Skip if already matched/updated
1248
+ if comment_id in matched_comment_ids:
1249
+ continue
1250
+ # Skip if already marked for deletion by fingerprint above
1251
+ existing_fingerprint = self._extract_finding_id(existing.get("body", ""))
1252
+ if existing_fingerprint:
1253
+ continue # Already handled above
1254
+
1255
+ if location not in seen_locations and should_delete_comment(existing["path"]):
1256
+ comment_ids_to_delete.append(comment_id)
1257
+ logger.debug(f"Marking for deletion: resolved legacy comment at {location}")
1258
+
1259
+ # Delete all collected comments in parallel
1260
+ if comment_ids_to_delete:
1261
+ deleted_count, _failed = await self._delete_comments_parallel(comment_ids_to_delete)
1262
+
891
1263
  logger.info(
892
1264
  f"Review comment management: {updated_count} updated, "
893
1265
  f"{created_count} created, {deleted_count} deleted (resolved)"
@@ -895,6 +1267,50 @@ class GitHubIntegration:
895
1267
 
896
1268
  return True
897
1269
 
1270
+ def _extract_finding_id(self, body: str) -> str | None:
1271
+ """Extract finding ID from comment body HTML comment.
1272
+
1273
+ Args:
1274
+ body: Comment body text
1275
+
1276
+ Returns:
1277
+ 16-character finding ID hash, or None if not found
1278
+ """
1279
+ match = re.search(r"<!-- finding-id: ([a-f0-9]{16}) -->", body)
1280
+ return match.group(1) if match else None
1281
+
1282
+ async def _get_bot_comments_by_fingerprint(self, identifier: str) -> dict[str, dict[str, Any]]:
1283
+ """Index existing bot comments by their finding fingerprint.
1284
+
1285
+ Args:
1286
+ identifier: String to identify bot comments
1287
+
1288
+ Returns:
1289
+ Dict mapping finding_id to comment metadata dict
1290
+ Comment dict contains: id, body, path, line
1291
+ """
1292
+ comments = await self.get_review_comments()
1293
+ indexed: dict[str, dict[str, Any]] = {}
1294
+
1295
+ for comment in comments:
1296
+ if not isinstance(comment, dict):
1297
+ continue
1298
+
1299
+ body = comment.get("body", "")
1300
+ if identifier not in str(body):
1301
+ continue
1302
+
1303
+ finding_id = self._extract_finding_id(body)
1304
+ if finding_id:
1305
+ indexed[finding_id] = {
1306
+ "id": comment["id"],
1307
+ "body": body,
1308
+ "path": comment.get("path", ""),
1309
+ "line": comment.get("line") or comment.get("original_line"),
1310
+ }
1311
+
1312
+ return indexed
1313
+
898
1314
  # ==================== PR Labels ====================
899
1315
 
900
1316
  async def add_labels(self, labels: list[str]) -> bool:
@@ -1061,3 +1477,345 @@ class GitHubIntegration:
1061
1477
  logger.info(f"Successfully set commit status: {state}")
1062
1478
  return True
1063
1479
  return False
1480
+
1481
+ # ==================== CODEOWNERS and Ignore Commands ====================
1482
+
1483
+ async def get_codeowners_content(self) -> str | None:
1484
+ """Fetch CODEOWNERS file content from repository.
1485
+
1486
+ Results are cached per instance to avoid redundant API calls.
1487
+
1488
+ Searches in standard CODEOWNERS locations:
1489
+ - CODEOWNERS
1490
+ - .github/CODEOWNERS
1491
+ - docs/CODEOWNERS
1492
+
1493
+ Returns:
1494
+ CODEOWNERS file content as string, or None if not found
1495
+ """
1496
+ # Return cached result if already loaded
1497
+ if self._codeowners_loaded:
1498
+ return self._codeowners_cache
1499
+
1500
+ from iam_validator.core.codeowners import ( # pylint: disable=import-outside-toplevel
1501
+ CodeOwnersParser,
1502
+ )
1503
+
1504
+ for path in CodeOwnersParser.CODEOWNERS_PATHS:
1505
+ result = await self._make_request(
1506
+ "GET",
1507
+ f"contents/{path}",
1508
+ )
1509
+
1510
+ if result and isinstance(result, dict) and "content" in result:
1511
+ try:
1512
+ content = base64.b64decode(result["content"]).decode("utf-8")
1513
+ logger.debug(f"Found CODEOWNERS at {path}")
1514
+ # Cache the result
1515
+ self._codeowners_cache = content
1516
+ self._codeowners_loaded = True
1517
+ return content
1518
+ except (ValueError, UnicodeDecodeError) as e:
1519
+ logger.warning(f"Failed to decode CODEOWNERS at {path}: {e}")
1520
+ continue
1521
+
1522
+ logger.debug("No CODEOWNERS file found in repository")
1523
+ # Cache the negative result too
1524
+ self._codeowners_cache = None
1525
+ self._codeowners_loaded = True
1526
+ return None
1527
+
1528
+ async def get_team_members(self, org: str, team_slug: str) -> list[str]:
1529
+ """Get members of a GitHub team.
1530
+
1531
+ Results are cached per instance to avoid redundant API calls
1532
+ when checking multiple users against the same team.
1533
+
1534
+ Note: This requires the token to have `read:org` scope for
1535
+ organization teams.
1536
+
1537
+ Args:
1538
+ org: Organization name
1539
+ team_slug: Team slug (URL-friendly name)
1540
+
1541
+ Returns:
1542
+ List of team member usernames (lowercase)
1543
+ """
1544
+ # Check cache first
1545
+ cache_key = (org.lower(), team_slug.lower())
1546
+ if cache_key in self._team_cache:
1547
+ logger.debug(f"Using cached team members for {org}/{team_slug}")
1548
+ return self._team_cache[cache_key]
1549
+
1550
+ url = f"{self.api_url}/orgs/{org}/teams/{team_slug}/members"
1551
+
1552
+ try:
1553
+ if self._client:
1554
+ response = await self._client.request("GET", url)
1555
+ else:
1556
+ async with httpx.AsyncClient(
1557
+ headers=self._get_headers(), timeout=httpx.Timeout(30.0)
1558
+ ) as client:
1559
+ response = await client.request("GET", url)
1560
+
1561
+ response.raise_for_status()
1562
+ result = response.json()
1563
+
1564
+ if isinstance(result, list):
1565
+ members = [
1566
+ member.get("login", "").lower()
1567
+ for member in result
1568
+ if isinstance(member, dict) and member.get("login")
1569
+ ]
1570
+ # Cache the result
1571
+ self._team_cache[cache_key] = members
1572
+ logger.debug(f"Found {len(members)} members in team {org}/{team_slug}")
1573
+ return members
1574
+
1575
+ except httpx.HTTPStatusError as e:
1576
+ logger.warning(
1577
+ f"Failed to get team members for {org}/{team_slug}: HTTP {e.response.status_code}"
1578
+ )
1579
+ except Exception as e: # pylint: disable=broad-exception-caught
1580
+ logger.warning(f"Failed to get team members for {org}/{team_slug}: {e}")
1581
+
1582
+ # Cache empty result to avoid repeated failed API calls
1583
+ self._team_cache[cache_key] = []
1584
+ return []
1585
+
1586
+ async def is_user_codeowner(
1587
+ self,
1588
+ username: str,
1589
+ file_path: str,
1590
+ codeowners_parser: "CodeOwnersParser | None" = None,
1591
+ allowed_users: list[str] | None = None,
1592
+ ) -> bool:
1593
+ """Check if a user is authorized to ignore findings for a file.
1594
+
1595
+ Authorization is granted if:
1596
+ 1. User is listed directly in CODEOWNERS for the file
1597
+ 2. User is a member of a team listed in CODEOWNERS for the file
1598
+ 3. User is in the allowed_users fallback list (when no CODEOWNERS)
1599
+
1600
+ Performance: Team membership checks are executed in parallel.
1601
+
1602
+ Args:
1603
+ username: GitHub username to check
1604
+ file_path: Path to the file being checked
1605
+ codeowners_parser: Pre-parsed CODEOWNERS (for caching)
1606
+ allowed_users: Fallback list of allowed users (when no CODEOWNERS)
1607
+
1608
+ Returns:
1609
+ True if user is authorized, False otherwise
1610
+ """
1611
+ username_lower = username.lower()
1612
+
1613
+ # Check fallback allowed_users first (always applies if configured)
1614
+ if allowed_users:
1615
+ if username_lower in [u.lower() for u in allowed_users]:
1616
+ logger.debug(f"User {username} authorized via allowed_users config")
1617
+ return True
1618
+
1619
+ # Get or parse CODEOWNERS
1620
+ parser = codeowners_parser
1621
+ if parser is None:
1622
+ content = await self.get_codeowners_content()
1623
+ if content is None:
1624
+ # No CODEOWNERS and no allowed_users match = deny
1625
+ logger.debug(f"No CODEOWNERS file found, user {username} not in allowed_users")
1626
+ return False
1627
+
1628
+ from iam_validator.core.codeowners import ( # pylint: disable=import-outside-toplevel
1629
+ CodeOwnersParser,
1630
+ )
1631
+
1632
+ parser = CodeOwnersParser(content)
1633
+
1634
+ # Check direct user ownership
1635
+ if parser.is_owner(username, file_path):
1636
+ logger.debug(f"User {username} is direct owner of {file_path}")
1637
+ return True
1638
+
1639
+ # Check team membership - fetch all teams in parallel for speed
1640
+ teams = parser.get_teams_for_file(file_path)
1641
+ if not teams:
1642
+ logger.debug(f"User {username} is not authorized for {file_path}")
1643
+ return False
1644
+
1645
+ # Fetch all team memberships concurrently
1646
+
1647
+ async def check_team(org: str, team_slug: str) -> tuple[str, str, bool]:
1648
+ members = await self.get_team_members(org, team_slug)
1649
+ return (org, team_slug, username_lower in members)
1650
+
1651
+ results = await asyncio.gather(*[check_team(org, team_slug) for org, team_slug in teams])
1652
+
1653
+ for org, team_slug, is_member in results:
1654
+ if is_member:
1655
+ logger.debug(f"User {username} authorized via team {org}/{team_slug}")
1656
+ return True
1657
+
1658
+ logger.debug(f"User {username} is not authorized for {file_path}")
1659
+ return False
1660
+
1661
+ async def get_issue_comments(self) -> list[dict[str, Any]]:
1662
+ """Get all issue comments (general PR comments, not review comments) with pagination.
1663
+
1664
+ Fetches ALL issue comments across all pages. This ensures proper
1665
+ comment management when there are many comments on a PR.
1666
+
1667
+ Returns:
1668
+ List of all issue comment dicts
1669
+ """
1670
+ return await self._make_paginated_request(f"issues/{self.pr_number}/comments")
1671
+
1672
+ async def get_comment_by_id(self, comment_id: int) -> dict[str, Any] | None:
1673
+ """Get a specific review comment by ID.
1674
+
1675
+ Used for verifying that ignore command replies still exist
1676
+ (tamper-resistant verification).
1677
+
1678
+ Args:
1679
+ comment_id: The ID of the review comment to fetch
1680
+
1681
+ Returns:
1682
+ Comment dict if found, None if deleted or error
1683
+ """
1684
+ result = await self._make_request(
1685
+ "GET",
1686
+ f"pulls/comments/{comment_id}",
1687
+ )
1688
+
1689
+ if result and isinstance(result, dict):
1690
+ return result
1691
+ return None
1692
+
1693
+ async def post_reply_to_review_comment(
1694
+ self,
1695
+ comment_id: int,
1696
+ body: str,
1697
+ ) -> bool:
1698
+ """Post a reply to a review comment thread.
1699
+
1700
+ Args:
1701
+ comment_id: The ID of the review comment to reply to
1702
+ body: The reply text (markdown supported)
1703
+
1704
+ Returns:
1705
+ True if successful, False otherwise
1706
+ """
1707
+ result = await self._make_request(
1708
+ "POST",
1709
+ f"pulls/{self.pr_number}/comments",
1710
+ json={
1711
+ "body": body,
1712
+ "in_reply_to": comment_id,
1713
+ },
1714
+ )
1715
+
1716
+ if result:
1717
+ logger.debug(f"Successfully posted reply to comment {comment_id}")
1718
+ return True
1719
+ return False
1720
+
1721
+ async def scan_for_ignore_commands(
1722
+ self,
1723
+ identifier: str = constants.BOT_IDENTIFIER,
1724
+ ) -> list[tuple[dict[str, Any], dict[str, Any]]]:
1725
+ """Scan for ignore commands in replies to bot review comments.
1726
+
1727
+ Looks for replies to bot comments that contain ignore commands.
1728
+ Supports formats: "ignore", "/ignore", "@iam-validator ignore",
1729
+ "skip", "suppress", and "ignore: reason here".
1730
+
1731
+ Args:
1732
+ identifier: String to identify bot comments
1733
+
1734
+ Returns:
1735
+ List of (bot_comment, reply_comment) tuples where reply
1736
+ contains an ignore command
1737
+ """
1738
+ all_comments = await self.get_review_comments()
1739
+ ignore_commands: list[tuple[dict[str, Any], dict[str, Any]]] = []
1740
+
1741
+ # Index bot comments by ID for O(1) lookup
1742
+ bot_comments_by_id: dict[int, dict[str, Any]] = {}
1743
+ for comment in all_comments:
1744
+ if not isinstance(comment, dict):
1745
+ continue
1746
+ body = comment.get("body", "")
1747
+ comment_id = comment.get("id")
1748
+ if identifier in str(body) and isinstance(comment_id, int):
1749
+ bot_comments_by_id[comment_id] = comment
1750
+
1751
+ # Find replies with ignore commands
1752
+ for comment in all_comments:
1753
+ if not isinstance(comment, dict):
1754
+ continue
1755
+
1756
+ reply_to_id = comment.get("in_reply_to_id")
1757
+ if reply_to_id and reply_to_id in bot_comments_by_id:
1758
+ body = comment.get("body", "")
1759
+ if self._is_ignore_command(body):
1760
+ ignore_commands.append((bot_comments_by_id[reply_to_id], comment))
1761
+
1762
+ logger.debug(f"Found {len(ignore_commands)} ignore command(s) in PR comments")
1763
+ return ignore_commands
1764
+
1765
+ def _is_ignore_command(self, text: str) -> bool:
1766
+ """Check if text is an ignore command.
1767
+
1768
+ Supports:
1769
+ - "ignore" (case insensitive)
1770
+ - "/ignore"
1771
+ - "@iam-validator ignore"
1772
+ - "skip", "suppress"
1773
+ - "ignore: reason here" (with optional reason)
1774
+
1775
+ Args:
1776
+ text: Comment text to check
1777
+
1778
+ Returns:
1779
+ True if text is an ignore command
1780
+ """
1781
+ if not text:
1782
+ return False
1783
+
1784
+ text = text.strip().lower()
1785
+
1786
+ ignore_patterns = [
1787
+ r"^\s*ignore\s*$",
1788
+ r"^\s*/ignore\s*$",
1789
+ r"^\s*@?iam-validator\s+ignore\s*$",
1790
+ r"^\s*ignore\s*:\s*.+$", # With reason
1791
+ r"^\s*skip\s*$",
1792
+ r"^\s*suppress\s*$",
1793
+ ]
1794
+
1795
+ return any(re.match(pattern, text, re.IGNORECASE) for pattern in ignore_patterns)
1796
+
1797
+ @staticmethod
1798
+ def extract_finding_id(comment_body: str) -> str | None:
1799
+ """Extract finding ID from a bot comment.
1800
+
1801
+ Args:
1802
+ comment_body: The comment body text
1803
+
1804
+ Returns:
1805
+ Finding ID hash, or None if not found
1806
+ """
1807
+ match = re.search(r"<!-- finding-id: ([a-f0-9]+) -->", comment_body)
1808
+ return match.group(1) if match else None
1809
+
1810
+ @staticmethod
1811
+ def extract_ignore_reason(text: str) -> str | None:
1812
+ """Extract reason from ignore command.
1813
+
1814
+ Args:
1815
+ text: The ignore command text
1816
+
1817
+ Returns:
1818
+ Reason string, or None if no reason provided
1819
+ """
1820
+ match = re.search(r"ignore\s*:\s*(.+)$", text.strip(), re.IGNORECASE)
1821
+ return match.group(1).strip() if match else None