iam-policy-validator 1.13.1__py3-none-any.whl → 1.14.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {iam_policy_validator-1.13.1.dist-info → iam_policy_validator-1.14.1.dist-info}/METADATA +1 -1
- {iam_policy_validator-1.13.1.dist-info → iam_policy_validator-1.14.1.dist-info}/RECORD +45 -39
- iam_validator/__version__.py +1 -1
- iam_validator/checks/action_condition_enforcement.py +6 -0
- iam_validator/checks/action_resource_matching.py +12 -12
- iam_validator/checks/action_validation.py +1 -0
- iam_validator/checks/condition_key_validation.py +2 -0
- iam_validator/checks/condition_type_mismatch.py +3 -0
- iam_validator/checks/full_wildcard.py +1 -0
- iam_validator/checks/mfa_condition_check.py +2 -0
- iam_validator/checks/policy_structure.py +9 -0
- iam_validator/checks/policy_type_validation.py +11 -0
- iam_validator/checks/principal_validation.py +5 -0
- iam_validator/checks/resource_validation.py +4 -0
- iam_validator/checks/sensitive_action.py +1 -0
- iam_validator/checks/service_wildcard.py +6 -3
- iam_validator/checks/set_operator_validation.py +3 -0
- iam_validator/checks/sid_uniqueness.py +2 -0
- iam_validator/checks/trust_policy_validation.py +3 -0
- iam_validator/checks/utils/__init__.py +16 -0
- iam_validator/checks/utils/action_parser.py +149 -0
- iam_validator/checks/wildcard_action.py +1 -0
- iam_validator/checks/wildcard_resource.py +231 -4
- iam_validator/commands/analyze.py +19 -1
- iam_validator/commands/completion.py +6 -2
- iam_validator/commands/validate.py +231 -12
- iam_validator/core/aws_service/fetcher.py +21 -9
- iam_validator/core/codeowners.py +245 -0
- iam_validator/core/config/check_documentation.py +390 -0
- iam_validator/core/config/config_loader.py +199 -0
- iam_validator/core/config/defaults.py +25 -0
- iam_validator/core/constants.py +1 -0
- iam_validator/core/diff_parser.py +8 -4
- iam_validator/core/finding_fingerprint.py +131 -0
- iam_validator/core/formatters/sarif.py +370 -128
- iam_validator/core/ignore_processor.py +309 -0
- iam_validator/core/ignored_findings.py +400 -0
- iam_validator/core/models.py +54 -4
- iam_validator/core/policy_loader.py +313 -4
- iam_validator/core/pr_commenter.py +223 -22
- iam_validator/core/report.py +22 -6
- iam_validator/integrations/github_integration.py +881 -123
- {iam_policy_validator-1.13.1.dist-info → iam_policy_validator-1.14.1.dist-info}/WHEEL +0 -0
- {iam_policy_validator-1.13.1.dist-info → iam_policy_validator-1.14.1.dist-info}/entry_points.txt +0 -0
- {iam_policy_validator-1.13.1.dist-info → iam_policy_validator-1.14.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -4,19 +4,53 @@ This module provides functionality to interact with GitHub,
|
|
|
4
4
|
including posting PR comments, line comments, labels, and retrieving PR information.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
+
import asyncio
|
|
8
|
+
import base64
|
|
7
9
|
import logging
|
|
8
10
|
import os
|
|
9
11
|
import re
|
|
12
|
+
import time
|
|
10
13
|
from enum import Enum
|
|
11
|
-
from typing import Any
|
|
14
|
+
from typing import TYPE_CHECKING, Any
|
|
12
15
|
|
|
13
16
|
import httpx
|
|
14
17
|
|
|
15
18
|
from iam_validator.core import constants
|
|
16
19
|
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from iam_validator.core.codeowners import CodeOwnersParser
|
|
22
|
+
|
|
17
23
|
logger = logging.getLogger(__name__)
|
|
18
24
|
|
|
19
25
|
|
|
26
|
+
class GitHubRateLimitError(Exception):
|
|
27
|
+
"""Raised when GitHub API rate limit is exceeded."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, reset_time: int, message: str = "GitHub API rate limit exceeded"):
|
|
30
|
+
self.reset_time = reset_time
|
|
31
|
+
super().__init__(message)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class GitHubRetryableError(Exception):
|
|
35
|
+
"""Raised for transient GitHub API errors that should be retried."""
|
|
36
|
+
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# Retry configuration
|
|
41
|
+
MAX_RETRIES = 3
|
|
42
|
+
INITIAL_BACKOFF_SECONDS = 1.0
|
|
43
|
+
MAX_BACKOFF_SECONDS = 30.0
|
|
44
|
+
BACKOFF_MULTIPLIER = 2.0
|
|
45
|
+
|
|
46
|
+
# HTTP status codes that should trigger retry
|
|
47
|
+
RETRYABLE_STATUS_CODES = {408, 429, 500, 502, 503, 504}
|
|
48
|
+
|
|
49
|
+
# Concurrency limit for parallel API operations (deletions, updates)
|
|
50
|
+
# This prevents hitting GitHub's secondary rate limits while still being fast
|
|
51
|
+
MAX_CONCURRENT_API_CALLS = 10
|
|
52
|
+
|
|
53
|
+
|
|
20
54
|
class PRState(str, Enum):
|
|
21
55
|
"""GitHub PR state."""
|
|
22
56
|
|
|
@@ -66,6 +100,12 @@ class GitHubIntegration:
|
|
|
66
100
|
os.environ.get("GITHUB_API_URL", "https://api.github.com")
|
|
67
101
|
)
|
|
68
102
|
self._client: httpx.AsyncClient | None = None
|
|
103
|
+
# Cache for team memberships: (org, team_slug) -> list[str]
|
|
104
|
+
# Reduces API calls when checking multiple users against same team
|
|
105
|
+
self._team_cache: dict[tuple[str, str], list[str]] = {}
|
|
106
|
+
# Cache for CODEOWNERS content (fetched once per instance)
|
|
107
|
+
self._codeowners_cache: str | None = None
|
|
108
|
+
self._codeowners_loaded: bool = False
|
|
69
109
|
|
|
70
110
|
def _validate_token(self, token: str | None) -> str | None:
|
|
71
111
|
"""Validate and sanitize GitHub token.
|
|
@@ -262,7 +302,114 @@ class GitHubIntegration:
|
|
|
262
302
|
async def _make_request(
|
|
263
303
|
self, method: str, endpoint: str, **kwargs: Any
|
|
264
304
|
) -> dict[str, Any] | None:
|
|
265
|
-
"""Make an HTTP request to GitHub API.
|
|
305
|
+
"""Make an HTTP request to GitHub API with retry and rate limit handling.
|
|
306
|
+
|
|
307
|
+
Implements exponential backoff for transient errors (5xx, 429) and
|
|
308
|
+
respects GitHub's rate limit headers.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
method: HTTP method (GET, POST, PATCH, DELETE)
|
|
312
|
+
endpoint: API endpoint path
|
|
313
|
+
**kwargs: Additional arguments to pass to httpx
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
Response JSON or None on error
|
|
317
|
+
"""
|
|
318
|
+
if not self.is_configured():
|
|
319
|
+
logger.error("GitHub integration not configured")
|
|
320
|
+
return None
|
|
321
|
+
|
|
322
|
+
url = f"{self.api_url}/repos/{self.repository}/{endpoint}"
|
|
323
|
+
backoff = INITIAL_BACKOFF_SECONDS
|
|
324
|
+
last_error: Exception | None = None
|
|
325
|
+
|
|
326
|
+
for attempt in range(MAX_RETRIES + 1):
|
|
327
|
+
try:
|
|
328
|
+
if self._client:
|
|
329
|
+
response = await self._client.request(method, url, **kwargs)
|
|
330
|
+
else:
|
|
331
|
+
async with httpx.AsyncClient(headers=self._get_headers()) as client:
|
|
332
|
+
response = await client.request(method, url, **kwargs)
|
|
333
|
+
|
|
334
|
+
# Handle rate limiting (429)
|
|
335
|
+
if response.status_code == 429:
|
|
336
|
+
# Get reset time from headers
|
|
337
|
+
reset_time = response.headers.get("X-RateLimit-Reset")
|
|
338
|
+
retry_after = response.headers.get("Retry-After")
|
|
339
|
+
|
|
340
|
+
if retry_after:
|
|
341
|
+
wait_time = int(retry_after)
|
|
342
|
+
elif reset_time:
|
|
343
|
+
wait_time = max(0, int(reset_time) - int(time.time()))
|
|
344
|
+
else:
|
|
345
|
+
wait_time = min(backoff, MAX_BACKOFF_SECONDS)
|
|
346
|
+
|
|
347
|
+
if attempt < MAX_RETRIES:
|
|
348
|
+
logger.warning(
|
|
349
|
+
f"Rate limited on {method} {endpoint}, "
|
|
350
|
+
f"waiting {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES + 1})"
|
|
351
|
+
)
|
|
352
|
+
await asyncio.sleep(wait_time)
|
|
353
|
+
backoff = min(backoff * BACKOFF_MULTIPLIER, MAX_BACKOFF_SECONDS)
|
|
354
|
+
continue
|
|
355
|
+
else:
|
|
356
|
+
raise GitHubRateLimitError(
|
|
357
|
+
int(reset_time or 0),
|
|
358
|
+
f"Rate limit exceeded after {MAX_RETRIES + 1} attempts",
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
# Handle retryable server errors (5xx)
|
|
362
|
+
if response.status_code in RETRYABLE_STATUS_CODES and attempt < MAX_RETRIES:
|
|
363
|
+
logger.warning(
|
|
364
|
+
f"Retryable error {response.status_code} on {method} {endpoint}, "
|
|
365
|
+
f"retrying in {backoff:.1f}s (attempt {attempt + 1}/{MAX_RETRIES + 1})"
|
|
366
|
+
)
|
|
367
|
+
await asyncio.sleep(backoff)
|
|
368
|
+
backoff = min(backoff * BACKOFF_MULTIPLIER, MAX_BACKOFF_SECONDS)
|
|
369
|
+
continue
|
|
370
|
+
|
|
371
|
+
response.raise_for_status()
|
|
372
|
+
return response.json() if response.text else {}
|
|
373
|
+
|
|
374
|
+
except httpx.HTTPStatusError as e:
|
|
375
|
+
last_error = e
|
|
376
|
+
# Don't retry client errors (4xx) except rate limit
|
|
377
|
+
if 400 <= e.response.status_code < 500 and e.response.status_code != 429:
|
|
378
|
+
logger.error(f"HTTP error: {e.response.status_code} - {e.response.text}")
|
|
379
|
+
return None
|
|
380
|
+
# For server errors, continue to retry logic
|
|
381
|
+
if attempt < MAX_RETRIES:
|
|
382
|
+
logger.warning(
|
|
383
|
+
f"HTTP error {e.response.status_code}, retrying in {backoff:.1f}s"
|
|
384
|
+
)
|
|
385
|
+
await asyncio.sleep(backoff)
|
|
386
|
+
backoff = min(backoff * BACKOFF_MULTIPLIER, MAX_BACKOFF_SECONDS)
|
|
387
|
+
continue
|
|
388
|
+
|
|
389
|
+
except (httpx.ConnectError, httpx.TimeoutException) as e:
|
|
390
|
+
last_error = e
|
|
391
|
+
if attempt < MAX_RETRIES:
|
|
392
|
+
logger.warning(
|
|
393
|
+
f"Connection error on {method} {endpoint}: {e}, "
|
|
394
|
+
f"retrying in {backoff:.1f}s (attempt {attempt + 1}/{MAX_RETRIES + 1})"
|
|
395
|
+
)
|
|
396
|
+
await asyncio.sleep(backoff)
|
|
397
|
+
backoff = min(backoff * BACKOFF_MULTIPLIER, MAX_BACKOFF_SECONDS)
|
|
398
|
+
continue
|
|
399
|
+
|
|
400
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
401
|
+
logger.error(f"Unexpected error on {method} {endpoint}: {e}")
|
|
402
|
+
return None
|
|
403
|
+
|
|
404
|
+
# All retries exhausted
|
|
405
|
+
if last_error:
|
|
406
|
+
logger.error(f"Request failed after {MAX_RETRIES + 1} attempts: {last_error}")
|
|
407
|
+
return None
|
|
408
|
+
|
|
409
|
+
async def _make_request_no_retry(
|
|
410
|
+
self, method: str, endpoint: str, **kwargs: Any
|
|
411
|
+
) -> dict[str, Any] | None:
|
|
412
|
+
"""Make an HTTP request without retry (for non-critical operations).
|
|
266
413
|
|
|
267
414
|
Args:
|
|
268
415
|
method: HTTP method (GET, POST, PATCH, DELETE)
|
|
@@ -295,6 +442,89 @@ class GitHubIntegration:
|
|
|
295
442
|
logger.error(f"Request failed: {e}")
|
|
296
443
|
return None
|
|
297
444
|
|
|
445
|
+
async def _make_paginated_request(
|
|
446
|
+
self, endpoint: str, max_pages: int = 100
|
|
447
|
+
) -> list[dict[str, Any]]:
|
|
448
|
+
"""Make a paginated GET request to GitHub API, fetching all pages.
|
|
449
|
+
|
|
450
|
+
GitHub API returns at most 100 items per page for list endpoints.
|
|
451
|
+
This method follows pagination links to fetch ALL items.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
endpoint: API endpoint path (e.g., "pulls/123/comments")
|
|
455
|
+
max_pages: Maximum number of pages to fetch (safety limit)
|
|
456
|
+
|
|
457
|
+
Returns:
|
|
458
|
+
Combined list of all items across all pages
|
|
459
|
+
"""
|
|
460
|
+
if not self.is_configured():
|
|
461
|
+
logger.error("GitHub integration not configured")
|
|
462
|
+
return []
|
|
463
|
+
|
|
464
|
+
all_items: list[dict[str, Any]] = []
|
|
465
|
+
url: str | None = f"{self.api_url}/repos/{self.repository}/{endpoint}"
|
|
466
|
+
page_count = 0
|
|
467
|
+
|
|
468
|
+
# Add per_page=100 to maximize items per request
|
|
469
|
+
if "?" in endpoint:
|
|
470
|
+
url = f"{url}&per_page=100"
|
|
471
|
+
else:
|
|
472
|
+
url = f"{url}?per_page=100"
|
|
473
|
+
|
|
474
|
+
while url and page_count < max_pages:
|
|
475
|
+
page_count += 1
|
|
476
|
+
try:
|
|
477
|
+
if self._client:
|
|
478
|
+
response = await self._client.request("GET", url)
|
|
479
|
+
else:
|
|
480
|
+
async with httpx.AsyncClient(
|
|
481
|
+
timeout=httpx.Timeout(30.0), headers=self._get_headers()
|
|
482
|
+
) as client:
|
|
483
|
+
response = await client.request("GET", url)
|
|
484
|
+
|
|
485
|
+
response.raise_for_status()
|
|
486
|
+
items = response.json()
|
|
487
|
+
|
|
488
|
+
if isinstance(items, list):
|
|
489
|
+
all_items.extend(items)
|
|
490
|
+
logger.debug(
|
|
491
|
+
f"Fetched page {page_count} with {len(items)} items "
|
|
492
|
+
f"(total: {len(all_items)})"
|
|
493
|
+
)
|
|
494
|
+
else:
|
|
495
|
+
# Not a list response, shouldn't happen for list endpoints
|
|
496
|
+
logger.warning(f"Unexpected response type on page {page_count}")
|
|
497
|
+
break
|
|
498
|
+
|
|
499
|
+
# Check for next page in Link header
|
|
500
|
+
# Format: <url>; rel="next", <url>; rel="last"
|
|
501
|
+
link_header = response.headers.get("Link", "")
|
|
502
|
+
url = None # Reset for next iteration
|
|
503
|
+
|
|
504
|
+
if link_header:
|
|
505
|
+
for link in link_header.split(","):
|
|
506
|
+
if 'rel="next"' in link:
|
|
507
|
+
# Extract URL from <url>
|
|
508
|
+
match = re.search(r"<([^>]+)>", link)
|
|
509
|
+
if match:
|
|
510
|
+
url = match.group(1)
|
|
511
|
+
break
|
|
512
|
+
|
|
513
|
+
except httpx.HTTPStatusError as e:
|
|
514
|
+
logger.error(f"HTTP error during pagination: {e.response.status_code}")
|
|
515
|
+
break
|
|
516
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
517
|
+
logger.error(f"Error during pagination: {e}")
|
|
518
|
+
break
|
|
519
|
+
|
|
520
|
+
if page_count >= max_pages:
|
|
521
|
+
logger.warning(f"Reached max pages limit ({max_pages}), results may be incomplete")
|
|
522
|
+
|
|
523
|
+
logger.debug(
|
|
524
|
+
f"Paginated request complete: {len(all_items)} total items from {page_count} page(s)"
|
|
525
|
+
)
|
|
526
|
+
return all_items
|
|
527
|
+
|
|
298
528
|
# ==================== PR Comments ====================
|
|
299
529
|
|
|
300
530
|
async def post_comment(self, comment_body: str) -> bool:
|
|
@@ -350,7 +580,11 @@ class GitHubIntegration:
|
|
|
350
580
|
) -> bool:
|
|
351
581
|
"""Post or update multiple related comments (for large reports).
|
|
352
582
|
|
|
353
|
-
|
|
583
|
+
For single-part comments (most common case), this will UPDATE the
|
|
584
|
+
existing comment in place rather than delete and recreate it.
|
|
585
|
+
This preserves comment history and avoids PR timeline noise.
|
|
586
|
+
|
|
587
|
+
For multi-part comments:
|
|
354
588
|
1. Delete all old comments with the identifier
|
|
355
589
|
2. Post new comments in sequence with part indicators
|
|
356
590
|
3. Validate each part stays under GitHub's limit
|
|
@@ -365,16 +599,45 @@ class GitHubIntegration:
|
|
|
365
599
|
# GitHub's actual limit
|
|
366
600
|
github_comment_limit = 65536
|
|
367
601
|
|
|
368
|
-
|
|
602
|
+
total_parts = len(comment_parts)
|
|
603
|
+
|
|
604
|
+
# Optimization: For single-part comments, use update-or-create
|
|
605
|
+
# This preserves the existing comment and avoids PR timeline noise
|
|
606
|
+
if total_parts == 1:
|
|
607
|
+
part_body = comment_parts[0]
|
|
608
|
+
full_body = f"{identifier}\n\n{part_body}"
|
|
609
|
+
|
|
610
|
+
# Safety check: ensure we don't exceed GitHub's limit
|
|
611
|
+
if len(full_body) > github_comment_limit:
|
|
612
|
+
logger.error(
|
|
613
|
+
f"Comment exceeds GitHub's limit ({len(full_body)} > {github_comment_limit} chars). "
|
|
614
|
+
f"Comment will be truncated."
|
|
615
|
+
)
|
|
616
|
+
available_space = github_comment_limit - 500
|
|
617
|
+
truncated_body = part_body[:available_space]
|
|
618
|
+
truncation_warning = (
|
|
619
|
+
"\n\n---\n\n"
|
|
620
|
+
"> ⚠️ **This comment was truncated to fit GitHub's size limit**\n"
|
|
621
|
+
">\n"
|
|
622
|
+
"> Download the full report using `--output report.json` or "
|
|
623
|
+
"`--format markdown --output report.md`\n"
|
|
624
|
+
)
|
|
625
|
+
full_body = f"{identifier}\n\n{truncated_body}{truncation_warning}"
|
|
626
|
+
|
|
627
|
+
success = await self.update_or_create_comment(full_body, identifier)
|
|
628
|
+
if success:
|
|
629
|
+
logger.info("Successfully updated summary comment")
|
|
630
|
+
return success
|
|
631
|
+
|
|
632
|
+
# Multi-part: Delete all existing comments with this identifier first
|
|
369
633
|
await self._delete_comments_with_identifier(identifier)
|
|
370
634
|
|
|
371
635
|
# Post each part
|
|
372
636
|
success = True
|
|
373
|
-
total_parts = len(comment_parts)
|
|
374
637
|
|
|
375
638
|
for part_num, part_body in enumerate(comment_parts, 1):
|
|
376
639
|
# Add identifier and part indicator
|
|
377
|
-
part_indicator = f"**(Part {part_num}/{total_parts})**"
|
|
640
|
+
part_indicator = f"**(Part {part_num}/{total_parts})**"
|
|
378
641
|
full_body = f"{identifier}\n{part_indicator}\n\n{part_body}"
|
|
379
642
|
|
|
380
643
|
# Safety check: ensure we don't exceed GitHub's limit
|
|
@@ -472,19 +735,15 @@ class GitHubIntegration:
|
|
|
472
735
|
# ==================== PR Review Comments (Line-specific) ====================
|
|
473
736
|
|
|
474
737
|
async def get_review_comments(self) -> list[dict[str, Any]]:
|
|
475
|
-
"""Get all review comments on the PR.
|
|
738
|
+
"""Get all review comments on the PR with pagination.
|
|
739
|
+
|
|
740
|
+
Fetches ALL review comments across all pages. This is critical for
|
|
741
|
+
proper comment deduplication and cleanup when there are many findings.
|
|
476
742
|
|
|
477
743
|
Returns:
|
|
478
|
-
List of review comment dicts
|
|
744
|
+
List of all review comment dicts
|
|
479
745
|
"""
|
|
480
|
-
|
|
481
|
-
"GET",
|
|
482
|
-
f"pulls/{self.pr_number}/comments",
|
|
483
|
-
)
|
|
484
|
-
|
|
485
|
-
if result and isinstance(result, list):
|
|
486
|
-
return result
|
|
487
|
-
return []
|
|
746
|
+
return await self._make_paginated_request(f"pulls/{self.pr_number}/comments")
|
|
488
747
|
|
|
489
748
|
async def get_bot_review_comments_with_location(
|
|
490
749
|
self, identifier: str = constants.BOT_IDENTIFIER
|
|
@@ -552,29 +811,52 @@ class GitHubIntegration:
|
|
|
552
811
|
)
|
|
553
812
|
|
|
554
813
|
if result is not None: # DELETE returns empty dict on success
|
|
555
|
-
logger.
|
|
814
|
+
logger.debug(f"Successfully deleted review comment {comment_id}")
|
|
556
815
|
return True
|
|
557
816
|
return False
|
|
558
817
|
|
|
559
|
-
async def
|
|
560
|
-
|
|
818
|
+
async def _delete_comments_parallel(
|
|
819
|
+
self, comment_ids: list[int], max_concurrent: int = MAX_CONCURRENT_API_CALLS
|
|
820
|
+
) -> tuple[int, int]:
|
|
821
|
+
"""Delete multiple review comments in parallel with controlled concurrency.
|
|
822
|
+
|
|
823
|
+
Uses a semaphore to limit concurrent API calls, preventing rate limit issues
|
|
824
|
+
while still being much faster than sequential deletion.
|
|
561
825
|
|
|
562
826
|
Args:
|
|
563
|
-
|
|
827
|
+
comment_ids: List of comment IDs to delete
|
|
828
|
+
max_concurrent: Maximum number of concurrent deletions (default: 10)
|
|
564
829
|
|
|
565
830
|
Returns:
|
|
566
|
-
|
|
831
|
+
Tuple of (successful_count, failed_count)
|
|
567
832
|
"""
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
833
|
+
if not comment_ids:
|
|
834
|
+
return (0, 0)
|
|
835
|
+
|
|
836
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
|
837
|
+
|
|
838
|
+
async def delete_with_limit(comment_id: int) -> bool:
|
|
839
|
+
async with semaphore:
|
|
840
|
+
return await self.delete_review_comment(comment_id)
|
|
841
|
+
|
|
842
|
+
# Run all deletions in parallel (semaphore controls actual concurrency)
|
|
843
|
+
results = await asyncio.gather(
|
|
844
|
+
*[delete_with_limit(cid) for cid in comment_ids],
|
|
845
|
+
return_exceptions=True,
|
|
572
846
|
)
|
|
573
847
|
|
|
574
|
-
if
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
848
|
+
successful = sum(1 for r in results if r is True)
|
|
849
|
+
failed = len(results) - successful
|
|
850
|
+
|
|
851
|
+
if successful > 0:
|
|
852
|
+
logger.info(f"Parallel deletion: {successful} deleted, {failed} failed")
|
|
853
|
+
|
|
854
|
+
return (successful, failed)
|
|
855
|
+
|
|
856
|
+
# NOTE: resolve_review_comment was removed because GitHub REST API doesn't support
|
|
857
|
+
# resolving review comments via {"state": "resolved"}. Resolving review threads
|
|
858
|
+
# requires the GraphQL API with resolveReviewThread mutation.
|
|
859
|
+
# See: https://docs.github.com/en/graphql/reference/mutations#resolvereviewthread
|
|
578
860
|
|
|
579
861
|
async def update_review_comment(self, comment_id: int, new_body: str) -> bool:
|
|
580
862
|
"""Update the body text of an existing review comment.
|
|
@@ -601,6 +883,7 @@ class GitHubIntegration:
|
|
|
601
883
|
"""Delete all review comments from the bot (from previous runs).
|
|
602
884
|
|
|
603
885
|
This ensures old/outdated comments are removed before posting new ones.
|
|
886
|
+
Uses parallel deletion for speed when there are many comments.
|
|
604
887
|
|
|
605
888
|
Args:
|
|
606
889
|
identifier: String to identify bot comments
|
|
@@ -609,8 +892,9 @@ class GitHubIntegration:
|
|
|
609
892
|
Number of comments deleted
|
|
610
893
|
"""
|
|
611
894
|
comments = await self.get_review_comments()
|
|
612
|
-
deleted_count = 0
|
|
613
895
|
|
|
896
|
+
# Collect all bot comment IDs to delete
|
|
897
|
+
comment_ids_to_delete: list[int] = []
|
|
614
898
|
for comment in comments:
|
|
615
899
|
if not isinstance(comment, dict):
|
|
616
900
|
continue
|
|
@@ -620,47 +904,23 @@ class GitHubIntegration:
|
|
|
620
904
|
|
|
621
905
|
# Check if this is a bot comment
|
|
622
906
|
if identifier in str(body) and isinstance(comment_id, int):
|
|
623
|
-
|
|
624
|
-
deleted_count += 1
|
|
625
|
-
|
|
626
|
-
if deleted_count > 0:
|
|
627
|
-
logger.info(f"Cleaned up {deleted_count} old review comments")
|
|
628
|
-
|
|
629
|
-
return deleted_count
|
|
630
|
-
|
|
631
|
-
async def cleanup_bot_review_comments_by_resolving(
|
|
632
|
-
self, identifier: str = constants.BOT_IDENTIFIER
|
|
633
|
-
) -> int:
|
|
634
|
-
"""Resolve all review comments from the bot (from previous runs).
|
|
635
|
-
|
|
636
|
-
This marks old/outdated comments as resolved instead of deleting them,
|
|
637
|
-
preserving them in the PR for audit trail purposes.
|
|
638
|
-
|
|
639
|
-
Args:
|
|
640
|
-
identifier: String to identify bot comments
|
|
641
|
-
|
|
642
|
-
Returns:
|
|
643
|
-
Number of comments resolved
|
|
644
|
-
"""
|
|
645
|
-
comments = await self.get_review_comments()
|
|
646
|
-
resolved_count = 0
|
|
907
|
+
comment_ids_to_delete.append(comment_id)
|
|
647
908
|
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
continue
|
|
909
|
+
if not comment_ids_to_delete:
|
|
910
|
+
return 0
|
|
651
911
|
|
|
652
|
-
|
|
653
|
-
|
|
912
|
+
# Delete all bot comments in parallel
|
|
913
|
+
successful, _failed = await self._delete_comments_parallel(comment_ids_to_delete)
|
|
654
914
|
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
if await self.resolve_review_comment(comment_id):
|
|
658
|
-
resolved_count += 1
|
|
915
|
+
if successful > 0:
|
|
916
|
+
logger.info(f"Cleaned up {successful} old review comments")
|
|
659
917
|
|
|
660
|
-
|
|
661
|
-
logger.info(f"Resolved {resolved_count} old review comments")
|
|
918
|
+
return successful
|
|
662
919
|
|
|
663
|
-
|
|
920
|
+
# NOTE: cleanup_bot_review_comments_by_resolving was removed because it depended on
|
|
921
|
+
# resolve_review_comment which doesn't work with GitHub REST API.
|
|
922
|
+
# Use cleanup_bot_review_comments (deletion) instead, or implement GraphQL-based
|
|
923
|
+
# resolution if audit trail preservation is needed.
|
|
664
924
|
|
|
665
925
|
async def create_review_comment(
|
|
666
926
|
self,
|
|
@@ -778,19 +1038,37 @@ class GitHubIntegration:
|
|
|
778
1038
|
body: str = "",
|
|
779
1039
|
event: ReviewEvent = ReviewEvent.COMMENT,
|
|
780
1040
|
identifier: str = constants.REVIEW_IDENTIFIER,
|
|
1041
|
+
validated_files: set[str] | None = None,
|
|
1042
|
+
skip_cleanup: bool = False,
|
|
781
1043
|
) -> bool:
|
|
782
|
-
"""Smart comment management
|
|
1044
|
+
"""Smart comment management using fingerprint-based matching.
|
|
1045
|
+
|
|
1046
|
+
This method uses finding fingerprints (stable IDs) as the PRIMARY key
|
|
1047
|
+
for matching comments, with location as SECONDARY for new comments.
|
|
1048
|
+
|
|
1049
|
+
Strategy:
|
|
1050
|
+
1. Index existing comments by finding_id (from HTML comment)
|
|
1051
|
+
2. For each new comment:
|
|
1052
|
+
- If finding_id exists: UPDATE (even if line changed)
|
|
1053
|
+
- If new: CREATE at specified line
|
|
1054
|
+
3. Delete comments whose finding_id is not in new set (resolved)
|
|
1055
|
+
(unless skip_cleanup=True)
|
|
783
1056
|
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
3. Delete old comments where issues have been resolved
|
|
1057
|
+
Note: Comments stay at their original line even if the issue moved,
|
|
1058
|
+
because GitHub doesn't support moving review comments. The comment
|
|
1059
|
+
body is updated to reflect any changes.
|
|
788
1060
|
|
|
789
1061
|
Args:
|
|
790
1062
|
comments: List of comment dicts with keys: path, line, body, (optional) side
|
|
791
1063
|
body: The overall review body text
|
|
792
1064
|
event: The review event type (APPROVE, REQUEST_CHANGES, COMMENT)
|
|
793
1065
|
identifier: String to identify bot comments (for matching existing)
|
|
1066
|
+
validated_files: Set of all file paths that were validated in this run.
|
|
1067
|
+
Used to clean up comments for files that no longer have findings.
|
|
1068
|
+
If None, only files with current findings are considered.
|
|
1069
|
+
skip_cleanup: If True, skip the cleanup phase (deleting resolved comments).
|
|
1070
|
+
Use this in streaming mode where files are processed one at a time
|
|
1071
|
+
to avoid deleting comments from files processed earlier.
|
|
794
1072
|
|
|
795
1073
|
Returns:
|
|
796
1074
|
True if successful, False otherwise
|
|
@@ -798,29 +1076,31 @@ class GitHubIntegration:
|
|
|
798
1076
|
Example:
|
|
799
1077
|
# First run: Creates 3 comments
|
|
800
1078
|
comments = [
|
|
801
|
-
{"path": "policy.json", "line": 5, "body": "Issue A"},
|
|
802
|
-
{"path": "policy.json", "line": 10, "body": "Issue B"},
|
|
803
|
-
{"path": "policy.json", "line": 15, "body": "Issue C"},
|
|
1079
|
+
{"path": "policy.json", "line": 5, "body": "<!-- finding-id: abc123 -->Issue A"},
|
|
1080
|
+
{"path": "policy.json", "line": 10, "body": "<!-- finding-id: def456 -->Issue B"},
|
|
804
1081
|
]
|
|
805
1082
|
|
|
806
|
-
# Second run:
|
|
1083
|
+
# Second run: Same findings, even if lines shifted
|
|
807
1084
|
comments = [
|
|
808
|
-
{"path": "policy.json", "line":
|
|
809
|
-
{"path": "policy.json", "line":
|
|
810
|
-
{"path": "policy.json", "line": 20, "body": "Issue D"}, # New
|
|
1085
|
+
{"path": "policy.json", "line": 8, "body": "<!-- finding-id: abc123 -->Issue A (updated)"},
|
|
1086
|
+
{"path": "policy.json", "line": 15, "body": "<!-- finding-id: def456 -->Issue B"},
|
|
811
1087
|
]
|
|
812
|
-
# Result:
|
|
1088
|
+
# Result: Both comments UPDATED in place (not recreated), preserving conversation history
|
|
813
1089
|
"""
|
|
814
|
-
# Step 1: Get existing bot comments
|
|
815
|
-
|
|
816
|
-
logger.debug(
|
|
1090
|
+
# Step 1: Get existing bot comments indexed by fingerprint
|
|
1091
|
+
existing_by_fingerprint = await self._get_bot_comments_by_fingerprint(identifier)
|
|
1092
|
+
logger.debug(
|
|
1093
|
+
f"Found {len(existing_by_fingerprint)} existing bot comments with fingerprints"
|
|
1094
|
+
)
|
|
1095
|
+
|
|
1096
|
+
# Also get location-based index for fallback (comments without fingerprints)
|
|
1097
|
+
existing_by_location = await self.get_bot_review_comments_with_location(identifier)
|
|
817
1098
|
|
|
818
|
-
|
|
1099
|
+
seen_fingerprints: set[str] = set()
|
|
819
1100
|
seen_locations: set[tuple[str, int, str]] = set()
|
|
1101
|
+
# Track comment IDs that were updated/matched - these should NOT be deleted
|
|
1102
|
+
matched_comment_ids: set[int] = set()
|
|
820
1103
|
updated_count = 0
|
|
821
|
-
created_count = 0
|
|
822
|
-
|
|
823
|
-
# Step 2: Update or create each new comment
|
|
824
1104
|
new_comments_for_review: list[dict[str, Any]] = []
|
|
825
1105
|
|
|
826
1106
|
for comment in comments:
|
|
@@ -828,33 +1108,59 @@ class GitHubIntegration:
|
|
|
828
1108
|
line = comment["line"]
|
|
829
1109
|
new_body = comment["body"]
|
|
830
1110
|
|
|
831
|
-
#
|
|
1111
|
+
# Try fingerprint-based matching first
|
|
1112
|
+
finding_id = self._extract_finding_id(new_body)
|
|
1113
|
+
|
|
1114
|
+
if finding_id:
|
|
1115
|
+
seen_fingerprints.add(finding_id)
|
|
1116
|
+
|
|
1117
|
+
if finding_id in existing_by_fingerprint:
|
|
1118
|
+
existing = existing_by_fingerprint[finding_id]
|
|
1119
|
+
matched_comment_ids.add(existing["id"])
|
|
1120
|
+
# Check if update needed (body changed)
|
|
1121
|
+
if existing["body"] != new_body:
|
|
1122
|
+
success = await self.update_review_comment(existing["id"], new_body)
|
|
1123
|
+
if success:
|
|
1124
|
+
updated_count += 1
|
|
1125
|
+
logger.debug(
|
|
1126
|
+
f"Updated comment for finding {finding_id[:8]}... "
|
|
1127
|
+
f"(was at {existing['path']}:{existing['line']})"
|
|
1128
|
+
)
|
|
1129
|
+
else:
|
|
1130
|
+
logger.debug(f"Comment for finding {finding_id[:8]}... unchanged")
|
|
1131
|
+
continue
|
|
1132
|
+
|
|
1133
|
+
# Fallback: location-based matching
|
|
1134
|
+
# This handles both:
|
|
1135
|
+
# 1. Legacy comments without fingerprints
|
|
1136
|
+
# 2. Comments with fingerprints that don't match (e.g., path changed)
|
|
832
1137
|
issue_type_match = re.search(r"<!-- issue-type: (\w+) -->", new_body)
|
|
833
1138
|
issue_type = issue_type_match.group(1) if issue_type_match else "unknown"
|
|
834
|
-
|
|
835
1139
|
location = (path, line, issue_type)
|
|
836
1140
|
seen_locations.add(location)
|
|
837
1141
|
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
#
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
success = await self.update_review_comment(
|
|
1142
|
+
existing_loc = existing_by_location.get(location)
|
|
1143
|
+
if existing_loc:
|
|
1144
|
+
# Found existing comment at same location with same issue type
|
|
1145
|
+
# Update it (this handles both legacy comments and fingerprint mismatches)
|
|
1146
|
+
matched_comment_ids.add(existing_loc["id"])
|
|
1147
|
+
if existing_loc["body"] != new_body:
|
|
1148
|
+
success = await self.update_review_comment(existing_loc["id"], new_body)
|
|
845
1149
|
if success:
|
|
846
1150
|
updated_count += 1
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
1151
|
+
if finding_id:
|
|
1152
|
+
logger.debug(
|
|
1153
|
+
f"Updated comment at {path}:{line} (fingerprint mismatch, location match)"
|
|
1154
|
+
)
|
|
1155
|
+
else:
|
|
1156
|
+
logger.debug(f"Updated legacy comment at {path}:{line}")
|
|
1157
|
+
continue
|
|
1158
|
+
|
|
1159
|
+
# New comment - collect for batch creation
|
|
1160
|
+
new_comments_for_review.append(comment)
|
|
856
1161
|
|
|
857
|
-
# Step
|
|
1162
|
+
# Step 2: Create new comments via review API (if any)
|
|
1163
|
+
created_count = 0
|
|
858
1164
|
if new_comments_for_review:
|
|
859
1165
|
success = await self.create_review_with_comments(
|
|
860
1166
|
new_comments_for_review,
|
|
@@ -868,26 +1174,92 @@ class GitHubIntegration:
|
|
|
868
1174
|
logger.error("Failed to create new review comments")
|
|
869
1175
|
return False
|
|
870
1176
|
|
|
871
|
-
# Step
|
|
872
|
-
#
|
|
873
|
-
# to
|
|
1177
|
+
# Step 3: Delete resolved comments (unless skip_cleanup is True)
|
|
1178
|
+
# In streaming mode, we skip cleanup because we're processing files one at a time
|
|
1179
|
+
# and don't want to delete comments from files processed earlier in the stream
|
|
874
1180
|
deleted_count = 0
|
|
875
|
-
files_in_batch = {comment["path"] for comment in comments}
|
|
876
|
-
|
|
877
|
-
for location, existing in existing_comments.items():
|
|
878
|
-
# Only delete if:
|
|
879
|
-
# 1. This location is not in the new comment set (resolved issue)
|
|
880
|
-
# 2. AND this file is in the current batch (don't touch other files' comments)
|
|
881
|
-
if location not in seen_locations and existing["path"] in files_in_batch:
|
|
882
|
-
# This comment location is no longer in the new issues - delete it
|
|
883
|
-
success = await self.delete_review_comment(existing["id"])
|
|
884
|
-
if success:
|
|
885
|
-
deleted_count += 1
|
|
886
|
-
logger.debug(
|
|
887
|
-
f"Deleted resolved comment at {existing['path']}:{existing['line']}"
|
|
888
|
-
)
|
|
889
1181
|
|
|
890
|
-
|
|
1182
|
+
if skip_cleanup:
|
|
1183
|
+
logger.debug("Skipping cleanup phase (streaming mode)")
|
|
1184
|
+
else:
|
|
1185
|
+
# Priority: fingerprint-based deletion, then location-based for legacy
|
|
1186
|
+
# Also clean up comments for files removed from the PR or files that were
|
|
1187
|
+
# validated but no longer have findings
|
|
1188
|
+
files_with_findings = {c["path"] for c in comments}
|
|
1189
|
+
|
|
1190
|
+
# Use validated_files if provided, otherwise fall back to files_with_findings
|
|
1191
|
+
# This ensures we clean up comments for files that were validated but have no findings
|
|
1192
|
+
files_in_scope = validated_files if validated_files is not None else files_with_findings
|
|
1193
|
+
|
|
1194
|
+
# Get current PR files to detect removed files
|
|
1195
|
+
# Note: get_pr_files() returns [] on error, so we check for non-empty result
|
|
1196
|
+
pr_files = await self.get_pr_files()
|
|
1197
|
+
if pr_files:
|
|
1198
|
+
current_pr_files: set[str] | None = {f["filename"] for f in pr_files}
|
|
1199
|
+
else:
|
|
1200
|
+
# Empty result could be an API error - fall back to batch-only cleanup
|
|
1201
|
+
# to avoid accidentally deleting valid comments
|
|
1202
|
+
logger.debug("Could not fetch PR files for cleanup, using batch-only mode")
|
|
1203
|
+
current_pr_files = None
|
|
1204
|
+
|
|
1205
|
+
def should_delete_comment(existing_path: str) -> bool:
|
|
1206
|
+
"""Check if a comment should be deleted based on file status.
|
|
1207
|
+
|
|
1208
|
+
A comment should be deleted if the file is part of this PR.
|
|
1209
|
+
The fingerprint check (done by caller) ensures we only delete
|
|
1210
|
+
comments for findings that are no longer present.
|
|
1211
|
+
|
|
1212
|
+
This aggressive cleanup ensures stale comments are removed even if:
|
|
1213
|
+
- The file was fixed but not re-validated in this specific run
|
|
1214
|
+
- The validation runs on a subset of PR files
|
|
1215
|
+
|
|
1216
|
+
We preserve comments for files NOT in the PR to avoid accidentally
|
|
1217
|
+
deleting comments from other branches/PRs.
|
|
1218
|
+
"""
|
|
1219
|
+
# If we successfully fetched PR files, delete comments for any PR file
|
|
1220
|
+
# whose finding is no longer present (fingerprint check done by caller)
|
|
1221
|
+
if current_pr_files is not None:
|
|
1222
|
+
return existing_path in current_pr_files
|
|
1223
|
+
|
|
1224
|
+
# Fallback: if we couldn't fetch PR files, only clean up validated files
|
|
1225
|
+
# to avoid accidentally deleting valid comments
|
|
1226
|
+
return existing_path in files_in_scope
|
|
1227
|
+
|
|
1228
|
+
# Collect all comment IDs to delete
|
|
1229
|
+
# Delete by fingerprint (primary) - comments that:
|
|
1230
|
+
# 1. Were NOT matched (updated) in this run
|
|
1231
|
+
# 2. Have a fingerprint not in the new findings
|
|
1232
|
+
# 3. Are in files that are part of this PR/validation
|
|
1233
|
+
comment_ids_to_delete: list[int] = []
|
|
1234
|
+
|
|
1235
|
+
for fingerprint, existing in existing_by_fingerprint.items():
|
|
1236
|
+
comment_id = existing["id"]
|
|
1237
|
+
# Skip if this comment was matched/updated via location fallback
|
|
1238
|
+
if comment_id in matched_comment_ids:
|
|
1239
|
+
continue
|
|
1240
|
+
if fingerprint not in seen_fingerprints and should_delete_comment(existing["path"]):
|
|
1241
|
+
comment_ids_to_delete.append(comment_id)
|
|
1242
|
+
logger.debug(f"Marking for deletion: resolved comment {fingerprint[:8]}...")
|
|
1243
|
+
|
|
1244
|
+
# Delete by location (legacy comments without fingerprints)
|
|
1245
|
+
for location, existing in existing_by_location.items():
|
|
1246
|
+
comment_id = existing["id"]
|
|
1247
|
+
# Skip if already matched/updated
|
|
1248
|
+
if comment_id in matched_comment_ids:
|
|
1249
|
+
continue
|
|
1250
|
+
# Skip if already marked for deletion by fingerprint above
|
|
1251
|
+
existing_fingerprint = self._extract_finding_id(existing.get("body", ""))
|
|
1252
|
+
if existing_fingerprint:
|
|
1253
|
+
continue # Already handled above
|
|
1254
|
+
|
|
1255
|
+
if location not in seen_locations and should_delete_comment(existing["path"]):
|
|
1256
|
+
comment_ids_to_delete.append(comment_id)
|
|
1257
|
+
logger.debug(f"Marking for deletion: resolved legacy comment at {location}")
|
|
1258
|
+
|
|
1259
|
+
# Delete all collected comments in parallel
|
|
1260
|
+
if comment_ids_to_delete:
|
|
1261
|
+
deleted_count, _failed = await self._delete_comments_parallel(comment_ids_to_delete)
|
|
1262
|
+
|
|
891
1263
|
logger.info(
|
|
892
1264
|
f"Review comment management: {updated_count} updated, "
|
|
893
1265
|
f"{created_count} created, {deleted_count} deleted (resolved)"
|
|
@@ -895,6 +1267,50 @@ class GitHubIntegration:
|
|
|
895
1267
|
|
|
896
1268
|
return True
|
|
897
1269
|
|
|
1270
|
+
def _extract_finding_id(self, body: str) -> str | None:
|
|
1271
|
+
"""Extract finding ID from comment body HTML comment.
|
|
1272
|
+
|
|
1273
|
+
Args:
|
|
1274
|
+
body: Comment body text
|
|
1275
|
+
|
|
1276
|
+
Returns:
|
|
1277
|
+
16-character finding ID hash, or None if not found
|
|
1278
|
+
"""
|
|
1279
|
+
match = re.search(r"<!-- finding-id: ([a-f0-9]{16}) -->", body)
|
|
1280
|
+
return match.group(1) if match else None
|
|
1281
|
+
|
|
1282
|
+
async def _get_bot_comments_by_fingerprint(self, identifier: str) -> dict[str, dict[str, Any]]:
|
|
1283
|
+
"""Index existing bot comments by their finding fingerprint.
|
|
1284
|
+
|
|
1285
|
+
Args:
|
|
1286
|
+
identifier: String to identify bot comments
|
|
1287
|
+
|
|
1288
|
+
Returns:
|
|
1289
|
+
Dict mapping finding_id to comment metadata dict
|
|
1290
|
+
Comment dict contains: id, body, path, line
|
|
1291
|
+
"""
|
|
1292
|
+
comments = await self.get_review_comments()
|
|
1293
|
+
indexed: dict[str, dict[str, Any]] = {}
|
|
1294
|
+
|
|
1295
|
+
for comment in comments:
|
|
1296
|
+
if not isinstance(comment, dict):
|
|
1297
|
+
continue
|
|
1298
|
+
|
|
1299
|
+
body = comment.get("body", "")
|
|
1300
|
+
if identifier not in str(body):
|
|
1301
|
+
continue
|
|
1302
|
+
|
|
1303
|
+
finding_id = self._extract_finding_id(body)
|
|
1304
|
+
if finding_id:
|
|
1305
|
+
indexed[finding_id] = {
|
|
1306
|
+
"id": comment["id"],
|
|
1307
|
+
"body": body,
|
|
1308
|
+
"path": comment.get("path", ""),
|
|
1309
|
+
"line": comment.get("line") or comment.get("original_line"),
|
|
1310
|
+
}
|
|
1311
|
+
|
|
1312
|
+
return indexed
|
|
1313
|
+
|
|
898
1314
|
# ==================== PR Labels ====================
|
|
899
1315
|
|
|
900
1316
|
async def add_labels(self, labels: list[str]) -> bool:
|
|
@@ -1061,3 +1477,345 @@ class GitHubIntegration:
|
|
|
1061
1477
|
logger.info(f"Successfully set commit status: {state}")
|
|
1062
1478
|
return True
|
|
1063
1479
|
return False
|
|
1480
|
+
|
|
1481
|
+
# ==================== CODEOWNERS and Ignore Commands ====================
|
|
1482
|
+
|
|
1483
|
+
async def get_codeowners_content(self) -> str | None:
|
|
1484
|
+
"""Fetch CODEOWNERS file content from repository.
|
|
1485
|
+
|
|
1486
|
+
Results are cached per instance to avoid redundant API calls.
|
|
1487
|
+
|
|
1488
|
+
Searches in standard CODEOWNERS locations:
|
|
1489
|
+
- CODEOWNERS
|
|
1490
|
+
- .github/CODEOWNERS
|
|
1491
|
+
- docs/CODEOWNERS
|
|
1492
|
+
|
|
1493
|
+
Returns:
|
|
1494
|
+
CODEOWNERS file content as string, or None if not found
|
|
1495
|
+
"""
|
|
1496
|
+
# Return cached result if already loaded
|
|
1497
|
+
if self._codeowners_loaded:
|
|
1498
|
+
return self._codeowners_cache
|
|
1499
|
+
|
|
1500
|
+
from iam_validator.core.codeowners import ( # pylint: disable=import-outside-toplevel
|
|
1501
|
+
CodeOwnersParser,
|
|
1502
|
+
)
|
|
1503
|
+
|
|
1504
|
+
for path in CodeOwnersParser.CODEOWNERS_PATHS:
|
|
1505
|
+
result = await self._make_request(
|
|
1506
|
+
"GET",
|
|
1507
|
+
f"contents/{path}",
|
|
1508
|
+
)
|
|
1509
|
+
|
|
1510
|
+
if result and isinstance(result, dict) and "content" in result:
|
|
1511
|
+
try:
|
|
1512
|
+
content = base64.b64decode(result["content"]).decode("utf-8")
|
|
1513
|
+
logger.debug(f"Found CODEOWNERS at {path}")
|
|
1514
|
+
# Cache the result
|
|
1515
|
+
self._codeowners_cache = content
|
|
1516
|
+
self._codeowners_loaded = True
|
|
1517
|
+
return content
|
|
1518
|
+
except (ValueError, UnicodeDecodeError) as e:
|
|
1519
|
+
logger.warning(f"Failed to decode CODEOWNERS at {path}: {e}")
|
|
1520
|
+
continue
|
|
1521
|
+
|
|
1522
|
+
logger.debug("No CODEOWNERS file found in repository")
|
|
1523
|
+
# Cache the negative result too
|
|
1524
|
+
self._codeowners_cache = None
|
|
1525
|
+
self._codeowners_loaded = True
|
|
1526
|
+
return None
|
|
1527
|
+
|
|
1528
|
+
async def get_team_members(self, org: str, team_slug: str) -> list[str]:
|
|
1529
|
+
"""Get members of a GitHub team.
|
|
1530
|
+
|
|
1531
|
+
Results are cached per instance to avoid redundant API calls
|
|
1532
|
+
when checking multiple users against the same team.
|
|
1533
|
+
|
|
1534
|
+
Note: This requires the token to have `read:org` scope for
|
|
1535
|
+
organization teams.
|
|
1536
|
+
|
|
1537
|
+
Args:
|
|
1538
|
+
org: Organization name
|
|
1539
|
+
team_slug: Team slug (URL-friendly name)
|
|
1540
|
+
|
|
1541
|
+
Returns:
|
|
1542
|
+
List of team member usernames (lowercase)
|
|
1543
|
+
"""
|
|
1544
|
+
# Check cache first
|
|
1545
|
+
cache_key = (org.lower(), team_slug.lower())
|
|
1546
|
+
if cache_key in self._team_cache:
|
|
1547
|
+
logger.debug(f"Using cached team members for {org}/{team_slug}")
|
|
1548
|
+
return self._team_cache[cache_key]
|
|
1549
|
+
|
|
1550
|
+
url = f"{self.api_url}/orgs/{org}/teams/{team_slug}/members"
|
|
1551
|
+
|
|
1552
|
+
try:
|
|
1553
|
+
if self._client:
|
|
1554
|
+
response = await self._client.request("GET", url)
|
|
1555
|
+
else:
|
|
1556
|
+
async with httpx.AsyncClient(
|
|
1557
|
+
headers=self._get_headers(), timeout=httpx.Timeout(30.0)
|
|
1558
|
+
) as client:
|
|
1559
|
+
response = await client.request("GET", url)
|
|
1560
|
+
|
|
1561
|
+
response.raise_for_status()
|
|
1562
|
+
result = response.json()
|
|
1563
|
+
|
|
1564
|
+
if isinstance(result, list):
|
|
1565
|
+
members = [
|
|
1566
|
+
member.get("login", "").lower()
|
|
1567
|
+
for member in result
|
|
1568
|
+
if isinstance(member, dict) and member.get("login")
|
|
1569
|
+
]
|
|
1570
|
+
# Cache the result
|
|
1571
|
+
self._team_cache[cache_key] = members
|
|
1572
|
+
logger.debug(f"Found {len(members)} members in team {org}/{team_slug}")
|
|
1573
|
+
return members
|
|
1574
|
+
|
|
1575
|
+
except httpx.HTTPStatusError as e:
|
|
1576
|
+
logger.warning(
|
|
1577
|
+
f"Failed to get team members for {org}/{team_slug}: HTTP {e.response.status_code}"
|
|
1578
|
+
)
|
|
1579
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
1580
|
+
logger.warning(f"Failed to get team members for {org}/{team_slug}: {e}")
|
|
1581
|
+
|
|
1582
|
+
# Cache empty result to avoid repeated failed API calls
|
|
1583
|
+
self._team_cache[cache_key] = []
|
|
1584
|
+
return []
|
|
1585
|
+
|
|
1586
|
+
async def is_user_codeowner(
|
|
1587
|
+
self,
|
|
1588
|
+
username: str,
|
|
1589
|
+
file_path: str,
|
|
1590
|
+
codeowners_parser: "CodeOwnersParser | None" = None,
|
|
1591
|
+
allowed_users: list[str] | None = None,
|
|
1592
|
+
) -> bool:
|
|
1593
|
+
"""Check if a user is authorized to ignore findings for a file.
|
|
1594
|
+
|
|
1595
|
+
Authorization is granted if:
|
|
1596
|
+
1. User is listed directly in CODEOWNERS for the file
|
|
1597
|
+
2. User is a member of a team listed in CODEOWNERS for the file
|
|
1598
|
+
3. User is in the allowed_users fallback list (when no CODEOWNERS)
|
|
1599
|
+
|
|
1600
|
+
Performance: Team membership checks are executed in parallel.
|
|
1601
|
+
|
|
1602
|
+
Args:
|
|
1603
|
+
username: GitHub username to check
|
|
1604
|
+
file_path: Path to the file being checked
|
|
1605
|
+
codeowners_parser: Pre-parsed CODEOWNERS (for caching)
|
|
1606
|
+
allowed_users: Fallback list of allowed users (when no CODEOWNERS)
|
|
1607
|
+
|
|
1608
|
+
Returns:
|
|
1609
|
+
True if user is authorized, False otherwise
|
|
1610
|
+
"""
|
|
1611
|
+
username_lower = username.lower()
|
|
1612
|
+
|
|
1613
|
+
# Check fallback allowed_users first (always applies if configured)
|
|
1614
|
+
if allowed_users:
|
|
1615
|
+
if username_lower in [u.lower() for u in allowed_users]:
|
|
1616
|
+
logger.debug(f"User {username} authorized via allowed_users config")
|
|
1617
|
+
return True
|
|
1618
|
+
|
|
1619
|
+
# Get or parse CODEOWNERS
|
|
1620
|
+
parser = codeowners_parser
|
|
1621
|
+
if parser is None:
|
|
1622
|
+
content = await self.get_codeowners_content()
|
|
1623
|
+
if content is None:
|
|
1624
|
+
# No CODEOWNERS and no allowed_users match = deny
|
|
1625
|
+
logger.debug(f"No CODEOWNERS file found, user {username} not in allowed_users")
|
|
1626
|
+
return False
|
|
1627
|
+
|
|
1628
|
+
from iam_validator.core.codeowners import ( # pylint: disable=import-outside-toplevel
|
|
1629
|
+
CodeOwnersParser,
|
|
1630
|
+
)
|
|
1631
|
+
|
|
1632
|
+
parser = CodeOwnersParser(content)
|
|
1633
|
+
|
|
1634
|
+
# Check direct user ownership
|
|
1635
|
+
if parser.is_owner(username, file_path):
|
|
1636
|
+
logger.debug(f"User {username} is direct owner of {file_path}")
|
|
1637
|
+
return True
|
|
1638
|
+
|
|
1639
|
+
# Check team membership - fetch all teams in parallel for speed
|
|
1640
|
+
teams = parser.get_teams_for_file(file_path)
|
|
1641
|
+
if not teams:
|
|
1642
|
+
logger.debug(f"User {username} is not authorized for {file_path}")
|
|
1643
|
+
return False
|
|
1644
|
+
|
|
1645
|
+
# Fetch all team memberships concurrently
|
|
1646
|
+
|
|
1647
|
+
async def check_team(org: str, team_slug: str) -> tuple[str, str, bool]:
|
|
1648
|
+
members = await self.get_team_members(org, team_slug)
|
|
1649
|
+
return (org, team_slug, username_lower in members)
|
|
1650
|
+
|
|
1651
|
+
results = await asyncio.gather(*[check_team(org, team_slug) for org, team_slug in teams])
|
|
1652
|
+
|
|
1653
|
+
for org, team_slug, is_member in results:
|
|
1654
|
+
if is_member:
|
|
1655
|
+
logger.debug(f"User {username} authorized via team {org}/{team_slug}")
|
|
1656
|
+
return True
|
|
1657
|
+
|
|
1658
|
+
logger.debug(f"User {username} is not authorized for {file_path}")
|
|
1659
|
+
return False
|
|
1660
|
+
|
|
1661
|
+
async def get_issue_comments(self) -> list[dict[str, Any]]:
|
|
1662
|
+
"""Get all issue comments (general PR comments, not review comments) with pagination.
|
|
1663
|
+
|
|
1664
|
+
Fetches ALL issue comments across all pages. This ensures proper
|
|
1665
|
+
comment management when there are many comments on a PR.
|
|
1666
|
+
|
|
1667
|
+
Returns:
|
|
1668
|
+
List of all issue comment dicts
|
|
1669
|
+
"""
|
|
1670
|
+
return await self._make_paginated_request(f"issues/{self.pr_number}/comments")
|
|
1671
|
+
|
|
1672
|
+
async def get_comment_by_id(self, comment_id: int) -> dict[str, Any] | None:
|
|
1673
|
+
"""Get a specific review comment by ID.
|
|
1674
|
+
|
|
1675
|
+
Used for verifying that ignore command replies still exist
|
|
1676
|
+
(tamper-resistant verification).
|
|
1677
|
+
|
|
1678
|
+
Args:
|
|
1679
|
+
comment_id: The ID of the review comment to fetch
|
|
1680
|
+
|
|
1681
|
+
Returns:
|
|
1682
|
+
Comment dict if found, None if deleted or error
|
|
1683
|
+
"""
|
|
1684
|
+
result = await self._make_request(
|
|
1685
|
+
"GET",
|
|
1686
|
+
f"pulls/comments/{comment_id}",
|
|
1687
|
+
)
|
|
1688
|
+
|
|
1689
|
+
if result and isinstance(result, dict):
|
|
1690
|
+
return result
|
|
1691
|
+
return None
|
|
1692
|
+
|
|
1693
|
+
async def post_reply_to_review_comment(
|
|
1694
|
+
self,
|
|
1695
|
+
comment_id: int,
|
|
1696
|
+
body: str,
|
|
1697
|
+
) -> bool:
|
|
1698
|
+
"""Post a reply to a review comment thread.
|
|
1699
|
+
|
|
1700
|
+
Args:
|
|
1701
|
+
comment_id: The ID of the review comment to reply to
|
|
1702
|
+
body: The reply text (markdown supported)
|
|
1703
|
+
|
|
1704
|
+
Returns:
|
|
1705
|
+
True if successful, False otherwise
|
|
1706
|
+
"""
|
|
1707
|
+
result = await self._make_request(
|
|
1708
|
+
"POST",
|
|
1709
|
+
f"pulls/{self.pr_number}/comments",
|
|
1710
|
+
json={
|
|
1711
|
+
"body": body,
|
|
1712
|
+
"in_reply_to": comment_id,
|
|
1713
|
+
},
|
|
1714
|
+
)
|
|
1715
|
+
|
|
1716
|
+
if result:
|
|
1717
|
+
logger.debug(f"Successfully posted reply to comment {comment_id}")
|
|
1718
|
+
return True
|
|
1719
|
+
return False
|
|
1720
|
+
|
|
1721
|
+
async def scan_for_ignore_commands(
|
|
1722
|
+
self,
|
|
1723
|
+
identifier: str = constants.BOT_IDENTIFIER,
|
|
1724
|
+
) -> list[tuple[dict[str, Any], dict[str, Any]]]:
|
|
1725
|
+
"""Scan for ignore commands in replies to bot review comments.
|
|
1726
|
+
|
|
1727
|
+
Looks for replies to bot comments that contain ignore commands.
|
|
1728
|
+
Supports formats: "ignore", "/ignore", "@iam-validator ignore",
|
|
1729
|
+
"skip", "suppress", and "ignore: reason here".
|
|
1730
|
+
|
|
1731
|
+
Args:
|
|
1732
|
+
identifier: String to identify bot comments
|
|
1733
|
+
|
|
1734
|
+
Returns:
|
|
1735
|
+
List of (bot_comment, reply_comment) tuples where reply
|
|
1736
|
+
contains an ignore command
|
|
1737
|
+
"""
|
|
1738
|
+
all_comments = await self.get_review_comments()
|
|
1739
|
+
ignore_commands: list[tuple[dict[str, Any], dict[str, Any]]] = []
|
|
1740
|
+
|
|
1741
|
+
# Index bot comments by ID for O(1) lookup
|
|
1742
|
+
bot_comments_by_id: dict[int, dict[str, Any]] = {}
|
|
1743
|
+
for comment in all_comments:
|
|
1744
|
+
if not isinstance(comment, dict):
|
|
1745
|
+
continue
|
|
1746
|
+
body = comment.get("body", "")
|
|
1747
|
+
comment_id = comment.get("id")
|
|
1748
|
+
if identifier in str(body) and isinstance(comment_id, int):
|
|
1749
|
+
bot_comments_by_id[comment_id] = comment
|
|
1750
|
+
|
|
1751
|
+
# Find replies with ignore commands
|
|
1752
|
+
for comment in all_comments:
|
|
1753
|
+
if not isinstance(comment, dict):
|
|
1754
|
+
continue
|
|
1755
|
+
|
|
1756
|
+
reply_to_id = comment.get("in_reply_to_id")
|
|
1757
|
+
if reply_to_id and reply_to_id in bot_comments_by_id:
|
|
1758
|
+
body = comment.get("body", "")
|
|
1759
|
+
if self._is_ignore_command(body):
|
|
1760
|
+
ignore_commands.append((bot_comments_by_id[reply_to_id], comment))
|
|
1761
|
+
|
|
1762
|
+
logger.debug(f"Found {len(ignore_commands)} ignore command(s) in PR comments")
|
|
1763
|
+
return ignore_commands
|
|
1764
|
+
|
|
1765
|
+
def _is_ignore_command(self, text: str) -> bool:
|
|
1766
|
+
"""Check if text is an ignore command.
|
|
1767
|
+
|
|
1768
|
+
Supports:
|
|
1769
|
+
- "ignore" (case insensitive)
|
|
1770
|
+
- "/ignore"
|
|
1771
|
+
- "@iam-validator ignore"
|
|
1772
|
+
- "skip", "suppress"
|
|
1773
|
+
- "ignore: reason here" (with optional reason)
|
|
1774
|
+
|
|
1775
|
+
Args:
|
|
1776
|
+
text: Comment text to check
|
|
1777
|
+
|
|
1778
|
+
Returns:
|
|
1779
|
+
True if text is an ignore command
|
|
1780
|
+
"""
|
|
1781
|
+
if not text:
|
|
1782
|
+
return False
|
|
1783
|
+
|
|
1784
|
+
text = text.strip().lower()
|
|
1785
|
+
|
|
1786
|
+
ignore_patterns = [
|
|
1787
|
+
r"^\s*ignore\s*$",
|
|
1788
|
+
r"^\s*/ignore\s*$",
|
|
1789
|
+
r"^\s*@?iam-validator\s+ignore\s*$",
|
|
1790
|
+
r"^\s*ignore\s*:\s*.+$", # With reason
|
|
1791
|
+
r"^\s*skip\s*$",
|
|
1792
|
+
r"^\s*suppress\s*$",
|
|
1793
|
+
]
|
|
1794
|
+
|
|
1795
|
+
return any(re.match(pattern, text, re.IGNORECASE) for pattern in ignore_patterns)
|
|
1796
|
+
|
|
1797
|
+
@staticmethod
|
|
1798
|
+
def extract_finding_id(comment_body: str) -> str | None:
|
|
1799
|
+
"""Extract finding ID from a bot comment.
|
|
1800
|
+
|
|
1801
|
+
Args:
|
|
1802
|
+
comment_body: The comment body text
|
|
1803
|
+
|
|
1804
|
+
Returns:
|
|
1805
|
+
Finding ID hash, or None if not found
|
|
1806
|
+
"""
|
|
1807
|
+
match = re.search(r"<!-- finding-id: ([a-f0-9]+) -->", comment_body)
|
|
1808
|
+
return match.group(1) if match else None
|
|
1809
|
+
|
|
1810
|
+
@staticmethod
|
|
1811
|
+
def extract_ignore_reason(text: str) -> str | None:
|
|
1812
|
+
"""Extract reason from ignore command.
|
|
1813
|
+
|
|
1814
|
+
Args:
|
|
1815
|
+
text: The ignore command text
|
|
1816
|
+
|
|
1817
|
+
Returns:
|
|
1818
|
+
Reason string, or None if no reason provided
|
|
1819
|
+
"""
|
|
1820
|
+
match = re.search(r"ignore\s*:\s*(.+)$", text.strip(), re.IGNORECASE)
|
|
1821
|
+
return match.group(1).strip() if match else None
|