github2gerrit 0.1.0__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,542 @@
1
+ # SPDX-FileCopyrightText: 2024 Matthew Watkins
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """
5
+ Duplicate change detection for github2gerrit.
6
+
7
+ This module provides functionality to detect potentially duplicate changes
8
+ before submitting them to Gerrit, helping to prevent spam and redundant
9
+ submissions from automated tools like Dependabot.
10
+ """
11
+
12
+ import hashlib
13
+ import logging
14
+ import os
15
+ import re
16
+ import urllib.parse
17
+ import urllib.request
18
+ from datetime import UTC
19
+ from datetime import datetime
20
+ from datetime import timedelta
21
+ from pathlib import Path
22
+
23
+ from .github_api import GhPullRequest
24
+ from .github_api import GhRepository
25
+ from .github_api import build_client
26
+ from .github_api import get_repo_from_env
27
+ from .models import GitHubContext
28
+
29
+
30
+ # Optional Gerrit REST API support
31
+ try:
32
+ from pygerrit2 import GerritRestAPI
33
+ from pygerrit2 import HTTPBasicAuth
34
+ except ImportError:
35
+ GerritRestAPI = None
36
+ HTTPBasicAuth = None
37
+
38
+
39
+ log = logging.getLogger(__name__)
40
+
41
+ __all__ = [
42
+ "ChangeFingerprint",
43
+ "DuplicateChangeError",
44
+ "DuplicateDetector",
45
+ "check_for_duplicates",
46
+ ]
47
+
48
+
49
+ class DuplicateChangeError(Exception):
50
+ """Raised when a duplicate change is detected."""
51
+
52
+ def __init__(self, message: str, existing_prs: list[int]) -> None:
53
+ super().__init__(message)
54
+ self.existing_prs = existing_prs
55
+
56
+
57
+ class ChangeFingerprint:
58
+ """Represents a fingerprint of a change for duplicate detection."""
59
+
60
+ def __init__(
61
+ self, title: str, body: str = "", files_changed: list[str] | None = None
62
+ ):
63
+ self.title = title.strip()
64
+ self.body = (body or "").strip()
65
+ self.files_changed = sorted(files_changed or [])
66
+ self._normalized_title = self._normalize_title(title)
67
+ self._content_hash = self._compute_content_hash()
68
+
69
+ def _normalize_title(self, title: str) -> str:
70
+ """Normalize PR title for comparison."""
71
+ # Remove common prefixes/suffixes
72
+ normalized = title.strip()
73
+
74
+ # Remove conventional commit prefixes like "feat:", "fix:", etc.
75
+ normalized = re.sub(
76
+ r"^(feat|fix|docs|style|refactor|test|chore|ci|build|perf)"
77
+ r"(\(.+?\))?: ",
78
+ "",
79
+ normalized,
80
+ flags=re.IGNORECASE,
81
+ )
82
+
83
+ # Remove markdown formatting
84
+ normalized = re.sub(r"[*_`]", "", normalized)
85
+
86
+ # Remove version number variations for dependency updates
87
+ # E.g., "from 0.6 to 0.8" -> "from x.y.z to x.y.z"
88
+ # Handle v-prefixed versions first, then plain versions
89
+ normalized = re.sub(r"\bv\d+(\.\d+)*(-\w+)?\b", "vx.y.z", normalized)
90
+ normalized = re.sub(r"\b\d+(\.\d+)+(-\w+)?\b", "x.y.z", normalized)
91
+ normalized = re.sub(r"\b\d+\.\d+\b", "x.y.z", normalized)
92
+
93
+ # Remove specific commit hashes
94
+ normalized = re.sub(r"\b[a-f0-9]{7,40}\b", "commit_hash", normalized)
95
+
96
+ # Normalize whitespace
97
+ normalized = re.sub(r"\s+", " ", normalized).strip()
98
+
99
+ return normalized.lower()
100
+
101
+ def _compute_content_hash(self) -> str:
102
+ """Compute a hash of the change content."""
103
+ content = (
104
+ f"{self._normalized_title}\n{self.body}\n"
105
+ f"{','.join(self.files_changed)}"
106
+ )
107
+ return hashlib.sha256(content.encode("utf-8")).hexdigest()[:16]
108
+
109
+ def is_similar_to(
110
+ self, other: "ChangeFingerprint", similarity_threshold: float = 0.8
111
+ ) -> bool:
112
+ """Check if this fingerprint is similar to another."""
113
+ # Exact normalized title match
114
+ if self._normalized_title == other._normalized_title:
115
+ return True
116
+
117
+ # Content hash match
118
+ if self._content_hash == other._content_hash:
119
+ return True
120
+
121
+ # Check for similar file changes (for dependency updates)
122
+ if self.files_changed and other.files_changed:
123
+ common_files = set(self.files_changed) & set(other.files_changed)
124
+ union_files = set(self.files_changed) | set(other.files_changed)
125
+ if common_files and union_files:
126
+ overlap_ratio = len(common_files) / len(union_files)
127
+ # If files overlap, check title similarity (lower threshold)
128
+ if overlap_ratio > 0:
129
+ return self._titles_similar(other, 0.6)
130
+
131
+ # Check title similarity even without file changes
132
+ return self._titles_similar(other, similarity_threshold)
133
+
134
+ def _titles_similar(
135
+ self, other: "ChangeFingerprint", threshold: float
136
+ ) -> bool:
137
+ """Check if titles are similar using simple string similarity."""
138
+ title1 = self._normalized_title
139
+ title2 = other._normalized_title
140
+
141
+ if not title1 or not title2:
142
+ return False
143
+
144
+ # Simple Jaccard similarity on words
145
+ words1 = set(title1.split())
146
+ words2 = set(title2.split())
147
+
148
+ if not words1 or not words2:
149
+ return False
150
+
151
+ intersection = len(words1 & words2)
152
+ union = len(words1 | words2)
153
+
154
+ return (intersection / union) >= threshold
155
+
156
+ def __str__(self) -> str:
157
+ return (
158
+ f"ChangeFingerprint(title='{self.title[:50]}...', "
159
+ f"hash={self._content_hash})"
160
+ )
161
+
162
+
163
+ class DuplicateDetector:
164
+ """Detects duplicate Gerrit changes for GitHub pull requests."""
165
+
166
+ def __init__(self, repo: GhRepository, lookback_days: int = 7):
167
+ self.repo = repo
168
+ self.lookback_days = lookback_days
169
+ self._cutoff_date = datetime.now(UTC) - timedelta(days=lookback_days)
170
+
171
+ def _match_first_group(self, pattern: str, text: str) -> str:
172
+ """Extract first regex group match from text."""
173
+ match = re.search(pattern, text)
174
+ return match.group(1) if match else ""
175
+
176
+ def _resolve_gerrit_info_from_env_or_gitreview(
177
+ self, gh: GitHubContext
178
+ ) -> tuple[str, str] | None:
179
+ """Resolve Gerrit host and project from environment or .gitreview file.
180
+
181
+ Returns:
182
+ Tuple of (host, project) if found, None otherwise
183
+ """
184
+ # First try environment variables (same as core module)
185
+ gerrit_host = os.getenv("GERRIT_SERVER", "").strip()
186
+ gerrit_project = os.getenv("GERRIT_PROJECT", "").strip()
187
+
188
+ if gerrit_host and gerrit_project:
189
+ return (gerrit_host, gerrit_project)
190
+
191
+ # Try to read .gitreview file locally first
192
+ gitreview_path = Path(".gitreview")
193
+ if gitreview_path.exists():
194
+ try:
195
+ text = gitreview_path.read_text(encoding="utf-8")
196
+ host = self._match_first_group(r"(?m)^host=(.+)$", text)
197
+ proj = self._match_first_group(r"(?m)^project=(.+)$", text)
198
+ if host and proj:
199
+ project = proj.removesuffix(".git")
200
+ return (host.strip(), project.strip())
201
+ except Exception as exc:
202
+ log.debug("Failed to read local .gitreview: %s", exc)
203
+
204
+ # Try to fetch .gitreview remotely (simplified version of core logic)
205
+ try:
206
+ repo_full = gh.repository.strip() if gh.repository else ""
207
+ if not repo_full:
208
+ return None
209
+
210
+ # Try a few common branches
211
+ branches = []
212
+ if gh.head_ref:
213
+ branches.append(gh.head_ref)
214
+ if gh.base_ref:
215
+ branches.append(gh.base_ref)
216
+ branches.extend(["master", "main"])
217
+
218
+ for branch in branches:
219
+ if not branch:
220
+ continue
221
+
222
+ url = (
223
+ f"https://raw.githubusercontent.com/"
224
+ f"{repo_full}/refs/heads/{branch}/.gitreview"
225
+ )
226
+
227
+ parsed = urllib.parse.urlparse(url)
228
+ if (
229
+ parsed.scheme != "https"
230
+ or parsed.netloc != "raw.githubusercontent.com"
231
+ ):
232
+ continue
233
+
234
+ try:
235
+ log.debug("Fetching .gitreview from: %s", url)
236
+ with urllib.request.urlopen(url, timeout=5) as resp: # noqa: S310
237
+ text_remote = resp.read().decode("utf-8")
238
+
239
+ host = self._match_first_group(
240
+ r"(?m)^host=(.+)$", text_remote
241
+ )
242
+ proj = self._match_first_group(
243
+ r"(?m)^project=(.+)$", text_remote
244
+ )
245
+
246
+ if host and proj:
247
+ project = proj.removesuffix(".git")
248
+ return (host.strip(), project.strip())
249
+
250
+ except Exception as exc:
251
+ log.debug(
252
+ "Failed to fetch .gitreview from %s: %s", url, exc
253
+ )
254
+ continue
255
+
256
+ except Exception as exc:
257
+ log.debug("Failed to resolve .gitreview remotely: %s", exc)
258
+
259
+ return None
260
+
261
+ def _build_gerrit_rest_client(self, gerrit_host: str) -> object | None:
262
+ """Build a Gerrit REST API client if pygerrit2 is available."""
263
+ if GerritRestAPI is None:
264
+ log.debug(
265
+ "pygerrit2 not available, skipping Gerrit duplicate check"
266
+ )
267
+ return None
268
+
269
+ base_path = os.getenv("GERRIT_HTTP_BASE_PATH", "").strip().strip("/")
270
+ base_url = (
271
+ f"https://{gerrit_host}/"
272
+ if not base_path
273
+ else f"https://{gerrit_host}/{base_path}/"
274
+ )
275
+
276
+ http_user = (
277
+ os.getenv("GERRIT_HTTP_USER", "").strip()
278
+ or os.getenv("GERRIT_SSH_USER_G2G", "").strip()
279
+ )
280
+ http_pass = os.getenv("GERRIT_HTTP_PASSWORD", "").strip()
281
+
282
+ try:
283
+ if http_user and http_pass:
284
+ if HTTPBasicAuth is None:
285
+ log.debug("pygerrit2 HTTPBasicAuth not available")
286
+ return None
287
+ # Type ignore needed for dynamic import returning Any
288
+ return GerritRestAPI( # type: ignore[no-any-return]
289
+ url=base_url, auth=HTTPBasicAuth(http_user, http_pass)
290
+ )
291
+ else:
292
+ # Type ignore needed for dynamic import returning Any
293
+ return GerritRestAPI(url=base_url) # type: ignore[no-any-return]
294
+ except Exception as exc:
295
+ log.debug("Failed to create Gerrit REST client: %s", exc)
296
+ return None
297
+
298
+ def _build_gerrit_rest_client_with_r_path(
299
+ self, gerrit_host: str
300
+ ) -> object | None:
301
+ """Build a Gerrit REST API client with /r/ base path for fallback."""
302
+ if GerritRestAPI is None:
303
+ return None
304
+
305
+ fallback_url = f"https://{gerrit_host}/r/"
306
+ http_user = (
307
+ os.getenv("GERRIT_HTTP_USER", "").strip()
308
+ or os.getenv("GERRIT_SSH_USER_G2G", "").strip()
309
+ )
310
+ http_pass = os.getenv("GERRIT_HTTP_PASSWORD", "").strip()
311
+
312
+ try:
313
+ if http_user and http_pass:
314
+ if HTTPBasicAuth is None:
315
+ return None
316
+ # Type ignore needed for dynamic import returning Any
317
+ return GerritRestAPI( # type: ignore[no-any-return]
318
+ url=fallback_url, auth=HTTPBasicAuth(http_user, http_pass)
319
+ )
320
+ else:
321
+ # Type ignore needed for dynamic import returning Any
322
+ return GerritRestAPI(url=fallback_url) # type: ignore[no-any-return]
323
+ except Exception as exc:
324
+ log.debug(
325
+ "Failed to create Gerrit REST client with /r/ path: %s", exc
326
+ )
327
+ return None
328
+
329
+ def check_gerrit_for_existing_change(self, gh: GitHubContext) -> bool:
330
+ """Check if a Gerrit change already exists for the given GitHub PR.
331
+
332
+ Args:
333
+ gh: GitHub context containing PR and repository information
334
+
335
+ Returns:
336
+ True if a Gerrit change already exists for this PR, False otherwise
337
+ """
338
+ if not gh.pr_number:
339
+ return False
340
+
341
+ # Resolve Gerrit host and project
342
+ gerrit_info = self._resolve_gerrit_info_from_env_or_gitreview(gh)
343
+ if not gerrit_info:
344
+ log.debug(
345
+ "Cannot resolve Gerrit host/project, "
346
+ "skipping Gerrit duplicate check"
347
+ )
348
+ return False
349
+
350
+ gerrit_host, gerrit_project = gerrit_info
351
+
352
+ rest = self._build_gerrit_rest_client(gerrit_host)
353
+ if rest is None:
354
+ log.debug(
355
+ "Cannot check Gerrit for duplicates, REST client unavailable"
356
+ )
357
+ return False
358
+
359
+ # Generate the GitHub change hash for this PR
360
+ github_hash = DuplicateDetector._generate_github_change_hash(gh)
361
+
362
+ try:
363
+ # Search for changes that contain the GitHub hash in commit messages
364
+ # This is more reliable than comment-based searches
365
+ query = (
366
+ f'project:{gerrit_project} message:"GitHub-Hash: {github_hash}"'
367
+ )
368
+ path = f"/changes/?q={query}&n=10"
369
+
370
+ log.debug(
371
+ "Searching Gerrit for existing changes with GitHub hash %s, "
372
+ "query: %s",
373
+ github_hash,
374
+ query,
375
+ )
376
+ # Use getattr for dynamic method access to avoid type checking
377
+ changes = rest.get(path) # type: ignore[attr-defined]
378
+
379
+ if changes:
380
+ log.info(
381
+ "Found %d existing Gerrit change(s) for GitHub PR #%d: %s",
382
+ len(changes),
383
+ gh.pr_number,
384
+ [f"{c.get('_number', '?')}" for c in changes],
385
+ )
386
+ return True
387
+ else:
388
+ log.debug(
389
+ "No existing Gerrit changes found for GitHub PR #%d",
390
+ gh.pr_number,
391
+ )
392
+ return False
393
+
394
+ except Exception as exc:
395
+ # Check if this is a 404 error and try /r/ fallback
396
+ status = getattr(
397
+ getattr(exc, "response", None), "status_code", None
398
+ )
399
+ if status == 404:
400
+ try:
401
+ log.debug("Trying /r/ fallback for Gerrit API")
402
+ fallback_rest = self._build_gerrit_rest_client_with_r_path(
403
+ gerrit_host
404
+ )
405
+ if fallback_rest:
406
+ changes = fallback_rest.get(path) # type: ignore[attr-defined]
407
+ if changes:
408
+ log.info(
409
+ "Found %d existing Gerrit change(s) for PR #%d "
410
+ "via /r/ fallback: %s",
411
+ len(changes),
412
+ gh.pr_number,
413
+ [f"{c.get('_number', '?')}" for c in changes],
414
+ )
415
+ return True
416
+ else:
417
+ log.debug(
418
+ "No existing Gerrit changes found for PR #%d "
419
+ "via /r/ fallback",
420
+ gh.pr_number,
421
+ )
422
+ return False
423
+ except Exception as exc2:
424
+ log.warning(
425
+ "Failed to query Gerrit via /r/ fallback: %s", exc2
426
+ )
427
+ return False
428
+
429
+ log.warning("Failed to query Gerrit for existing changes: %s", exc)
430
+ # If we can't check Gerrit, err on the side of caution
431
+ return False
432
+
433
+ @staticmethod
434
+ def _generate_github_change_hash(gh: GitHubContext) -> str:
435
+ """Generate a deterministic hash for a GitHub PR to identify duplicates.
436
+
437
+ This creates a SHA256 hash based on stable PR metadata that uniquely
438
+ identifies the change content, making duplicate detection reliable
439
+ regardless of comment formatting or API issues.
440
+
441
+ Args:
442
+ gh: GitHub context containing PR information
443
+
444
+ Returns:
445
+ Hex-encoded SHA256 hash string (first 16 characters for readability)
446
+ """
447
+ import hashlib
448
+
449
+ # Build hash input from stable, unique PR identifiers
450
+ # Use server_url + repository + pr_number for global uniqueness
451
+ hash_input = f"{gh.server_url}/{gh.repository}/pull/{gh.pr_number}"
452
+
453
+ # Create SHA256 hash and take first 16 characters for readability
454
+ hash_bytes = hashlib.sha256(hash_input.encode("utf-8")).digest()
455
+ hash_hex = hash_bytes.hex()[:16]
456
+
457
+ log.debug(
458
+ "Generated GitHub change hash for %s: %s", hash_input, hash_hex
459
+ )
460
+ return hash_hex
461
+
462
+ def check_for_duplicates(
463
+ self,
464
+ target_pr: GhPullRequest,
465
+ allow_duplicates: bool = False,
466
+ gh: GitHubContext | None = None,
467
+ ) -> None:
468
+ """Check if the target PR is a duplicate in Gerrit.
469
+
470
+ Args:
471
+ target_pr: The PR to check for duplicates
472
+ allow_duplicates: If True, only log warnings; if False, raise error
473
+ gh: GitHub context for Gerrit duplicate checking
474
+
475
+ Raises:
476
+ DuplicateChangeError: If duplicates found and allow_duplicates=False
477
+ """
478
+ pr_number = getattr(target_pr, "number", 0)
479
+
480
+ log.debug("Checking PR #%d for Gerrit duplicates", pr_number)
481
+
482
+ # Check if this PR already has a corresponding Gerrit change
483
+ if gh and self.check_gerrit_for_existing_change(gh):
484
+ full_message = (
485
+ f"PR #{pr_number} already has an existing Gerrit change. "
486
+ f"Skipping duplicate submission. "
487
+ f"Target PR title: '{getattr(target_pr, 'title', '')[:100]}'"
488
+ )
489
+
490
+ if allow_duplicates:
491
+ log.warning(
492
+ "GERRIT DUPLICATE DETECTED (allowed): %s", full_message
493
+ )
494
+ return
495
+ else:
496
+ raise DuplicateChangeError(full_message, [])
497
+
498
+ log.debug("No existing Gerrit change found for PR #%d", pr_number)
499
+
500
+
501
+ def check_for_duplicates(
502
+ gh: GitHubContext,
503
+ allow_duplicates: bool = False,
504
+ lookback_days: int = 7,
505
+ ) -> None:
506
+ """Convenience function to check for duplicates.
507
+
508
+ Args:
509
+ gh: GitHub context containing PR information
510
+ allow_duplicates: If True, only log warnings; if False, raise exception
511
+ lookback_days: Number of days to look back for similar PRs
512
+
513
+ Raises:
514
+ DuplicateChangeError: If duplicates found and allow_duplicates=False
515
+ """
516
+ if not gh.pr_number:
517
+ log.debug("No PR number provided, skipping duplicate check")
518
+ return
519
+
520
+ try:
521
+ client = build_client()
522
+ repo = get_repo_from_env(client)
523
+
524
+ # Get the target PR
525
+ target_pr = repo.get_pull(gh.pr_number)
526
+
527
+ # Create detector and check
528
+ detector = DuplicateDetector(repo, lookback_days=lookback_days)
529
+ detector.check_for_duplicates(
530
+ target_pr, allow_duplicates=allow_duplicates, gh=gh
531
+ )
532
+
533
+ log.info("Duplicate check completed for PR #%d", gh.pr_number)
534
+
535
+ except DuplicateChangeError:
536
+ # Re-raise duplicate errors
537
+ raise
538
+ except Exception as exc:
539
+ log.warning(
540
+ "Duplicate detection failed for PR #%d: %s", gh.pr_number, exc
541
+ )
542
+ # Don't fail the entire process if duplicate detection has issues