github2gerrit 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- github2gerrit/cli.py +196 -108
- github2gerrit/config.py +207 -5
- github2gerrit/core.py +542 -398
- github2gerrit/duplicate_detection.py +375 -193
- github2gerrit/gerrit_urls.py +256 -0
- github2gerrit/github_api.py +15 -20
- github2gerrit/gitutils.py +49 -13
- github2gerrit/models.py +1 -0
- github2gerrit/similarity.py +458 -0
- github2gerrit/ssh_discovery.py +365 -0
- {github2gerrit-0.1.4.dist-info → github2gerrit-0.1.6.dist-info}/METADATA +24 -25
- github2gerrit-0.1.6.dist-info/RECORD +17 -0
- github2gerrit-0.1.4.dist-info/RECORD +0 -14
- {github2gerrit-0.1.4.dist-info → github2gerrit-0.1.6.dist-info}/WHEEL +0 -0
- {github2gerrit-0.1.4.dist-info → github2gerrit-0.1.6.dist-info}/entry_points.txt +0 -0
- {github2gerrit-0.1.4.dist-info → github2gerrit-0.1.6.dist-info}/licenses/LICENSE +0 -0
- {github2gerrit-0.1.4.dist-info → github2gerrit-0.1.6.dist-info}/top_level.txt +0 -0
@@ -10,16 +10,19 @@ submissions from automated tools like Dependabot.
|
|
10
10
|
"""
|
11
11
|
|
12
12
|
import hashlib
|
13
|
+
import json
|
13
14
|
import logging
|
14
15
|
import os
|
15
16
|
import re
|
16
17
|
import urllib.parse
|
17
18
|
import urllib.request
|
19
|
+
from collections.abc import Iterable
|
18
20
|
from datetime import UTC
|
19
21
|
from datetime import datetime
|
20
22
|
from datetime import timedelta
|
21
23
|
from pathlib import Path
|
22
24
|
|
25
|
+
from .gerrit_urls import create_gerrit_url_builder
|
23
26
|
from .github_api import GhPullRequest
|
24
27
|
from .github_api import GhRepository
|
25
28
|
from .github_api import build_client
|
@@ -57,9 +60,7 @@ class DuplicateChangeError(Exception):
|
|
57
60
|
class ChangeFingerprint:
|
58
61
|
"""Represents a fingerprint of a change for duplicate detection."""
|
59
62
|
|
60
|
-
def __init__(
|
61
|
-
self, title: str, body: str = "", files_changed: list[str] | None = None
|
62
|
-
):
|
63
|
+
def __init__(self, title: str, body: str = "", files_changed: list[str] | None = None):
|
63
64
|
self.title = title.strip()
|
64
65
|
self.body = (body or "").strip()
|
65
66
|
self.files_changed = sorted(files_changed or [])
|
@@ -100,15 +101,10 @@ class ChangeFingerprint:
|
|
100
101
|
|
101
102
|
def _compute_content_hash(self) -> str:
|
102
103
|
"""Compute a hash of the change content."""
|
103
|
-
content = (
|
104
|
-
f"{self._normalized_title}\n{self.body}\n"
|
105
|
-
f"{','.join(self.files_changed)}"
|
106
|
-
)
|
104
|
+
content = f"{self._normalized_title}\n{self.body}\n{','.join(self.files_changed)}"
|
107
105
|
return hashlib.sha256(content.encode("utf-8")).hexdigest()[:16]
|
108
106
|
|
109
|
-
def is_similar_to(
|
110
|
-
self, other: "ChangeFingerprint", similarity_threshold: float = 0.8
|
111
|
-
) -> bool:
|
107
|
+
def is_similar_to(self, other: "ChangeFingerprint", similarity_threshold: float = 0.8) -> bool:
|
112
108
|
"""Check if this fingerprint is similar to another."""
|
113
109
|
# Exact normalized title match
|
114
110
|
if self._normalized_title == other._normalized_title:
|
@@ -131,9 +127,7 @@ class ChangeFingerprint:
|
|
131
127
|
# Check title similarity even without file changes
|
132
128
|
return self._titles_similar(other, similarity_threshold)
|
133
129
|
|
134
|
-
def _titles_similar(
|
135
|
-
self, other: "ChangeFingerprint", threshold: float
|
136
|
-
) -> bool:
|
130
|
+
def _titles_similar(self, other: "ChangeFingerprint", threshold: float) -> bool:
|
137
131
|
"""Check if titles are similar using simple string similarity."""
|
138
132
|
title1 = self._normalized_title
|
139
133
|
title2 = other._normalized_title
|
@@ -154,28 +148,29 @@ class ChangeFingerprint:
|
|
154
148
|
return (intersection / union) >= threshold
|
155
149
|
|
156
150
|
def __str__(self) -> str:
|
157
|
-
return (
|
158
|
-
f"ChangeFingerprint(title='{self.title[:50]}...', "
|
159
|
-
f"hash={self._content_hash})"
|
160
|
-
)
|
151
|
+
return f"ChangeFingerprint(title='{self.title[:50]}...', hash={self._content_hash})"
|
161
152
|
|
162
153
|
|
163
154
|
class DuplicateDetector:
|
164
155
|
"""Detects duplicate Gerrit changes for GitHub pull requests."""
|
165
156
|
|
166
|
-
def __init__(
|
157
|
+
def __init__(
|
158
|
+
self,
|
159
|
+
repo: GhRepository,
|
160
|
+
lookback_days: int = 7,
|
161
|
+
duplicates_filter: str = "open",
|
162
|
+
):
|
167
163
|
self.repo = repo
|
168
164
|
self.lookback_days = lookback_days
|
169
165
|
self._cutoff_date = datetime.now(UTC) - timedelta(days=lookback_days)
|
166
|
+
self.duplicates_filter = duplicates_filter
|
170
167
|
|
171
168
|
def _match_first_group(self, pattern: str, text: str) -> str:
|
172
169
|
"""Extract first regex group match from text."""
|
173
170
|
match = re.search(pattern, text)
|
174
171
|
return match.group(1) if match else ""
|
175
172
|
|
176
|
-
def _resolve_gerrit_info_from_env_or_gitreview(
|
177
|
-
self, gh: GitHubContext
|
178
|
-
) -> tuple[str, str] | None:
|
173
|
+
def _resolve_gerrit_info_from_env_or_gitreview(self, gh: GitHubContext) -> tuple[str, str] | None:
|
179
174
|
"""Resolve Gerrit host and project from environment or .gitreview file.
|
180
175
|
|
181
176
|
Returns:
|
@@ -198,6 +193,8 @@ class DuplicateDetector:
|
|
198
193
|
if host and proj:
|
199
194
|
project = proj.removesuffix(".git")
|
200
195
|
return (host.strip(), project.strip())
|
196
|
+
if host and not proj:
|
197
|
+
return (host.strip(), "")
|
201
198
|
except Exception as exc:
|
202
199
|
log.debug("Failed to read local .gitreview: %s", exc)
|
203
200
|
|
@@ -219,38 +216,28 @@ class DuplicateDetector:
|
|
219
216
|
if not branch:
|
220
217
|
continue
|
221
218
|
|
222
|
-
url =
|
223
|
-
f"https://raw.githubusercontent.com/"
|
224
|
-
f"{repo_full}/refs/heads/{branch}/.gitreview"
|
225
|
-
)
|
219
|
+
url = f"https://raw.githubusercontent.com/{repo_full}/{branch}/.gitreview"
|
226
220
|
|
227
221
|
parsed = urllib.parse.urlparse(url)
|
228
|
-
if
|
229
|
-
parsed.scheme != "https"
|
230
|
-
or parsed.netloc != "raw.githubusercontent.com"
|
231
|
-
):
|
222
|
+
if parsed.scheme != "https" or parsed.netloc != "raw.githubusercontent.com":
|
232
223
|
continue
|
233
224
|
|
234
225
|
try:
|
235
226
|
log.debug("Fetching .gitreview from: %s", url)
|
236
|
-
with urllib.request.urlopen(url, timeout=5) as resp:
|
227
|
+
with urllib.request.urlopen(url, timeout=5) as resp:
|
237
228
|
text_remote = resp.read().decode("utf-8")
|
238
229
|
|
239
|
-
host = self._match_first_group(
|
240
|
-
|
241
|
-
)
|
242
|
-
proj = self._match_first_group(
|
243
|
-
r"(?m)^project=(.+)$", text_remote
|
244
|
-
)
|
230
|
+
host = self._match_first_group(r"(?m)^host=(.+)$", text_remote)
|
231
|
+
proj = self._match_first_group(r"(?m)^project=(.+)$", text_remote)
|
245
232
|
|
246
233
|
if host and proj:
|
247
234
|
project = proj.removesuffix(".git")
|
248
235
|
return (host.strip(), project.strip())
|
236
|
+
if host and not proj:
|
237
|
+
return (host.strip(), "")
|
249
238
|
|
250
239
|
except Exception as exc:
|
251
|
-
log.debug(
|
252
|
-
"Failed to fetch .gitreview from %s: %s", url, exc
|
253
|
-
)
|
240
|
+
log.debug("Failed to fetch .gitreview from %s: %s", url, exc)
|
254
241
|
continue
|
255
242
|
|
256
243
|
except Exception as exc:
|
@@ -261,22 +248,14 @@ class DuplicateDetector:
|
|
261
248
|
def _build_gerrit_rest_client(self, gerrit_host: str) -> object | None:
|
262
249
|
"""Build a Gerrit REST API client if pygerrit2 is available."""
|
263
250
|
if GerritRestAPI is None:
|
264
|
-
log.debug(
|
265
|
-
"pygerrit2 not available, skipping Gerrit duplicate check"
|
266
|
-
)
|
251
|
+
log.debug("pygerrit2 not available, skipping Gerrit duplicate check")
|
267
252
|
return None
|
268
253
|
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
if not base_path
|
273
|
-
else f"https://{gerrit_host}/{base_path}/"
|
274
|
-
)
|
254
|
+
# Create centralized URL builder
|
255
|
+
url_builder = create_gerrit_url_builder(gerrit_host)
|
256
|
+
base_url = url_builder.api_url()
|
275
257
|
|
276
|
-
http_user = (
|
277
|
-
os.getenv("GERRIT_HTTP_USER", "").strip()
|
278
|
-
or os.getenv("GERRIT_SSH_USER_G2G", "").strip()
|
279
|
-
)
|
258
|
+
http_user = os.getenv("GERRIT_HTTP_USER", "").strip() or os.getenv("GERRIT_SSH_USER_G2G", "").strip()
|
280
259
|
http_pass = os.getenv("GERRIT_HTTP_PASSWORD", "").strip()
|
281
260
|
|
282
261
|
try:
|
@@ -295,18 +274,16 @@ class DuplicateDetector:
|
|
295
274
|
log.debug("Failed to create Gerrit REST client: %s", exc)
|
296
275
|
return None
|
297
276
|
|
298
|
-
def _build_gerrit_rest_client_with_r_path(
|
299
|
-
self, gerrit_host: str
|
300
|
-
) -> object | None:
|
277
|
+
def _build_gerrit_rest_client_with_r_path(self, gerrit_host: str) -> object | None:
|
301
278
|
"""Build a Gerrit REST API client with /r/ base path for fallback."""
|
302
279
|
if GerritRestAPI is None:
|
303
280
|
return None
|
304
281
|
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
)
|
282
|
+
# Create centralized URL builder with /r/ base path override
|
283
|
+
url_builder = create_gerrit_url_builder(gerrit_host, "r")
|
284
|
+
fallback_url = url_builder.api_url()
|
285
|
+
|
286
|
+
http_user = os.getenv("GERRIT_HTTP_USER", "").strip() or os.getenv("GERRIT_SSH_USER_G2G", "").strip()
|
310
287
|
http_pass = os.getenv("GERRIT_HTTP_PASSWORD", "").strip()
|
311
288
|
|
312
289
|
try:
|
@@ -321,114 +298,17 @@ class DuplicateDetector:
|
|
321
298
|
# Type ignore needed for dynamic import returning Any
|
322
299
|
return GerritRestAPI(url=fallback_url) # type: ignore[no-any-return]
|
323
300
|
except Exception as exc:
|
324
|
-
log.debug(
|
325
|
-
"Failed to create Gerrit REST client with /r/ path: %s", exc
|
326
|
-
)
|
301
|
+
log.debug("Failed to create Gerrit REST client with /r/ path: %s", exc)
|
327
302
|
return None
|
328
303
|
|
329
304
|
def check_gerrit_for_existing_change(self, gh: GitHubContext) -> bool:
|
330
|
-
"""
|
331
|
-
|
332
|
-
Args:
|
333
|
-
gh: GitHub context containing PR and repository information
|
305
|
+
"""Deprecated: GitHub-Hash/Gerrit REST based duplicate detection disabled.
|
334
306
|
|
335
|
-
|
336
|
-
|
307
|
+
Always returns False. Scoring-based duplicate detection will be implemented
|
308
|
+
in check_for_duplicates.
|
337
309
|
"""
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
# Resolve Gerrit host and project
|
342
|
-
gerrit_info = self._resolve_gerrit_info_from_env_or_gitreview(gh)
|
343
|
-
if not gerrit_info:
|
344
|
-
log.debug(
|
345
|
-
"Cannot resolve Gerrit host/project, "
|
346
|
-
"skipping Gerrit duplicate check"
|
347
|
-
)
|
348
|
-
return False
|
349
|
-
|
350
|
-
gerrit_host, gerrit_project = gerrit_info
|
351
|
-
|
352
|
-
rest = self._build_gerrit_rest_client(gerrit_host)
|
353
|
-
if rest is None:
|
354
|
-
log.debug(
|
355
|
-
"Cannot check Gerrit for duplicates, REST client unavailable"
|
356
|
-
)
|
357
|
-
return False
|
358
|
-
|
359
|
-
# Generate the GitHub change hash for this PR
|
360
|
-
github_hash = DuplicateDetector._generate_github_change_hash(gh)
|
361
|
-
|
362
|
-
try:
|
363
|
-
# Search for changes that contain the GitHub hash in commit messages
|
364
|
-
# This is more reliable than comment-based searches
|
365
|
-
query = (
|
366
|
-
f'project:{gerrit_project} message:"GitHub-Hash: {github_hash}"'
|
367
|
-
)
|
368
|
-
path = f"/changes/?q={query}&n=10"
|
369
|
-
|
370
|
-
log.debug(
|
371
|
-
"Searching Gerrit for existing changes with GitHub hash %s, "
|
372
|
-
"query: %s",
|
373
|
-
github_hash,
|
374
|
-
query,
|
375
|
-
)
|
376
|
-
# Use getattr for dynamic method access to avoid type checking
|
377
|
-
changes = rest.get(path) # type: ignore[attr-defined]
|
378
|
-
|
379
|
-
if changes:
|
380
|
-
log.info(
|
381
|
-
"Found %d existing Gerrit change(s) for GitHub PR #%d: %s",
|
382
|
-
len(changes),
|
383
|
-
gh.pr_number,
|
384
|
-
[f"{c.get('_number', '?')}" for c in changes],
|
385
|
-
)
|
386
|
-
return True
|
387
|
-
else:
|
388
|
-
log.debug(
|
389
|
-
"No existing Gerrit changes found for GitHub PR #%d",
|
390
|
-
gh.pr_number,
|
391
|
-
)
|
392
|
-
return False
|
393
|
-
|
394
|
-
except Exception as exc:
|
395
|
-
# Check if this is a 404 error and try /r/ fallback
|
396
|
-
status = getattr(
|
397
|
-
getattr(exc, "response", None), "status_code", None
|
398
|
-
)
|
399
|
-
if status == 404:
|
400
|
-
try:
|
401
|
-
log.debug("Trying /r/ fallback for Gerrit API")
|
402
|
-
fallback_rest = self._build_gerrit_rest_client_with_r_path(
|
403
|
-
gerrit_host
|
404
|
-
)
|
405
|
-
if fallback_rest:
|
406
|
-
changes = fallback_rest.get(path) # type: ignore[attr-defined]
|
407
|
-
if changes:
|
408
|
-
log.info(
|
409
|
-
"Found %d existing Gerrit change(s) for PR #%d "
|
410
|
-
"via /r/ fallback: %s",
|
411
|
-
len(changes),
|
412
|
-
gh.pr_number,
|
413
|
-
[f"{c.get('_number', '?')}" for c in changes],
|
414
|
-
)
|
415
|
-
return True
|
416
|
-
else:
|
417
|
-
log.debug(
|
418
|
-
"No existing Gerrit changes found for PR #%d "
|
419
|
-
"via /r/ fallback",
|
420
|
-
gh.pr_number,
|
421
|
-
)
|
422
|
-
return False
|
423
|
-
except Exception as exc2:
|
424
|
-
log.warning(
|
425
|
-
"Failed to query Gerrit via /r/ fallback: %s", exc2
|
426
|
-
)
|
427
|
-
return False
|
428
|
-
|
429
|
-
log.warning("Failed to query Gerrit for existing changes: %s", exc)
|
430
|
-
# If we can't check Gerrit, err on the side of caution
|
431
|
-
return False
|
310
|
+
log.debug("Gerrit REST duplicate check disabled")
|
311
|
+
return False
|
432
312
|
|
433
313
|
@staticmethod
|
434
314
|
def _generate_github_change_hash(gh: GitHubContext) -> str:
|
@@ -454,9 +334,7 @@ class DuplicateDetector:
|
|
454
334
|
hash_bytes = hashlib.sha256(hash_input.encode("utf-8")).digest()
|
455
335
|
hash_hex = hash_bytes.hex()[:16]
|
456
336
|
|
457
|
-
log.debug(
|
458
|
-
"Generated GitHub change hash for %s: %s", hash_input, hash_hex
|
459
|
-
)
|
337
|
+
log.debug("Generated GitHub change hash for %s: %s", hash_input, hash_hex)
|
460
338
|
return hash_hex
|
461
339
|
|
462
340
|
def check_for_duplicates(
|
@@ -465,37 +343,341 @@ class DuplicateDetector:
|
|
465
343
|
allow_duplicates: bool = False,
|
466
344
|
gh: GitHubContext | None = None,
|
467
345
|
) -> None:
|
468
|
-
"""Check if the target PR is a duplicate
|
469
|
-
|
470
|
-
Args:
|
471
|
-
target_pr: The PR to check for duplicates
|
472
|
-
allow_duplicates: If True, only log warnings; if False, raise error
|
473
|
-
gh: GitHub context for Gerrit duplicate checking
|
346
|
+
"""Check if the target PR is a duplicate via subject equality against Gerrit.
|
474
347
|
|
475
|
-
|
476
|
-
|
348
|
+
Implements a robust, dependency-free subject-first duplicate check:
|
349
|
+
- Resolve Gerrit host/project from env or .gitreview
|
350
|
+
- Query Gerrit changes updated within the lookback window (excluding abandoned)
|
351
|
+
- Compare normalized subjects (first line) for exact equality
|
352
|
+
- If any match, treat as duplicate and either warn or raise
|
477
353
|
"""
|
478
354
|
pr_number = getattr(target_pr, "number", 0)
|
355
|
+
pr_title = (getattr(target_pr, "title", "") or "").strip()
|
479
356
|
|
480
|
-
log.debug(
|
357
|
+
log.debug(
|
358
|
+
"Checking PR #%d for duplicates via subject equality against Gerrit",
|
359
|
+
pr_number,
|
360
|
+
)
|
481
361
|
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
362
|
+
if not pr_title:
|
363
|
+
log.debug("PR #%d has empty title; skipping duplicate check", pr_number)
|
364
|
+
return
|
365
|
+
if gh is None:
|
366
|
+
log.debug("No GitHub context provided; skipping duplicate check")
|
367
|
+
return
|
368
|
+
|
369
|
+
# Resolve Gerrit target (host/project)
|
370
|
+
gerrit_info = self._resolve_gerrit_info_from_env_or_gitreview(gh)
|
371
|
+
if not gerrit_info:
|
372
|
+
log.debug("Unable to resolve Gerrit host/project; skipping duplicate check")
|
373
|
+
return
|
374
|
+
gerrit_host, gerrit_project = gerrit_info
|
375
|
+
|
376
|
+
# Helper: normalize subject like our existing title normalization
|
377
|
+
def _normalize_subject(title: str) -> str:
|
378
|
+
normalized = title.strip()
|
379
|
+
normalized = re.sub(
|
380
|
+
r"^(feat|fix|docs|style|refactor|test|chore|ci|build|perf)(\(.+?\))?: ",
|
381
|
+
"",
|
382
|
+
normalized,
|
383
|
+
flags=re.IGNORECASE,
|
488
384
|
)
|
385
|
+
normalized = re.sub(r"[*_`]", "", normalized)
|
386
|
+
normalized = re.sub(r"\bv\d+(\.\d+)*(-\w+)?\b", "vx.y.z", normalized)
|
387
|
+
normalized = re.sub(r"\b\d+(\.\d+)+(-\w+)?\b", "x.y.z", normalized)
|
388
|
+
normalized = re.sub(r"\b\d+\.\d+\b", "x.y.z", normalized)
|
389
|
+
normalized = re.sub(r"\b[a-f0-9]{7,40}\b", "commit_hash", normalized)
|
390
|
+
normalized = re.sub(r"\s+", " ", normalized).strip()
|
391
|
+
return normalized.lower()
|
392
|
+
|
393
|
+
normalized_pr_subject = _normalize_subject(pr_title)
|
394
|
+
log.debug(
|
395
|
+
"Normalized PR subject for duplicate check: %s",
|
396
|
+
normalized_pr_subject,
|
397
|
+
)
|
489
398
|
|
490
|
-
|
491
|
-
|
492
|
-
|
399
|
+
# Build Gerrit REST URL using centralized URL builder
|
400
|
+
url_builder = create_gerrit_url_builder(gerrit_host)
|
401
|
+
api_base = url_builder.api_url().rstrip("/")
|
402
|
+
|
403
|
+
# Track which base path actually works for constructing display URLs
|
404
|
+
successful_base_path = url_builder.base_path
|
405
|
+
|
406
|
+
# Build query: limit to recent changes, exclude abandoned; prefer open
|
407
|
+
cutoff_date = self._cutoff_date.date().isoformat()
|
408
|
+
q_parts = []
|
409
|
+
if gerrit_project:
|
410
|
+
q_parts.append(f"project:{gerrit_project}")
|
411
|
+
# Build status clause from DUPLICATES filter (default: open)
|
412
|
+
dup_filter = (self.duplicates_filter or "open").strip().lower()
|
413
|
+
selected = [s.strip() for s in dup_filter.split(",") if s.strip()]
|
414
|
+
valid = {
|
415
|
+
"open": "status:open",
|
416
|
+
"merged": "status:merged",
|
417
|
+
"abandoned": "status:abandoned",
|
418
|
+
}
|
419
|
+
status_terms = [valid[s] for s in selected if s in valid]
|
420
|
+
if not status_terms:
|
421
|
+
status_clause = "status:open"
|
422
|
+
elif len(status_terms) == 1:
|
423
|
+
status_clause = status_terms[0]
|
424
|
+
else:
|
425
|
+
status_clause = "(" + " OR ".join(status_terms) + ")"
|
426
|
+
q_parts.append(status_clause)
|
427
|
+
q_parts.append(f"after:{cutoff_date}")
|
428
|
+
query = " ".join(q_parts)
|
429
|
+
encoded_q = urllib.parse.quote(query, safe="")
|
430
|
+
|
431
|
+
# Request current commit metadata so we get 'subject'
|
432
|
+
# Use a modest page size
|
433
|
+
url = f"{api_base}/changes/?q={encoded_q}&n=50&o=CURRENT_COMMIT&o=CURRENT_FILES"
|
434
|
+
|
435
|
+
def _load_gerrit_json(url_: str) -> list[dict[str, object]]:
|
436
|
+
try:
|
437
|
+
log.debug("Querying Gerrit for duplicates: %s", url_)
|
438
|
+
# Ensure we only fetch over HTTPS to avoid unsafe schemes
|
439
|
+
parsed = urllib.parse.urlparse(url_)
|
440
|
+
if parsed.scheme != "https":
|
441
|
+
log.debug("Skipping non-HTTPS URL for Gerrit query: %s", url_)
|
442
|
+
return []
|
443
|
+
headers: dict[str, str] = {}
|
444
|
+
http_user = os.getenv("GERRIT_HTTP_USER", "").strip()
|
445
|
+
http_pass = os.getenv("GERRIT_HTTP_PASSWORD", "").strip()
|
446
|
+
if http_user and http_pass:
|
447
|
+
import base64 as _b64 # localized import to avoid global import edit
|
448
|
+
|
449
|
+
basic = _b64.b64encode(f"{http_user}:{http_pass}".encode()).decode("ascii")
|
450
|
+
headers["Authorization"] = f"Basic {basic}"
|
451
|
+
req = urllib.request.Request(url_, headers=headers)
|
452
|
+
with urllib.request.urlopen(req, timeout=8) as resp:
|
453
|
+
raw = resp.read().decode("utf-8", errors="replace")
|
454
|
+
# Strip Gerrit's XSSI prefix if present
|
455
|
+
if raw.startswith(")]}'"):
|
456
|
+
raw = raw.split("\n", 1)[1] if "\n" in raw else ""
|
457
|
+
data = json.loads(raw or "[]")
|
458
|
+
if isinstance(data, list):
|
459
|
+
return data
|
460
|
+
else:
|
461
|
+
return []
|
462
|
+
except urllib.error.HTTPError as exc:
|
463
|
+
if exc.code == 404:
|
464
|
+
# Try with /r/ base path fallback using centralized URL builder
|
465
|
+
fallback_builder = create_gerrit_url_builder(gerrit_host, "r")
|
466
|
+
fallback_api_base = fallback_builder.api_url().rstrip("/")
|
467
|
+
fallback_url = url_.replace(api_base, fallback_api_base)
|
468
|
+
if fallback_url != url_:
|
469
|
+
log.debug(
|
470
|
+
"Trying Gerrit query with /r/ base path: %s",
|
471
|
+
fallback_url,
|
472
|
+
)
|
473
|
+
try:
|
474
|
+
req_fallback = urllib.request.Request(fallback_url, headers=headers)
|
475
|
+
with urllib.request.urlopen(req_fallback, timeout=8) as resp:
|
476
|
+
raw = resp.read().decode("utf-8", errors="replace")
|
477
|
+
# Strip Gerrit's XSSI prefix if present
|
478
|
+
if raw.startswith(")]}'"):
|
479
|
+
raw = raw.split("\n", 1)[1] if "\n" in raw else ""
|
480
|
+
data = json.loads(raw or "[]")
|
481
|
+
if isinstance(data, list):
|
482
|
+
# Update successful base path for display URL construction
|
483
|
+
nonlocal successful_base_path
|
484
|
+
successful_base_path = "r"
|
485
|
+
return data
|
486
|
+
else:
|
487
|
+
return []
|
488
|
+
except Exception as fallback_exc:
|
489
|
+
log.debug(
|
490
|
+
"Gerrit fallback query also failed for %s: %s",
|
491
|
+
fallback_url,
|
492
|
+
fallback_exc,
|
493
|
+
)
|
494
|
+
log.debug("Gerrit query failed for %s: %s", url_, exc)
|
495
|
+
return []
|
496
|
+
except Exception as exc:
|
497
|
+
log.debug("Gerrit query failed for %s: %s", url_, exc)
|
498
|
+
return []
|
499
|
+
|
500
|
+
log.debug(
|
501
|
+
"Gerrit duplicate query: host=%s project=%s filter=%s cutoff=%s url=%s",
|
502
|
+
gerrit_host,
|
503
|
+
gerrit_project or "(any)",
|
504
|
+
dup_filter,
|
505
|
+
cutoff_date,
|
506
|
+
url,
|
507
|
+
)
|
508
|
+
changes = _load_gerrit_json(url)
|
509
|
+
log.debug(
|
510
|
+
"Gerrit query returned %d change(s) for project=%s filter=%s after=%s",
|
511
|
+
len(changes),
|
512
|
+
gerrit_project or "(any)",
|
513
|
+
dup_filter,
|
514
|
+
cutoff_date,
|
515
|
+
)
|
516
|
+
if changes:
|
517
|
+
sample_subjects = ", ".join(str(c.get("subject") or "")[:60] for c in changes[:5])
|
518
|
+
log.debug("Sample subjects: %s", sample_subjects)
|
519
|
+
|
520
|
+
# Compare normalized subjects for exact equality
|
521
|
+
matched: list[tuple[int, str]] = []
|
522
|
+
for c in changes:
|
523
|
+
subj = str(c.get("subject") or "").strip()
|
524
|
+
if not subj:
|
525
|
+
continue
|
526
|
+
if _normalize_subject(subj) == normalized_pr_subject:
|
527
|
+
num = c.get("_number")
|
528
|
+
proj = str(c.get("project") or gerrit_project or "")
|
529
|
+
if isinstance(num, int):
|
530
|
+
matched.append((num, proj))
|
531
|
+
|
532
|
+
if not matched:
|
533
|
+
# No exact subject match; proceed with similarity scoring across candidates
|
534
|
+
log.debug("No exact-subject matches found; entering similarity scoring")
|
535
|
+
from .similarity import ScoringConfig
|
536
|
+
from .similarity import aggregate_scores
|
537
|
+
from .similarity import remove_commit_trailers
|
538
|
+
from .similarity import score_bodies
|
539
|
+
from .similarity import score_files
|
540
|
+
from .similarity import score_subjects
|
541
|
+
|
542
|
+
config = ScoringConfig()
|
543
|
+
# Source features from the PR
|
544
|
+
src_subjects = [pr_title]
|
545
|
+
src_body = str(getattr(target_pr, "body", "") or "")
|
546
|
+
src_files: list[str] = []
|
547
|
+
try:
|
548
|
+
get_files = getattr(target_pr, "get_files", None)
|
549
|
+
if callable(get_files):
|
550
|
+
files_obj = get_files()
|
551
|
+
if isinstance(files_obj, Iterable):
|
552
|
+
for f in files_obj:
|
553
|
+
fname = getattr(f, "filename", None)
|
554
|
+
if fname:
|
555
|
+
src_files.append(str(fname))
|
556
|
+
except Exception as exc:
|
557
|
+
# Best-effort; if files cannot be retrieved, proceed without them
|
558
|
+
log.debug("Failed to retrieve PR files for scoring: %s", exc)
|
559
|
+
|
560
|
+
best_score = 0.0
|
561
|
+
best_reasons: list[str] = []
|
562
|
+
hits: list[tuple[float, str, int | None]] = []
|
563
|
+
all_nums: list[int] = []
|
564
|
+
for c in changes:
|
565
|
+
subj = str(c.get("subject") or "").strip()
|
566
|
+
if not subj:
|
567
|
+
continue
|
568
|
+
# Extract commit message and files from revisions (CURRENT_COMMIT, CURRENT_FILES)
|
569
|
+
rev = str(c.get("current_revision") or "")
|
570
|
+
revs_obj = c.get("revisions")
|
571
|
+
revs = revs_obj if isinstance(revs_obj, dict) else {}
|
572
|
+
cur_obj = revs.get(rev)
|
573
|
+
cur = cur_obj if isinstance(cur_obj, dict) else {}
|
574
|
+
commit = cur.get("commit") or {}
|
575
|
+
msg = str(commit.get("message") or "")
|
576
|
+
cand_body_raw = ""
|
577
|
+
if "\n" in msg:
|
578
|
+
cand_body_raw = msg.split("\n", 1)[1]
|
579
|
+
cand_body = remove_commit_trailers(cand_body_raw)
|
580
|
+
files_dict = cur.get("files") or {}
|
581
|
+
cand_files = [p for p in files_dict if isinstance(p, str) and not p.startswith("/")]
|
582
|
+
|
583
|
+
# Compute component scores
|
584
|
+
s_res = score_subjects(src_subjects, subj)
|
585
|
+
f_res = score_files(
|
586
|
+
src_files,
|
587
|
+
cand_files,
|
588
|
+
workflow_min_floor=config.workflow_min_floor,
|
589
|
+
)
|
590
|
+
b_res = score_bodies(src_body, cand_body)
|
591
|
+
|
592
|
+
# Aggregate
|
593
|
+
agg = aggregate_scores(s_res.score, f_res.score, b_res.score, config=config)
|
594
|
+
log.debug(
|
595
|
+
"Aggregate score computed: %.2f (s=%.2f f=%.2f b=%.2f)",
|
596
|
+
agg,
|
597
|
+
s_res.score,
|
598
|
+
f_res.score,
|
599
|
+
b_res.score,
|
493
600
|
)
|
494
|
-
|
601
|
+
|
602
|
+
# Build candidate reference and number using successful base path
|
603
|
+
num_obj = c.get("_number")
|
604
|
+
num = int(num_obj) if isinstance(num_obj, int) else None
|
605
|
+
proj = str(c.get("project") or gerrit_project or "")
|
606
|
+
|
607
|
+
# Use the base path that actually worked for API calls
|
608
|
+
display_url_builder = create_gerrit_url_builder(gerrit_host, successful_base_path)
|
609
|
+
ref = (
|
610
|
+
display_url_builder.change_url(proj, num)
|
611
|
+
if proj and isinstance(num, int)
|
612
|
+
else (f"change {num}" if isinstance(num, int) else "")
|
613
|
+
)
|
614
|
+
log.debug(
|
615
|
+
"Scoring candidate: ref=%s agg=%.2f (s=%.2f f=%.2f b=%.2f) subj='%s'",
|
616
|
+
ref or "(none)",
|
617
|
+
agg,
|
618
|
+
s_res.score,
|
619
|
+
f_res.score,
|
620
|
+
b_res.score,
|
621
|
+
subj[:200],
|
622
|
+
)
|
623
|
+
|
624
|
+
# Track best (for reasons)
|
625
|
+
if agg > best_score:
|
626
|
+
best_score = agg
|
627
|
+
# Deduplicate reasons preserving order
|
628
|
+
best_reasons = list(dict.fromkeys(s_res.reasons + f_res.reasons + b_res.reasons))
|
629
|
+
|
630
|
+
# Collect all candidates above threshold
|
631
|
+
if agg >= config.similarity_threshold and ref:
|
632
|
+
hits.append((agg, ref, num))
|
633
|
+
if isinstance(num, int):
|
634
|
+
all_nums.append(num)
|
635
|
+
|
636
|
+
log.debug(
|
637
|
+
"Similarity scoring found %d hit(s) (threshold=%.2f)",
|
638
|
+
len(hits),
|
639
|
+
config.similarity_threshold,
|
640
|
+
)
|
641
|
+
if hits:
|
642
|
+
hits_sorted = sorted(hits, key=lambda t: t[0], reverse=True)
|
643
|
+
|
644
|
+
# Log each matching change individually
|
645
|
+
for s, u, _ in hits_sorted:
|
646
|
+
if u:
|
647
|
+
log.info("Score: %.2f URL: %s", s, u)
|
648
|
+
msg = f"Similar Gerrit change(s) detected [≥ {config.similarity_threshold:.2f}]"
|
649
|
+
if best_reasons:
|
650
|
+
msg += f" (Reasons: {', '.join(best_reasons)})"
|
651
|
+
if allow_duplicates:
|
652
|
+
log.warning("GERRIT DUPLICATE DETECTED (allowed): %s", msg)
|
653
|
+
return
|
654
|
+
raise DuplicateChangeError(msg, all_nums)
|
655
|
+
|
656
|
+
# Construct human-friendly references for logs
|
657
|
+
matching_numbers: list[int] = []
|
658
|
+
match_lines: list[str] = []
|
659
|
+
for n, proj in matched:
|
660
|
+
if proj:
|
661
|
+
# Use the base path that actually worked for API calls
|
662
|
+
display_url_builder = create_gerrit_url_builder(gerrit_host, successful_base_path)
|
663
|
+
url = display_url_builder.change_url(proj, n)
|
664
|
+
match_lines.append(f"Score: 1.0 URL: {url}")
|
665
|
+
log.info("Score: 1.0 URL: %s", url)
|
495
666
|
else:
|
496
|
-
|
667
|
+
match_lines.append(f"Score: 1.0 URL: change {n}")
|
668
|
+
log.info("Score: 1.0 URL: change %s", n)
|
669
|
+
matching_numbers.append(n)
|
497
670
|
|
498
|
-
|
671
|
+
if not matched:
|
672
|
+
log.debug("No exact subject matches and no similarity matches; duplicate check passes")
|
673
|
+
return
|
674
|
+
|
675
|
+
# Remove PR number from message since cli.py already includes it
|
676
|
+
full_message = "subject matches existing Gerrit change(s)"
|
677
|
+
if allow_duplicates:
|
678
|
+
log.warning("GERRIT DUPLICATE DETECTED (allowed): %s", full_message)
|
679
|
+
return
|
680
|
+
raise DuplicateChangeError(full_message, matching_numbers)
|
499
681
|
|
500
682
|
|
501
683
|
def check_for_duplicates(
|
@@ -525,10 +707,12 @@ def check_for_duplicates(
|
|
525
707
|
target_pr = repo.get_pull(gh.pr_number)
|
526
708
|
|
527
709
|
# Create detector and check
|
528
|
-
detector = DuplicateDetector(
|
529
|
-
|
530
|
-
|
710
|
+
detector = DuplicateDetector(
|
711
|
+
repo,
|
712
|
+
lookback_days=lookback_days,
|
713
|
+
duplicates_filter=os.getenv("DUPLICATES", "open"),
|
531
714
|
)
|
715
|
+
detector.check_for_duplicates(target_pr, allow_duplicates=allow_duplicates, gh=gh)
|
532
716
|
|
533
717
|
log.info("Duplicate check completed for PR #%d", gh.pr_number)
|
534
718
|
|
@@ -536,7 +720,5 @@ def check_for_duplicates(
|
|
536
720
|
# Re-raise duplicate errors
|
537
721
|
raise
|
538
722
|
except Exception as exc:
|
539
|
-
log.warning(
|
540
|
-
"Duplicate detection failed for PR #%d: %s", gh.pr_number, exc
|
541
|
-
)
|
723
|
+
log.warning("Duplicate detection failed for PR #%d: %s", gh.pr_number, exc)
|
542
724
|
# Don't fail the entire process if duplicate detection has issues
|