github2gerrit 0.1.10__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- github2gerrit/cli.py +793 -198
- github2gerrit/commit_normalization.py +44 -15
- github2gerrit/config.py +76 -30
- github2gerrit/core.py +1571 -267
- github2gerrit/duplicate_detection.py +222 -98
- github2gerrit/external_api.py +76 -25
- github2gerrit/gerrit_query.py +286 -0
- github2gerrit/gerrit_rest.py +53 -18
- github2gerrit/gerrit_urls.py +90 -33
- github2gerrit/github_api.py +19 -6
- github2gerrit/gitutils.py +43 -14
- github2gerrit/mapping_comment.py +345 -0
- github2gerrit/models.py +15 -1
- github2gerrit/orchestrator/__init__.py +25 -0
- github2gerrit/orchestrator/reconciliation.py +589 -0
- github2gerrit/pr_content_filter.py +65 -17
- github2gerrit/reconcile_matcher.py +595 -0
- github2gerrit/rich_display.py +502 -0
- github2gerrit/rich_logging.py +316 -0
- github2gerrit/similarity.py +65 -19
- github2gerrit/ssh_agent_setup.py +59 -22
- github2gerrit/ssh_common.py +30 -11
- github2gerrit/ssh_discovery.py +67 -20
- github2gerrit/trailers.py +340 -0
- github2gerrit/utils.py +6 -2
- {github2gerrit-0.1.10.dist-info → github2gerrit-0.1.11.dist-info}/METADATA +76 -24
- github2gerrit-0.1.11.dist-info/RECORD +31 -0
- {github2gerrit-0.1.10.dist-info → github2gerrit-0.1.11.dist-info}/WHEEL +1 -2
- github2gerrit-0.1.10.dist-info/RECORD +0 -24
- github2gerrit-0.1.10.dist-info/top_level.txt +0 -1
- {github2gerrit-0.1.10.dist-info → github2gerrit-0.1.11.dist-info}/entry_points.txt +0 -0
- {github2gerrit-0.1.10.dist-info → github2gerrit-0.1.11.dist-info}/licenses/LICENSE +0 -0
@@ -10,7 +10,6 @@ submissions from automated tools like Dependabot.
|
|
10
10
|
"""
|
11
11
|
|
12
12
|
import hashlib
|
13
|
-
import json
|
14
13
|
import logging
|
15
14
|
import os
|
16
15
|
import re
|
@@ -22,6 +21,7 @@ from datetime import UTC
|
|
22
21
|
from datetime import datetime
|
23
22
|
from datetime import timedelta
|
24
23
|
from pathlib import Path
|
24
|
+
from typing import Any
|
25
25
|
|
26
26
|
from .gerrit_urls import create_gerrit_url_builder
|
27
27
|
from .github_api import GhPullRequest
|
@@ -29,6 +29,7 @@ from .github_api import GhRepository
|
|
29
29
|
from .github_api import build_client
|
30
30
|
from .github_api import get_repo_from_env
|
31
31
|
from .models import GitHubContext
|
32
|
+
from .trailers import extract_github_metadata
|
32
33
|
|
33
34
|
|
34
35
|
# Optional Gerrit REST API support
|
@@ -53,15 +54,23 @@ __all__ = [
|
|
53
54
|
class DuplicateChangeError(Exception):
|
54
55
|
"""Raised when a duplicate change is detected."""
|
55
56
|
|
56
|
-
def __init__(
|
57
|
+
def __init__(
|
58
|
+
self,
|
59
|
+
message: str,
|
60
|
+
existing_prs: list[int],
|
61
|
+
urls: list[str] | None = None,
|
62
|
+
) -> None:
|
57
63
|
super().__init__(message)
|
58
64
|
self.existing_prs = existing_prs
|
65
|
+
self.urls = urls or []
|
59
66
|
|
60
67
|
|
61
68
|
class ChangeFingerprint:
|
62
69
|
"""Represents a fingerprint of a change for duplicate detection."""
|
63
70
|
|
64
|
-
def __init__(
|
71
|
+
def __init__(
|
72
|
+
self, title: str, body: str = "", files_changed: list[str] | None = None
|
73
|
+
):
|
65
74
|
self.title = title.strip()
|
66
75
|
self.body = (body or "").strip()
|
67
76
|
self.files_changed = sorted(files_changed or [])
|
@@ -102,10 +111,15 @@ class ChangeFingerprint:
|
|
102
111
|
|
103
112
|
def _compute_content_hash(self) -> str:
|
104
113
|
"""Compute a hash of the change content."""
|
105
|
-
content =
|
114
|
+
content = (
|
115
|
+
f"{self._normalized_title}\n{self.body}\n"
|
116
|
+
f"{','.join(self.files_changed)}"
|
117
|
+
)
|
106
118
|
return hashlib.sha256(content.encode("utf-8")).hexdigest()[:16]
|
107
119
|
|
108
|
-
def is_similar_to(
|
120
|
+
def is_similar_to(
|
121
|
+
self, other: "ChangeFingerprint", similarity_threshold: float = 0.8
|
122
|
+
) -> bool:
|
109
123
|
"""Check if this fingerprint is similar to another."""
|
110
124
|
# Exact normalized title match
|
111
125
|
if self._normalized_title == other._normalized_title:
|
@@ -128,7 +142,9 @@ class ChangeFingerprint:
|
|
128
142
|
# Check title similarity even without file changes
|
129
143
|
return self._titles_similar(other, similarity_threshold)
|
130
144
|
|
131
|
-
def _titles_similar(
|
145
|
+
def _titles_similar(
|
146
|
+
self, other: "ChangeFingerprint", threshold: float
|
147
|
+
) -> bool:
|
132
148
|
"""Check if titles are similar using simple string similarity."""
|
133
149
|
title1 = self._normalized_title
|
134
150
|
title2 = other._normalized_title
|
@@ -149,7 +165,10 @@ class ChangeFingerprint:
|
|
149
165
|
return (intersection / union) >= threshold
|
150
166
|
|
151
167
|
def __str__(self) -> str:
|
152
|
-
return
|
168
|
+
return (
|
169
|
+
f"ChangeFingerprint(title='{self.title[:50]}...', "
|
170
|
+
f"hash={self._content_hash})"
|
171
|
+
)
|
153
172
|
|
154
173
|
|
155
174
|
class DuplicateDetector:
|
@@ -171,7 +190,9 @@ class DuplicateDetector:
|
|
171
190
|
match = re.search(pattern, text)
|
172
191
|
return match.group(1) if match else ""
|
173
192
|
|
174
|
-
def _resolve_gerrit_info_from_env_or_gitreview(
|
193
|
+
def _resolve_gerrit_info_from_env_or_gitreview(
|
194
|
+
self, gh: GitHubContext
|
195
|
+
) -> tuple[str, str] | None:
|
175
196
|
"""Resolve Gerrit host and project from environment or .gitreview file.
|
176
197
|
|
177
198
|
Returns:
|
@@ -220,7 +241,10 @@ class DuplicateDetector:
|
|
220
241
|
url = f"https://raw.githubusercontent.com/{repo_full}/{branch}/.gitreview"
|
221
242
|
|
222
243
|
parsed = urllib.parse.urlparse(url)
|
223
|
-
if
|
244
|
+
if (
|
245
|
+
parsed.scheme != "https"
|
246
|
+
or parsed.netloc != "raw.githubusercontent.com"
|
247
|
+
):
|
224
248
|
continue
|
225
249
|
|
226
250
|
try:
|
@@ -228,8 +252,12 @@ class DuplicateDetector:
|
|
228
252
|
with urllib.request.urlopen(url, timeout=5) as resp:
|
229
253
|
text_remote = resp.read().decode("utf-8")
|
230
254
|
|
231
|
-
host = self._match_first_group(
|
232
|
-
|
255
|
+
host = self._match_first_group(
|
256
|
+
r"(?m)^host=(.+)$", text_remote
|
257
|
+
)
|
258
|
+
proj = self._match_first_group(
|
259
|
+
r"(?m)^project=(.+)$", text_remote
|
260
|
+
)
|
233
261
|
|
234
262
|
if host and proj:
|
235
263
|
project = proj.removesuffix(".git")
|
@@ -238,7 +266,9 @@ class DuplicateDetector:
|
|
238
266
|
return (host.strip(), "")
|
239
267
|
|
240
268
|
except Exception as exc:
|
241
|
-
log.debug(
|
269
|
+
log.debug(
|
270
|
+
"Failed to fetch .gitreview from %s: %s", url, exc
|
271
|
+
)
|
242
272
|
continue
|
243
273
|
|
244
274
|
except Exception as exc:
|
@@ -246,31 +276,24 @@ class DuplicateDetector:
|
|
246
276
|
|
247
277
|
return None
|
248
278
|
|
249
|
-
def _build_gerrit_rest_client(self, gerrit_host: str) ->
|
250
|
-
"""Build a Gerrit REST API client
|
251
|
-
|
252
|
-
log.debug("pygerrit2 not available, skipping Gerrit duplicate check")
|
253
|
-
return None
|
279
|
+
def _build_gerrit_rest_client(self, gerrit_host: str) -> Any | None:
|
280
|
+
"""Build a Gerrit REST API client using centralized framework."""
|
281
|
+
from .gerrit_rest import build_client_for_host
|
254
282
|
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
http_user = os.getenv("GERRIT_HTTP_USER", "").strip() or os.getenv("GERRIT_SSH_USER_G2G", "").strip()
|
283
|
+
http_user = (
|
284
|
+
os.getenv("GERRIT_HTTP_USER", "").strip()
|
285
|
+
or os.getenv("GERRIT_SSH_USER_G2G", "").strip()
|
286
|
+
)
|
260
287
|
http_pass = os.getenv("GERRIT_HTTP_PASSWORD", "").strip()
|
261
288
|
|
262
289
|
try:
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
)
|
271
|
-
else:
|
272
|
-
# Type ignore needed for dynamic import returning Any
|
273
|
-
return GerritRestAPI(url=base_url) # type: ignore[no-any-return]
|
290
|
+
return build_client_for_host(
|
291
|
+
gerrit_host,
|
292
|
+
timeout=8.0,
|
293
|
+
max_attempts=3,
|
294
|
+
http_user=http_user or None,
|
295
|
+
http_password=http_pass or None,
|
296
|
+
)
|
274
297
|
except Exception as exc:
|
275
298
|
log.debug("Failed to create Gerrit REST client: %s", exc)
|
276
299
|
return None
|
@@ -299,7 +322,9 @@ class DuplicateDetector:
|
|
299
322
|
hash_bytes = hashlib.sha256(hash_input.encode("utf-8")).digest()
|
300
323
|
hash_hex = hash_bytes.hex()[:16]
|
301
324
|
|
302
|
-
log.debug(
|
325
|
+
log.debug(
|
326
|
+
"Generated GitHub change hash for %s: %s", hash_input, hash_hex
|
327
|
+
)
|
303
328
|
return hash_hex
|
304
329
|
|
305
330
|
def check_for_duplicates(
|
@@ -307,25 +332,41 @@ class DuplicateDetector:
|
|
307
332
|
target_pr: GhPullRequest,
|
308
333
|
allow_duplicates: bool = False,
|
309
334
|
gh: GitHubContext | None = None,
|
335
|
+
expected_github_hash: str | None = None,
|
310
336
|
) -> None:
|
311
|
-
"""Check if the target PR is a duplicate via
|
337
|
+
"""Check if the target PR is a duplicate via trailer-aware and subject
|
338
|
+
equality against Gerrit.
|
312
339
|
|
313
|
-
Implements a robust, dependency-free
|
340
|
+
Implements a robust, dependency-free duplicate check with trailer
|
341
|
+
awareness:
|
342
|
+
- First check for existing changes with matching GitHub-Hash trailer
|
343
|
+
(short-circuit)
|
314
344
|
- Resolve Gerrit host/project from env or .gitreview
|
315
|
-
- Query Gerrit changes updated within the lookback window (excluding
|
345
|
+
- Query Gerrit changes updated within the lookback window (excluding
|
346
|
+
abandoned)
|
316
347
|
- Compare normalized subjects (first line) for exact equality
|
317
348
|
- If any match, treat as duplicate and either warn or raise
|
349
|
+
|
350
|
+
Args:
|
351
|
+
target_pr: The GitHub PR to check for duplicates
|
352
|
+
allow_duplicates: If True, log warnings instead of raising errors
|
353
|
+
gh: GitHub context for resolving Gerrit configuration
|
354
|
+
expected_github_hash: The GitHub-Hash trailer value expected for
|
355
|
+
this PR
|
318
356
|
"""
|
319
357
|
pr_number = getattr(target_pr, "number", 0)
|
320
358
|
pr_title = (getattr(target_pr, "title", "") or "").strip()
|
321
359
|
|
322
360
|
log.debug(
|
323
|
-
"Checking PR #%d for duplicates via subject equality against
|
361
|
+
"Checking PR #%d for duplicates via subject equality against "
|
362
|
+
"Gerrit",
|
324
363
|
pr_number,
|
325
364
|
)
|
326
365
|
|
327
366
|
if not pr_title:
|
328
|
-
log.debug(
|
367
|
+
log.debug(
|
368
|
+
"PR #%d has empty title; skipping duplicate check", pr_number
|
369
|
+
)
|
329
370
|
return
|
330
371
|
if gh is None:
|
331
372
|
log.debug("No GitHub context provided; skipping duplicate check")
|
@@ -334,7 +375,10 @@ class DuplicateDetector:
|
|
334
375
|
# Resolve Gerrit target (host/project)
|
335
376
|
gerrit_info = self._resolve_gerrit_info_from_env_or_gitreview(gh)
|
336
377
|
if not gerrit_info:
|
337
|
-
log.debug(
|
378
|
+
log.debug(
|
379
|
+
"Unable to resolve Gerrit host/project; skipping duplicate "
|
380
|
+
"check"
|
381
|
+
)
|
338
382
|
return
|
339
383
|
gerrit_host, gerrit_project = gerrit_info
|
340
384
|
|
@@ -342,16 +386,21 @@ class DuplicateDetector:
|
|
342
386
|
def _normalize_subject(title: str) -> str:
|
343
387
|
normalized = title.strip()
|
344
388
|
normalized = re.sub(
|
345
|
-
r"^(feat|fix|docs|style|refactor|test|chore|ci|build|perf)
|
389
|
+
r"^(feat|fix|docs|style|refactor|test|chore|ci|build|perf)"
|
390
|
+
r"(\(.+?\))?: ",
|
346
391
|
"",
|
347
392
|
normalized,
|
348
393
|
flags=re.IGNORECASE,
|
349
394
|
)
|
350
395
|
normalized = re.sub(r"[*_`]", "", normalized)
|
351
|
-
normalized = re.sub(
|
396
|
+
normalized = re.sub(
|
397
|
+
r"\bv\d+(\.\d+)*(-\w+)?\b", "vx.y.z", normalized
|
398
|
+
)
|
352
399
|
normalized = re.sub(r"\b\d+(\.\d+)+(-\w+)?\b", "x.y.z", normalized)
|
353
400
|
normalized = re.sub(r"\b\d+\.\d+\b", "x.y.z", normalized)
|
354
|
-
normalized = re.sub(
|
401
|
+
normalized = re.sub(
|
402
|
+
r"\b[a-f0-9]{7,40}\b", "commit_hash", normalized
|
403
|
+
)
|
355
404
|
normalized = re.sub(r"\s+", " ", normalized).strip()
|
356
405
|
return normalized.lower()
|
357
406
|
|
@@ -363,7 +412,6 @@ class DuplicateDetector:
|
|
363
412
|
|
364
413
|
# Build Gerrit REST URL using centralized URL builder
|
365
414
|
url_builder = create_gerrit_url_builder(gerrit_host)
|
366
|
-
api_base = url_builder.api_url().rstrip("/")
|
367
415
|
|
368
416
|
# Track which base path actually works for constructing display URLs
|
369
417
|
successful_base_path = url_builder.base_path
|
@@ -393,64 +441,99 @@ class DuplicateDetector:
|
|
393
441
|
query = " ".join(q_parts)
|
394
442
|
encoded_q = urllib.parse.quote(query, safe="")
|
395
443
|
|
396
|
-
|
397
|
-
# Use a modest page size
|
398
|
-
url = f"{api_base}/changes/?q={encoded_q}&n=50&o=CURRENT_COMMIT&o=CURRENT_FILES"
|
399
|
-
|
400
|
-
def _load_gerrit_json(url_: str) -> list[dict[str, object]]:
|
444
|
+
def _load_gerrit_json(query_path: str) -> list[dict[str, object]]:
|
401
445
|
try:
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
446
|
+
# Use centralized client that handles base path and auth
|
447
|
+
client = self._build_gerrit_rest_client(gerrit_host)
|
448
|
+
if client is None:
|
449
|
+
log.debug(
|
450
|
+
"Gerrit client not available; skipping duplicate check"
|
451
|
+
)
|
407
452
|
return []
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
if http_user and http_pass:
|
412
|
-
import base64 as _b64 # localized import to avoid global import edit
|
413
|
-
|
414
|
-
basic = _b64.b64encode(f"{http_user}:{http_pass}".encode()).decode("ascii")
|
415
|
-
headers["Authorization"] = f"Basic {basic}"
|
416
|
-
req = urllib.request.Request(url_, headers=headers)
|
417
|
-
with urllib.request.urlopen(req, timeout=8) as resp:
|
418
|
-
raw = resp.read().decode("utf-8", errors="replace")
|
419
|
-
# Strip Gerrit's XSSI prefix if present
|
420
|
-
if raw.startswith(")]}'"):
|
421
|
-
raw = raw.split("\n", 1)[1] if "\n" in raw else ""
|
422
|
-
data = json.loads(raw or "[]")
|
453
|
+
|
454
|
+
log.debug("Querying Gerrit for duplicates: %s", query_path)
|
455
|
+
data = client.get(query_path)
|
423
456
|
if isinstance(data, list):
|
424
457
|
return data
|
425
458
|
else:
|
426
459
|
return []
|
427
|
-
except urllib.error.HTTPError as exc:
|
428
|
-
log.debug("Gerrit query failed for %s: %s", url_, exc)
|
429
|
-
return []
|
430
460
|
except Exception as exc:
|
431
|
-
log.debug("Gerrit query failed for %s: %s",
|
461
|
+
log.debug("Gerrit query failed for %s: %s", query_path, exc)
|
432
462
|
return []
|
433
463
|
|
464
|
+
# Build query path for centralized client
|
465
|
+
query_path = (
|
466
|
+
f"/changes/?q={encoded_q}&n=50&o=CURRENT_COMMIT&o=CURRENT_FILES"
|
467
|
+
)
|
468
|
+
|
434
469
|
log.debug(
|
435
|
-
"Gerrit duplicate query: host=%s project=%s filter=%s cutoff=%s
|
470
|
+
"Gerrit duplicate query: host=%s project=%s filter=%s cutoff=%s "
|
471
|
+
"path=%s",
|
436
472
|
gerrit_host,
|
437
473
|
gerrit_project or "(any)",
|
438
474
|
dup_filter,
|
439
475
|
cutoff_date,
|
440
|
-
|
476
|
+
query_path,
|
441
477
|
)
|
442
|
-
changes = _load_gerrit_json(
|
478
|
+
changes = _load_gerrit_json(query_path)
|
443
479
|
log.debug(
|
444
|
-
"Gerrit query returned %d change(s) for project=%s filter=%s
|
480
|
+
"Gerrit query returned %d change(s) for project=%s filter=%s "
|
481
|
+
"after=%s",
|
445
482
|
len(changes),
|
446
483
|
gerrit_project or "(any)",
|
447
484
|
dup_filter,
|
448
485
|
cutoff_date,
|
449
486
|
)
|
450
487
|
if changes:
|
451
|
-
sample_subjects = ", ".join(
|
488
|
+
sample_subjects = ", ".join(
|
489
|
+
str(c.get("subject") or "")[:60] for c in changes[:5]
|
490
|
+
)
|
452
491
|
log.debug("Sample subjects: %s", sample_subjects)
|
453
492
|
|
493
|
+
# First pass: Check for trailer-based matches (GitHub-Hash)
|
494
|
+
if expected_github_hash:
|
495
|
+
log.debug(
|
496
|
+
"Checking for GitHub-Hash trailer matches: %s",
|
497
|
+
expected_github_hash,
|
498
|
+
)
|
499
|
+
trailer_matches: list[tuple[int, str]] = []
|
500
|
+
|
501
|
+
for c in changes:
|
502
|
+
# Extract commit message and check for GitHub trailers
|
503
|
+
rev = str(c.get("current_revision") or "")
|
504
|
+
revs_obj = c.get("revisions")
|
505
|
+
revs = revs_obj if isinstance(revs_obj, dict) else {}
|
506
|
+
cur_obj = revs.get(rev)
|
507
|
+
cur = cur_obj if isinstance(cur_obj, dict) else {}
|
508
|
+
commit = cur.get("commit") or {}
|
509
|
+
msg = str(commit.get("message") or "")
|
510
|
+
|
511
|
+
if msg:
|
512
|
+
github_metadata = extract_github_metadata(msg)
|
513
|
+
change_github_hash = github_metadata.get("GitHub-Hash", "")
|
514
|
+
|
515
|
+
if change_github_hash == expected_github_hash:
|
516
|
+
num = c.get("_number")
|
517
|
+
proj = str(c.get("project") or gerrit_project or "")
|
518
|
+
if isinstance(num, int):
|
519
|
+
trailer_matches.append((num, proj))
|
520
|
+
log.debug(
|
521
|
+
"Found GitHub-Hash trailer match: change %d, "
|
522
|
+
"hash %s",
|
523
|
+
num,
|
524
|
+
change_github_hash,
|
525
|
+
)
|
526
|
+
|
527
|
+
if trailer_matches:
|
528
|
+
log.debug(
|
529
|
+
"Found %d change(s) with matching GitHub-Hash trailer - "
|
530
|
+
"treating as update targets",
|
531
|
+
len(trailer_matches),
|
532
|
+
)
|
533
|
+
# These are update targets, not duplicates - allow them to
|
534
|
+
# proceed
|
535
|
+
return
|
536
|
+
|
454
537
|
# Compare normalized subjects for exact equality
|
455
538
|
matched: list[tuple[int, str]] = []
|
456
539
|
for c in changes:
|
@@ -464,8 +547,11 @@ class DuplicateDetector:
|
|
464
547
|
matched.append((num, proj))
|
465
548
|
|
466
549
|
if not matched:
|
467
|
-
# No exact subject match; proceed with similarity scoring across
|
468
|
-
|
550
|
+
# No exact subject match; proceed with similarity scoring across
|
551
|
+
# candidates
|
552
|
+
log.debug(
|
553
|
+
"No exact-subject matches found; entering similarity scoring"
|
554
|
+
)
|
469
555
|
from .similarity import ScoringConfig
|
470
556
|
from .similarity import aggregate_scores
|
471
557
|
from .similarity import remove_commit_trailers
|
@@ -488,7 +574,8 @@ class DuplicateDetector:
|
|
488
574
|
if fname:
|
489
575
|
src_files.append(str(fname))
|
490
576
|
except Exception as exc:
|
491
|
-
# Best-effort; if files cannot be retrieved, proceed without
|
577
|
+
# Best-effort; if files cannot be retrieved, proceed without
|
578
|
+
# them
|
492
579
|
log.debug("Failed to retrieve PR files for scoring: %s", exc)
|
493
580
|
|
494
581
|
best_score = 0.0
|
@@ -499,7 +586,8 @@ class DuplicateDetector:
|
|
499
586
|
subj = str(c.get("subject") or "").strip()
|
500
587
|
if not subj:
|
501
588
|
continue
|
502
|
-
# Extract commit message and files from revisions
|
589
|
+
# Extract commit message and files from revisions
|
590
|
+
# (CURRENT_COMMIT, CURRENT_FILES)
|
503
591
|
rev = str(c.get("current_revision") or "")
|
504
592
|
revs_obj = c.get("revisions")
|
505
593
|
revs = revs_obj if isinstance(revs_obj, dict) else {}
|
@@ -512,7 +600,11 @@ class DuplicateDetector:
|
|
512
600
|
cand_body_raw = msg.split("\n", 1)[1]
|
513
601
|
cand_body = remove_commit_trailers(cand_body_raw)
|
514
602
|
files_dict = cur.get("files") or {}
|
515
|
-
cand_files = [
|
603
|
+
cand_files = [
|
604
|
+
p
|
605
|
+
for p in files_dict
|
606
|
+
if isinstance(p, str) and not p.startswith("/")
|
607
|
+
]
|
516
608
|
|
517
609
|
# Compute component scores
|
518
610
|
s_res = score_subjects(src_subjects, subj)
|
@@ -524,7 +616,9 @@ class DuplicateDetector:
|
|
524
616
|
b_res = score_bodies(src_body, cand_body)
|
525
617
|
|
526
618
|
# Aggregate
|
527
|
-
agg = aggregate_scores(
|
619
|
+
agg = aggregate_scores(
|
620
|
+
s_res.score, f_res.score, b_res.score, config=config
|
621
|
+
)
|
528
622
|
log.debug(
|
529
623
|
"Aggregate score computed: %.2f (s=%.2f f=%.2f b=%.2f)",
|
530
624
|
agg,
|
@@ -533,20 +627,24 @@ class DuplicateDetector:
|
|
533
627
|
b_res.score,
|
534
628
|
)
|
535
629
|
|
536
|
-
# Build candidate reference and number using successful base
|
630
|
+
# Build candidate reference and number using successful base
|
631
|
+
# path
|
537
632
|
num_obj = c.get("_number")
|
538
633
|
num = int(num_obj) if isinstance(num_obj, int) else None
|
539
634
|
proj = str(c.get("project") or gerrit_project or "")
|
540
635
|
|
541
636
|
# Use the base path that actually worked for API calls
|
542
|
-
display_url_builder = create_gerrit_url_builder(
|
637
|
+
display_url_builder = create_gerrit_url_builder(
|
638
|
+
gerrit_host, successful_base_path
|
639
|
+
)
|
543
640
|
ref = (
|
544
641
|
display_url_builder.change_url(proj, num)
|
545
642
|
if proj and isinstance(num, int)
|
546
643
|
else (f"change {num}" if isinstance(num, int) else "")
|
547
644
|
)
|
548
645
|
log.debug(
|
549
|
-
"Scoring candidate: ref=%s agg=%.2f (s=%.2f f=%.2f b=%.2f)
|
646
|
+
"Scoring candidate: ref=%s agg=%.2f (s=%.2f f=%.2f b=%.2f) "
|
647
|
+
"subj='%s'",
|
550
648
|
ref or "(none)",
|
551
649
|
agg,
|
552
650
|
s_res.score,
|
@@ -559,7 +657,11 @@ class DuplicateDetector:
|
|
559
657
|
if agg > best_score:
|
560
658
|
best_score = agg
|
561
659
|
# Deduplicate reasons preserving order
|
562
|
-
best_reasons = list(
|
660
|
+
best_reasons = list(
|
661
|
+
dict.fromkeys(
|
662
|
+
s_res.reasons + f_res.reasons + b_res.reasons
|
663
|
+
)
|
664
|
+
)
|
563
665
|
|
564
666
|
# Collect all candidates above threshold
|
565
667
|
if agg >= config.similarity_threshold and ref:
|
@@ -579,7 +681,10 @@ class DuplicateDetector:
|
|
579
681
|
for s, u, _ in hits_sorted:
|
580
682
|
if u:
|
581
683
|
log.info("Score: %.2f URL: %s", s, u)
|
582
|
-
msg =
|
684
|
+
msg = (
|
685
|
+
f"Similar Gerrit change(s) detected "
|
686
|
+
f"[≥ {config.similarity_threshold:.2f}]"
|
687
|
+
)
|
583
688
|
if best_reasons:
|
584
689
|
msg += f" (Reasons: {', '.join(best_reasons)})"
|
585
690
|
if allow_duplicates:
|
@@ -590,34 +695,45 @@ class DuplicateDetector:
|
|
590
695
|
# Construct human-friendly references for logs
|
591
696
|
matching_numbers: list[int] = []
|
592
697
|
match_lines: list[str] = []
|
698
|
+
duplicate_urls: list[str] = []
|
593
699
|
for n, proj in matched:
|
594
700
|
if proj:
|
595
701
|
# Use the base path that actually worked for API calls
|
596
|
-
display_url_builder = create_gerrit_url_builder(
|
702
|
+
display_url_builder = create_gerrit_url_builder(
|
703
|
+
gerrit_host, successful_base_path
|
704
|
+
)
|
597
705
|
url = display_url_builder.change_url(proj, n)
|
598
706
|
match_lines.append(f"Score: 1.0 URL: {url}")
|
599
|
-
|
707
|
+
duplicate_urls.append(url)
|
708
|
+
log.debug("Score: 1.0 URL: %s", url)
|
600
709
|
else:
|
601
710
|
match_lines.append(f"Score: 1.0 URL: change {n}")
|
602
|
-
|
711
|
+
duplicate_urls.append(f"change {n}")
|
712
|
+
log.debug("Score: 1.0 URL: change %s", n)
|
603
713
|
matching_numbers.append(n)
|
604
714
|
|
605
715
|
if not matched:
|
606
|
-
log.debug(
|
716
|
+
log.debug(
|
717
|
+
"No exact subject matches and no similarity matches; "
|
718
|
+
"duplicate check passes"
|
719
|
+
)
|
607
720
|
return
|
608
721
|
|
609
722
|
# Remove PR number from message since cli.py already includes it
|
610
723
|
full_message = "subject matches existing Gerrit change(s)"
|
611
724
|
if allow_duplicates:
|
612
|
-
log.
|
725
|
+
log.debug("GERRIT DUPLICATE DETECTED (allowed): %s", full_message)
|
613
726
|
return
|
614
|
-
raise DuplicateChangeError(
|
727
|
+
raise DuplicateChangeError(
|
728
|
+
full_message, matching_numbers, duplicate_urls
|
729
|
+
)
|
615
730
|
|
616
731
|
|
617
732
|
def check_for_duplicates(
|
618
733
|
gh: GitHubContext,
|
619
734
|
allow_duplicates: bool = False,
|
620
735
|
lookback_days: int = 7,
|
736
|
+
expected_github_hash: str | None = None,
|
621
737
|
) -> None:
|
622
738
|
"""Convenience function to check for duplicates.
|
623
739
|
|
@@ -625,6 +741,7 @@ def check_for_duplicates(
|
|
625
741
|
gh: GitHub context containing PR information
|
626
742
|
allow_duplicates: If True, only log warnings; if False, raise exception
|
627
743
|
lookback_days: Number of days to look back for similar PRs
|
744
|
+
expected_github_hash: The GitHub-Hash trailer value expected for this PR
|
628
745
|
|
629
746
|
Raises:
|
630
747
|
DuplicateChangeError: If duplicates found and allow_duplicates=False
|
@@ -646,13 +763,20 @@ def check_for_duplicates(
|
|
646
763
|
lookback_days=lookback_days,
|
647
764
|
duplicates_filter=os.getenv("DUPLICATE_TYPES", "open"),
|
648
765
|
)
|
649
|
-
detector.check_for_duplicates(
|
766
|
+
detector.check_for_duplicates(
|
767
|
+
target_pr,
|
768
|
+
allow_duplicates=allow_duplicates,
|
769
|
+
gh=gh,
|
770
|
+
expected_github_hash=expected_github_hash,
|
771
|
+
)
|
650
772
|
|
651
|
-
log.
|
773
|
+
log.debug("Duplicate check completed for PR #%d", gh.pr_number)
|
652
774
|
|
653
775
|
except DuplicateChangeError:
|
654
776
|
# Re-raise duplicate errors
|
655
777
|
raise
|
656
778
|
except Exception as exc:
|
657
|
-
log.warning(
|
779
|
+
log.warning(
|
780
|
+
"Duplicate detection failed for PR #%d: %s", gh.pr_number, exc
|
781
|
+
)
|
658
782
|
# Don't fail the entire process if duplicate detection has issues
|