github2gerrit 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- github2gerrit/cli.py +793 -198
- github2gerrit/commit_normalization.py +44 -15
- github2gerrit/config.py +76 -30
- github2gerrit/core.py +1571 -267
- github2gerrit/duplicate_detection.py +227 -113
- github2gerrit/external_api.py +76 -25
- github2gerrit/gerrit_query.py +286 -0
- github2gerrit/gerrit_rest.py +53 -18
- github2gerrit/gerrit_urls.py +90 -33
- github2gerrit/github_api.py +19 -6
- github2gerrit/gitutils.py +43 -14
- github2gerrit/mapping_comment.py +345 -0
- github2gerrit/models.py +15 -1
- github2gerrit/orchestrator/__init__.py +25 -0
- github2gerrit/orchestrator/reconciliation.py +589 -0
- github2gerrit/pr_content_filter.py +65 -17
- github2gerrit/reconcile_matcher.py +595 -0
- github2gerrit/rich_display.py +502 -0
- github2gerrit/rich_logging.py +316 -0
- github2gerrit/similarity.py +65 -19
- github2gerrit/ssh_agent_setup.py +59 -22
- github2gerrit/ssh_common.py +30 -11
- github2gerrit/ssh_discovery.py +67 -20
- github2gerrit/trailers.py +340 -0
- github2gerrit/utils.py +6 -2
- {github2gerrit-0.1.10.dist-info → github2gerrit-0.1.12.dist-info}/METADATA +76 -24
- github2gerrit-0.1.12.dist-info/RECORD +31 -0
- {github2gerrit-0.1.10.dist-info → github2gerrit-0.1.12.dist-info}/WHEEL +1 -2
- github2gerrit-0.1.10.dist-info/RECORD +0 -24
- github2gerrit-0.1.10.dist-info/top_level.txt +0 -1
- {github2gerrit-0.1.10.dist-info → github2gerrit-0.1.12.dist-info}/entry_points.txt +0 -0
- {github2gerrit-0.1.10.dist-info → github2gerrit-0.1.12.dist-info}/licenses/LICENSE +0 -0
@@ -10,7 +10,6 @@ submissions from automated tools like Dependabot.
|
|
10
10
|
"""
|
11
11
|
|
12
12
|
import hashlib
|
13
|
-
import json
|
14
13
|
import logging
|
15
14
|
import os
|
16
15
|
import re
|
@@ -21,7 +20,7 @@ from collections.abc import Iterable
|
|
21
20
|
from datetime import UTC
|
22
21
|
from datetime import datetime
|
23
22
|
from datetime import timedelta
|
24
|
-
from
|
23
|
+
from typing import Any
|
25
24
|
|
26
25
|
from .gerrit_urls import create_gerrit_url_builder
|
27
26
|
from .github_api import GhPullRequest
|
@@ -29,6 +28,7 @@ from .github_api import GhRepository
|
|
29
28
|
from .github_api import build_client
|
30
29
|
from .github_api import get_repo_from_env
|
31
30
|
from .models import GitHubContext
|
31
|
+
from .trailers import extract_github_metadata
|
32
32
|
|
33
33
|
|
34
34
|
# Optional Gerrit REST API support
|
@@ -53,15 +53,23 @@ __all__ = [
|
|
53
53
|
class DuplicateChangeError(Exception):
|
54
54
|
"""Raised when a duplicate change is detected."""
|
55
55
|
|
56
|
-
def __init__(
|
56
|
+
def __init__(
|
57
|
+
self,
|
58
|
+
message: str,
|
59
|
+
existing_prs: list[int],
|
60
|
+
urls: list[str] | None = None,
|
61
|
+
) -> None:
|
57
62
|
super().__init__(message)
|
58
63
|
self.existing_prs = existing_prs
|
64
|
+
self.urls = urls or []
|
59
65
|
|
60
66
|
|
61
67
|
class ChangeFingerprint:
|
62
68
|
"""Represents a fingerprint of a change for duplicate detection."""
|
63
69
|
|
64
|
-
def __init__(
|
70
|
+
def __init__(
|
71
|
+
self, title: str, body: str = "", files_changed: list[str] | None = None
|
72
|
+
):
|
65
73
|
self.title = title.strip()
|
66
74
|
self.body = (body or "").strip()
|
67
75
|
self.files_changed = sorted(files_changed or [])
|
@@ -102,10 +110,15 @@ class ChangeFingerprint:
|
|
102
110
|
|
103
111
|
def _compute_content_hash(self) -> str:
|
104
112
|
"""Compute a hash of the change content."""
|
105
|
-
content =
|
113
|
+
content = (
|
114
|
+
f"{self._normalized_title}\n{self.body}\n"
|
115
|
+
f"{','.join(self.files_changed)}"
|
116
|
+
)
|
106
117
|
return hashlib.sha256(content.encode("utf-8")).hexdigest()[:16]
|
107
118
|
|
108
|
-
def is_similar_to(
|
119
|
+
def is_similar_to(
|
120
|
+
self, other: "ChangeFingerprint", similarity_threshold: float = 0.8
|
121
|
+
) -> bool:
|
109
122
|
"""Check if this fingerprint is similar to another."""
|
110
123
|
# Exact normalized title match
|
111
124
|
if self._normalized_title == other._normalized_title:
|
@@ -128,7 +141,9 @@ class ChangeFingerprint:
|
|
128
141
|
# Check title similarity even without file changes
|
129
142
|
return self._titles_similar(other, similarity_threshold)
|
130
143
|
|
131
|
-
def _titles_similar(
|
144
|
+
def _titles_similar(
|
145
|
+
self, other: "ChangeFingerprint", threshold: float
|
146
|
+
) -> bool:
|
132
147
|
"""Check if titles are similar using simple string similarity."""
|
133
148
|
title1 = self._normalized_title
|
134
149
|
title2 = other._normalized_title
|
@@ -149,7 +164,10 @@ class ChangeFingerprint:
|
|
149
164
|
return (intersection / union) >= threshold
|
150
165
|
|
151
166
|
def __str__(self) -> str:
|
152
|
-
return
|
167
|
+
return (
|
168
|
+
f"ChangeFingerprint(title='{self.title[:50]}...', "
|
169
|
+
f"hash={self._content_hash})"
|
170
|
+
)
|
153
171
|
|
154
172
|
|
155
173
|
class DuplicateDetector:
|
@@ -171,7 +189,9 @@ class DuplicateDetector:
|
|
171
189
|
match = re.search(pattern, text)
|
172
190
|
return match.group(1) if match else ""
|
173
191
|
|
174
|
-
def _resolve_gerrit_info_from_env_or_gitreview(
|
192
|
+
def _resolve_gerrit_info_from_env_or_gitreview(
|
193
|
+
self, gh: GitHubContext
|
194
|
+
) -> tuple[str, str] | None:
|
175
195
|
"""Resolve Gerrit host and project from environment or .gitreview file.
|
176
196
|
|
177
197
|
Returns:
|
@@ -184,20 +204,11 @@ class DuplicateDetector:
|
|
184
204
|
if gerrit_host and gerrit_project:
|
185
205
|
return (gerrit_host, gerrit_project)
|
186
206
|
|
187
|
-
#
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
host = self._match_first_group(r"(?m)^host=(.+)$", text)
|
193
|
-
proj = self._match_first_group(r"(?m)^project=(.+)$", text)
|
194
|
-
if host and proj:
|
195
|
-
project = proj.removesuffix(".git")
|
196
|
-
return (host.strip(), project.strip())
|
197
|
-
if host and not proj:
|
198
|
-
return (host.strip(), "")
|
199
|
-
except Exception as exc:
|
200
|
-
log.debug("Failed to read local .gitreview: %s", exc)
|
207
|
+
# Skip local .gitreview check in composite action context
|
208
|
+
# The duplicate detection runs before workspace setup, so there's no
|
209
|
+
# reliable local .gitreview file to check. Instead, rely on environment
|
210
|
+
# variables or remote fetching.
|
211
|
+
log.debug("Skipping local .gitreview check (composite action context)")
|
201
212
|
|
202
213
|
# Try to fetch .gitreview remotely (simplified version of core logic)
|
203
214
|
try:
|
@@ -220,7 +231,10 @@ class DuplicateDetector:
|
|
220
231
|
url = f"https://raw.githubusercontent.com/{repo_full}/{branch}/.gitreview"
|
221
232
|
|
222
233
|
parsed = urllib.parse.urlparse(url)
|
223
|
-
if
|
234
|
+
if (
|
235
|
+
parsed.scheme != "https"
|
236
|
+
or parsed.netloc != "raw.githubusercontent.com"
|
237
|
+
):
|
224
238
|
continue
|
225
239
|
|
226
240
|
try:
|
@@ -228,8 +242,12 @@ class DuplicateDetector:
|
|
228
242
|
with urllib.request.urlopen(url, timeout=5) as resp:
|
229
243
|
text_remote = resp.read().decode("utf-8")
|
230
244
|
|
231
|
-
host = self._match_first_group(
|
232
|
-
|
245
|
+
host = self._match_first_group(
|
246
|
+
r"(?m)^host=(.+)$", text_remote
|
247
|
+
)
|
248
|
+
proj = self._match_first_group(
|
249
|
+
r"(?m)^project=(.+)$", text_remote
|
250
|
+
)
|
233
251
|
|
234
252
|
if host and proj:
|
235
253
|
project = proj.removesuffix(".git")
|
@@ -238,7 +256,9 @@ class DuplicateDetector:
|
|
238
256
|
return (host.strip(), "")
|
239
257
|
|
240
258
|
except Exception as exc:
|
241
|
-
log.debug(
|
259
|
+
log.debug(
|
260
|
+
"Failed to fetch .gitreview from %s: %s", url, exc
|
261
|
+
)
|
242
262
|
continue
|
243
263
|
|
244
264
|
except Exception as exc:
|
@@ -246,31 +266,24 @@ class DuplicateDetector:
|
|
246
266
|
|
247
267
|
return None
|
248
268
|
|
249
|
-
def _build_gerrit_rest_client(self, gerrit_host: str) ->
|
250
|
-
"""Build a Gerrit REST API client
|
251
|
-
|
252
|
-
log.debug("pygerrit2 not available, skipping Gerrit duplicate check")
|
253
|
-
return None
|
254
|
-
|
255
|
-
# Create centralized URL builder
|
256
|
-
url_builder = create_gerrit_url_builder(gerrit_host)
|
257
|
-
base_url = url_builder.api_url()
|
269
|
+
def _build_gerrit_rest_client(self, gerrit_host: str) -> Any | None:
|
270
|
+
"""Build a Gerrit REST API client using centralized framework."""
|
271
|
+
from .gerrit_rest import build_client_for_host
|
258
272
|
|
259
|
-
http_user =
|
273
|
+
http_user = (
|
274
|
+
os.getenv("GERRIT_HTTP_USER", "").strip()
|
275
|
+
or os.getenv("GERRIT_SSH_USER_G2G", "").strip()
|
276
|
+
)
|
260
277
|
http_pass = os.getenv("GERRIT_HTTP_PASSWORD", "").strip()
|
261
278
|
|
262
279
|
try:
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
)
|
271
|
-
else:
|
272
|
-
# Type ignore needed for dynamic import returning Any
|
273
|
-
return GerritRestAPI(url=base_url) # type: ignore[no-any-return]
|
280
|
+
return build_client_for_host(
|
281
|
+
gerrit_host,
|
282
|
+
timeout=8.0,
|
283
|
+
max_attempts=3,
|
284
|
+
http_user=http_user or None,
|
285
|
+
http_password=http_pass or None,
|
286
|
+
)
|
274
287
|
except Exception as exc:
|
275
288
|
log.debug("Failed to create Gerrit REST client: %s", exc)
|
276
289
|
return None
|
@@ -299,7 +312,9 @@ class DuplicateDetector:
|
|
299
312
|
hash_bytes = hashlib.sha256(hash_input.encode("utf-8")).digest()
|
300
313
|
hash_hex = hash_bytes.hex()[:16]
|
301
314
|
|
302
|
-
log.debug(
|
315
|
+
log.debug(
|
316
|
+
"Generated GitHub change hash for %s: %s", hash_input, hash_hex
|
317
|
+
)
|
303
318
|
return hash_hex
|
304
319
|
|
305
320
|
def check_for_duplicates(
|
@@ -307,25 +322,41 @@ class DuplicateDetector:
|
|
307
322
|
target_pr: GhPullRequest,
|
308
323
|
allow_duplicates: bool = False,
|
309
324
|
gh: GitHubContext | None = None,
|
325
|
+
expected_github_hash: str | None = None,
|
310
326
|
) -> None:
|
311
|
-
"""Check if the target PR is a duplicate via
|
327
|
+
"""Check if the target PR is a duplicate via trailer-aware and subject
|
328
|
+
equality against Gerrit.
|
312
329
|
|
313
|
-
Implements a robust, dependency-free
|
330
|
+
Implements a robust, dependency-free duplicate check with trailer
|
331
|
+
awareness:
|
332
|
+
- First check for existing changes with matching GitHub-Hash trailer
|
333
|
+
(short-circuit)
|
314
334
|
- Resolve Gerrit host/project from env or .gitreview
|
315
|
-
- Query Gerrit changes updated within the lookback window (excluding
|
335
|
+
- Query Gerrit changes updated within the lookback window (excluding
|
336
|
+
abandoned)
|
316
337
|
- Compare normalized subjects (first line) for exact equality
|
317
338
|
- If any match, treat as duplicate and either warn or raise
|
339
|
+
|
340
|
+
Args:
|
341
|
+
target_pr: The GitHub PR to check for duplicates
|
342
|
+
allow_duplicates: If True, log warnings instead of raising errors
|
343
|
+
gh: GitHub context for resolving Gerrit configuration
|
344
|
+
expected_github_hash: The GitHub-Hash trailer value expected for
|
345
|
+
this PR
|
318
346
|
"""
|
319
347
|
pr_number = getattr(target_pr, "number", 0)
|
320
348
|
pr_title = (getattr(target_pr, "title", "") or "").strip()
|
321
349
|
|
322
350
|
log.debug(
|
323
|
-
"Checking PR #%d for duplicates via subject equality against
|
351
|
+
"Checking PR #%d for duplicates via subject equality against "
|
352
|
+
"Gerrit",
|
324
353
|
pr_number,
|
325
354
|
)
|
326
355
|
|
327
356
|
if not pr_title:
|
328
|
-
log.debug(
|
357
|
+
log.debug(
|
358
|
+
"PR #%d has empty title; skipping duplicate check", pr_number
|
359
|
+
)
|
329
360
|
return
|
330
361
|
if gh is None:
|
331
362
|
log.debug("No GitHub context provided; skipping duplicate check")
|
@@ -334,7 +365,10 @@ class DuplicateDetector:
|
|
334
365
|
# Resolve Gerrit target (host/project)
|
335
366
|
gerrit_info = self._resolve_gerrit_info_from_env_or_gitreview(gh)
|
336
367
|
if not gerrit_info:
|
337
|
-
log.debug(
|
368
|
+
log.debug(
|
369
|
+
"Unable to resolve Gerrit host/project; skipping duplicate "
|
370
|
+
"check"
|
371
|
+
)
|
338
372
|
return
|
339
373
|
gerrit_host, gerrit_project = gerrit_info
|
340
374
|
|
@@ -342,16 +376,21 @@ class DuplicateDetector:
|
|
342
376
|
def _normalize_subject(title: str) -> str:
|
343
377
|
normalized = title.strip()
|
344
378
|
normalized = re.sub(
|
345
|
-
r"^(feat|fix|docs|style|refactor|test|chore|ci|build|perf)
|
379
|
+
r"^(feat|fix|docs|style|refactor|test|chore|ci|build|perf)"
|
380
|
+
r"(\(.+?\))?: ",
|
346
381
|
"",
|
347
382
|
normalized,
|
348
383
|
flags=re.IGNORECASE,
|
349
384
|
)
|
350
385
|
normalized = re.sub(r"[*_`]", "", normalized)
|
351
|
-
normalized = re.sub(
|
386
|
+
normalized = re.sub(
|
387
|
+
r"\bv\d+(\.\d+)*(-\w+)?\b", "vx.y.z", normalized
|
388
|
+
)
|
352
389
|
normalized = re.sub(r"\b\d+(\.\d+)+(-\w+)?\b", "x.y.z", normalized)
|
353
390
|
normalized = re.sub(r"\b\d+\.\d+\b", "x.y.z", normalized)
|
354
|
-
normalized = re.sub(
|
391
|
+
normalized = re.sub(
|
392
|
+
r"\b[a-f0-9]{7,40}\b", "commit_hash", normalized
|
393
|
+
)
|
355
394
|
normalized = re.sub(r"\s+", " ", normalized).strip()
|
356
395
|
return normalized.lower()
|
357
396
|
|
@@ -363,7 +402,6 @@ class DuplicateDetector:
|
|
363
402
|
|
364
403
|
# Build Gerrit REST URL using centralized URL builder
|
365
404
|
url_builder = create_gerrit_url_builder(gerrit_host)
|
366
|
-
api_base = url_builder.api_url().rstrip("/")
|
367
405
|
|
368
406
|
# Track which base path actually works for constructing display URLs
|
369
407
|
successful_base_path = url_builder.base_path
|
@@ -393,64 +431,99 @@ class DuplicateDetector:
|
|
393
431
|
query = " ".join(q_parts)
|
394
432
|
encoded_q = urllib.parse.quote(query, safe="")
|
395
433
|
|
396
|
-
|
397
|
-
# Use a modest page size
|
398
|
-
url = f"{api_base}/changes/?q={encoded_q}&n=50&o=CURRENT_COMMIT&o=CURRENT_FILES"
|
399
|
-
|
400
|
-
def _load_gerrit_json(url_: str) -> list[dict[str, object]]:
|
434
|
+
def _load_gerrit_json(query_path: str) -> list[dict[str, object]]:
|
401
435
|
try:
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
436
|
+
# Use centralized client that handles base path and auth
|
437
|
+
client = self._build_gerrit_rest_client(gerrit_host)
|
438
|
+
if client is None:
|
439
|
+
log.debug(
|
440
|
+
"Gerrit client not available; skipping duplicate check"
|
441
|
+
)
|
407
442
|
return []
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
if http_user and http_pass:
|
412
|
-
import base64 as _b64 # localized import to avoid global import edit
|
413
|
-
|
414
|
-
basic = _b64.b64encode(f"{http_user}:{http_pass}".encode()).decode("ascii")
|
415
|
-
headers["Authorization"] = f"Basic {basic}"
|
416
|
-
req = urllib.request.Request(url_, headers=headers)
|
417
|
-
with urllib.request.urlopen(req, timeout=8) as resp:
|
418
|
-
raw = resp.read().decode("utf-8", errors="replace")
|
419
|
-
# Strip Gerrit's XSSI prefix if present
|
420
|
-
if raw.startswith(")]}'"):
|
421
|
-
raw = raw.split("\n", 1)[1] if "\n" in raw else ""
|
422
|
-
data = json.loads(raw or "[]")
|
443
|
+
|
444
|
+
log.debug("Querying Gerrit for duplicates: %s", query_path)
|
445
|
+
data = client.get(query_path)
|
423
446
|
if isinstance(data, list):
|
424
447
|
return data
|
425
448
|
else:
|
426
449
|
return []
|
427
|
-
except urllib.error.HTTPError as exc:
|
428
|
-
log.debug("Gerrit query failed for %s: %s", url_, exc)
|
429
|
-
return []
|
430
450
|
except Exception as exc:
|
431
|
-
log.debug("Gerrit query failed for %s: %s",
|
451
|
+
log.debug("Gerrit query failed for %s: %s", query_path, exc)
|
432
452
|
return []
|
433
453
|
|
454
|
+
# Build query path for centralized client
|
455
|
+
query_path = (
|
456
|
+
f"/changes/?q={encoded_q}&n=50&o=CURRENT_COMMIT&o=CURRENT_FILES"
|
457
|
+
)
|
458
|
+
|
434
459
|
log.debug(
|
435
|
-
"Gerrit duplicate query: host=%s project=%s filter=%s cutoff=%s
|
460
|
+
"Gerrit duplicate query: host=%s project=%s filter=%s cutoff=%s "
|
461
|
+
"path=%s",
|
436
462
|
gerrit_host,
|
437
463
|
gerrit_project or "(any)",
|
438
464
|
dup_filter,
|
439
465
|
cutoff_date,
|
440
|
-
|
466
|
+
query_path,
|
441
467
|
)
|
442
|
-
changes = _load_gerrit_json(
|
468
|
+
changes = _load_gerrit_json(query_path)
|
443
469
|
log.debug(
|
444
|
-
"Gerrit query returned %d change(s) for project=%s filter=%s
|
470
|
+
"Gerrit query returned %d change(s) for project=%s filter=%s "
|
471
|
+
"after=%s",
|
445
472
|
len(changes),
|
446
473
|
gerrit_project or "(any)",
|
447
474
|
dup_filter,
|
448
475
|
cutoff_date,
|
449
476
|
)
|
450
477
|
if changes:
|
451
|
-
sample_subjects = ", ".join(
|
478
|
+
sample_subjects = ", ".join(
|
479
|
+
str(c.get("subject") or "")[:60] for c in changes[:5]
|
480
|
+
)
|
452
481
|
log.debug("Sample subjects: %s", sample_subjects)
|
453
482
|
|
483
|
+
# First pass: Check for trailer-based matches (GitHub-Hash)
|
484
|
+
if expected_github_hash:
|
485
|
+
log.debug(
|
486
|
+
"Checking for GitHub-Hash trailer matches: %s",
|
487
|
+
expected_github_hash,
|
488
|
+
)
|
489
|
+
trailer_matches: list[tuple[int, str]] = []
|
490
|
+
|
491
|
+
for c in changes:
|
492
|
+
# Extract commit message and check for GitHub trailers
|
493
|
+
rev = str(c.get("current_revision") or "")
|
494
|
+
revs_obj = c.get("revisions")
|
495
|
+
revs = revs_obj if isinstance(revs_obj, dict) else {}
|
496
|
+
cur_obj = revs.get(rev)
|
497
|
+
cur = cur_obj if isinstance(cur_obj, dict) else {}
|
498
|
+
commit = cur.get("commit") or {}
|
499
|
+
msg = str(commit.get("message") or "")
|
500
|
+
|
501
|
+
if msg:
|
502
|
+
github_metadata = extract_github_metadata(msg)
|
503
|
+
change_github_hash = github_metadata.get("GitHub-Hash", "")
|
504
|
+
|
505
|
+
if change_github_hash == expected_github_hash:
|
506
|
+
num = c.get("_number")
|
507
|
+
proj = str(c.get("project") or gerrit_project or "")
|
508
|
+
if isinstance(num, int):
|
509
|
+
trailer_matches.append((num, proj))
|
510
|
+
log.debug(
|
511
|
+
"Found GitHub-Hash trailer match: change %d, "
|
512
|
+
"hash %s",
|
513
|
+
num,
|
514
|
+
change_github_hash,
|
515
|
+
)
|
516
|
+
|
517
|
+
if trailer_matches:
|
518
|
+
log.debug(
|
519
|
+
"Found %d change(s) with matching GitHub-Hash trailer - "
|
520
|
+
"treating as update targets",
|
521
|
+
len(trailer_matches),
|
522
|
+
)
|
523
|
+
# These are update targets, not duplicates - allow them to
|
524
|
+
# proceed
|
525
|
+
return
|
526
|
+
|
454
527
|
# Compare normalized subjects for exact equality
|
455
528
|
matched: list[tuple[int, str]] = []
|
456
529
|
for c in changes:
|
@@ -464,8 +537,11 @@ class DuplicateDetector:
|
|
464
537
|
matched.append((num, proj))
|
465
538
|
|
466
539
|
if not matched:
|
467
|
-
# No exact subject match; proceed with similarity scoring across
|
468
|
-
|
540
|
+
# No exact subject match; proceed with similarity scoring across
|
541
|
+
# candidates
|
542
|
+
log.debug(
|
543
|
+
"No exact-subject matches found; entering similarity scoring"
|
544
|
+
)
|
469
545
|
from .similarity import ScoringConfig
|
470
546
|
from .similarity import aggregate_scores
|
471
547
|
from .similarity import remove_commit_trailers
|
@@ -488,7 +564,8 @@ class DuplicateDetector:
|
|
488
564
|
if fname:
|
489
565
|
src_files.append(str(fname))
|
490
566
|
except Exception as exc:
|
491
|
-
# Best-effort; if files cannot be retrieved, proceed without
|
567
|
+
# Best-effort; if files cannot be retrieved, proceed without
|
568
|
+
# them
|
492
569
|
log.debug("Failed to retrieve PR files for scoring: %s", exc)
|
493
570
|
|
494
571
|
best_score = 0.0
|
@@ -499,7 +576,8 @@ class DuplicateDetector:
|
|
499
576
|
subj = str(c.get("subject") or "").strip()
|
500
577
|
if not subj:
|
501
578
|
continue
|
502
|
-
# Extract commit message and files from revisions
|
579
|
+
# Extract commit message and files from revisions
|
580
|
+
# (CURRENT_COMMIT, CURRENT_FILES)
|
503
581
|
rev = str(c.get("current_revision") or "")
|
504
582
|
revs_obj = c.get("revisions")
|
505
583
|
revs = revs_obj if isinstance(revs_obj, dict) else {}
|
@@ -512,7 +590,11 @@ class DuplicateDetector:
|
|
512
590
|
cand_body_raw = msg.split("\n", 1)[1]
|
513
591
|
cand_body = remove_commit_trailers(cand_body_raw)
|
514
592
|
files_dict = cur.get("files") or {}
|
515
|
-
cand_files = [
|
593
|
+
cand_files = [
|
594
|
+
p
|
595
|
+
for p in files_dict
|
596
|
+
if isinstance(p, str) and not p.startswith("/")
|
597
|
+
]
|
516
598
|
|
517
599
|
# Compute component scores
|
518
600
|
s_res = score_subjects(src_subjects, subj)
|
@@ -524,7 +606,9 @@ class DuplicateDetector:
|
|
524
606
|
b_res = score_bodies(src_body, cand_body)
|
525
607
|
|
526
608
|
# Aggregate
|
527
|
-
agg = aggregate_scores(
|
609
|
+
agg = aggregate_scores(
|
610
|
+
s_res.score, f_res.score, b_res.score, config=config
|
611
|
+
)
|
528
612
|
log.debug(
|
529
613
|
"Aggregate score computed: %.2f (s=%.2f f=%.2f b=%.2f)",
|
530
614
|
agg,
|
@@ -533,20 +617,24 @@ class DuplicateDetector:
|
|
533
617
|
b_res.score,
|
534
618
|
)
|
535
619
|
|
536
|
-
# Build candidate reference and number using successful base
|
620
|
+
# Build candidate reference and number using successful base
|
621
|
+
# path
|
537
622
|
num_obj = c.get("_number")
|
538
623
|
num = int(num_obj) if isinstance(num_obj, int) else None
|
539
624
|
proj = str(c.get("project") or gerrit_project or "")
|
540
625
|
|
541
626
|
# Use the base path that actually worked for API calls
|
542
|
-
display_url_builder = create_gerrit_url_builder(
|
627
|
+
display_url_builder = create_gerrit_url_builder(
|
628
|
+
gerrit_host, successful_base_path
|
629
|
+
)
|
543
630
|
ref = (
|
544
631
|
display_url_builder.change_url(proj, num)
|
545
632
|
if proj and isinstance(num, int)
|
546
633
|
else (f"change {num}" if isinstance(num, int) else "")
|
547
634
|
)
|
548
635
|
log.debug(
|
549
|
-
"Scoring candidate: ref=%s agg=%.2f (s=%.2f f=%.2f b=%.2f)
|
636
|
+
"Scoring candidate: ref=%s agg=%.2f (s=%.2f f=%.2f b=%.2f) "
|
637
|
+
"subj='%s'",
|
550
638
|
ref or "(none)",
|
551
639
|
agg,
|
552
640
|
s_res.score,
|
@@ -559,7 +647,11 @@ class DuplicateDetector:
|
|
559
647
|
if agg > best_score:
|
560
648
|
best_score = agg
|
561
649
|
# Deduplicate reasons preserving order
|
562
|
-
best_reasons = list(
|
650
|
+
best_reasons = list(
|
651
|
+
dict.fromkeys(
|
652
|
+
s_res.reasons + f_res.reasons + b_res.reasons
|
653
|
+
)
|
654
|
+
)
|
563
655
|
|
564
656
|
# Collect all candidates above threshold
|
565
657
|
if agg >= config.similarity_threshold and ref:
|
@@ -579,7 +671,10 @@ class DuplicateDetector:
|
|
579
671
|
for s, u, _ in hits_sorted:
|
580
672
|
if u:
|
581
673
|
log.info("Score: %.2f URL: %s", s, u)
|
582
|
-
msg =
|
674
|
+
msg = (
|
675
|
+
f"Similar Gerrit change(s) detected "
|
676
|
+
f"[≥ {config.similarity_threshold:.2f}]"
|
677
|
+
)
|
583
678
|
if best_reasons:
|
584
679
|
msg += f" (Reasons: {', '.join(best_reasons)})"
|
585
680
|
if allow_duplicates:
|
@@ -590,34 +685,45 @@ class DuplicateDetector:
|
|
590
685
|
# Construct human-friendly references for logs
|
591
686
|
matching_numbers: list[int] = []
|
592
687
|
match_lines: list[str] = []
|
688
|
+
duplicate_urls: list[str] = []
|
593
689
|
for n, proj in matched:
|
594
690
|
if proj:
|
595
691
|
# Use the base path that actually worked for API calls
|
596
|
-
display_url_builder = create_gerrit_url_builder(
|
692
|
+
display_url_builder = create_gerrit_url_builder(
|
693
|
+
gerrit_host, successful_base_path
|
694
|
+
)
|
597
695
|
url = display_url_builder.change_url(proj, n)
|
598
696
|
match_lines.append(f"Score: 1.0 URL: {url}")
|
599
|
-
|
697
|
+
duplicate_urls.append(url)
|
698
|
+
log.debug("Score: 1.0 URL: %s", url)
|
600
699
|
else:
|
601
700
|
match_lines.append(f"Score: 1.0 URL: change {n}")
|
602
|
-
|
701
|
+
duplicate_urls.append(f"change {n}")
|
702
|
+
log.debug("Score: 1.0 URL: change %s", n)
|
603
703
|
matching_numbers.append(n)
|
604
704
|
|
605
705
|
if not matched:
|
606
|
-
log.debug(
|
706
|
+
log.debug(
|
707
|
+
"No exact subject matches and no similarity matches; "
|
708
|
+
"duplicate check passes"
|
709
|
+
)
|
607
710
|
return
|
608
711
|
|
609
712
|
# Remove PR number from message since cli.py already includes it
|
610
713
|
full_message = "subject matches existing Gerrit change(s)"
|
611
714
|
if allow_duplicates:
|
612
|
-
log.
|
715
|
+
log.debug("GERRIT DUPLICATE DETECTED (allowed): %s", full_message)
|
613
716
|
return
|
614
|
-
raise DuplicateChangeError(
|
717
|
+
raise DuplicateChangeError(
|
718
|
+
full_message, matching_numbers, duplicate_urls
|
719
|
+
)
|
615
720
|
|
616
721
|
|
617
722
|
def check_for_duplicates(
|
618
723
|
gh: GitHubContext,
|
619
724
|
allow_duplicates: bool = False,
|
620
725
|
lookback_days: int = 7,
|
726
|
+
expected_github_hash: str | None = None,
|
621
727
|
) -> None:
|
622
728
|
"""Convenience function to check for duplicates.
|
623
729
|
|
@@ -625,6 +731,7 @@ def check_for_duplicates(
|
|
625
731
|
gh: GitHub context containing PR information
|
626
732
|
allow_duplicates: If True, only log warnings; if False, raise exception
|
627
733
|
lookback_days: Number of days to look back for similar PRs
|
734
|
+
expected_github_hash: The GitHub-Hash trailer value expected for this PR
|
628
735
|
|
629
736
|
Raises:
|
630
737
|
DuplicateChangeError: If duplicates found and allow_duplicates=False
|
@@ -646,13 +753,20 @@ def check_for_duplicates(
|
|
646
753
|
lookback_days=lookback_days,
|
647
754
|
duplicates_filter=os.getenv("DUPLICATE_TYPES", "open"),
|
648
755
|
)
|
649
|
-
detector.check_for_duplicates(
|
756
|
+
detector.check_for_duplicates(
|
757
|
+
target_pr,
|
758
|
+
allow_duplicates=allow_duplicates,
|
759
|
+
gh=gh,
|
760
|
+
expected_github_hash=expected_github_hash,
|
761
|
+
)
|
650
762
|
|
651
|
-
log.
|
763
|
+
log.debug("Duplicate check completed for PR #%d", gh.pr_number)
|
652
764
|
|
653
765
|
except DuplicateChangeError:
|
654
766
|
# Re-raise duplicate errors
|
655
767
|
raise
|
656
768
|
except Exception as exc:
|
657
|
-
log.warning(
|
769
|
+
log.warning(
|
770
|
+
"Duplicate detection failed for PR #%d: %s", gh.pr_number, exc
|
771
|
+
)
|
658
772
|
# Don't fail the entire process if duplicate detection has issues
|