github2gerrit 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,16 +10,20 @@ submissions from automated tools like Dependabot.
10
10
  """
11
11
 
12
12
  import hashlib
13
+ import json
13
14
  import logging
14
15
  import os
15
16
  import re
17
+ import urllib.error
16
18
  import urllib.parse
17
19
  import urllib.request
20
+ from collections.abc import Iterable
18
21
  from datetime import UTC
19
22
  from datetime import datetime
20
23
  from datetime import timedelta
21
24
  from pathlib import Path
22
25
 
26
+ from .gerrit_urls import create_gerrit_url_builder
23
27
  from .github_api import GhPullRequest
24
28
  from .github_api import GhRepository
25
29
  from .github_api import build_client
@@ -57,9 +61,7 @@ class DuplicateChangeError(Exception):
57
61
  class ChangeFingerprint:
58
62
  """Represents a fingerprint of a change for duplicate detection."""
59
63
 
60
- def __init__(
61
- self, title: str, body: str = "", files_changed: list[str] | None = None
62
- ):
64
+ def __init__(self, title: str, body: str = "", files_changed: list[str] | None = None):
63
65
  self.title = title.strip()
64
66
  self.body = (body or "").strip()
65
67
  self.files_changed = sorted(files_changed or [])
@@ -100,15 +102,10 @@ class ChangeFingerprint:
100
102
 
101
103
  def _compute_content_hash(self) -> str:
102
104
  """Compute a hash of the change content."""
103
- content = (
104
- f"{self._normalized_title}\n{self.body}\n"
105
- f"{','.join(self.files_changed)}"
106
- )
105
+ content = f"{self._normalized_title}\n{self.body}\n{','.join(self.files_changed)}"
107
106
  return hashlib.sha256(content.encode("utf-8")).hexdigest()[:16]
108
107
 
109
- def is_similar_to(
110
- self, other: "ChangeFingerprint", similarity_threshold: float = 0.8
111
- ) -> bool:
108
+ def is_similar_to(self, other: "ChangeFingerprint", similarity_threshold: float = 0.8) -> bool:
112
109
  """Check if this fingerprint is similar to another."""
113
110
  # Exact normalized title match
114
111
  if self._normalized_title == other._normalized_title:
@@ -131,9 +128,7 @@ class ChangeFingerprint:
131
128
  # Check title similarity even without file changes
132
129
  return self._titles_similar(other, similarity_threshold)
133
130
 
134
- def _titles_similar(
135
- self, other: "ChangeFingerprint", threshold: float
136
- ) -> bool:
131
+ def _titles_similar(self, other: "ChangeFingerprint", threshold: float) -> bool:
137
132
  """Check if titles are similar using simple string similarity."""
138
133
  title1 = self._normalized_title
139
134
  title2 = other._normalized_title
@@ -154,28 +149,29 @@ class ChangeFingerprint:
154
149
  return (intersection / union) >= threshold
155
150
 
156
151
  def __str__(self) -> str:
157
- return (
158
- f"ChangeFingerprint(title='{self.title[:50]}...', "
159
- f"hash={self._content_hash})"
160
- )
152
+ return f"ChangeFingerprint(title='{self.title[:50]}...', hash={self._content_hash})"
161
153
 
162
154
 
163
155
  class DuplicateDetector:
164
156
  """Detects duplicate Gerrit changes for GitHub pull requests."""
165
157
 
166
- def __init__(self, repo: GhRepository, lookback_days: int = 7):
158
+ def __init__(
159
+ self,
160
+ repo: GhRepository,
161
+ lookback_days: int = 7,
162
+ duplicates_filter: str = "open",
163
+ ):
167
164
  self.repo = repo
168
165
  self.lookback_days = lookback_days
169
166
  self._cutoff_date = datetime.now(UTC) - timedelta(days=lookback_days)
167
+ self.duplicates_filter = duplicates_filter
170
168
 
171
169
  def _match_first_group(self, pattern: str, text: str) -> str:
172
170
  """Extract first regex group match from text."""
173
171
  match = re.search(pattern, text)
174
172
  return match.group(1) if match else ""
175
173
 
176
- def _resolve_gerrit_info_from_env_or_gitreview(
177
- self, gh: GitHubContext
178
- ) -> tuple[str, str] | None:
174
+ def _resolve_gerrit_info_from_env_or_gitreview(self, gh: GitHubContext) -> tuple[str, str] | None:
179
175
  """Resolve Gerrit host and project from environment or .gitreview file.
180
176
 
181
177
  Returns:
@@ -198,6 +194,8 @@ class DuplicateDetector:
198
194
  if host and proj:
199
195
  project = proj.removesuffix(".git")
200
196
  return (host.strip(), project.strip())
197
+ if host and not proj:
198
+ return (host.strip(), "")
201
199
  except Exception as exc:
202
200
  log.debug("Failed to read local .gitreview: %s", exc)
203
201
 
@@ -219,38 +217,28 @@ class DuplicateDetector:
219
217
  if not branch:
220
218
  continue
221
219
 
222
- url = (
223
- f"https://raw.githubusercontent.com/"
224
- f"{repo_full}/refs/heads/{branch}/.gitreview"
225
- )
220
+ url = f"https://raw.githubusercontent.com/{repo_full}/{branch}/.gitreview"
226
221
 
227
222
  parsed = urllib.parse.urlparse(url)
228
- if (
229
- parsed.scheme != "https"
230
- or parsed.netloc != "raw.githubusercontent.com"
231
- ):
223
+ if parsed.scheme != "https" or parsed.netloc != "raw.githubusercontent.com":
232
224
  continue
233
225
 
234
226
  try:
235
227
  log.debug("Fetching .gitreview from: %s", url)
236
- with urllib.request.urlopen(url, timeout=5) as resp: # noqa: S310
228
+ with urllib.request.urlopen(url, timeout=5) as resp:
237
229
  text_remote = resp.read().decode("utf-8")
238
230
 
239
- host = self._match_first_group(
240
- r"(?m)^host=(.+)$", text_remote
241
- )
242
- proj = self._match_first_group(
243
- r"(?m)^project=(.+)$", text_remote
244
- )
231
+ host = self._match_first_group(r"(?m)^host=(.+)$", text_remote)
232
+ proj = self._match_first_group(r"(?m)^project=(.+)$", text_remote)
245
233
 
246
234
  if host and proj:
247
235
  project = proj.removesuffix(".git")
248
236
  return (host.strip(), project.strip())
237
+ if host and not proj:
238
+ return (host.strip(), "")
249
239
 
250
240
  except Exception as exc:
251
- log.debug(
252
- "Failed to fetch .gitreview from %s: %s", url, exc
253
- )
241
+ log.debug("Failed to fetch .gitreview from %s: %s", url, exc)
254
242
  continue
255
243
 
256
244
  except Exception as exc:
@@ -261,22 +249,14 @@ class DuplicateDetector:
261
249
  def _build_gerrit_rest_client(self, gerrit_host: str) -> object | None:
262
250
  """Build a Gerrit REST API client if pygerrit2 is available."""
263
251
  if GerritRestAPI is None:
264
- log.debug(
265
- "pygerrit2 not available, skipping Gerrit duplicate check"
266
- )
252
+ log.debug("pygerrit2 not available, skipping Gerrit duplicate check")
267
253
  return None
268
254
 
269
- base_path = os.getenv("GERRIT_HTTP_BASE_PATH", "").strip().strip("/")
270
- base_url = (
271
- f"https://{gerrit_host}/"
272
- if not base_path
273
- else f"https://{gerrit_host}/{base_path}/"
274
- )
255
+ # Create centralized URL builder
256
+ url_builder = create_gerrit_url_builder(gerrit_host)
257
+ base_url = url_builder.api_url()
275
258
 
276
- http_user = (
277
- os.getenv("GERRIT_HTTP_USER", "").strip()
278
- or os.getenv("GERRIT_SSH_USER_G2G", "").strip()
279
- )
259
+ http_user = os.getenv("GERRIT_HTTP_USER", "").strip() or os.getenv("GERRIT_SSH_USER_G2G", "").strip()
280
260
  http_pass = os.getenv("GERRIT_HTTP_PASSWORD", "").strip()
281
261
 
282
262
  try:
@@ -295,141 +275,6 @@ class DuplicateDetector:
295
275
  log.debug("Failed to create Gerrit REST client: %s", exc)
296
276
  return None
297
277
 
298
- def _build_gerrit_rest_client_with_r_path(
299
- self, gerrit_host: str
300
- ) -> object | None:
301
- """Build a Gerrit REST API client with /r/ base path for fallback."""
302
- if GerritRestAPI is None:
303
- return None
304
-
305
- fallback_url = f"https://{gerrit_host}/r/"
306
- http_user = (
307
- os.getenv("GERRIT_HTTP_USER", "").strip()
308
- or os.getenv("GERRIT_SSH_USER_G2G", "").strip()
309
- )
310
- http_pass = os.getenv("GERRIT_HTTP_PASSWORD", "").strip()
311
-
312
- try:
313
- if http_user and http_pass:
314
- if HTTPBasicAuth is None:
315
- return None
316
- # Type ignore needed for dynamic import returning Any
317
- return GerritRestAPI( # type: ignore[no-any-return]
318
- url=fallback_url, auth=HTTPBasicAuth(http_user, http_pass)
319
- )
320
- else:
321
- # Type ignore needed for dynamic import returning Any
322
- return GerritRestAPI(url=fallback_url) # type: ignore[no-any-return]
323
- except Exception as exc:
324
- log.debug(
325
- "Failed to create Gerrit REST client with /r/ path: %s", exc
326
- )
327
- return None
328
-
329
- def check_gerrit_for_existing_change(self, gh: GitHubContext) -> bool:
330
- """Check if a Gerrit change already exists for the given GitHub PR.
331
-
332
- Args:
333
- gh: GitHub context containing PR and repository information
334
-
335
- Returns:
336
- True if a Gerrit change already exists for this PR, False otherwise
337
- """
338
- if not gh.pr_number:
339
- return False
340
-
341
- # Resolve Gerrit host and project
342
- gerrit_info = self._resolve_gerrit_info_from_env_or_gitreview(gh)
343
- if not gerrit_info:
344
- log.debug(
345
- "Cannot resolve Gerrit host/project, "
346
- "skipping Gerrit duplicate check"
347
- )
348
- return False
349
-
350
- gerrit_host, gerrit_project = gerrit_info
351
-
352
- rest = self._build_gerrit_rest_client(gerrit_host)
353
- if rest is None:
354
- log.debug(
355
- "Cannot check Gerrit for duplicates, REST client unavailable"
356
- )
357
- return False
358
-
359
- # Generate the GitHub change hash for this PR
360
- github_hash = DuplicateDetector._generate_github_change_hash(gh)
361
-
362
- try:
363
- # Search for changes that contain the GitHub hash in commit messages
364
- # This is more reliable than comment-based searches
365
- query = (
366
- f'project:{gerrit_project} message:"GitHub-Hash: {github_hash}"'
367
- )
368
- path = f"/changes/?q={query}&n=10"
369
-
370
- log.debug(
371
- "Searching Gerrit for existing changes with GitHub hash %s, "
372
- "query: %s",
373
- github_hash,
374
- query,
375
- )
376
- # Use getattr for dynamic method access to avoid type checking
377
- changes = rest.get(path) # type: ignore[attr-defined]
378
-
379
- if changes:
380
- log.info(
381
- "Found %d existing Gerrit change(s) for GitHub PR #%d: %s",
382
- len(changes),
383
- gh.pr_number,
384
- [f"{c.get('_number', '?')}" for c in changes],
385
- )
386
- return True
387
- else:
388
- log.debug(
389
- "No existing Gerrit changes found for GitHub PR #%d",
390
- gh.pr_number,
391
- )
392
- return False
393
-
394
- except Exception as exc:
395
- # Check if this is a 404 error and try /r/ fallback
396
- status = getattr(
397
- getattr(exc, "response", None), "status_code", None
398
- )
399
- if status == 404:
400
- try:
401
- log.debug("Trying /r/ fallback for Gerrit API")
402
- fallback_rest = self._build_gerrit_rest_client_with_r_path(
403
- gerrit_host
404
- )
405
- if fallback_rest:
406
- changes = fallback_rest.get(path) # type: ignore[attr-defined]
407
- if changes:
408
- log.info(
409
- "Found %d existing Gerrit change(s) for PR #%d "
410
- "via /r/ fallback: %s",
411
- len(changes),
412
- gh.pr_number,
413
- [f"{c.get('_number', '?')}" for c in changes],
414
- )
415
- return True
416
- else:
417
- log.debug(
418
- "No existing Gerrit changes found for PR #%d "
419
- "via /r/ fallback",
420
- gh.pr_number,
421
- )
422
- return False
423
- except Exception as exc2:
424
- log.warning(
425
- "Failed to query Gerrit via /r/ fallback: %s", exc2
426
- )
427
- return False
428
-
429
- log.warning("Failed to query Gerrit for existing changes: %s", exc)
430
- # If we can't check Gerrit, err on the side of caution
431
- return False
432
-
433
278
  @staticmethod
434
279
  def _generate_github_change_hash(gh: GitHubContext) -> str:
435
280
  """Generate a deterministic hash for a GitHub PR to identify duplicates.
@@ -454,9 +299,7 @@ class DuplicateDetector:
454
299
  hash_bytes = hashlib.sha256(hash_input.encode("utf-8")).digest()
455
300
  hash_hex = hash_bytes.hex()[:16]
456
301
 
457
- log.debug(
458
- "Generated GitHub change hash for %s: %s", hash_input, hash_hex
459
- )
302
+ log.debug("Generated GitHub change hash for %s: %s", hash_input, hash_hex)
460
303
  return hash_hex
461
304
 
462
305
  def check_for_duplicates(
@@ -465,37 +308,310 @@ class DuplicateDetector:
465
308
  allow_duplicates: bool = False,
466
309
  gh: GitHubContext | None = None,
467
310
  ) -> None:
468
- """Check if the target PR is a duplicate in Gerrit.
311
+ """Check if the target PR is a duplicate via subject equality against Gerrit.
469
312
 
470
- Args:
471
- target_pr: The PR to check for duplicates
472
- allow_duplicates: If True, only log warnings; if False, raise error
473
- gh: GitHub context for Gerrit duplicate checking
474
-
475
- Raises:
476
- DuplicateChangeError: If duplicates found and allow_duplicates=False
313
+ Implements a robust, dependency-free subject-first duplicate check:
314
+ - Resolve Gerrit host/project from env or .gitreview
315
+ - Query Gerrit changes updated within the lookback window (excluding abandoned)
316
+ - Compare normalized subjects (first line) for exact equality
317
+ - If any match, treat as duplicate and either warn or raise
477
318
  """
478
319
  pr_number = getattr(target_pr, "number", 0)
320
+ pr_title = (getattr(target_pr, "title", "") or "").strip()
479
321
 
480
- log.debug("Checking PR #%d for Gerrit duplicates", pr_number)
322
+ log.debug(
323
+ "Checking PR #%d for duplicates via subject equality against Gerrit",
324
+ pr_number,
325
+ )
326
+
327
+ if not pr_title:
328
+ log.debug("PR #%d has empty title; skipping duplicate check", pr_number)
329
+ return
330
+ if gh is None:
331
+ log.debug("No GitHub context provided; skipping duplicate check")
332
+ return
333
+
334
+ # Resolve Gerrit target (host/project)
335
+ gerrit_info = self._resolve_gerrit_info_from_env_or_gitreview(gh)
336
+ if not gerrit_info:
337
+ log.debug("Unable to resolve Gerrit host/project; skipping duplicate check")
338
+ return
339
+ gerrit_host, gerrit_project = gerrit_info
481
340
 
482
- # Check if this PR already has a corresponding Gerrit change
483
- if gh and self.check_gerrit_for_existing_change(gh):
484
- full_message = (
485
- f"PR #{pr_number} already has an existing Gerrit change. "
486
- f"Skipping duplicate submission. "
487
- f"Target PR title: '{getattr(target_pr, 'title', '')[:100]}'"
341
+ # Helper: normalize subject like our existing title normalization
342
+ def _normalize_subject(title: str) -> str:
343
+ normalized = title.strip()
344
+ normalized = re.sub(
345
+ r"^(feat|fix|docs|style|refactor|test|chore|ci|build|perf)(\(.+?\))?: ",
346
+ "",
347
+ normalized,
348
+ flags=re.IGNORECASE,
488
349
  )
350
+ normalized = re.sub(r"[*_`]", "", normalized)
351
+ normalized = re.sub(r"\bv\d+(\.\d+)*(-\w+)?\b", "vx.y.z", normalized)
352
+ normalized = re.sub(r"\b\d+(\.\d+)+(-\w+)?\b", "x.y.z", normalized)
353
+ normalized = re.sub(r"\b\d+\.\d+\b", "x.y.z", normalized)
354
+ normalized = re.sub(r"\b[a-f0-9]{7,40}\b", "commit_hash", normalized)
355
+ normalized = re.sub(r"\s+", " ", normalized).strip()
356
+ return normalized.lower()
357
+
358
+ normalized_pr_subject = _normalize_subject(pr_title)
359
+ log.debug(
360
+ "Normalized PR subject for duplicate check: %s",
361
+ normalized_pr_subject,
362
+ )
363
+
364
+ # Build Gerrit REST URL using centralized URL builder
365
+ url_builder = create_gerrit_url_builder(gerrit_host)
366
+ api_base = url_builder.api_url().rstrip("/")
367
+
368
+ # Track which base path actually works for constructing display URLs
369
+ successful_base_path = url_builder.base_path
370
+
371
+ # Build query: limit to recent changes, exclude abandoned; prefer open
372
+ cutoff_date = self._cutoff_date.date().isoformat()
373
+ q_parts = []
374
+ if gerrit_project:
375
+ q_parts.append(f"project:{gerrit_project}")
376
+ # Build status clause from DUPLICATES filter (default: open)
377
+ dup_filter = (self.duplicates_filter or "open").strip().lower()
378
+ selected = [s.strip() for s in dup_filter.split(",") if s.strip()]
379
+ valid = {
380
+ "open": "status:open",
381
+ "merged": "status:merged",
382
+ "abandoned": "status:abandoned",
383
+ }
384
+ status_terms = [valid[s] for s in selected if s in valid]
385
+ if not status_terms:
386
+ status_clause = "status:open"
387
+ elif len(status_terms) == 1:
388
+ status_clause = status_terms[0]
389
+ else:
390
+ status_clause = "(" + " OR ".join(status_terms) + ")"
391
+ q_parts.append(status_clause)
392
+ q_parts.append(f"after:{cutoff_date}")
393
+ query = " ".join(q_parts)
394
+ encoded_q = urllib.parse.quote(query, safe="")
395
+
396
+ # Request current commit metadata so we get 'subject'
397
+ # Use a modest page size
398
+ url = f"{api_base}/changes/?q={encoded_q}&n=50&o=CURRENT_COMMIT&o=CURRENT_FILES"
399
+
400
+ def _load_gerrit_json(url_: str) -> list[dict[str, object]]:
401
+ try:
402
+ log.debug("Querying Gerrit for duplicates: %s", url_)
403
+ # Ensure we only fetch over HTTPS to avoid unsafe schemes
404
+ parsed = urllib.parse.urlparse(url_)
405
+ if parsed.scheme != "https":
406
+ log.debug("Skipping non-HTTPS URL for Gerrit query: %s", url_)
407
+ return []
408
+ headers: dict[str, str] = {}
409
+ http_user = os.getenv("GERRIT_HTTP_USER", "").strip()
410
+ http_pass = os.getenv("GERRIT_HTTP_PASSWORD", "").strip()
411
+ if http_user and http_pass:
412
+ import base64 as _b64 # localized import to avoid global import edit
413
+
414
+ basic = _b64.b64encode(f"{http_user}:{http_pass}".encode()).decode("ascii")
415
+ headers["Authorization"] = f"Basic {basic}"
416
+ req = urllib.request.Request(url_, headers=headers)
417
+ with urllib.request.urlopen(req, timeout=8) as resp:
418
+ raw = resp.read().decode("utf-8", errors="replace")
419
+ # Strip Gerrit's XSSI prefix if present
420
+ if raw.startswith(")]}'"):
421
+ raw = raw.split("\n", 1)[1] if "\n" in raw else ""
422
+ data = json.loads(raw or "[]")
423
+ if isinstance(data, list):
424
+ return data
425
+ else:
426
+ return []
427
+ except urllib.error.HTTPError as exc:
428
+ log.debug("Gerrit query failed for %s: %s", url_, exc)
429
+ return []
430
+ except Exception as exc:
431
+ log.debug("Gerrit query failed for %s: %s", url_, exc)
432
+ return []
433
+
434
+ log.debug(
435
+ "Gerrit duplicate query: host=%s project=%s filter=%s cutoff=%s url=%s",
436
+ gerrit_host,
437
+ gerrit_project or "(any)",
438
+ dup_filter,
439
+ cutoff_date,
440
+ url,
441
+ )
442
+ changes = _load_gerrit_json(url)
443
+ log.debug(
444
+ "Gerrit query returned %d change(s) for project=%s filter=%s after=%s",
445
+ len(changes),
446
+ gerrit_project or "(any)",
447
+ dup_filter,
448
+ cutoff_date,
449
+ )
450
+ if changes:
451
+ sample_subjects = ", ".join(str(c.get("subject") or "")[:60] for c in changes[:5])
452
+ log.debug("Sample subjects: %s", sample_subjects)
453
+
454
+ # Compare normalized subjects for exact equality
455
+ matched: list[tuple[int, str]] = []
456
+ for c in changes:
457
+ subj = str(c.get("subject") or "").strip()
458
+ if not subj:
459
+ continue
460
+ if _normalize_subject(subj) == normalized_pr_subject:
461
+ num = c.get("_number")
462
+ proj = str(c.get("project") or gerrit_project or "")
463
+ if isinstance(num, int):
464
+ matched.append((num, proj))
465
+
466
+ if not matched:
467
+ # No exact subject match; proceed with similarity scoring across candidates
468
+ log.debug("No exact-subject matches found; entering similarity scoring")
469
+ from .similarity import ScoringConfig
470
+ from .similarity import aggregate_scores
471
+ from .similarity import remove_commit_trailers
472
+ from .similarity import score_bodies
473
+ from .similarity import score_files
474
+ from .similarity import score_subjects
475
+
476
+ config = ScoringConfig()
477
+ # Source features from the PR
478
+ src_subjects = [pr_title]
479
+ src_body = str(getattr(target_pr, "body", "") or "")
480
+ src_files: list[str] = []
481
+ try:
482
+ get_files = getattr(target_pr, "get_files", None)
483
+ if callable(get_files):
484
+ files_obj = get_files()
485
+ if isinstance(files_obj, Iterable):
486
+ for f in files_obj:
487
+ fname = getattr(f, "filename", None)
488
+ if fname:
489
+ src_files.append(str(fname))
490
+ except Exception as exc:
491
+ # Best-effort; if files cannot be retrieved, proceed without them
492
+ log.debug("Failed to retrieve PR files for scoring: %s", exc)
493
+
494
+ best_score = 0.0
495
+ best_reasons: list[str] = []
496
+ hits: list[tuple[float, str, int | None]] = []
497
+ all_nums: list[int] = []
498
+ for c in changes:
499
+ subj = str(c.get("subject") or "").strip()
500
+ if not subj:
501
+ continue
502
+ # Extract commit message and files from revisions (CURRENT_COMMIT, CURRENT_FILES)
503
+ rev = str(c.get("current_revision") or "")
504
+ revs_obj = c.get("revisions")
505
+ revs = revs_obj if isinstance(revs_obj, dict) else {}
506
+ cur_obj = revs.get(rev)
507
+ cur = cur_obj if isinstance(cur_obj, dict) else {}
508
+ commit = cur.get("commit") or {}
509
+ msg = str(commit.get("message") or "")
510
+ cand_body_raw = ""
511
+ if "\n" in msg:
512
+ cand_body_raw = msg.split("\n", 1)[1]
513
+ cand_body = remove_commit_trailers(cand_body_raw)
514
+ files_dict = cur.get("files") or {}
515
+ cand_files = [p for p in files_dict if isinstance(p, str) and not p.startswith("/")]
516
+
517
+ # Compute component scores
518
+ s_res = score_subjects(src_subjects, subj)
519
+ f_res = score_files(
520
+ src_files,
521
+ cand_files,
522
+ workflow_min_floor=config.workflow_min_floor,
523
+ )
524
+ b_res = score_bodies(src_body, cand_body)
525
+
526
+ # Aggregate
527
+ agg = aggregate_scores(s_res.score, f_res.score, b_res.score, config=config)
528
+ log.debug(
529
+ "Aggregate score computed: %.2f (s=%.2f f=%.2f b=%.2f)",
530
+ agg,
531
+ s_res.score,
532
+ f_res.score,
533
+ b_res.score,
534
+ )
489
535
 
490
- if allow_duplicates:
491
- log.warning(
492
- "GERRIT DUPLICATE DETECTED (allowed): %s", full_message
536
+ # Build candidate reference and number using successful base path
537
+ num_obj = c.get("_number")
538
+ num = int(num_obj) if isinstance(num_obj, int) else None
539
+ proj = str(c.get("project") or gerrit_project or "")
540
+
541
+ # Use the base path that actually worked for API calls
542
+ display_url_builder = create_gerrit_url_builder(gerrit_host, successful_base_path)
543
+ ref = (
544
+ display_url_builder.change_url(proj, num)
545
+ if proj and isinstance(num, int)
546
+ else (f"change {num}" if isinstance(num, int) else "")
493
547
  )
494
- return
548
+ log.debug(
549
+ "Scoring candidate: ref=%s agg=%.2f (s=%.2f f=%.2f b=%.2f) subj='%s'",
550
+ ref or "(none)",
551
+ agg,
552
+ s_res.score,
553
+ f_res.score,
554
+ b_res.score,
555
+ subj[:200],
556
+ )
557
+
558
+ # Track best (for reasons)
559
+ if agg > best_score:
560
+ best_score = agg
561
+ # Deduplicate reasons preserving order
562
+ best_reasons = list(dict.fromkeys(s_res.reasons + f_res.reasons + b_res.reasons))
563
+
564
+ # Collect all candidates above threshold
565
+ if agg >= config.similarity_threshold and ref:
566
+ hits.append((agg, ref, num))
567
+ if isinstance(num, int):
568
+ all_nums.append(num)
569
+
570
+ log.debug(
571
+ "Similarity scoring found %d hit(s) (threshold=%.2f)",
572
+ len(hits),
573
+ config.similarity_threshold,
574
+ )
575
+ if hits:
576
+ hits_sorted = sorted(hits, key=lambda t: t[0], reverse=True)
577
+
578
+ # Log each matching change individually
579
+ for s, u, _ in hits_sorted:
580
+ if u:
581
+ log.info("Score: %.2f URL: %s", s, u)
582
+ msg = f"Similar Gerrit change(s) detected [≥ {config.similarity_threshold:.2f}]"
583
+ if best_reasons:
584
+ msg += f" (Reasons: {', '.join(best_reasons)})"
585
+ if allow_duplicates:
586
+ log.warning("GERRIT DUPLICATE DETECTED (allowed): %s", msg)
587
+ return
588
+ raise DuplicateChangeError(msg, all_nums)
589
+
590
+ # Construct human-friendly references for logs
591
+ matching_numbers: list[int] = []
592
+ match_lines: list[str] = []
593
+ for n, proj in matched:
594
+ if proj:
595
+ # Use the base path that actually worked for API calls
596
+ display_url_builder = create_gerrit_url_builder(gerrit_host, successful_base_path)
597
+ url = display_url_builder.change_url(proj, n)
598
+ match_lines.append(f"Score: 1.0 URL: {url}")
599
+ log.info("Score: 1.0 URL: %s", url)
495
600
  else:
496
- raise DuplicateChangeError(full_message, [])
601
+ match_lines.append(f"Score: 1.0 URL: change {n}")
602
+ log.info("Score: 1.0 URL: change %s", n)
603
+ matching_numbers.append(n)
604
+
605
+ if not matched:
606
+ log.debug("No exact subject matches and no similarity matches; duplicate check passes")
607
+ return
497
608
 
498
- log.debug("No existing Gerrit change found for PR #%d", pr_number)
609
+ # Remove PR number from message since cli.py already includes it
610
+ full_message = "subject matches existing Gerrit change(s)"
611
+ if allow_duplicates:
612
+ log.warning("GERRIT DUPLICATE DETECTED (allowed): %s", full_message)
613
+ return
614
+ raise DuplicateChangeError(full_message, matching_numbers)
499
615
 
500
616
 
501
617
  def check_for_duplicates(
@@ -525,10 +641,12 @@ def check_for_duplicates(
525
641
  target_pr = repo.get_pull(gh.pr_number)
526
642
 
527
643
  # Create detector and check
528
- detector = DuplicateDetector(repo, lookback_days=lookback_days)
529
- detector.check_for_duplicates(
530
- target_pr, allow_duplicates=allow_duplicates, gh=gh
644
+ detector = DuplicateDetector(
645
+ repo,
646
+ lookback_days=lookback_days,
647
+ duplicates_filter=os.getenv("DUPLICATES", "open"),
531
648
  )
649
+ detector.check_for_duplicates(target_pr, allow_duplicates=allow_duplicates, gh=gh)
532
650
 
533
651
  log.info("Duplicate check completed for PR #%d", gh.pr_number)
534
652
 
@@ -536,7 +654,5 @@ def check_for_duplicates(
536
654
  # Re-raise duplicate errors
537
655
  raise
538
656
  except Exception as exc:
539
- log.warning(
540
- "Duplicate detection failed for PR #%d: %s", gh.pr_number, exc
541
- )
657
+ log.warning("Duplicate detection failed for PR #%d: %s", gh.pr_number, exc)
542
658
  # Don't fail the entire process if duplicate detection has issues