github2gerrit 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,16 +10,19 @@ submissions from automated tools like Dependabot.
10
10
  """
11
11
 
12
12
  import hashlib
13
+ import json
13
14
  import logging
14
15
  import os
15
16
  import re
16
17
  import urllib.parse
17
18
  import urllib.request
19
+ from collections.abc import Iterable
18
20
  from datetime import UTC
19
21
  from datetime import datetime
20
22
  from datetime import timedelta
21
23
  from pathlib import Path
22
24
 
25
+ from .gerrit_urls import create_gerrit_url_builder
23
26
  from .github_api import GhPullRequest
24
27
  from .github_api import GhRepository
25
28
  from .github_api import build_client
@@ -57,9 +60,7 @@ class DuplicateChangeError(Exception):
57
60
  class ChangeFingerprint:
58
61
  """Represents a fingerprint of a change for duplicate detection."""
59
62
 
60
- def __init__(
61
- self, title: str, body: str = "", files_changed: list[str] | None = None
62
- ):
63
+ def __init__(self, title: str, body: str = "", files_changed: list[str] | None = None):
63
64
  self.title = title.strip()
64
65
  self.body = (body or "").strip()
65
66
  self.files_changed = sorted(files_changed or [])
@@ -100,15 +101,10 @@ class ChangeFingerprint:
100
101
 
101
102
  def _compute_content_hash(self) -> str:
102
103
  """Compute a hash of the change content."""
103
- content = (
104
- f"{self._normalized_title}\n{self.body}\n"
105
- f"{','.join(self.files_changed)}"
106
- )
104
+ content = f"{self._normalized_title}\n{self.body}\n{','.join(self.files_changed)}"
107
105
  return hashlib.sha256(content.encode("utf-8")).hexdigest()[:16]
108
106
 
109
- def is_similar_to(
110
- self, other: "ChangeFingerprint", similarity_threshold: float = 0.8
111
- ) -> bool:
107
+ def is_similar_to(self, other: "ChangeFingerprint", similarity_threshold: float = 0.8) -> bool:
112
108
  """Check if this fingerprint is similar to another."""
113
109
  # Exact normalized title match
114
110
  if self._normalized_title == other._normalized_title:
@@ -131,9 +127,7 @@ class ChangeFingerprint:
131
127
  # Check title similarity even without file changes
132
128
  return self._titles_similar(other, similarity_threshold)
133
129
 
134
- def _titles_similar(
135
- self, other: "ChangeFingerprint", threshold: float
136
- ) -> bool:
130
+ def _titles_similar(self, other: "ChangeFingerprint", threshold: float) -> bool:
137
131
  """Check if titles are similar using simple string similarity."""
138
132
  title1 = self._normalized_title
139
133
  title2 = other._normalized_title
@@ -154,28 +148,29 @@ class ChangeFingerprint:
154
148
  return (intersection / union) >= threshold
155
149
 
156
150
  def __str__(self) -> str:
157
- return (
158
- f"ChangeFingerprint(title='{self.title[:50]}...', "
159
- f"hash={self._content_hash})"
160
- )
151
+ return f"ChangeFingerprint(title='{self.title[:50]}...', hash={self._content_hash})"
161
152
 
162
153
 
163
154
  class DuplicateDetector:
164
155
  """Detects duplicate Gerrit changes for GitHub pull requests."""
165
156
 
166
- def __init__(self, repo: GhRepository, lookback_days: int = 7):
157
+ def __init__(
158
+ self,
159
+ repo: GhRepository,
160
+ lookback_days: int = 7,
161
+ duplicates_filter: str = "open",
162
+ ):
167
163
  self.repo = repo
168
164
  self.lookback_days = lookback_days
169
165
  self._cutoff_date = datetime.now(UTC) - timedelta(days=lookback_days)
166
+ self.duplicates_filter = duplicates_filter
170
167
 
171
168
  def _match_first_group(self, pattern: str, text: str) -> str:
172
169
  """Extract first regex group match from text."""
173
170
  match = re.search(pattern, text)
174
171
  return match.group(1) if match else ""
175
172
 
176
- def _resolve_gerrit_info_from_env_or_gitreview(
177
- self, gh: GitHubContext
178
- ) -> tuple[str, str] | None:
173
+ def _resolve_gerrit_info_from_env_or_gitreview(self, gh: GitHubContext) -> tuple[str, str] | None:
179
174
  """Resolve Gerrit host and project from environment or .gitreview file.
180
175
 
181
176
  Returns:
@@ -198,6 +193,8 @@ class DuplicateDetector:
198
193
  if host and proj:
199
194
  project = proj.removesuffix(".git")
200
195
  return (host.strip(), project.strip())
196
+ if host and not proj:
197
+ return (host.strip(), "")
201
198
  except Exception as exc:
202
199
  log.debug("Failed to read local .gitreview: %s", exc)
203
200
 
@@ -219,38 +216,28 @@ class DuplicateDetector:
219
216
  if not branch:
220
217
  continue
221
218
 
222
- url = (
223
- f"https://raw.githubusercontent.com/"
224
- f"{repo_full}/refs/heads/{branch}/.gitreview"
225
- )
219
+ url = f"https://raw.githubusercontent.com/{repo_full}/{branch}/.gitreview"
226
220
 
227
221
  parsed = urllib.parse.urlparse(url)
228
- if (
229
- parsed.scheme != "https"
230
- or parsed.netloc != "raw.githubusercontent.com"
231
- ):
222
+ if parsed.scheme != "https" or parsed.netloc != "raw.githubusercontent.com":
232
223
  continue
233
224
 
234
225
  try:
235
226
  log.debug("Fetching .gitreview from: %s", url)
236
- with urllib.request.urlopen(url, timeout=5) as resp: # noqa: S310
227
+ with urllib.request.urlopen(url, timeout=5) as resp:
237
228
  text_remote = resp.read().decode("utf-8")
238
229
 
239
- host = self._match_first_group(
240
- r"(?m)^host=(.+)$", text_remote
241
- )
242
- proj = self._match_first_group(
243
- r"(?m)^project=(.+)$", text_remote
244
- )
230
+ host = self._match_first_group(r"(?m)^host=(.+)$", text_remote)
231
+ proj = self._match_first_group(r"(?m)^project=(.+)$", text_remote)
245
232
 
246
233
  if host and proj:
247
234
  project = proj.removesuffix(".git")
248
235
  return (host.strip(), project.strip())
236
+ if host and not proj:
237
+ return (host.strip(), "")
249
238
 
250
239
  except Exception as exc:
251
- log.debug(
252
- "Failed to fetch .gitreview from %s: %s", url, exc
253
- )
240
+ log.debug("Failed to fetch .gitreview from %s: %s", url, exc)
254
241
  continue
255
242
 
256
243
  except Exception as exc:
@@ -261,22 +248,14 @@ class DuplicateDetector:
261
248
  def _build_gerrit_rest_client(self, gerrit_host: str) -> object | None:
262
249
  """Build a Gerrit REST API client if pygerrit2 is available."""
263
250
  if GerritRestAPI is None:
264
- log.debug(
265
- "pygerrit2 not available, skipping Gerrit duplicate check"
266
- )
251
+ log.debug("pygerrit2 not available, skipping Gerrit duplicate check")
267
252
  return None
268
253
 
269
- base_path = os.getenv("GERRIT_HTTP_BASE_PATH", "").strip().strip("/")
270
- base_url = (
271
- f"https://{gerrit_host}/"
272
- if not base_path
273
- else f"https://{gerrit_host}/{base_path}/"
274
- )
254
+ # Create centralized URL builder
255
+ url_builder = create_gerrit_url_builder(gerrit_host)
256
+ base_url = url_builder.api_url()
275
257
 
276
- http_user = (
277
- os.getenv("GERRIT_HTTP_USER", "").strip()
278
- or os.getenv("GERRIT_SSH_USER_G2G", "").strip()
279
- )
258
+ http_user = os.getenv("GERRIT_HTTP_USER", "").strip() or os.getenv("GERRIT_SSH_USER_G2G", "").strip()
280
259
  http_pass = os.getenv("GERRIT_HTTP_PASSWORD", "").strip()
281
260
 
282
261
  try:
@@ -295,18 +274,16 @@ class DuplicateDetector:
295
274
  log.debug("Failed to create Gerrit REST client: %s", exc)
296
275
  return None
297
276
 
298
- def _build_gerrit_rest_client_with_r_path(
299
- self, gerrit_host: str
300
- ) -> object | None:
277
+ def _build_gerrit_rest_client_with_r_path(self, gerrit_host: str) -> object | None:
301
278
  """Build a Gerrit REST API client with /r/ base path for fallback."""
302
279
  if GerritRestAPI is None:
303
280
  return None
304
281
 
305
- fallback_url = f"https://{gerrit_host}/r/"
306
- http_user = (
307
- os.getenv("GERRIT_HTTP_USER", "").strip()
308
- or os.getenv("GERRIT_SSH_USER_G2G", "").strip()
309
- )
282
+ # Create centralized URL builder with /r/ base path override
283
+ url_builder = create_gerrit_url_builder(gerrit_host, "r")
284
+ fallback_url = url_builder.api_url()
285
+
286
+ http_user = os.getenv("GERRIT_HTTP_USER", "").strip() or os.getenv("GERRIT_SSH_USER_G2G", "").strip()
310
287
  http_pass = os.getenv("GERRIT_HTTP_PASSWORD", "").strip()
311
288
 
312
289
  try:
@@ -321,114 +298,17 @@ class DuplicateDetector:
321
298
  # Type ignore needed for dynamic import returning Any
322
299
  return GerritRestAPI(url=fallback_url) # type: ignore[no-any-return]
323
300
  except Exception as exc:
324
- log.debug(
325
- "Failed to create Gerrit REST client with /r/ path: %s", exc
326
- )
301
+ log.debug("Failed to create Gerrit REST client with /r/ path: %s", exc)
327
302
  return None
328
303
 
329
304
  def check_gerrit_for_existing_change(self, gh: GitHubContext) -> bool:
330
- """Check if a Gerrit change already exists for the given GitHub PR.
331
-
332
- Args:
333
- gh: GitHub context containing PR and repository information
305
+ """Deprecated: GitHub-Hash/Gerrit REST based duplicate detection disabled.
334
306
 
335
- Returns:
336
- True if a Gerrit change already exists for this PR, False otherwise
307
+ Always returns False. Scoring-based duplicate detection will be implemented
308
+ in check_for_duplicates.
337
309
  """
338
- if not gh.pr_number:
339
- return False
340
-
341
- # Resolve Gerrit host and project
342
- gerrit_info = self._resolve_gerrit_info_from_env_or_gitreview(gh)
343
- if not gerrit_info:
344
- log.debug(
345
- "Cannot resolve Gerrit host/project, "
346
- "skipping Gerrit duplicate check"
347
- )
348
- return False
349
-
350
- gerrit_host, gerrit_project = gerrit_info
351
-
352
- rest = self._build_gerrit_rest_client(gerrit_host)
353
- if rest is None:
354
- log.debug(
355
- "Cannot check Gerrit for duplicates, REST client unavailable"
356
- )
357
- return False
358
-
359
- # Generate the GitHub change hash for this PR
360
- github_hash = DuplicateDetector._generate_github_change_hash(gh)
361
-
362
- try:
363
- # Search for changes that contain the GitHub hash in commit messages
364
- # This is more reliable than comment-based searches
365
- query = (
366
- f'project:{gerrit_project} message:"GitHub-Hash: {github_hash}"'
367
- )
368
- path = f"/changes/?q={query}&n=10"
369
-
370
- log.debug(
371
- "Searching Gerrit for existing changes with GitHub hash %s, "
372
- "query: %s",
373
- github_hash,
374
- query,
375
- )
376
- # Use getattr for dynamic method access to avoid type checking
377
- changes = rest.get(path) # type: ignore[attr-defined]
378
-
379
- if changes:
380
- log.info(
381
- "Found %d existing Gerrit change(s) for GitHub PR #%d: %s",
382
- len(changes),
383
- gh.pr_number,
384
- [f"{c.get('_number', '?')}" for c in changes],
385
- )
386
- return True
387
- else:
388
- log.debug(
389
- "No existing Gerrit changes found for GitHub PR #%d",
390
- gh.pr_number,
391
- )
392
- return False
393
-
394
- except Exception as exc:
395
- # Check if this is a 404 error and try /r/ fallback
396
- status = getattr(
397
- getattr(exc, "response", None), "status_code", None
398
- )
399
- if status == 404:
400
- try:
401
- log.debug("Trying /r/ fallback for Gerrit API")
402
- fallback_rest = self._build_gerrit_rest_client_with_r_path(
403
- gerrit_host
404
- )
405
- if fallback_rest:
406
- changes = fallback_rest.get(path) # type: ignore[attr-defined]
407
- if changes:
408
- log.info(
409
- "Found %d existing Gerrit change(s) for PR #%d "
410
- "via /r/ fallback: %s",
411
- len(changes),
412
- gh.pr_number,
413
- [f"{c.get('_number', '?')}" for c in changes],
414
- )
415
- return True
416
- else:
417
- log.debug(
418
- "No existing Gerrit changes found for PR #%d "
419
- "via /r/ fallback",
420
- gh.pr_number,
421
- )
422
- return False
423
- except Exception as exc2:
424
- log.warning(
425
- "Failed to query Gerrit via /r/ fallback: %s", exc2
426
- )
427
- return False
428
-
429
- log.warning("Failed to query Gerrit for existing changes: %s", exc)
430
- # If we can't check Gerrit, err on the side of caution
431
- return False
310
+ log.debug("Gerrit REST duplicate check disabled")
311
+ return False
432
312
 
433
313
  @staticmethod
434
314
  def _generate_github_change_hash(gh: GitHubContext) -> str:
@@ -454,9 +334,7 @@ class DuplicateDetector:
454
334
  hash_bytes = hashlib.sha256(hash_input.encode("utf-8")).digest()
455
335
  hash_hex = hash_bytes.hex()[:16]
456
336
 
457
- log.debug(
458
- "Generated GitHub change hash for %s: %s", hash_input, hash_hex
459
- )
337
+ log.debug("Generated GitHub change hash for %s: %s", hash_input, hash_hex)
460
338
  return hash_hex
461
339
 
462
340
  def check_for_duplicates(
@@ -465,37 +343,341 @@ class DuplicateDetector:
465
343
  allow_duplicates: bool = False,
466
344
  gh: GitHubContext | None = None,
467
345
  ) -> None:
468
- """Check if the target PR is a duplicate in Gerrit.
469
-
470
- Args:
471
- target_pr: The PR to check for duplicates
472
- allow_duplicates: If True, only log warnings; if False, raise error
473
- gh: GitHub context for Gerrit duplicate checking
346
+ """Check if the target PR is a duplicate via subject equality against Gerrit.
474
347
 
475
- Raises:
476
- DuplicateChangeError: If duplicates found and allow_duplicates=False
348
+ Implements a robust, dependency-free subject-first duplicate check:
349
+ - Resolve Gerrit host/project from env or .gitreview
350
+ - Query Gerrit changes updated within the lookback window (excluding abandoned)
351
+ - Compare normalized subjects (first line) for exact equality
352
+ - If any match, treat as duplicate and either warn or raise
477
353
  """
478
354
  pr_number = getattr(target_pr, "number", 0)
355
+ pr_title = (getattr(target_pr, "title", "") or "").strip()
479
356
 
480
- log.debug("Checking PR #%d for Gerrit duplicates", pr_number)
357
+ log.debug(
358
+ "Checking PR #%d for duplicates via subject equality against Gerrit",
359
+ pr_number,
360
+ )
481
361
 
482
- # Check if this PR already has a corresponding Gerrit change
483
- if gh and self.check_gerrit_for_existing_change(gh):
484
- full_message = (
485
- f"PR #{pr_number} already has an existing Gerrit change. "
486
- f"Skipping duplicate submission. "
487
- f"Target PR title: '{getattr(target_pr, 'title', '')[:100]}'"
362
+ if not pr_title:
363
+ log.debug("PR #%d has empty title; skipping duplicate check", pr_number)
364
+ return
365
+ if gh is None:
366
+ log.debug("No GitHub context provided; skipping duplicate check")
367
+ return
368
+
369
+ # Resolve Gerrit target (host/project)
370
+ gerrit_info = self._resolve_gerrit_info_from_env_or_gitreview(gh)
371
+ if not gerrit_info:
372
+ log.debug("Unable to resolve Gerrit host/project; skipping duplicate check")
373
+ return
374
+ gerrit_host, gerrit_project = gerrit_info
375
+
376
+ # Helper: normalize subject like our existing title normalization
377
+ def _normalize_subject(title: str) -> str:
378
+ normalized = title.strip()
379
+ normalized = re.sub(
380
+ r"^(feat|fix|docs|style|refactor|test|chore|ci|build|perf)(\(.+?\))?: ",
381
+ "",
382
+ normalized,
383
+ flags=re.IGNORECASE,
488
384
  )
385
+ normalized = re.sub(r"[*_`]", "", normalized)
386
+ normalized = re.sub(r"\bv\d+(\.\d+)*(-\w+)?\b", "vx.y.z", normalized)
387
+ normalized = re.sub(r"\b\d+(\.\d+)+(-\w+)?\b", "x.y.z", normalized)
388
+ normalized = re.sub(r"\b\d+\.\d+\b", "x.y.z", normalized)
389
+ normalized = re.sub(r"\b[a-f0-9]{7,40}\b", "commit_hash", normalized)
390
+ normalized = re.sub(r"\s+", " ", normalized).strip()
391
+ return normalized.lower()
392
+
393
+ normalized_pr_subject = _normalize_subject(pr_title)
394
+ log.debug(
395
+ "Normalized PR subject for duplicate check: %s",
396
+ normalized_pr_subject,
397
+ )
489
398
 
490
- if allow_duplicates:
491
- log.warning(
492
- "GERRIT DUPLICATE DETECTED (allowed): %s", full_message
399
+ # Build Gerrit REST URL using centralized URL builder
400
+ url_builder = create_gerrit_url_builder(gerrit_host)
401
+ api_base = url_builder.api_url().rstrip("/")
402
+
403
+ # Track which base path actually works for constructing display URLs
404
+ successful_base_path = url_builder.base_path
405
+
406
+ # Build query: limit to recent changes, exclude abandoned; prefer open
407
+ cutoff_date = self._cutoff_date.date().isoformat()
408
+ q_parts = []
409
+ if gerrit_project:
410
+ q_parts.append(f"project:{gerrit_project}")
411
+ # Build status clause from DUPLICATES filter (default: open)
412
+ dup_filter = (self.duplicates_filter or "open").strip().lower()
413
+ selected = [s.strip() for s in dup_filter.split(",") if s.strip()]
414
+ valid = {
415
+ "open": "status:open",
416
+ "merged": "status:merged",
417
+ "abandoned": "status:abandoned",
418
+ }
419
+ status_terms = [valid[s] for s in selected if s in valid]
420
+ if not status_terms:
421
+ status_clause = "status:open"
422
+ elif len(status_terms) == 1:
423
+ status_clause = status_terms[0]
424
+ else:
425
+ status_clause = "(" + " OR ".join(status_terms) + ")"
426
+ q_parts.append(status_clause)
427
+ q_parts.append(f"after:{cutoff_date}")
428
+ query = " ".join(q_parts)
429
+ encoded_q = urllib.parse.quote(query, safe="")
430
+
431
+ # Request current commit metadata so we get 'subject'
432
+ # Use a modest page size
433
+ url = f"{api_base}/changes/?q={encoded_q}&n=50&o=CURRENT_COMMIT&o=CURRENT_FILES"
434
+
435
+ def _load_gerrit_json(url_: str) -> list[dict[str, object]]:
436
+ try:
437
+ log.debug("Querying Gerrit for duplicates: %s", url_)
438
+ # Ensure we only fetch over HTTPS to avoid unsafe schemes
439
+ parsed = urllib.parse.urlparse(url_)
440
+ if parsed.scheme != "https":
441
+ log.debug("Skipping non-HTTPS URL for Gerrit query: %s", url_)
442
+ return []
443
+ headers: dict[str, str] = {}
444
+ http_user = os.getenv("GERRIT_HTTP_USER", "").strip()
445
+ http_pass = os.getenv("GERRIT_HTTP_PASSWORD", "").strip()
446
+ if http_user and http_pass:
447
+ import base64 as _b64 # localized import to avoid global import edit
448
+
449
+ basic = _b64.b64encode(f"{http_user}:{http_pass}".encode()).decode("ascii")
450
+ headers["Authorization"] = f"Basic {basic}"
451
+ req = urllib.request.Request(url_, headers=headers)
452
+ with urllib.request.urlopen(req, timeout=8) as resp:
453
+ raw = resp.read().decode("utf-8", errors="replace")
454
+ # Strip Gerrit's XSSI prefix if present
455
+ if raw.startswith(")]}'"):
456
+ raw = raw.split("\n", 1)[1] if "\n" in raw else ""
457
+ data = json.loads(raw or "[]")
458
+ if isinstance(data, list):
459
+ return data
460
+ else:
461
+ return []
462
+ except urllib.error.HTTPError as exc:
463
+ if exc.code == 404:
464
+ # Try with /r/ base path fallback using centralized URL builder
465
+ fallback_builder = create_gerrit_url_builder(gerrit_host, "r")
466
+ fallback_api_base = fallback_builder.api_url().rstrip("/")
467
+ fallback_url = url_.replace(api_base, fallback_api_base)
468
+ if fallback_url != url_:
469
+ log.debug(
470
+ "Trying Gerrit query with /r/ base path: %s",
471
+ fallback_url,
472
+ )
473
+ try:
474
+ req_fallback = urllib.request.Request(fallback_url, headers=headers)
475
+ with urllib.request.urlopen(req_fallback, timeout=8) as resp:
476
+ raw = resp.read().decode("utf-8", errors="replace")
477
+ # Strip Gerrit's XSSI prefix if present
478
+ if raw.startswith(")]}'"):
479
+ raw = raw.split("\n", 1)[1] if "\n" in raw else ""
480
+ data = json.loads(raw or "[]")
481
+ if isinstance(data, list):
482
+ # Update successful base path for display URL construction
483
+ nonlocal successful_base_path
484
+ successful_base_path = "r"
485
+ return data
486
+ else:
487
+ return []
488
+ except Exception as fallback_exc:
489
+ log.debug(
490
+ "Gerrit fallback query also failed for %s: %s",
491
+ fallback_url,
492
+ fallback_exc,
493
+ )
494
+ log.debug("Gerrit query failed for %s: %s", url_, exc)
495
+ return []
496
+ except Exception as exc:
497
+ log.debug("Gerrit query failed for %s: %s", url_, exc)
498
+ return []
499
+
500
+ log.debug(
501
+ "Gerrit duplicate query: host=%s project=%s filter=%s cutoff=%s url=%s",
502
+ gerrit_host,
503
+ gerrit_project or "(any)",
504
+ dup_filter,
505
+ cutoff_date,
506
+ url,
507
+ )
508
+ changes = _load_gerrit_json(url)
509
+ log.debug(
510
+ "Gerrit query returned %d change(s) for project=%s filter=%s after=%s",
511
+ len(changes),
512
+ gerrit_project or "(any)",
513
+ dup_filter,
514
+ cutoff_date,
515
+ )
516
+ if changes:
517
+ sample_subjects = ", ".join(str(c.get("subject") or "")[:60] for c in changes[:5])
518
+ log.debug("Sample subjects: %s", sample_subjects)
519
+
520
+ # Compare normalized subjects for exact equality
521
+ matched: list[tuple[int, str]] = []
522
+ for c in changes:
523
+ subj = str(c.get("subject") or "").strip()
524
+ if not subj:
525
+ continue
526
+ if _normalize_subject(subj) == normalized_pr_subject:
527
+ num = c.get("_number")
528
+ proj = str(c.get("project") or gerrit_project or "")
529
+ if isinstance(num, int):
530
+ matched.append((num, proj))
531
+
532
+ if not matched:
533
+ # No exact subject match; proceed with similarity scoring across candidates
534
+ log.debug("No exact-subject matches found; entering similarity scoring")
535
+ from .similarity import ScoringConfig
536
+ from .similarity import aggregate_scores
537
+ from .similarity import remove_commit_trailers
538
+ from .similarity import score_bodies
539
+ from .similarity import score_files
540
+ from .similarity import score_subjects
541
+
542
+ config = ScoringConfig()
543
+ # Source features from the PR
544
+ src_subjects = [pr_title]
545
+ src_body = str(getattr(target_pr, "body", "") or "")
546
+ src_files: list[str] = []
547
+ try:
548
+ get_files = getattr(target_pr, "get_files", None)
549
+ if callable(get_files):
550
+ files_obj = get_files()
551
+ if isinstance(files_obj, Iterable):
552
+ for f in files_obj:
553
+ fname = getattr(f, "filename", None)
554
+ if fname:
555
+ src_files.append(str(fname))
556
+ except Exception as exc:
557
+ # Best-effort; if files cannot be retrieved, proceed without them
558
+ log.debug("Failed to retrieve PR files for scoring: %s", exc)
559
+
560
+ best_score = 0.0
561
+ best_reasons: list[str] = []
562
+ hits: list[tuple[float, str, int | None]] = []
563
+ all_nums: list[int] = []
564
+ for c in changes:
565
+ subj = str(c.get("subject") or "").strip()
566
+ if not subj:
567
+ continue
568
+ # Extract commit message and files from revisions (CURRENT_COMMIT, CURRENT_FILES)
569
+ rev = str(c.get("current_revision") or "")
570
+ revs_obj = c.get("revisions")
571
+ revs = revs_obj if isinstance(revs_obj, dict) else {}
572
+ cur_obj = revs.get(rev)
573
+ cur = cur_obj if isinstance(cur_obj, dict) else {}
574
+ commit = cur.get("commit") or {}
575
+ msg = str(commit.get("message") or "")
576
+ cand_body_raw = ""
577
+ if "\n" in msg:
578
+ cand_body_raw = msg.split("\n", 1)[1]
579
+ cand_body = remove_commit_trailers(cand_body_raw)
580
+ files_dict = cur.get("files") or {}
581
+ cand_files = [p for p in files_dict if isinstance(p, str) and not p.startswith("/")]
582
+
583
+ # Compute component scores
584
+ s_res = score_subjects(src_subjects, subj)
585
+ f_res = score_files(
586
+ src_files,
587
+ cand_files,
588
+ workflow_min_floor=config.workflow_min_floor,
589
+ )
590
+ b_res = score_bodies(src_body, cand_body)
591
+
592
+ # Aggregate
593
+ agg = aggregate_scores(s_res.score, f_res.score, b_res.score, config=config)
594
+ log.debug(
595
+ "Aggregate score computed: %.2f (s=%.2f f=%.2f b=%.2f)",
596
+ agg,
597
+ s_res.score,
598
+ f_res.score,
599
+ b_res.score,
493
600
  )
494
- return
601
+
602
+ # Build candidate reference and number using successful base path
603
+ num_obj = c.get("_number")
604
+ num = int(num_obj) if isinstance(num_obj, int) else None
605
+ proj = str(c.get("project") or gerrit_project or "")
606
+
607
+ # Use the base path that actually worked for API calls
608
+ display_url_builder = create_gerrit_url_builder(gerrit_host, successful_base_path)
609
+ ref = (
610
+ display_url_builder.change_url(proj, num)
611
+ if proj and isinstance(num, int)
612
+ else (f"change {num}" if isinstance(num, int) else "")
613
+ )
614
+ log.debug(
615
+ "Scoring candidate: ref=%s agg=%.2f (s=%.2f f=%.2f b=%.2f) subj='%s'",
616
+ ref or "(none)",
617
+ agg,
618
+ s_res.score,
619
+ f_res.score,
620
+ b_res.score,
621
+ subj[:200],
622
+ )
623
+
624
+ # Track best (for reasons)
625
+ if agg > best_score:
626
+ best_score = agg
627
+ # Deduplicate reasons preserving order
628
+ best_reasons = list(dict.fromkeys(s_res.reasons + f_res.reasons + b_res.reasons))
629
+
630
+ # Collect all candidates above threshold
631
+ if agg >= config.similarity_threshold and ref:
632
+ hits.append((agg, ref, num))
633
+ if isinstance(num, int):
634
+ all_nums.append(num)
635
+
636
+ log.debug(
637
+ "Similarity scoring found %d hit(s) (threshold=%.2f)",
638
+ len(hits),
639
+ config.similarity_threshold,
640
+ )
641
+ if hits:
642
+ hits_sorted = sorted(hits, key=lambda t: t[0], reverse=True)
643
+
644
+ # Log each matching change individually
645
+ for s, u, _ in hits_sorted:
646
+ if u:
647
+ log.info("Score: %.2f URL: %s", s, u)
648
+ msg = f"Similar Gerrit change(s) detected [≥ {config.similarity_threshold:.2f}]"
649
+ if best_reasons:
650
+ msg += f" (Reasons: {', '.join(best_reasons)})"
651
+ if allow_duplicates:
652
+ log.warning("GERRIT DUPLICATE DETECTED (allowed): %s", msg)
653
+ return
654
+ raise DuplicateChangeError(msg, all_nums)
655
+
656
+ # Construct human-friendly references for logs
657
+ matching_numbers: list[int] = []
658
+ match_lines: list[str] = []
659
+ for n, proj in matched:
660
+ if proj:
661
+ # Use the base path that actually worked for API calls
662
+ display_url_builder = create_gerrit_url_builder(gerrit_host, successful_base_path)
663
+ url = display_url_builder.change_url(proj, n)
664
+ match_lines.append(f"Score: 1.0 URL: {url}")
665
+ log.info("Score: 1.0 URL: %s", url)
495
666
  else:
496
- raise DuplicateChangeError(full_message, [])
667
+ match_lines.append(f"Score: 1.0 URL: change {n}")
668
+ log.info("Score: 1.0 URL: change %s", n)
669
+ matching_numbers.append(n)
497
670
 
498
- log.debug("No existing Gerrit change found for PR #%d", pr_number)
671
+ if not matched:
672
+ log.debug("No exact subject matches and no similarity matches; duplicate check passes")
673
+ return
674
+
675
+ # Remove PR number from message since cli.py already includes it
676
+ full_message = "subject matches existing Gerrit change(s)"
677
+ if allow_duplicates:
678
+ log.warning("GERRIT DUPLICATE DETECTED (allowed): %s", full_message)
679
+ return
680
+ raise DuplicateChangeError(full_message, matching_numbers)
499
681
 
500
682
 
501
683
  def check_for_duplicates(
@@ -525,10 +707,12 @@ def check_for_duplicates(
525
707
  target_pr = repo.get_pull(gh.pr_number)
526
708
 
527
709
  # Create detector and check
528
- detector = DuplicateDetector(repo, lookback_days=lookback_days)
529
- detector.check_for_duplicates(
530
- target_pr, allow_duplicates=allow_duplicates, gh=gh
710
+ detector = DuplicateDetector(
711
+ repo,
712
+ lookback_days=lookback_days,
713
+ duplicates_filter=os.getenv("DUPLICATES", "open"),
531
714
  )
715
+ detector.check_for_duplicates(target_pr, allow_duplicates=allow_duplicates, gh=gh)
532
716
 
533
717
  log.info("Duplicate check completed for PR #%d", gh.pr_number)
534
718
 
@@ -536,7 +720,5 @@ def check_for_duplicates(
536
720
  # Re-raise duplicate errors
537
721
  raise
538
722
  except Exception as exc:
539
- log.warning(
540
- "Duplicate detection failed for PR #%d: %s", gh.pr_number, exc
541
- )
723
+ log.warning("Duplicate detection failed for PR #%d: %s", gh.pr_number, exc)
542
724
  # Don't fail the entire process if duplicate detection has issues