github2gerrit 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- github2gerrit/cli.py +511 -271
- github2gerrit/commit_normalization.py +471 -0
- github2gerrit/config.py +32 -24
- github2gerrit/core.py +1092 -507
- github2gerrit/duplicate_detection.py +333 -217
- github2gerrit/external_api.py +518 -0
- github2gerrit/gerrit_rest.py +298 -0
- github2gerrit/gerrit_urls.py +353 -0
- github2gerrit/github_api.py +17 -95
- github2gerrit/gitutils.py +225 -41
- github2gerrit/models.py +3 -0
- github2gerrit/pr_content_filter.py +476 -0
- github2gerrit/similarity.py +458 -0
- github2gerrit/ssh_agent_setup.py +351 -0
- github2gerrit/ssh_common.py +244 -0
- github2gerrit/ssh_discovery.py +24 -67
- github2gerrit/utils.py +113 -0
- github2gerrit-0.1.7.dist-info/METADATA +798 -0
- github2gerrit-0.1.7.dist-info/RECORD +24 -0
- github2gerrit-0.1.5.dist-info/METADATA +0 -555
- github2gerrit-0.1.5.dist-info/RECORD +0 -15
- {github2gerrit-0.1.5.dist-info → github2gerrit-0.1.7.dist-info}/WHEEL +0 -0
- {github2gerrit-0.1.5.dist-info → github2gerrit-0.1.7.dist-info}/entry_points.txt +0 -0
- {github2gerrit-0.1.5.dist-info → github2gerrit-0.1.7.dist-info}/licenses/LICENSE +0 -0
- {github2gerrit-0.1.5.dist-info → github2gerrit-0.1.7.dist-info}/top_level.txt +0 -0
@@ -10,16 +10,20 @@ submissions from automated tools like Dependabot.
|
|
10
10
|
"""
|
11
11
|
|
12
12
|
import hashlib
|
13
|
+
import json
|
13
14
|
import logging
|
14
15
|
import os
|
15
16
|
import re
|
17
|
+
import urllib.error
|
16
18
|
import urllib.parse
|
17
19
|
import urllib.request
|
20
|
+
from collections.abc import Iterable
|
18
21
|
from datetime import UTC
|
19
22
|
from datetime import datetime
|
20
23
|
from datetime import timedelta
|
21
24
|
from pathlib import Path
|
22
25
|
|
26
|
+
from .gerrit_urls import create_gerrit_url_builder
|
23
27
|
from .github_api import GhPullRequest
|
24
28
|
from .github_api import GhRepository
|
25
29
|
from .github_api import build_client
|
@@ -57,9 +61,7 @@ class DuplicateChangeError(Exception):
|
|
57
61
|
class ChangeFingerprint:
|
58
62
|
"""Represents a fingerprint of a change for duplicate detection."""
|
59
63
|
|
60
|
-
def __init__(
|
61
|
-
self, title: str, body: str = "", files_changed: list[str] | None = None
|
62
|
-
):
|
64
|
+
def __init__(self, title: str, body: str = "", files_changed: list[str] | None = None):
|
63
65
|
self.title = title.strip()
|
64
66
|
self.body = (body or "").strip()
|
65
67
|
self.files_changed = sorted(files_changed or [])
|
@@ -100,15 +102,10 @@ class ChangeFingerprint:
|
|
100
102
|
|
101
103
|
def _compute_content_hash(self) -> str:
|
102
104
|
"""Compute a hash of the change content."""
|
103
|
-
content = (
|
104
|
-
f"{self._normalized_title}\n{self.body}\n"
|
105
|
-
f"{','.join(self.files_changed)}"
|
106
|
-
)
|
105
|
+
content = f"{self._normalized_title}\n{self.body}\n{','.join(self.files_changed)}"
|
107
106
|
return hashlib.sha256(content.encode("utf-8")).hexdigest()[:16]
|
108
107
|
|
109
|
-
def is_similar_to(
|
110
|
-
self, other: "ChangeFingerprint", similarity_threshold: float = 0.8
|
111
|
-
) -> bool:
|
108
|
+
def is_similar_to(self, other: "ChangeFingerprint", similarity_threshold: float = 0.8) -> bool:
|
112
109
|
"""Check if this fingerprint is similar to another."""
|
113
110
|
# Exact normalized title match
|
114
111
|
if self._normalized_title == other._normalized_title:
|
@@ -131,9 +128,7 @@ class ChangeFingerprint:
|
|
131
128
|
# Check title similarity even without file changes
|
132
129
|
return self._titles_similar(other, similarity_threshold)
|
133
130
|
|
134
|
-
def _titles_similar(
|
135
|
-
self, other: "ChangeFingerprint", threshold: float
|
136
|
-
) -> bool:
|
131
|
+
def _titles_similar(self, other: "ChangeFingerprint", threshold: float) -> bool:
|
137
132
|
"""Check if titles are similar using simple string similarity."""
|
138
133
|
title1 = self._normalized_title
|
139
134
|
title2 = other._normalized_title
|
@@ -154,28 +149,29 @@ class ChangeFingerprint:
|
|
154
149
|
return (intersection / union) >= threshold
|
155
150
|
|
156
151
|
def __str__(self) -> str:
|
157
|
-
return (
|
158
|
-
f"ChangeFingerprint(title='{self.title[:50]}...', "
|
159
|
-
f"hash={self._content_hash})"
|
160
|
-
)
|
152
|
+
return f"ChangeFingerprint(title='{self.title[:50]}...', hash={self._content_hash})"
|
161
153
|
|
162
154
|
|
163
155
|
class DuplicateDetector:
|
164
156
|
"""Detects duplicate Gerrit changes for GitHub pull requests."""
|
165
157
|
|
166
|
-
def __init__(
|
158
|
+
def __init__(
|
159
|
+
self,
|
160
|
+
repo: GhRepository,
|
161
|
+
lookback_days: int = 7,
|
162
|
+
duplicates_filter: str = "open",
|
163
|
+
):
|
167
164
|
self.repo = repo
|
168
165
|
self.lookback_days = lookback_days
|
169
166
|
self._cutoff_date = datetime.now(UTC) - timedelta(days=lookback_days)
|
167
|
+
self.duplicates_filter = duplicates_filter
|
170
168
|
|
171
169
|
def _match_first_group(self, pattern: str, text: str) -> str:
|
172
170
|
"""Extract first regex group match from text."""
|
173
171
|
match = re.search(pattern, text)
|
174
172
|
return match.group(1) if match else ""
|
175
173
|
|
176
|
-
def _resolve_gerrit_info_from_env_or_gitreview(
|
177
|
-
self, gh: GitHubContext
|
178
|
-
) -> tuple[str, str] | None:
|
174
|
+
def _resolve_gerrit_info_from_env_or_gitreview(self, gh: GitHubContext) -> tuple[str, str] | None:
|
179
175
|
"""Resolve Gerrit host and project from environment or .gitreview file.
|
180
176
|
|
181
177
|
Returns:
|
@@ -198,6 +194,8 @@ class DuplicateDetector:
|
|
198
194
|
if host and proj:
|
199
195
|
project = proj.removesuffix(".git")
|
200
196
|
return (host.strip(), project.strip())
|
197
|
+
if host and not proj:
|
198
|
+
return (host.strip(), "")
|
201
199
|
except Exception as exc:
|
202
200
|
log.debug("Failed to read local .gitreview: %s", exc)
|
203
201
|
|
@@ -219,38 +217,28 @@ class DuplicateDetector:
|
|
219
217
|
if not branch:
|
220
218
|
continue
|
221
219
|
|
222
|
-
url =
|
223
|
-
f"https://raw.githubusercontent.com/"
|
224
|
-
f"{repo_full}/refs/heads/{branch}/.gitreview"
|
225
|
-
)
|
220
|
+
url = f"https://raw.githubusercontent.com/{repo_full}/{branch}/.gitreview"
|
226
221
|
|
227
222
|
parsed = urllib.parse.urlparse(url)
|
228
|
-
if
|
229
|
-
parsed.scheme != "https"
|
230
|
-
or parsed.netloc != "raw.githubusercontent.com"
|
231
|
-
):
|
223
|
+
if parsed.scheme != "https" or parsed.netloc != "raw.githubusercontent.com":
|
232
224
|
continue
|
233
225
|
|
234
226
|
try:
|
235
227
|
log.debug("Fetching .gitreview from: %s", url)
|
236
|
-
with urllib.request.urlopen(url, timeout=5) as resp:
|
228
|
+
with urllib.request.urlopen(url, timeout=5) as resp:
|
237
229
|
text_remote = resp.read().decode("utf-8")
|
238
230
|
|
239
|
-
host = self._match_first_group(
|
240
|
-
|
241
|
-
)
|
242
|
-
proj = self._match_first_group(
|
243
|
-
r"(?m)^project=(.+)$", text_remote
|
244
|
-
)
|
231
|
+
host = self._match_first_group(r"(?m)^host=(.+)$", text_remote)
|
232
|
+
proj = self._match_first_group(r"(?m)^project=(.+)$", text_remote)
|
245
233
|
|
246
234
|
if host and proj:
|
247
235
|
project = proj.removesuffix(".git")
|
248
236
|
return (host.strip(), project.strip())
|
237
|
+
if host and not proj:
|
238
|
+
return (host.strip(), "")
|
249
239
|
|
250
240
|
except Exception as exc:
|
251
|
-
log.debug(
|
252
|
-
"Failed to fetch .gitreview from %s: %s", url, exc
|
253
|
-
)
|
241
|
+
log.debug("Failed to fetch .gitreview from %s: %s", url, exc)
|
254
242
|
continue
|
255
243
|
|
256
244
|
except Exception as exc:
|
@@ -261,22 +249,14 @@ class DuplicateDetector:
|
|
261
249
|
def _build_gerrit_rest_client(self, gerrit_host: str) -> object | None:
|
262
250
|
"""Build a Gerrit REST API client if pygerrit2 is available."""
|
263
251
|
if GerritRestAPI is None:
|
264
|
-
log.debug(
|
265
|
-
"pygerrit2 not available, skipping Gerrit duplicate check"
|
266
|
-
)
|
252
|
+
log.debug("pygerrit2 not available, skipping Gerrit duplicate check")
|
267
253
|
return None
|
268
254
|
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
if not base_path
|
273
|
-
else f"https://{gerrit_host}/{base_path}/"
|
274
|
-
)
|
255
|
+
# Create centralized URL builder
|
256
|
+
url_builder = create_gerrit_url_builder(gerrit_host)
|
257
|
+
base_url = url_builder.api_url()
|
275
258
|
|
276
|
-
http_user = (
|
277
|
-
os.getenv("GERRIT_HTTP_USER", "").strip()
|
278
|
-
or os.getenv("GERRIT_SSH_USER_G2G", "").strip()
|
279
|
-
)
|
259
|
+
http_user = os.getenv("GERRIT_HTTP_USER", "").strip() or os.getenv("GERRIT_SSH_USER_G2G", "").strip()
|
280
260
|
http_pass = os.getenv("GERRIT_HTTP_PASSWORD", "").strip()
|
281
261
|
|
282
262
|
try:
|
@@ -295,141 +275,6 @@ class DuplicateDetector:
|
|
295
275
|
log.debug("Failed to create Gerrit REST client: %s", exc)
|
296
276
|
return None
|
297
277
|
|
298
|
-
def _build_gerrit_rest_client_with_r_path(
|
299
|
-
self, gerrit_host: str
|
300
|
-
) -> object | None:
|
301
|
-
"""Build a Gerrit REST API client with /r/ base path for fallback."""
|
302
|
-
if GerritRestAPI is None:
|
303
|
-
return None
|
304
|
-
|
305
|
-
fallback_url = f"https://{gerrit_host}/r/"
|
306
|
-
http_user = (
|
307
|
-
os.getenv("GERRIT_HTTP_USER", "").strip()
|
308
|
-
or os.getenv("GERRIT_SSH_USER_G2G", "").strip()
|
309
|
-
)
|
310
|
-
http_pass = os.getenv("GERRIT_HTTP_PASSWORD", "").strip()
|
311
|
-
|
312
|
-
try:
|
313
|
-
if http_user and http_pass:
|
314
|
-
if HTTPBasicAuth is None:
|
315
|
-
return None
|
316
|
-
# Type ignore needed for dynamic import returning Any
|
317
|
-
return GerritRestAPI( # type: ignore[no-any-return]
|
318
|
-
url=fallback_url, auth=HTTPBasicAuth(http_user, http_pass)
|
319
|
-
)
|
320
|
-
else:
|
321
|
-
# Type ignore needed for dynamic import returning Any
|
322
|
-
return GerritRestAPI(url=fallback_url) # type: ignore[no-any-return]
|
323
|
-
except Exception as exc:
|
324
|
-
log.debug(
|
325
|
-
"Failed to create Gerrit REST client with /r/ path: %s", exc
|
326
|
-
)
|
327
|
-
return None
|
328
|
-
|
329
|
-
def check_gerrit_for_existing_change(self, gh: GitHubContext) -> bool:
|
330
|
-
"""Check if a Gerrit change already exists for the given GitHub PR.
|
331
|
-
|
332
|
-
Args:
|
333
|
-
gh: GitHub context containing PR and repository information
|
334
|
-
|
335
|
-
Returns:
|
336
|
-
True if a Gerrit change already exists for this PR, False otherwise
|
337
|
-
"""
|
338
|
-
if not gh.pr_number:
|
339
|
-
return False
|
340
|
-
|
341
|
-
# Resolve Gerrit host and project
|
342
|
-
gerrit_info = self._resolve_gerrit_info_from_env_or_gitreview(gh)
|
343
|
-
if not gerrit_info:
|
344
|
-
log.debug(
|
345
|
-
"Cannot resolve Gerrit host/project, "
|
346
|
-
"skipping Gerrit duplicate check"
|
347
|
-
)
|
348
|
-
return False
|
349
|
-
|
350
|
-
gerrit_host, gerrit_project = gerrit_info
|
351
|
-
|
352
|
-
rest = self._build_gerrit_rest_client(gerrit_host)
|
353
|
-
if rest is None:
|
354
|
-
log.debug(
|
355
|
-
"Cannot check Gerrit for duplicates, REST client unavailable"
|
356
|
-
)
|
357
|
-
return False
|
358
|
-
|
359
|
-
# Generate the GitHub change hash for this PR
|
360
|
-
github_hash = DuplicateDetector._generate_github_change_hash(gh)
|
361
|
-
|
362
|
-
try:
|
363
|
-
# Search for changes that contain the GitHub hash in commit messages
|
364
|
-
# This is more reliable than comment-based searches
|
365
|
-
query = (
|
366
|
-
f'project:{gerrit_project} message:"GitHub-Hash: {github_hash}"'
|
367
|
-
)
|
368
|
-
path = f"/changes/?q={query}&n=10"
|
369
|
-
|
370
|
-
log.debug(
|
371
|
-
"Searching Gerrit for existing changes with GitHub hash %s, "
|
372
|
-
"query: %s",
|
373
|
-
github_hash,
|
374
|
-
query,
|
375
|
-
)
|
376
|
-
# Use getattr for dynamic method access to avoid type checking
|
377
|
-
changes = rest.get(path) # type: ignore[attr-defined]
|
378
|
-
|
379
|
-
if changes:
|
380
|
-
log.info(
|
381
|
-
"Found %d existing Gerrit change(s) for GitHub PR #%d: %s",
|
382
|
-
len(changes),
|
383
|
-
gh.pr_number,
|
384
|
-
[f"{c.get('_number', '?')}" for c in changes],
|
385
|
-
)
|
386
|
-
return True
|
387
|
-
else:
|
388
|
-
log.debug(
|
389
|
-
"No existing Gerrit changes found for GitHub PR #%d",
|
390
|
-
gh.pr_number,
|
391
|
-
)
|
392
|
-
return False
|
393
|
-
|
394
|
-
except Exception as exc:
|
395
|
-
# Check if this is a 404 error and try /r/ fallback
|
396
|
-
status = getattr(
|
397
|
-
getattr(exc, "response", None), "status_code", None
|
398
|
-
)
|
399
|
-
if status == 404:
|
400
|
-
try:
|
401
|
-
log.debug("Trying /r/ fallback for Gerrit API")
|
402
|
-
fallback_rest = self._build_gerrit_rest_client_with_r_path(
|
403
|
-
gerrit_host
|
404
|
-
)
|
405
|
-
if fallback_rest:
|
406
|
-
changes = fallback_rest.get(path) # type: ignore[attr-defined]
|
407
|
-
if changes:
|
408
|
-
log.info(
|
409
|
-
"Found %d existing Gerrit change(s) for PR #%d "
|
410
|
-
"via /r/ fallback: %s",
|
411
|
-
len(changes),
|
412
|
-
gh.pr_number,
|
413
|
-
[f"{c.get('_number', '?')}" for c in changes],
|
414
|
-
)
|
415
|
-
return True
|
416
|
-
else:
|
417
|
-
log.debug(
|
418
|
-
"No existing Gerrit changes found for PR #%d "
|
419
|
-
"via /r/ fallback",
|
420
|
-
gh.pr_number,
|
421
|
-
)
|
422
|
-
return False
|
423
|
-
except Exception as exc2:
|
424
|
-
log.warning(
|
425
|
-
"Failed to query Gerrit via /r/ fallback: %s", exc2
|
426
|
-
)
|
427
|
-
return False
|
428
|
-
|
429
|
-
log.warning("Failed to query Gerrit for existing changes: %s", exc)
|
430
|
-
# If we can't check Gerrit, err on the side of caution
|
431
|
-
return False
|
432
|
-
|
433
278
|
@staticmethod
|
434
279
|
def _generate_github_change_hash(gh: GitHubContext) -> str:
|
435
280
|
"""Generate a deterministic hash for a GitHub PR to identify duplicates.
|
@@ -454,9 +299,7 @@ class DuplicateDetector:
|
|
454
299
|
hash_bytes = hashlib.sha256(hash_input.encode("utf-8")).digest()
|
455
300
|
hash_hex = hash_bytes.hex()[:16]
|
456
301
|
|
457
|
-
log.debug(
|
458
|
-
"Generated GitHub change hash for %s: %s", hash_input, hash_hex
|
459
|
-
)
|
302
|
+
log.debug("Generated GitHub change hash for %s: %s", hash_input, hash_hex)
|
460
303
|
return hash_hex
|
461
304
|
|
462
305
|
def check_for_duplicates(
|
@@ -465,37 +308,310 @@ class DuplicateDetector:
|
|
465
308
|
allow_duplicates: bool = False,
|
466
309
|
gh: GitHubContext | None = None,
|
467
310
|
) -> None:
|
468
|
-
"""Check if the target PR is a duplicate
|
311
|
+
"""Check if the target PR is a duplicate via subject equality against Gerrit.
|
469
312
|
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
Raises:
|
476
|
-
DuplicateChangeError: If duplicates found and allow_duplicates=False
|
313
|
+
Implements a robust, dependency-free subject-first duplicate check:
|
314
|
+
- Resolve Gerrit host/project from env or .gitreview
|
315
|
+
- Query Gerrit changes updated within the lookback window (excluding abandoned)
|
316
|
+
- Compare normalized subjects (first line) for exact equality
|
317
|
+
- If any match, treat as duplicate and either warn or raise
|
477
318
|
"""
|
478
319
|
pr_number = getattr(target_pr, "number", 0)
|
320
|
+
pr_title = (getattr(target_pr, "title", "") or "").strip()
|
479
321
|
|
480
|
-
log.debug(
|
322
|
+
log.debug(
|
323
|
+
"Checking PR #%d for duplicates via subject equality against Gerrit",
|
324
|
+
pr_number,
|
325
|
+
)
|
326
|
+
|
327
|
+
if not pr_title:
|
328
|
+
log.debug("PR #%d has empty title; skipping duplicate check", pr_number)
|
329
|
+
return
|
330
|
+
if gh is None:
|
331
|
+
log.debug("No GitHub context provided; skipping duplicate check")
|
332
|
+
return
|
333
|
+
|
334
|
+
# Resolve Gerrit target (host/project)
|
335
|
+
gerrit_info = self._resolve_gerrit_info_from_env_or_gitreview(gh)
|
336
|
+
if not gerrit_info:
|
337
|
+
log.debug("Unable to resolve Gerrit host/project; skipping duplicate check")
|
338
|
+
return
|
339
|
+
gerrit_host, gerrit_project = gerrit_info
|
481
340
|
|
482
|
-
#
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
341
|
+
# Helper: normalize subject like our existing title normalization
|
342
|
+
def _normalize_subject(title: str) -> str:
|
343
|
+
normalized = title.strip()
|
344
|
+
normalized = re.sub(
|
345
|
+
r"^(feat|fix|docs|style|refactor|test|chore|ci|build|perf)(\(.+?\))?: ",
|
346
|
+
"",
|
347
|
+
normalized,
|
348
|
+
flags=re.IGNORECASE,
|
488
349
|
)
|
350
|
+
normalized = re.sub(r"[*_`]", "", normalized)
|
351
|
+
normalized = re.sub(r"\bv\d+(\.\d+)*(-\w+)?\b", "vx.y.z", normalized)
|
352
|
+
normalized = re.sub(r"\b\d+(\.\d+)+(-\w+)?\b", "x.y.z", normalized)
|
353
|
+
normalized = re.sub(r"\b\d+\.\d+\b", "x.y.z", normalized)
|
354
|
+
normalized = re.sub(r"\b[a-f0-9]{7,40}\b", "commit_hash", normalized)
|
355
|
+
normalized = re.sub(r"\s+", " ", normalized).strip()
|
356
|
+
return normalized.lower()
|
357
|
+
|
358
|
+
normalized_pr_subject = _normalize_subject(pr_title)
|
359
|
+
log.debug(
|
360
|
+
"Normalized PR subject for duplicate check: %s",
|
361
|
+
normalized_pr_subject,
|
362
|
+
)
|
363
|
+
|
364
|
+
# Build Gerrit REST URL using centralized URL builder
|
365
|
+
url_builder = create_gerrit_url_builder(gerrit_host)
|
366
|
+
api_base = url_builder.api_url().rstrip("/")
|
367
|
+
|
368
|
+
# Track which base path actually works for constructing display URLs
|
369
|
+
successful_base_path = url_builder.base_path
|
370
|
+
|
371
|
+
# Build query: limit to recent changes, exclude abandoned; prefer open
|
372
|
+
cutoff_date = self._cutoff_date.date().isoformat()
|
373
|
+
q_parts = []
|
374
|
+
if gerrit_project:
|
375
|
+
q_parts.append(f"project:{gerrit_project}")
|
376
|
+
# Build status clause from DUPLICATES filter (default: open)
|
377
|
+
dup_filter = (self.duplicates_filter or "open").strip().lower()
|
378
|
+
selected = [s.strip() for s in dup_filter.split(",") if s.strip()]
|
379
|
+
valid = {
|
380
|
+
"open": "status:open",
|
381
|
+
"merged": "status:merged",
|
382
|
+
"abandoned": "status:abandoned",
|
383
|
+
}
|
384
|
+
status_terms = [valid[s] for s in selected if s in valid]
|
385
|
+
if not status_terms:
|
386
|
+
status_clause = "status:open"
|
387
|
+
elif len(status_terms) == 1:
|
388
|
+
status_clause = status_terms[0]
|
389
|
+
else:
|
390
|
+
status_clause = "(" + " OR ".join(status_terms) + ")"
|
391
|
+
q_parts.append(status_clause)
|
392
|
+
q_parts.append(f"after:{cutoff_date}")
|
393
|
+
query = " ".join(q_parts)
|
394
|
+
encoded_q = urllib.parse.quote(query, safe="")
|
395
|
+
|
396
|
+
# Request current commit metadata so we get 'subject'
|
397
|
+
# Use a modest page size
|
398
|
+
url = f"{api_base}/changes/?q={encoded_q}&n=50&o=CURRENT_COMMIT&o=CURRENT_FILES"
|
399
|
+
|
400
|
+
def _load_gerrit_json(url_: str) -> list[dict[str, object]]:
|
401
|
+
try:
|
402
|
+
log.debug("Querying Gerrit for duplicates: %s", url_)
|
403
|
+
# Ensure we only fetch over HTTPS to avoid unsafe schemes
|
404
|
+
parsed = urllib.parse.urlparse(url_)
|
405
|
+
if parsed.scheme != "https":
|
406
|
+
log.debug("Skipping non-HTTPS URL for Gerrit query: %s", url_)
|
407
|
+
return []
|
408
|
+
headers: dict[str, str] = {}
|
409
|
+
http_user = os.getenv("GERRIT_HTTP_USER", "").strip()
|
410
|
+
http_pass = os.getenv("GERRIT_HTTP_PASSWORD", "").strip()
|
411
|
+
if http_user and http_pass:
|
412
|
+
import base64 as _b64 # localized import to avoid global import edit
|
413
|
+
|
414
|
+
basic = _b64.b64encode(f"{http_user}:{http_pass}".encode()).decode("ascii")
|
415
|
+
headers["Authorization"] = f"Basic {basic}"
|
416
|
+
req = urllib.request.Request(url_, headers=headers)
|
417
|
+
with urllib.request.urlopen(req, timeout=8) as resp:
|
418
|
+
raw = resp.read().decode("utf-8", errors="replace")
|
419
|
+
# Strip Gerrit's XSSI prefix if present
|
420
|
+
if raw.startswith(")]}'"):
|
421
|
+
raw = raw.split("\n", 1)[1] if "\n" in raw else ""
|
422
|
+
data = json.loads(raw or "[]")
|
423
|
+
if isinstance(data, list):
|
424
|
+
return data
|
425
|
+
else:
|
426
|
+
return []
|
427
|
+
except urllib.error.HTTPError as exc:
|
428
|
+
log.debug("Gerrit query failed for %s: %s", url_, exc)
|
429
|
+
return []
|
430
|
+
except Exception as exc:
|
431
|
+
log.debug("Gerrit query failed for %s: %s", url_, exc)
|
432
|
+
return []
|
433
|
+
|
434
|
+
log.debug(
|
435
|
+
"Gerrit duplicate query: host=%s project=%s filter=%s cutoff=%s url=%s",
|
436
|
+
gerrit_host,
|
437
|
+
gerrit_project or "(any)",
|
438
|
+
dup_filter,
|
439
|
+
cutoff_date,
|
440
|
+
url,
|
441
|
+
)
|
442
|
+
changes = _load_gerrit_json(url)
|
443
|
+
log.debug(
|
444
|
+
"Gerrit query returned %d change(s) for project=%s filter=%s after=%s",
|
445
|
+
len(changes),
|
446
|
+
gerrit_project or "(any)",
|
447
|
+
dup_filter,
|
448
|
+
cutoff_date,
|
449
|
+
)
|
450
|
+
if changes:
|
451
|
+
sample_subjects = ", ".join(str(c.get("subject") or "")[:60] for c in changes[:5])
|
452
|
+
log.debug("Sample subjects: %s", sample_subjects)
|
453
|
+
|
454
|
+
# Compare normalized subjects for exact equality
|
455
|
+
matched: list[tuple[int, str]] = []
|
456
|
+
for c in changes:
|
457
|
+
subj = str(c.get("subject") or "").strip()
|
458
|
+
if not subj:
|
459
|
+
continue
|
460
|
+
if _normalize_subject(subj) == normalized_pr_subject:
|
461
|
+
num = c.get("_number")
|
462
|
+
proj = str(c.get("project") or gerrit_project or "")
|
463
|
+
if isinstance(num, int):
|
464
|
+
matched.append((num, proj))
|
465
|
+
|
466
|
+
if not matched:
|
467
|
+
# No exact subject match; proceed with similarity scoring across candidates
|
468
|
+
log.debug("No exact-subject matches found; entering similarity scoring")
|
469
|
+
from .similarity import ScoringConfig
|
470
|
+
from .similarity import aggregate_scores
|
471
|
+
from .similarity import remove_commit_trailers
|
472
|
+
from .similarity import score_bodies
|
473
|
+
from .similarity import score_files
|
474
|
+
from .similarity import score_subjects
|
475
|
+
|
476
|
+
config = ScoringConfig()
|
477
|
+
# Source features from the PR
|
478
|
+
src_subjects = [pr_title]
|
479
|
+
src_body = str(getattr(target_pr, "body", "") or "")
|
480
|
+
src_files: list[str] = []
|
481
|
+
try:
|
482
|
+
get_files = getattr(target_pr, "get_files", None)
|
483
|
+
if callable(get_files):
|
484
|
+
files_obj = get_files()
|
485
|
+
if isinstance(files_obj, Iterable):
|
486
|
+
for f in files_obj:
|
487
|
+
fname = getattr(f, "filename", None)
|
488
|
+
if fname:
|
489
|
+
src_files.append(str(fname))
|
490
|
+
except Exception as exc:
|
491
|
+
# Best-effort; if files cannot be retrieved, proceed without them
|
492
|
+
log.debug("Failed to retrieve PR files for scoring: %s", exc)
|
493
|
+
|
494
|
+
best_score = 0.0
|
495
|
+
best_reasons: list[str] = []
|
496
|
+
hits: list[tuple[float, str, int | None]] = []
|
497
|
+
all_nums: list[int] = []
|
498
|
+
for c in changes:
|
499
|
+
subj = str(c.get("subject") or "").strip()
|
500
|
+
if not subj:
|
501
|
+
continue
|
502
|
+
# Extract commit message and files from revisions (CURRENT_COMMIT, CURRENT_FILES)
|
503
|
+
rev = str(c.get("current_revision") or "")
|
504
|
+
revs_obj = c.get("revisions")
|
505
|
+
revs = revs_obj if isinstance(revs_obj, dict) else {}
|
506
|
+
cur_obj = revs.get(rev)
|
507
|
+
cur = cur_obj if isinstance(cur_obj, dict) else {}
|
508
|
+
commit = cur.get("commit") or {}
|
509
|
+
msg = str(commit.get("message") or "")
|
510
|
+
cand_body_raw = ""
|
511
|
+
if "\n" in msg:
|
512
|
+
cand_body_raw = msg.split("\n", 1)[1]
|
513
|
+
cand_body = remove_commit_trailers(cand_body_raw)
|
514
|
+
files_dict = cur.get("files") or {}
|
515
|
+
cand_files = [p for p in files_dict if isinstance(p, str) and not p.startswith("/")]
|
516
|
+
|
517
|
+
# Compute component scores
|
518
|
+
s_res = score_subjects(src_subjects, subj)
|
519
|
+
f_res = score_files(
|
520
|
+
src_files,
|
521
|
+
cand_files,
|
522
|
+
workflow_min_floor=config.workflow_min_floor,
|
523
|
+
)
|
524
|
+
b_res = score_bodies(src_body, cand_body)
|
525
|
+
|
526
|
+
# Aggregate
|
527
|
+
agg = aggregate_scores(s_res.score, f_res.score, b_res.score, config=config)
|
528
|
+
log.debug(
|
529
|
+
"Aggregate score computed: %.2f (s=%.2f f=%.2f b=%.2f)",
|
530
|
+
agg,
|
531
|
+
s_res.score,
|
532
|
+
f_res.score,
|
533
|
+
b_res.score,
|
534
|
+
)
|
489
535
|
|
490
|
-
|
491
|
-
|
492
|
-
|
536
|
+
# Build candidate reference and number using successful base path
|
537
|
+
num_obj = c.get("_number")
|
538
|
+
num = int(num_obj) if isinstance(num_obj, int) else None
|
539
|
+
proj = str(c.get("project") or gerrit_project or "")
|
540
|
+
|
541
|
+
# Use the base path that actually worked for API calls
|
542
|
+
display_url_builder = create_gerrit_url_builder(gerrit_host, successful_base_path)
|
543
|
+
ref = (
|
544
|
+
display_url_builder.change_url(proj, num)
|
545
|
+
if proj and isinstance(num, int)
|
546
|
+
else (f"change {num}" if isinstance(num, int) else "")
|
493
547
|
)
|
494
|
-
|
548
|
+
log.debug(
|
549
|
+
"Scoring candidate: ref=%s agg=%.2f (s=%.2f f=%.2f b=%.2f) subj='%s'",
|
550
|
+
ref or "(none)",
|
551
|
+
agg,
|
552
|
+
s_res.score,
|
553
|
+
f_res.score,
|
554
|
+
b_res.score,
|
555
|
+
subj[:200],
|
556
|
+
)
|
557
|
+
|
558
|
+
# Track best (for reasons)
|
559
|
+
if agg > best_score:
|
560
|
+
best_score = agg
|
561
|
+
# Deduplicate reasons preserving order
|
562
|
+
best_reasons = list(dict.fromkeys(s_res.reasons + f_res.reasons + b_res.reasons))
|
563
|
+
|
564
|
+
# Collect all candidates above threshold
|
565
|
+
if agg >= config.similarity_threshold and ref:
|
566
|
+
hits.append((agg, ref, num))
|
567
|
+
if isinstance(num, int):
|
568
|
+
all_nums.append(num)
|
569
|
+
|
570
|
+
log.debug(
|
571
|
+
"Similarity scoring found %d hit(s) (threshold=%.2f)",
|
572
|
+
len(hits),
|
573
|
+
config.similarity_threshold,
|
574
|
+
)
|
575
|
+
if hits:
|
576
|
+
hits_sorted = sorted(hits, key=lambda t: t[0], reverse=True)
|
577
|
+
|
578
|
+
# Log each matching change individually
|
579
|
+
for s, u, _ in hits_sorted:
|
580
|
+
if u:
|
581
|
+
log.info("Score: %.2f URL: %s", s, u)
|
582
|
+
msg = f"Similar Gerrit change(s) detected [≥ {config.similarity_threshold:.2f}]"
|
583
|
+
if best_reasons:
|
584
|
+
msg += f" (Reasons: {', '.join(best_reasons)})"
|
585
|
+
if allow_duplicates:
|
586
|
+
log.warning("GERRIT DUPLICATE DETECTED (allowed): %s", msg)
|
587
|
+
return
|
588
|
+
raise DuplicateChangeError(msg, all_nums)
|
589
|
+
|
590
|
+
# Construct human-friendly references for logs
|
591
|
+
matching_numbers: list[int] = []
|
592
|
+
match_lines: list[str] = []
|
593
|
+
for n, proj in matched:
|
594
|
+
if proj:
|
595
|
+
# Use the base path that actually worked for API calls
|
596
|
+
display_url_builder = create_gerrit_url_builder(gerrit_host, successful_base_path)
|
597
|
+
url = display_url_builder.change_url(proj, n)
|
598
|
+
match_lines.append(f"Score: 1.0 URL: {url}")
|
599
|
+
log.info("Score: 1.0 URL: %s", url)
|
495
600
|
else:
|
496
|
-
|
601
|
+
match_lines.append(f"Score: 1.0 URL: change {n}")
|
602
|
+
log.info("Score: 1.0 URL: change %s", n)
|
603
|
+
matching_numbers.append(n)
|
604
|
+
|
605
|
+
if not matched:
|
606
|
+
log.debug("No exact subject matches and no similarity matches; duplicate check passes")
|
607
|
+
return
|
497
608
|
|
498
|
-
|
609
|
+
# Remove PR number from message since cli.py already includes it
|
610
|
+
full_message = "subject matches existing Gerrit change(s)"
|
611
|
+
if allow_duplicates:
|
612
|
+
log.warning("GERRIT DUPLICATE DETECTED (allowed): %s", full_message)
|
613
|
+
return
|
614
|
+
raise DuplicateChangeError(full_message, matching_numbers)
|
499
615
|
|
500
616
|
|
501
617
|
def check_for_duplicates(
|
@@ -525,10 +641,12 @@ def check_for_duplicates(
|
|
525
641
|
target_pr = repo.get_pull(gh.pr_number)
|
526
642
|
|
527
643
|
# Create detector and check
|
528
|
-
detector = DuplicateDetector(
|
529
|
-
|
530
|
-
|
644
|
+
detector = DuplicateDetector(
|
645
|
+
repo,
|
646
|
+
lookback_days=lookback_days,
|
647
|
+
duplicates_filter=os.getenv("DUPLICATES", "open"),
|
531
648
|
)
|
649
|
+
detector.check_for_duplicates(target_pr, allow_duplicates=allow_duplicates, gh=gh)
|
532
650
|
|
533
651
|
log.info("Duplicate check completed for PR #%d", gh.pr_number)
|
534
652
|
|
@@ -536,7 +654,5 @@ def check_for_duplicates(
|
|
536
654
|
# Re-raise duplicate errors
|
537
655
|
raise
|
538
656
|
except Exception as exc:
|
539
|
-
log.warning(
|
540
|
-
"Duplicate detection failed for PR #%d: %s", gh.pr_number, exc
|
541
|
-
)
|
657
|
+
log.warning("Duplicate detection failed for PR #%d: %s", gh.pr_number, exc)
|
542
658
|
# Don't fail the entire process if duplicate detection has issues
|