github2gerrit 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- github2gerrit/cli.py +86 -117
- github2gerrit/config.py +32 -24
- github2gerrit/core.py +425 -417
- github2gerrit/duplicate_detection.py +375 -193
- github2gerrit/gerrit_urls.py +256 -0
- github2gerrit/github_api.py +6 -17
- github2gerrit/gitutils.py +30 -13
- github2gerrit/models.py +1 -0
- github2gerrit/similarity.py +458 -0
- github2gerrit/ssh_discovery.py +20 -67
- {github2gerrit-0.1.5.dist-info → github2gerrit-0.1.6.dist-info}/METADATA +22 -25
- github2gerrit-0.1.6.dist-info/RECORD +17 -0
- github2gerrit-0.1.5.dist-info/RECORD +0 -15
- {github2gerrit-0.1.5.dist-info → github2gerrit-0.1.6.dist-info}/WHEEL +0 -0
- {github2gerrit-0.1.5.dist-info → github2gerrit-0.1.6.dist-info}/entry_points.txt +0 -0
- {github2gerrit-0.1.5.dist-info → github2gerrit-0.1.6.dist-info}/licenses/LICENSE +0 -0
- {github2gerrit-0.1.5.dist-info → github2gerrit-0.1.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,458 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
# SPDX-FileCopyrightText: 2025 The Linux Foundation
|
3
|
+
|
4
|
+
"""
|
5
|
+
Utilities for subject/body/files similarity scoring.
|
6
|
+
|
7
|
+
This module provides normalization helpers and scoring interfaces that
|
8
|
+
will be used by the duplicate detection pipeline to compute similarity
|
9
|
+
between:
|
10
|
+
- A source pull-request (or a squashed commit to be submitted), and
|
11
|
+
- Candidate existing changes (e.g., in Gerrit).
|
12
|
+
|
13
|
+
Design goals:
|
14
|
+
- Deterministic, testable helpers with explicit inputs/outputs.
|
15
|
+
- Clear separation between normalization, feature extraction, and scoring.
|
16
|
+
- Explainability: each scorer returns both a score and human-readable reasons.
|
17
|
+
|
18
|
+
Implementation notes:
|
19
|
+
- This is scaffolding only. Functions raise NotImplementedError for now.
|
20
|
+
- The actual logic will be implemented incrementally in follow-up changes.
|
21
|
+
"""
|
22
|
+
|
23
|
+
from __future__ import annotations
|
24
|
+
|
25
|
+
import re
|
26
|
+
from collections.abc import Iterable
|
27
|
+
from collections.abc import Sequence
|
28
|
+
from dataclasses import dataclass
|
29
|
+
from difflib import SequenceMatcher
|
30
|
+
|
31
|
+
|
32
|
+
# Public API surface
|
33
|
+
__all__ = [
|
34
|
+
"ScoreResult",
|
35
|
+
"ScoringConfig",
|
36
|
+
"aggregate_scores",
|
37
|
+
"classify_automation_context",
|
38
|
+
"extract_dependency_package_from_subject",
|
39
|
+
"jaccard",
|
40
|
+
"normalize_body",
|
41
|
+
"normalize_subject",
|
42
|
+
"remove_commit_trailers",
|
43
|
+
"score_bodies",
|
44
|
+
"score_files",
|
45
|
+
"score_subjects",
|
46
|
+
"sequence_ratio",
|
47
|
+
]
|
48
|
+
|
49
|
+
|
50
|
+
@dataclass(frozen=True)
|
51
|
+
class ScoringConfig:
|
52
|
+
"""
|
53
|
+
Tunable weights and thresholds for similarity aggregation.
|
54
|
+
|
55
|
+
Attributes:
|
56
|
+
subject_weight: Weight applied to the subject similarity score.
|
57
|
+
files_weight: Weight applied to the files similarity score.
|
58
|
+
body_weight: Weight applied to the body similarity score.
|
59
|
+
similarity_threshold: Minimum aggregated score to consider a match.
|
60
|
+
workflow_min_floor:
|
61
|
+
Minimum score floor for 'workflow-files-in-both' cases.
|
62
|
+
"""
|
63
|
+
|
64
|
+
subject_weight: float = 0.45
|
65
|
+
files_weight: float = 0.35
|
66
|
+
body_weight: float = 0.20
|
67
|
+
similarity_threshold: float = 0.80
|
68
|
+
workflow_min_floor: float = 0.50
|
69
|
+
|
70
|
+
|
71
|
+
@dataclass(frozen=True)
|
72
|
+
class ScoreResult:
|
73
|
+
"""
|
74
|
+
Result of a specific similarity check.
|
75
|
+
|
76
|
+
Attributes:
|
77
|
+
score: A value in [0.0, 1.0] representing similarity confidence.
|
78
|
+
reasons: Human-readable, short explanations for the score.
|
79
|
+
"""
|
80
|
+
|
81
|
+
score: float
|
82
|
+
reasons: list[str]
|
83
|
+
|
84
|
+
|
85
|
+
def normalize_subject(subject: str) -> str:
|
86
|
+
"""
|
87
|
+
Normalize a subject/first-line string for robust comparison.
|
88
|
+
|
89
|
+
Recommended cleanups (to be implemented):
|
90
|
+
- Lowercase.
|
91
|
+
- Remove conventional commit prefixes (feat:, fix:, chore:, etc.).
|
92
|
+
- Remove semantic versions and commit hashes.
|
93
|
+
- Collapse whitespace.
|
94
|
+
- Strip punctuation where helpful.
|
95
|
+
|
96
|
+
Returns:
|
97
|
+
Normalized subject string.
|
98
|
+
"""
|
99
|
+
s = (subject or "").strip()
|
100
|
+
# Remove conventional commit prefixes
|
101
|
+
s = re.sub(
|
102
|
+
r"^(feat|fix|docs|style|refactor|test|chore|ci|build|perf)(\(.+?\))?:\s*",
|
103
|
+
"",
|
104
|
+
s,
|
105
|
+
flags=re.IGNORECASE,
|
106
|
+
)
|
107
|
+
# Remove lightweight markdown punctuation
|
108
|
+
s = re.sub(r"[*_`]", "", s)
|
109
|
+
# Normalize versions and commit hashes
|
110
|
+
s = re.sub(r"\bv\d+(\.\d+)*(-[\w.]+)?\b", "vx.y.z", s)
|
111
|
+
s = re.sub(r"\b\d+(\.\d+)+(-[\w.]+)?\b", "x.y.z", s)
|
112
|
+
s = re.sub(r"\b\d+\.\d+\b", "x.y.z", s)
|
113
|
+
s = re.sub(r"\b[a-f0-9]{7,40}\b", "commit_hash", s)
|
114
|
+
# Normalize whitespace and lowercase
|
115
|
+
s = re.sub(r"\s+", " ", s).strip()
|
116
|
+
return s.lower()
|
117
|
+
|
118
|
+
|
119
|
+
def normalize_body(body: str | None) -> str:
|
120
|
+
"""
|
121
|
+
Normalize a PR/commit body string for robust comparison.
|
122
|
+
|
123
|
+
Recommended cleanups (to be implemented):
|
124
|
+
- Lowercase.
|
125
|
+
- Remove URLs, commit hashes, dates, and version numbers.
|
126
|
+
- Normalize numeric IDs (e.g., #1234) to a placeholder.
|
127
|
+
- Collapse whitespace.
|
128
|
+
- Consider removing templated boilerplate for known automation tools.
|
129
|
+
|
130
|
+
Args:
|
131
|
+
body: Raw body text; may be None.
|
132
|
+
|
133
|
+
Returns:
|
134
|
+
Normalized body string (possibly empty).
|
135
|
+
"""
|
136
|
+
if not body:
|
137
|
+
return ""
|
138
|
+
b = body.lower()
|
139
|
+
# Remove URLs
|
140
|
+
b = re.sub(r"https?://\S+", "", b)
|
141
|
+
# Normalize versions and commit hashes and dates
|
142
|
+
b = re.sub(
|
143
|
+
r"v?\d+\.\d+\.\d+(?:\.\d+)?(?:-[a-z0-9.-]+)?",
|
144
|
+
"VERSION",
|
145
|
+
b,
|
146
|
+
flags=re.IGNORECASE,
|
147
|
+
)
|
148
|
+
b = re.sub(r"\b[a-f0-9]{7,40}\b", "COMMIT", b)
|
149
|
+
b = re.sub(r"\d{4}-\d{2}-\d{2}", "DATE", b)
|
150
|
+
# Normalize issue/PR references
|
151
|
+
b = re.sub(r"#\d+", "#NUMBER", b)
|
152
|
+
# Collapse whitespace
|
153
|
+
b = re.sub(r"\s+", " ", b).strip()
|
154
|
+
return b
|
155
|
+
|
156
|
+
|
157
|
+
def remove_commit_trailers(message: str) -> str:
|
158
|
+
"""
|
159
|
+
Remove commit trailers from a commit message body.
|
160
|
+
|
161
|
+
Examples of trailers to remove (to be implemented):
|
162
|
+
- Change-Id: Iabc123...
|
163
|
+
- Signed-off-by: Name <email>
|
164
|
+
- Issue-ID: ABC-123
|
165
|
+
- GitHub-Hash: deadbeefcafebabe
|
166
|
+
- Co-authored-by: ...
|
167
|
+
|
168
|
+
Args:
|
169
|
+
message: Full commit message including subject/body/trailers.
|
170
|
+
|
171
|
+
Returns:
|
172
|
+
Message body with trailers removed.
|
173
|
+
"""
|
174
|
+
lines = (message or "").splitlines()
|
175
|
+
out: list[str] = []
|
176
|
+
trailer_re = re.compile(r"(?i)^(change-id|signed-off-by|issue-id|github-hash|co-authored-by):")
|
177
|
+
for ln in lines:
|
178
|
+
if trailer_re.match(ln.strip()):
|
179
|
+
continue
|
180
|
+
out.append(ln)
|
181
|
+
return "\n".join(out).strip()
|
182
|
+
|
183
|
+
|
184
|
+
def extract_dependency_package_from_subject(subject: str) -> str:
|
185
|
+
"""
|
186
|
+
Extract likely dependency/package name from a dependency update subject.
|
187
|
+
|
188
|
+
Examples to consider (to be implemented):
|
189
|
+
- "Bump requests from 2.31.0 to 2.32.0" -> "requests"
|
190
|
+
- "chore: update org/tool from v1.2.3 to v1.2.4" -> "org/tool"
|
191
|
+
|
192
|
+
Args:
|
193
|
+
subject: The (possibly unnormalized) subject line.
|
194
|
+
|
195
|
+
Returns:
|
196
|
+
Package identifier, or empty string if none could be extracted.
|
197
|
+
"""
|
198
|
+
s = (subject or "").lower()
|
199
|
+
patterns = [
|
200
|
+
r"(?:chore:\s*)?bump\s+([^\s]+)\s+from\s+",
|
201
|
+
r"(?:chore:\s*)?update\s+([^\s]+)\s+from\s+",
|
202
|
+
r"(?:chore:\s*)?upgrade\s+([^\s]+)\s+from\s+",
|
203
|
+
]
|
204
|
+
for pat in patterns:
|
205
|
+
m = re.search(pat, s)
|
206
|
+
if m:
|
207
|
+
pkg = m.group(1).strip().strip("'\"")
|
208
|
+
return pkg
|
209
|
+
return ""
|
210
|
+
|
211
|
+
|
212
|
+
def jaccard(a: Iterable[str], b: Iterable[str]) -> float:
|
213
|
+
"""
|
214
|
+
Compute Jaccard similarity between two sets of strings.
|
215
|
+
|
216
|
+
Args:
|
217
|
+
a: Iterable of items.
|
218
|
+
b: Iterable of items.
|
219
|
+
|
220
|
+
Returns:
|
221
|
+
Jaccard index in [0.0, 1.0].
|
222
|
+
"""
|
223
|
+
sa = set(a)
|
224
|
+
sb = set(b)
|
225
|
+
if not sa and not sb:
|
226
|
+
return 1.0
|
227
|
+
union = len(sa | sb)
|
228
|
+
if union == 0:
|
229
|
+
return 0.0
|
230
|
+
return len(sa & sb) / union
|
231
|
+
|
232
|
+
|
233
|
+
def sequence_ratio(a: str, b: str) -> float:
|
234
|
+
"""
|
235
|
+
Compute a sequence similarity ratio for two strings.
|
236
|
+
|
237
|
+
Implementation note:
|
238
|
+
- Intended to wrap an algorithm like difflib.SequenceMatcher in a
|
239
|
+
testable and replaceable interface.
|
240
|
+
|
241
|
+
Returns:
|
242
|
+
Ratio in [0.0, 1.0].
|
243
|
+
"""
|
244
|
+
return float(SequenceMatcher(None, a or "", b or "").ratio())
|
245
|
+
|
246
|
+
|
247
|
+
def classify_automation_context(
|
248
|
+
title: str,
|
249
|
+
body: str | None,
|
250
|
+
author: str | None,
|
251
|
+
) -> list[str]:
|
252
|
+
"""
|
253
|
+
Identify automation signals present in the change context.
|
254
|
+
|
255
|
+
Signals to detect (to be implemented):
|
256
|
+
- Dependabot (titles/bodies/author names, frontmatter fields).
|
257
|
+
- pre-commit autoupdates.
|
258
|
+
- GitHub Actions bumps (uses: owner/action@version).
|
259
|
+
|
260
|
+
Returns:
|
261
|
+
List of detected signals (e.g., ["dependabot", "github-actions"]).
|
262
|
+
"""
|
263
|
+
text = f"{title or ''} {body or ''} {author or ''}".lower()
|
264
|
+
signals: list[str] = []
|
265
|
+
if "dependabot" in text or "dependency-name:" in text:
|
266
|
+
signals.append("dependabot")
|
267
|
+
if "pre-commit" in text or ".pre-commit-config.yaml" in text:
|
268
|
+
signals.append("pre-commit")
|
269
|
+
if "github actions" in text or ".github/workflows" in text or "uses:" in text:
|
270
|
+
signals.append("github-actions")
|
271
|
+
# Deduplicate while preserving order
|
272
|
+
seen: set[str] = set()
|
273
|
+
uniq: list[str] = []
|
274
|
+
for s in signals:
|
275
|
+
if s not in seen:
|
276
|
+
seen.add(s)
|
277
|
+
uniq.append(s)
|
278
|
+
return uniq
|
279
|
+
|
280
|
+
|
281
|
+
def score_subjects(
|
282
|
+
source_subjects: Sequence[str],
|
283
|
+
candidate_subject: str,
|
284
|
+
*,
|
285
|
+
strong_match_threshold: float = 0.95,
|
286
|
+
) -> ScoreResult:
|
287
|
+
"""
|
288
|
+
Score subject similarity between one or more source subjects
|
289
|
+
and a candidate.
|
290
|
+
|
291
|
+
Behavior:
|
292
|
+
- Normalize all subjects.
|
293
|
+
- If any exact normalized match -> score=1.0, reasons include
|
294
|
+
"Exact subject match".
|
295
|
+
- Otherwise compute the maximum ratio across subjects vs candidate.
|
296
|
+
- If both look like dependency updates and refer to the same package,
|
297
|
+
prefer 1.0.
|
298
|
+
"""
|
299
|
+
reasons: list[str] = []
|
300
|
+
cand_norm = normalize_subject(candidate_subject)
|
301
|
+
best_ratio = 0.0
|
302
|
+
for src in source_subjects:
|
303
|
+
src_norm = normalize_subject(src)
|
304
|
+
if src_norm == cand_norm and src_norm:
|
305
|
+
return ScoreResult(score=1.0, reasons=["Exact subject match"])
|
306
|
+
# Prefer package equality for dependency updates
|
307
|
+
pkg_src = extract_dependency_package_from_subject(src)
|
308
|
+
pkg_cand = extract_dependency_package_from_subject(candidate_subject)
|
309
|
+
if pkg_src and pkg_cand and pkg_src == pkg_cand:
|
310
|
+
return ScoreResult(score=1.0, reasons=[f"Same dependency package: {pkg_src}"])
|
311
|
+
r = sequence_ratio(src_norm, cand_norm)
|
312
|
+
if r > best_ratio:
|
313
|
+
best_ratio = r
|
314
|
+
if best_ratio >= strong_match_threshold:
|
315
|
+
reasons.append(f"Strongly similar subjects (ratio: {best_ratio:.2f})")
|
316
|
+
elif best_ratio > 0:
|
317
|
+
reasons.append(f"Similar subjects (ratio: {best_ratio:.2f})")
|
318
|
+
return ScoreResult(score=best_ratio, reasons=reasons)
|
319
|
+
|
320
|
+
|
321
|
+
def score_files(
|
322
|
+
source_files: Sequence[str],
|
323
|
+
candidate_files: Sequence[str],
|
324
|
+
*,
|
325
|
+
workflow_min_floor: float = ScoringConfig.workflow_min_floor,
|
326
|
+
) -> ScoreResult:
|
327
|
+
"""
|
328
|
+
Score similarity based on changed file paths.
|
329
|
+
|
330
|
+
Behavior:
|
331
|
+
- Normalize paths (e.g., case, strip version fragments if needed).
|
332
|
+
- Compute Jaccard similarity across filename sets.
|
333
|
+
- If both sides include one or more files under .github/workflows/,
|
334
|
+
floor the score to workflow_min_floor.
|
335
|
+
"""
|
336
|
+
|
337
|
+
def _nf(p: str) -> str:
|
338
|
+
q = (p or "").strip().lower()
|
339
|
+
# Remove embedded version-like fragments
|
340
|
+
q = re.sub(r"v?\d+\.\d+\.\d+(?:\.\d+)?(?:-[\w.-]+)?", "", q)
|
341
|
+
q = re.sub(r"\s+", " ", q).strip()
|
342
|
+
return q
|
343
|
+
|
344
|
+
src_set = {_nf(f) for f in source_files if f}
|
345
|
+
cand_set = {_nf(f) for f in candidate_files if f}
|
346
|
+
score = jaccard(src_set, cand_set)
|
347
|
+
# Workflow floor if both sides modify workflow files
|
348
|
+
workflows_src = any(s.startswith(".github/workflows/") for s in src_set)
|
349
|
+
workflows_cand = any(s.startswith(".github/workflows/") for s in cand_set)
|
350
|
+
reasons: list[str] = []
|
351
|
+
if workflows_src and workflows_cand and score < workflow_min_floor:
|
352
|
+
score = max(score, float(workflow_min_floor))
|
353
|
+
reasons.append("Both modify workflow files (.github/workflows/*)")
|
354
|
+
if src_set or cand_set:
|
355
|
+
reasons.append(
|
356
|
+
f"File overlap Jaccard: {score:.2f} (|n|={len(src_set & cand_set)}, |U|={len(src_set | cand_set)})"
|
357
|
+
)
|
358
|
+
return ScoreResult(score=score, reasons=reasons)
|
359
|
+
|
360
|
+
|
361
|
+
def score_bodies(
|
362
|
+
source_body: str | None,
|
363
|
+
candidate_body: str | None,
|
364
|
+
) -> ScoreResult:
|
365
|
+
"""
|
366
|
+
Score similarity based on normalized body text and automation patterns.
|
367
|
+
"""
|
368
|
+
if not source_body or not candidate_body:
|
369
|
+
return ScoreResult(score=0.0, reasons=[])
|
370
|
+
# Very short bodies: exact match or zero
|
371
|
+
if len(source_body.strip()) < 50 or len(candidate_body.strip()) < 50:
|
372
|
+
if normalize_body(source_body) == normalize_body(candidate_body):
|
373
|
+
return ScoreResult(score=1.0, reasons=["Short bodies exactly match"])
|
374
|
+
return ScoreResult(score=0.0, reasons=[])
|
375
|
+
reasons: list[str] = []
|
376
|
+
# Automation-aware checks
|
377
|
+
src_text = source_body or ""
|
378
|
+
cand_text = candidate_body or ""
|
379
|
+
src_is_dep = "dependabot" in src_text.lower() or "dependency-name:" in src_text.lower()
|
380
|
+
cand_is_dep = "dependabot" in cand_text.lower() or "dependency-name:" in cand_text.lower()
|
381
|
+
if src_is_dep and cand_is_dep:
|
382
|
+
pkg1 = ""
|
383
|
+
pkg2 = ""
|
384
|
+
m1 = re.search(r"dependency-name:\s*([^\s\n]+)", src_text, flags=re.IGNORECASE)
|
385
|
+
m2 = re.search(
|
386
|
+
r"dependency-name:\s*([^\s\n]+)",
|
387
|
+
cand_text,
|
388
|
+
flags=re.IGNORECASE,
|
389
|
+
)
|
390
|
+
if m1:
|
391
|
+
pkg1 = m1.group(1).strip()
|
392
|
+
if m2:
|
393
|
+
pkg2 = m2.group(1).strip()
|
394
|
+
if pkg1 and pkg2 and pkg1 == pkg2:
|
395
|
+
return ScoreResult(score=0.95, reasons=[f"Dependabot package match: {pkg1}"])
|
396
|
+
# Different packages -> slight similarity for being both dependabot
|
397
|
+
reasons.append("Both look like Dependabot bodies")
|
398
|
+
# do not return yet; fall through to normalized ratio
|
399
|
+
src_is_pc = "pre-commit" in src_text.lower() or ".pre-commit-config.yaml" in src_text.lower()
|
400
|
+
cand_is_pc = "pre-commit" in cand_text.lower() or ".pre-commit-config.yaml" in cand_text.lower()
|
401
|
+
if src_is_pc and cand_is_pc:
|
402
|
+
return ScoreResult(score=0.9, reasons=["Both look like pre-commit updates"])
|
403
|
+
src_is_actions = (
|
404
|
+
"github actions" in src_text.lower() or ".github/workflows" in src_text.lower() or "uses:" in src_text.lower()
|
405
|
+
)
|
406
|
+
cand_is_actions = (
|
407
|
+
"github actions" in cand_text.lower()
|
408
|
+
or ".github/workflows" in cand_text.lower()
|
409
|
+
or "uses:" in cand_text.lower()
|
410
|
+
)
|
411
|
+
if src_is_actions and cand_is_actions:
|
412
|
+
a1 = re.search(r"uses:\s*([^@\s]+)", src_text, flags=re.IGNORECASE)
|
413
|
+
a2 = re.search(r"uses:\s*([^@\s]+)", cand_text, flags=re.IGNORECASE)
|
414
|
+
if a1 and a2 and a1.group(1).strip() and a1.group(1).strip() == a2.group(1).strip():
|
415
|
+
return ScoreResult(
|
416
|
+
score=0.9,
|
417
|
+
reasons=[f"Same GitHub Action: {a1.group(1).strip()}"],
|
418
|
+
)
|
419
|
+
reasons.append("Both look like GitHub Actions updates")
|
420
|
+
# Fallback to normalized sequence ratio
|
421
|
+
nb1 = normalize_body(source_body)
|
422
|
+
nb2 = normalize_body(candidate_body)
|
423
|
+
ratio = sequence_ratio(nb1, nb2)
|
424
|
+
if ratio >= 0.6:
|
425
|
+
reasons.append(f"Similar bodies (ratio: {ratio:.2f})")
|
426
|
+
return ScoreResult(score=ratio, reasons=reasons)
|
427
|
+
|
428
|
+
|
429
|
+
def aggregate_scores(
|
430
|
+
subject_score: float,
|
431
|
+
files_score: float,
|
432
|
+
body_score: float,
|
433
|
+
*,
|
434
|
+
config: ScoringConfig | None = None,
|
435
|
+
) -> float:
|
436
|
+
"""
|
437
|
+
Aggregate component scores into a single confidence value.
|
438
|
+
|
439
|
+
Args:
|
440
|
+
subject_score: Score in [0,1] for subject similarity.
|
441
|
+
files_score: Score in [0,1] for files similarity.
|
442
|
+
body_score: Score in [0,1] for body similarity.
|
443
|
+
config: Weighting and threshold configuration.
|
444
|
+
|
445
|
+
Returns:
|
446
|
+
Weighted average in [0,1].
|
447
|
+
"""
|
448
|
+
if config is None:
|
449
|
+
config = ScoringConfig()
|
450
|
+
w_sum = float(config.subject_weight + config.files_weight + config.body_weight)
|
451
|
+
if w_sum <= 0:
|
452
|
+
return 0.0
|
453
|
+
total = (
|
454
|
+
config.subject_weight * float(subject_score)
|
455
|
+
+ config.files_weight * float(files_score)
|
456
|
+
+ config.body_weight * float(body_score)
|
457
|
+
)
|
458
|
+
return max(0.0, min(1.0, total / w_sum))
|
github2gerrit/ssh_discovery.py
CHANGED
@@ -28,31 +28,17 @@ class SSHDiscoveryError(Exception):
|
|
28
28
|
|
29
29
|
|
30
30
|
# Error message constants to comply with TRY003
|
31
|
-
_MSG_HOST_UNREACHABLE =
|
32
|
-
"Host {hostname}:{port} is not reachable. "
|
33
|
-
"Check network connectivity and server availability."
|
34
|
-
)
|
31
|
+
_MSG_HOST_UNREACHABLE = "Host {hostname}:{port} is not reachable. Check network connectivity and server availability."
|
35
32
|
_MSG_NO_KEYS_FOUND = (
|
36
|
-
"No SSH host keys found for {hostname}:{port}. "
|
37
|
-
"The server may not be running SSH or may be blocking connections."
|
33
|
+
"No SSH host keys found for {hostname}:{port}. The server may not be running SSH or may be blocking connections."
|
38
34
|
)
|
39
35
|
_MSG_NO_VALID_KEYS = (
|
40
|
-
"No valid SSH host keys found for {hostname}:{port}. "
|
41
|
-
"The ssh-keyscan output was empty or malformed."
|
42
|
-
)
|
43
|
-
_MSG_CONNECTION_FAILED = (
|
44
|
-
"Failed to connect to {hostname}:{port} for SSH key discovery. "
|
45
|
-
"Error: {error}"
|
46
|
-
)
|
47
|
-
_MSG_KEYSCAN_FAILED = (
|
48
|
-
"ssh-keyscan failed with return code {returncode}: {error}"
|
49
|
-
)
|
50
|
-
_MSG_UNEXPECTED_ERROR = (
|
51
|
-
"Unexpected error during SSH key discovery for {hostname}:{port}: {error}"
|
52
|
-
)
|
53
|
-
_MSG_SAVE_FAILED = (
|
54
|
-
"Failed to save host keys to configuration file {config_file}: {error}"
|
36
|
+
"No valid SSH host keys found for {hostname}:{port}. The ssh-keyscan output was empty or malformed."
|
55
37
|
)
|
38
|
+
_MSG_CONNECTION_FAILED = "Failed to connect to {hostname}:{port} for SSH key discovery. Error: {error}"
|
39
|
+
_MSG_KEYSCAN_FAILED = "ssh-keyscan failed with return code {returncode}: {error}"
|
40
|
+
_MSG_UNEXPECTED_ERROR = "Unexpected error during SSH key discovery for {hostname}:{port}: {error}"
|
41
|
+
_MSG_SAVE_FAILED = "Failed to save host keys to configuration file {config_file}: {error}"
|
56
42
|
|
57
43
|
|
58
44
|
def is_host_reachable(hostname: str, port: int, timeout: int = 5) -> bool:
|
@@ -64,9 +50,7 @@ def is_host_reachable(hostname: str, port: int, timeout: int = 5) -> bool:
|
|
64
50
|
return False
|
65
51
|
|
66
52
|
|
67
|
-
def fetch_ssh_host_keys(
|
68
|
-
hostname: str, port: int = 22, timeout: int = 10
|
69
|
-
) -> str:
|
53
|
+
def fetch_ssh_host_keys(hostname: str, port: int = 22, timeout: int = 10) -> str:
|
70
54
|
"""
|
71
55
|
Fetch SSH host keys for a given hostname and port using ssh-keyscan.
|
72
56
|
|
@@ -85,9 +69,7 @@ def fetch_ssh_host_keys(
|
|
85
69
|
|
86
70
|
# First check if the host is reachable
|
87
71
|
if not is_host_reachable(hostname, port, timeout=5):
|
88
|
-
raise SSHDiscoveryError(
|
89
|
-
_MSG_HOST_UNREACHABLE.format(hostname=hostname, port=port)
|
90
|
-
)
|
72
|
+
raise SSHDiscoveryError(_MSG_HOST_UNREACHABLE.format(hostname=hostname, port=port))
|
91
73
|
|
92
74
|
try:
|
93
75
|
# Use ssh-keyscan to fetch all available key types
|
@@ -142,23 +124,13 @@ def fetch_ssh_host_keys(
|
|
142
124
|
# ssh-keyscan returns 1 when it can't connect
|
143
125
|
error_msg = exc.stderr or exc.stdout or "Connection failed"
|
144
126
|
raise SSHDiscoveryError(
|
145
|
-
_MSG_CONNECTION_FAILED.format(
|
146
|
-
hostname=hostname, port=port, error=error_msg
|
147
|
-
)
|
127
|
+
_MSG_CONNECTION_FAILED.format(hostname=hostname, port=port, error=error_msg)
|
148
128
|
) from exc
|
149
129
|
else:
|
150
130
|
error_msg = exc.stderr or exc.stdout or "Unknown error"
|
151
|
-
raise SSHDiscoveryError(
|
152
|
-
_MSG_KEYSCAN_FAILED.format(
|
153
|
-
returncode=exc.returncode, error=error_msg
|
154
|
-
)
|
155
|
-
) from exc
|
131
|
+
raise SSHDiscoveryError(_MSG_KEYSCAN_FAILED.format(returncode=exc.returncode, error=error_msg)) from exc
|
156
132
|
except Exception as exc:
|
157
|
-
raise SSHDiscoveryError(
|
158
|
-
_MSG_UNEXPECTED_ERROR.format(
|
159
|
-
hostname=hostname, port=port, error=exc
|
160
|
-
)
|
161
|
-
) from exc
|
133
|
+
raise SSHDiscoveryError(_MSG_UNEXPECTED_ERROR.format(hostname=hostname, port=port, error=exc)) from exc
|
162
134
|
else:
|
163
135
|
return discovered_keys
|
164
136
|
|
@@ -196,9 +168,7 @@ def extract_gerrit_info_from_gitreview(content: str) -> tuple[str, int] | None:
|
|
196
168
|
return (hostname, port) if hostname else None
|
197
169
|
|
198
170
|
|
199
|
-
def discover_and_save_host_keys(
|
200
|
-
hostname: str, port: int, organization: str, config_path: str | None = None
|
201
|
-
) -> str:
|
171
|
+
def discover_and_save_host_keys(hostname: str, port: int, organization: str, config_path: str | None = None) -> str:
|
202
172
|
"""
|
203
173
|
Discover SSH host keys and save them to the organization's configuration.
|
204
174
|
|
@@ -224,9 +194,7 @@ def discover_and_save_host_keys(
|
|
224
194
|
return host_keys
|
225
195
|
|
226
196
|
|
227
|
-
def save_host_keys_to_config(
|
228
|
-
host_keys: str, organization: str, config_path: str | None = None
|
229
|
-
) -> None:
|
197
|
+
def save_host_keys_to_config(host_keys: str, organization: str, config_path: str | None = None) -> None:
|
230
198
|
"""
|
231
199
|
Save SSH host keys to the organization's configuration file.
|
232
200
|
|
@@ -242,9 +210,7 @@ def save_host_keys_to_config(
|
|
242
210
|
from .config import DEFAULT_CONFIG_PATH
|
243
211
|
|
244
212
|
if config_path is None:
|
245
|
-
config_path = (
|
246
|
-
os.getenv("G2G_CONFIG_PATH", "").strip() or DEFAULT_CONFIG_PATH
|
247
|
-
)
|
213
|
+
config_path = os.getenv("G2G_CONFIG_PATH", "").strip() or DEFAULT_CONFIG_PATH
|
248
214
|
|
249
215
|
config_file = Path(config_path).expanduser()
|
250
216
|
|
@@ -313,9 +279,7 @@ def save_host_keys_to_config(
|
|
313
279
|
|
314
280
|
# Insert the GERRIT_KNOWN_HOSTS entry
|
315
281
|
escaped_keys = host_keys.replace("\n", "\\n")
|
316
|
-
new_lines.insert(
|
317
|
-
section_end, f'GERRIT_KNOWN_HOSTS = "{escaped_keys}"'
|
318
|
-
)
|
282
|
+
new_lines.insert(section_end, f'GERRIT_KNOWN_HOSTS = "{escaped_keys}"')
|
319
283
|
|
320
284
|
# Write the updated configuration
|
321
285
|
config_file.write_text("\n".join(new_lines), encoding="utf-8")
|
@@ -327,9 +291,7 @@ def save_host_keys_to_config(
|
|
327
291
|
)
|
328
292
|
|
329
293
|
except Exception as exc:
|
330
|
-
raise SSHDiscoveryError(
|
331
|
-
_MSG_SAVE_FAILED.format(config_file=config_file, error=exc)
|
332
|
-
) from exc
|
294
|
+
raise SSHDiscoveryError(_MSG_SAVE_FAILED.format(config_file=config_file, error=exc)) from exc
|
333
295
|
|
334
296
|
|
335
297
|
def auto_discover_gerrit_host_keys(
|
@@ -360,21 +322,14 @@ def auto_discover_gerrit_host_keys(
|
|
360
322
|
gerrit_port = 29418
|
361
323
|
|
362
324
|
if organization is None:
|
363
|
-
organization = (
|
364
|
-
os.getenv("ORGANIZATION")
|
365
|
-
or os.getenv("GITHUB_REPOSITORY_OWNER")
|
366
|
-
or ""
|
367
|
-
).strip()
|
325
|
+
organization = (os.getenv("ORGANIZATION") or os.getenv("GITHUB_REPOSITORY_OWNER") or "").strip()
|
368
326
|
|
369
327
|
if not gerrit_hostname:
|
370
328
|
log.debug("No Gerrit hostname provided for auto-discovery")
|
371
329
|
return None
|
372
330
|
|
373
331
|
if not organization:
|
374
|
-
log.warning(
|
375
|
-
"No organization specified for SSH host key auto-discovery. "
|
376
|
-
"Cannot save to configuration file."
|
377
|
-
)
|
332
|
+
log.warning("No organization specified for SSH host key auto-discovery. Cannot save to configuration file.")
|
378
333
|
save_to_config = False
|
379
334
|
|
380
335
|
log.info(
|
@@ -404,9 +359,7 @@ def auto_discover_gerrit_host_keys(
|
|
404
359
|
log.warning("SSH host key auto-discovery failed: %s", exc)
|
405
360
|
return None
|
406
361
|
except Exception as exc:
|
407
|
-
log.warning(
|
408
|
-
"Unexpected error during SSH host key auto-discovery: %s", exc
|
409
|
-
)
|
362
|
+
log.warning("Unexpected error during SSH host key auto-discovery: %s", exc)
|
410
363
|
return None
|
411
364
|
else:
|
412
365
|
return host_keys
|