github2gerrit 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- github2gerrit/cli.py +796 -200
- github2gerrit/commit_normalization.py +44 -15
- github2gerrit/config.py +77 -30
- github2gerrit/core.py +1576 -260
- github2gerrit/duplicate_detection.py +224 -100
- github2gerrit/external_api.py +76 -25
- github2gerrit/gerrit_query.py +286 -0
- github2gerrit/gerrit_rest.py +53 -18
- github2gerrit/gerrit_urls.py +90 -33
- github2gerrit/github_api.py +19 -6
- github2gerrit/gitutils.py +43 -14
- github2gerrit/mapping_comment.py +345 -0
- github2gerrit/models.py +15 -1
- github2gerrit/orchestrator/__init__.py +25 -0
- github2gerrit/orchestrator/reconciliation.py +589 -0
- github2gerrit/pr_content_filter.py +65 -17
- github2gerrit/reconcile_matcher.py +595 -0
- github2gerrit/rich_display.py +502 -0
- github2gerrit/rich_logging.py +316 -0
- github2gerrit/similarity.py +66 -19
- github2gerrit/ssh_agent_setup.py +59 -22
- github2gerrit/ssh_common.py +30 -11
- github2gerrit/ssh_discovery.py +67 -20
- github2gerrit/trailers.py +340 -0
- github2gerrit/utils.py +6 -2
- {github2gerrit-0.1.9.dist-info → github2gerrit-0.1.11.dist-info}/METADATA +99 -25
- github2gerrit-0.1.11.dist-info/RECORD +31 -0
- {github2gerrit-0.1.9.dist-info → github2gerrit-0.1.11.dist-info}/WHEEL +1 -2
- github2gerrit-0.1.9.dist-info/RECORD +0 -24
- github2gerrit-0.1.9.dist-info/top_level.txt +0 -1
- {github2gerrit-0.1.9.dist-info → github2gerrit-0.1.11.dist-info}/entry_points.txt +0 -0
- {github2gerrit-0.1.9.dist-info → github2gerrit-0.1.11.dist-info}/licenses/LICENSE +0 -0
github2gerrit/ssh_common.py
CHANGED
@@ -35,14 +35,17 @@ def build_ssh_options(
|
|
35
35
|
batch_mode: Enable batch mode (non-interactive)
|
36
36
|
connect_timeout: Connection timeout in seconds
|
37
37
|
additional_options: Additional SSH options to include
|
38
|
-
respect_user_ssh_config: If True, respect user SSH config; if None,
|
38
|
+
respect_user_ssh_config: If True, respect user SSH config; if None,
|
39
|
+
check G2G_RESPECT_USER_SSH env var
|
39
40
|
|
40
41
|
Returns:
|
41
42
|
List of SSH option strings suitable for ssh command line
|
42
43
|
"""
|
43
44
|
# Check if we should respect user SSH config
|
44
45
|
if respect_user_ssh_config is None:
|
45
|
-
respect_user_ssh_config = os.getenv(
|
46
|
+
respect_user_ssh_config = os.getenv(
|
47
|
+
"G2G_RESPECT_USER_SSH", "false"
|
48
|
+
).lower() in ("true", "1", "yes")
|
46
49
|
|
47
50
|
options = []
|
48
51
|
if not respect_user_ssh_config:
|
@@ -57,7 +60,8 @@ def build_ssh_options(
|
|
57
60
|
if identities_only and not respect_user_ssh_config:
|
58
61
|
options.extend(
|
59
62
|
[
|
60
|
-
"-o IdentitiesOnly=yes", # Critical: prevents SSH agent
|
63
|
+
"-o IdentitiesOnly=yes", # Critical: prevents SSH agent
|
64
|
+
# scanning
|
61
65
|
"-o IdentityAgent=none",
|
62
66
|
]
|
63
67
|
)
|
@@ -107,7 +111,8 @@ def build_git_ssh_command(
|
|
107
111
|
batch_mode: Enable batch mode (non-interactive)
|
108
112
|
connect_timeout: Connection timeout in seconds
|
109
113
|
additional_options: Additional SSH options to include
|
110
|
-
respect_user_ssh_config: If True, respect user SSH config; if None,
|
114
|
+
respect_user_ssh_config: If True, respect user SSH config; if None,
|
115
|
+
check G2G_RESPECT_USER_SSH env var
|
111
116
|
|
112
117
|
Returns:
|
113
118
|
Complete SSH command string suitable for GIT_SSH_COMMAND
|
@@ -158,9 +163,11 @@ def augment_known_hosts_with_bracketed_entries(
|
|
158
163
|
hostname: str,
|
159
164
|
port: int = 22,
|
160
165
|
) -> str:
|
161
|
-
"""Augment known_hosts content with bracketed [host]:port entries for
|
166
|
+
"""Augment known_hosts content with bracketed [host]:port entries for
|
167
|
+
non-standard ports.
|
162
168
|
|
163
|
-
This function adds bracketed [host]:port variants for existing plain host
|
169
|
+
This function adds bracketed [host]:port variants for existing plain host
|
170
|
+
entries
|
164
171
|
to satisfy StrictHostKeyChecking with non-standard SSH ports.
|
165
172
|
|
166
173
|
Args:
|
@@ -169,15 +176,21 @@ def augment_known_hosts_with_bracketed_entries(
|
|
169
176
|
port: SSH port (default 22)
|
170
177
|
|
171
178
|
Returns:
|
172
|
-
Augmented known_hosts content (always normalized to end with single
|
179
|
+
Augmented known_hosts content (always normalized to end with single
|
180
|
+
newline)
|
173
181
|
"""
|
174
182
|
if not known_hosts_content.strip():
|
175
183
|
return known_hosts_content
|
176
184
|
|
177
|
-
original_lines = [
|
185
|
+
original_lines = [
|
186
|
+
ln.rstrip()
|
187
|
+
for ln in known_hosts_content.strip().splitlines()
|
188
|
+
if ln.strip()
|
189
|
+
]
|
178
190
|
augmented = list(original_lines)
|
179
191
|
|
180
|
-
# Add bracketed [host]:port variants for non-standard ports if hostname
|
192
|
+
# Add bracketed [host]:port variants for non-standard ports if hostname
|
193
|
+
# provided
|
181
194
|
if hostname and original_lines:
|
182
195
|
bracket_prefix = f"[{hostname}]:{port} "
|
183
196
|
plain_prefix = f"{hostname} "
|
@@ -199,7 +212,8 @@ def merge_known_hosts_content(
|
|
199
212
|
base_content: str,
|
200
213
|
additional_content: str,
|
201
214
|
) -> str:
|
202
|
-
"""Merge additional known_hosts content into base content without
|
215
|
+
"""Merge additional known_hosts content into base content without
|
216
|
+
duplicates.
|
203
217
|
|
204
218
|
Args:
|
205
219
|
base_content: Original known_hosts content
|
@@ -241,4 +255,9 @@ def augment_known_hosts(
|
|
241
255
|
# This is a placeholder for known hosts augmentation logic
|
242
256
|
# The actual implementation would use ssh-keyscan or similar
|
243
257
|
# to fetch and add host keys to the known_hosts file
|
244
|
-
log.debug(
|
258
|
+
log.debug(
|
259
|
+
"Would augment known_hosts at %s with %s:%d",
|
260
|
+
known_hosts_path,
|
261
|
+
hostname,
|
262
|
+
port,
|
263
|
+
)
|
github2gerrit/ssh_discovery.py
CHANGED
@@ -30,17 +30,31 @@ class SSHDiscoveryError(Exception):
|
|
30
30
|
|
31
31
|
|
32
32
|
# Error message constants to comply with TRY003
|
33
|
-
_MSG_HOST_UNREACHABLE =
|
33
|
+
_MSG_HOST_UNREACHABLE = (
|
34
|
+
"Host {hostname}:{port} is not reachable. Check network connectivity "
|
35
|
+
"and server availability."
|
36
|
+
)
|
34
37
|
_MSG_NO_KEYS_FOUND = (
|
35
|
-
"No SSH host keys found for {hostname}:{port}. The server may not be
|
38
|
+
"No SSH host keys found for {hostname}:{port}. The server may not be "
|
39
|
+
"running SSH or may be blocking connections."
|
36
40
|
)
|
37
41
|
_MSG_NO_VALID_KEYS = (
|
38
|
-
"No valid SSH host keys found for {hostname}:{port}. The ssh-keyscan
|
42
|
+
"No valid SSH host keys found for {hostname}:{port}. The ssh-keyscan "
|
43
|
+
"output was empty or malformed."
|
44
|
+
)
|
45
|
+
_MSG_CONNECTION_FAILED = (
|
46
|
+
"Failed to connect to {hostname}:{port} for SSH key discovery. "
|
47
|
+
"Error: {error}"
|
48
|
+
)
|
49
|
+
_MSG_KEYSCAN_FAILED = (
|
50
|
+
"ssh-keyscan failed with return code {returncode}: {error}"
|
51
|
+
)
|
52
|
+
_MSG_UNEXPECTED_ERROR = (
|
53
|
+
"Unexpected error during SSH key discovery for {hostname}:{port}: {error}"
|
54
|
+
)
|
55
|
+
_MSG_SAVE_FAILED = (
|
56
|
+
"Failed to save host keys to configuration file {config_file}: {error}"
|
39
57
|
)
|
40
|
-
_MSG_CONNECTION_FAILED = "Failed to connect to {hostname}:{port} for SSH key discovery. Error: {error}"
|
41
|
-
_MSG_KEYSCAN_FAILED = "ssh-keyscan failed with return code {returncode}: {error}"
|
42
|
-
_MSG_UNEXPECTED_ERROR = "Unexpected error during SSH key discovery for {hostname}:{port}: {error}"
|
43
|
-
_MSG_SAVE_FAILED = "Failed to save host keys to configuration file {config_file}: {error}"
|
44
58
|
|
45
59
|
|
46
60
|
def is_host_reachable(hostname: str, port: int, timeout: int = 5) -> bool:
|
@@ -53,7 +67,9 @@ def is_host_reachable(hostname: str, port: int, timeout: int = 5) -> bool:
|
|
53
67
|
|
54
68
|
|
55
69
|
@external_api_call(ApiType.SSH, "fetch_ssh_host_keys")
|
56
|
-
def fetch_ssh_host_keys(
|
70
|
+
def fetch_ssh_host_keys(
|
71
|
+
hostname: str, port: int = 22, timeout: int = 10
|
72
|
+
) -> str:
|
57
73
|
"""
|
58
74
|
Fetch SSH host keys for a given hostname and port using ssh-keyscan.
|
59
75
|
|
@@ -72,7 +88,9 @@ def fetch_ssh_host_keys(hostname: str, port: int = 22, timeout: int = 10) -> str
|
|
72
88
|
|
73
89
|
# First check if the host is reachable
|
74
90
|
if not is_host_reachable(hostname, port, timeout=5):
|
75
|
-
raise SSHDiscoveryError(
|
91
|
+
raise SSHDiscoveryError(
|
92
|
+
_MSG_HOST_UNREACHABLE.format(hostname=hostname, port=port)
|
93
|
+
)
|
76
94
|
|
77
95
|
try:
|
78
96
|
# Use ssh-keyscan to fetch all available key types
|
@@ -127,13 +145,23 @@ def fetch_ssh_host_keys(hostname: str, port: int = 22, timeout: int = 10) -> str
|
|
127
145
|
# ssh-keyscan returns 1 when it can't connect
|
128
146
|
error_msg = exc.stderr or exc.stdout or "Connection failed"
|
129
147
|
raise SSHDiscoveryError(
|
130
|
-
_MSG_CONNECTION_FAILED.format(
|
148
|
+
_MSG_CONNECTION_FAILED.format(
|
149
|
+
hostname=hostname, port=port, error=error_msg
|
150
|
+
)
|
131
151
|
) from exc
|
132
152
|
else:
|
133
153
|
error_msg = exc.stderr or exc.stdout or "Unknown error"
|
134
|
-
raise SSHDiscoveryError(
|
154
|
+
raise SSHDiscoveryError(
|
155
|
+
_MSG_KEYSCAN_FAILED.format(
|
156
|
+
returncode=exc.returncode, error=error_msg
|
157
|
+
)
|
158
|
+
) from exc
|
135
159
|
except Exception as exc:
|
136
|
-
raise SSHDiscoveryError(
|
160
|
+
raise SSHDiscoveryError(
|
161
|
+
_MSG_UNEXPECTED_ERROR.format(
|
162
|
+
hostname=hostname, port=port, error=exc
|
163
|
+
)
|
164
|
+
) from exc
|
137
165
|
else:
|
138
166
|
return discovered_keys
|
139
167
|
|
@@ -172,7 +200,9 @@ def extract_gerrit_info_from_gitreview(content: str) -> tuple[str, int] | None:
|
|
172
200
|
|
173
201
|
|
174
202
|
@external_api_call(ApiType.SSH, "discover_and_save_host_keys")
|
175
|
-
def discover_and_save_host_keys(
|
203
|
+
def discover_and_save_host_keys(
|
204
|
+
hostname: str, port: int, organization: str, config_path: str | None = None
|
205
|
+
) -> str:
|
176
206
|
"""
|
177
207
|
Discover SSH host keys and save them to the organization's configuration.
|
178
208
|
|
@@ -198,7 +228,9 @@ def discover_and_save_host_keys(hostname: str, port: int, organization: str, con
|
|
198
228
|
return host_keys
|
199
229
|
|
200
230
|
|
201
|
-
def save_host_keys_to_config(
|
231
|
+
def save_host_keys_to_config(
|
232
|
+
host_keys: str, organization: str, config_path: str | None = None
|
233
|
+
) -> None:
|
202
234
|
"""
|
203
235
|
Save SSH host keys to the organization's configuration file.
|
204
236
|
|
@@ -214,7 +246,9 @@ def save_host_keys_to_config(host_keys: str, organization: str, config_path: str
|
|
214
246
|
from .config import DEFAULT_CONFIG_PATH
|
215
247
|
|
216
248
|
if config_path is None:
|
217
|
-
config_path =
|
249
|
+
config_path = (
|
250
|
+
os.getenv("G2G_CONFIG_PATH", "").strip() or DEFAULT_CONFIG_PATH
|
251
|
+
)
|
218
252
|
|
219
253
|
config_file = Path(config_path).expanduser()
|
220
254
|
|
@@ -283,7 +317,9 @@ def save_host_keys_to_config(host_keys: str, organization: str, config_path: str
|
|
283
317
|
|
284
318
|
# Insert the GERRIT_KNOWN_HOSTS entry
|
285
319
|
escaped_keys = host_keys.replace("\n", "\\n")
|
286
|
-
new_lines.insert(
|
320
|
+
new_lines.insert(
|
321
|
+
section_end, f'GERRIT_KNOWN_HOSTS = "{escaped_keys}"'
|
322
|
+
)
|
287
323
|
|
288
324
|
# Write the updated configuration
|
289
325
|
config_file.write_text("\n".join(new_lines), encoding="utf-8")
|
@@ -295,7 +331,9 @@ def save_host_keys_to_config(host_keys: str, organization: str, config_path: str
|
|
295
331
|
)
|
296
332
|
|
297
333
|
except Exception as exc:
|
298
|
-
raise SSHDiscoveryError(
|
334
|
+
raise SSHDiscoveryError(
|
335
|
+
_MSG_SAVE_FAILED.format(config_file=config_file, error=exc)
|
336
|
+
) from exc
|
299
337
|
|
300
338
|
|
301
339
|
def auto_discover_gerrit_host_keys(
|
@@ -326,14 +364,21 @@ def auto_discover_gerrit_host_keys(
|
|
326
364
|
gerrit_port = 29418
|
327
365
|
|
328
366
|
if organization is None:
|
329
|
-
organization = (
|
367
|
+
organization = (
|
368
|
+
os.getenv("ORGANIZATION")
|
369
|
+
or os.getenv("GITHUB_REPOSITORY_OWNER")
|
370
|
+
or ""
|
371
|
+
).strip()
|
330
372
|
|
331
373
|
if not gerrit_hostname:
|
332
374
|
log.debug("No Gerrit hostname provided for auto-discovery")
|
333
375
|
return None
|
334
376
|
|
335
377
|
if not organization:
|
336
|
-
log.warning(
|
378
|
+
log.warning(
|
379
|
+
"No organization specified for SSH host key auto-discovery. "
|
380
|
+
"Cannot save to configuration file."
|
381
|
+
)
|
337
382
|
save_to_config = False
|
338
383
|
|
339
384
|
log.info(
|
@@ -363,7 +408,9 @@ def auto_discover_gerrit_host_keys(
|
|
363
408
|
log.warning("SSH host key auto-discovery failed: %s", exc)
|
364
409
|
return None
|
365
410
|
except Exception as exc:
|
366
|
-
log.warning(
|
411
|
+
log.warning(
|
412
|
+
"Unexpected error during SSH host key auto-discovery: %s", exc
|
413
|
+
)
|
367
414
|
return None
|
368
415
|
else:
|
369
416
|
return host_keys
|
@@ -0,0 +1,340 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
# SPDX-FileCopyrightText: 2025 The Linux Foundation
|
3
|
+
"""
|
4
|
+
Common trailer constants and parsing utilities.
|
5
|
+
|
6
|
+
This module provides shared functionality for working with Git commit
|
7
|
+
trailers, particularly GitHub PR metadata trailers used for reconciliation.
|
8
|
+
"""
|
9
|
+
|
10
|
+
import logging
|
11
|
+
import re
|
12
|
+
|
13
|
+
|
14
|
+
log = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
# Trailer key constants
|
18
|
+
GITHUB_PR_TRAILER = "GitHub-PR"
|
19
|
+
GITHUB_HASH_TRAILER = "GitHub-Hash"
|
20
|
+
CHANGE_ID_TRAILER = "Change-Id"
|
21
|
+
SIGNED_OFF_BY_TRAILER = "Signed-off-by"
|
22
|
+
|
23
|
+
# Standard trailer keys we recognize
|
24
|
+
KNOWN_TRAILER_KEYS = {
|
25
|
+
GITHUB_PR_TRAILER,
|
26
|
+
GITHUB_HASH_TRAILER,
|
27
|
+
CHANGE_ID_TRAILER,
|
28
|
+
SIGNED_OFF_BY_TRAILER,
|
29
|
+
}
|
30
|
+
|
31
|
+
|
32
|
+
def parse_trailers(commit_message: str) -> dict[str, list[str]]:
|
33
|
+
"""
|
34
|
+
Parse Git-style trailers from a commit message.
|
35
|
+
|
36
|
+
Trailers are key-value pairs at the end of a commit message,
|
37
|
+
after the last blank line, in the format "Key: value".
|
38
|
+
|
39
|
+
Args:
|
40
|
+
commit_message: Full commit message text
|
41
|
+
|
42
|
+
Returns:
|
43
|
+
Dictionary mapping trailer keys to lists of values
|
44
|
+
"""
|
45
|
+
trailers: dict[str, list[str]] = {}
|
46
|
+
|
47
|
+
if not commit_message.strip():
|
48
|
+
return trailers
|
49
|
+
|
50
|
+
lines = commit_message.strip().split("\n")
|
51
|
+
|
52
|
+
# Find the start of trailers (after the last blank line)
|
53
|
+
trailer_start = 0
|
54
|
+
for i in range(len(lines) - 1, -1, -1):
|
55
|
+
if not lines[i].strip():
|
56
|
+
trailer_start = i + 1
|
57
|
+
break
|
58
|
+
|
59
|
+
# Parse trailer lines
|
60
|
+
for raw_line in lines[trailer_start:]:
|
61
|
+
line = raw_line.strip()
|
62
|
+
if ":" in line:
|
63
|
+
key, value = line.split(":", 1)
|
64
|
+
key = key.strip()
|
65
|
+
value = value.strip()
|
66
|
+
|
67
|
+
if key:
|
68
|
+
if key not in trailers:
|
69
|
+
trailers[key] = []
|
70
|
+
trailers[key].append(value)
|
71
|
+
|
72
|
+
return trailers
|
73
|
+
|
74
|
+
|
75
|
+
def extract_github_metadata(commit_message: str) -> dict[str, str]:
|
76
|
+
"""
|
77
|
+
Extract GitHub PR metadata trailers from a commit message.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
commit_message: Full commit message text
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
Dictionary with GitHub-* trailer values (single values, not lists)
|
84
|
+
"""
|
85
|
+
trailers = parse_trailers(commit_message)
|
86
|
+
metadata = {}
|
87
|
+
|
88
|
+
for key in [GITHUB_PR_TRAILER, GITHUB_HASH_TRAILER]:
|
89
|
+
values = trailers.get(key, [])
|
90
|
+
if values:
|
91
|
+
# Take the last value if multiple exist
|
92
|
+
metadata[key] = values[-1]
|
93
|
+
|
94
|
+
return metadata
|
95
|
+
|
96
|
+
|
97
|
+
def extract_change_ids(commit_message: str) -> list[str]:
|
98
|
+
"""
|
99
|
+
Extract Change-Id trailer values from a commit message.
|
100
|
+
|
101
|
+
Args:
|
102
|
+
commit_message: Full commit message text
|
103
|
+
|
104
|
+
Returns:
|
105
|
+
List of Change-Id values found
|
106
|
+
"""
|
107
|
+
trailers = parse_trailers(commit_message)
|
108
|
+
return trailers.get(CHANGE_ID_TRAILER, [])
|
109
|
+
|
110
|
+
|
111
|
+
def has_trailer(
|
112
|
+
commit_message: str, key: str, value: str | None = None
|
113
|
+
) -> bool:
|
114
|
+
"""
|
115
|
+
Check if a commit message contains a specific trailer.
|
116
|
+
|
117
|
+
Args:
|
118
|
+
commit_message: Full commit message text
|
119
|
+
key: Trailer key to check for
|
120
|
+
value: Optional specific value to match (if None, any value matches)
|
121
|
+
|
122
|
+
Returns:
|
123
|
+
True if the trailer exists with the specified value (or any value)
|
124
|
+
"""
|
125
|
+
trailers = parse_trailers(commit_message)
|
126
|
+
|
127
|
+
if key not in trailers:
|
128
|
+
return False
|
129
|
+
|
130
|
+
if value is None:
|
131
|
+
return True
|
132
|
+
|
133
|
+
return value in trailers[key]
|
134
|
+
|
135
|
+
|
136
|
+
def add_trailers(commit_message: str, new_trailers: dict[str, str]) -> str:
|
137
|
+
"""
|
138
|
+
Add trailers to a commit message, avoiding duplicates.
|
139
|
+
|
140
|
+
Args:
|
141
|
+
commit_message: Original commit message
|
142
|
+
new_trailers: Dictionary of trailers to add (key -> value)
|
143
|
+
|
144
|
+
Returns:
|
145
|
+
Commit message with trailers added
|
146
|
+
"""
|
147
|
+
if not new_trailers:
|
148
|
+
return commit_message
|
149
|
+
|
150
|
+
existing_trailers = parse_trailers(commit_message)
|
151
|
+
msg = commit_message.rstrip()
|
152
|
+
|
153
|
+
# Collect trailers that need to be added
|
154
|
+
to_add = []
|
155
|
+
for key, value in new_trailers.items():
|
156
|
+
if key not in existing_trailers or value not in existing_trailers[key]:
|
157
|
+
to_add.append(f"{key}: {value}")
|
158
|
+
|
159
|
+
if to_add:
|
160
|
+
if msg and not msg.endswith("\n"):
|
161
|
+
msg += "\n"
|
162
|
+
if not msg.endswith("\n\n"):
|
163
|
+
msg += "\n"
|
164
|
+
msg += "\n".join(to_add) + "\n"
|
165
|
+
|
166
|
+
return msg
|
167
|
+
|
168
|
+
|
169
|
+
def normalize_subject_for_matching(subject: str) -> str:
|
170
|
+
"""
|
171
|
+
Normalize a commit subject line for similarity matching.
|
172
|
+
|
173
|
+
This removes common noise and standardizes the format to improve
|
174
|
+
matching accuracy when commits have minor subject changes.
|
175
|
+
|
176
|
+
Args:
|
177
|
+
subject: Original commit subject line
|
178
|
+
|
179
|
+
Returns:
|
180
|
+
Normalized subject for comparison
|
181
|
+
"""
|
182
|
+
if not subject:
|
183
|
+
return ""
|
184
|
+
|
185
|
+
# Remove common prefixes and suffixes
|
186
|
+
normalized = subject.strip()
|
187
|
+
|
188
|
+
# Remove version numbers and tags in brackets/parentheses
|
189
|
+
normalized = re.sub(r"\s*[\[\(][vV]?\d+[\.\d]*[\]\)]\s*", " ", normalized)
|
190
|
+
|
191
|
+
# Remove "WIP:", "DRAFT:", etc. prefixes
|
192
|
+
normalized = re.sub(
|
193
|
+
r"^\s*(WIP|DRAFT|TODO|FIXME|HACK):\s*",
|
194
|
+
"",
|
195
|
+
normalized,
|
196
|
+
flags=re.IGNORECASE,
|
197
|
+
)
|
198
|
+
|
199
|
+
# Remove trailing punctuation and whitespace
|
200
|
+
normalized = re.sub(r"[.!]+\s*$", "", normalized)
|
201
|
+
|
202
|
+
# Normalize whitespace
|
203
|
+
normalized = re.sub(r"\s+", " ", normalized).strip()
|
204
|
+
|
205
|
+
# Convert to lowercase for case-insensitive matching
|
206
|
+
return normalized.lower()
|
207
|
+
|
208
|
+
|
209
|
+
def compute_file_signature(file_paths: list[str]) -> str:
|
210
|
+
"""
|
211
|
+
Compute a normalized signature for a set of file paths.
|
212
|
+
|
213
|
+
This creates a deterministic hash of the files touched by a commit,
|
214
|
+
useful for matching commits that affect the same files.
|
215
|
+
|
216
|
+
Args:
|
217
|
+
file_paths: List of file paths
|
218
|
+
|
219
|
+
Returns:
|
220
|
+
Hex string signature of the file set
|
221
|
+
"""
|
222
|
+
import hashlib
|
223
|
+
|
224
|
+
if not file_paths:
|
225
|
+
return ""
|
226
|
+
|
227
|
+
# Normalize paths: lowercase, remove leading/trailing slashes
|
228
|
+
normalized_paths = []
|
229
|
+
for path in file_paths:
|
230
|
+
normalized = path.strip().lower()
|
231
|
+
normalized = normalized.strip("/")
|
232
|
+
if normalized:
|
233
|
+
normalized_paths.append(normalized)
|
234
|
+
|
235
|
+
# Sort for deterministic ordering
|
236
|
+
normalized_paths.sort()
|
237
|
+
|
238
|
+
# Create hash of the sorted, normalized path list
|
239
|
+
content = "\n".join(normalized_paths)
|
240
|
+
hash_obj = hashlib.sha256(content.encode("utf-8"))
|
241
|
+
return hash_obj.hexdigest()[:12] # 12 hex chars = 48 bits
|
242
|
+
|
243
|
+
|
244
|
+
def extract_subject_tokens(subject: str) -> set[str]:
|
245
|
+
"""
|
246
|
+
Extract meaningful tokens from a subject line for similarity matching.
|
247
|
+
|
248
|
+
Args:
|
249
|
+
subject: Commit subject line
|
250
|
+
|
251
|
+
Returns:
|
252
|
+
Set of normalized tokens
|
253
|
+
"""
|
254
|
+
if not subject:
|
255
|
+
return set()
|
256
|
+
|
257
|
+
# Normalize the subject
|
258
|
+
normalized = normalize_subject_for_matching(subject)
|
259
|
+
|
260
|
+
# Split on common delimiters and filter out short/common words
|
261
|
+
tokens = re.split(r"[\s\-_\.,:;/\\]+", normalized)
|
262
|
+
|
263
|
+
# Filter tokens: must be at least 3 chars and not common stop words
|
264
|
+
stop_words = {
|
265
|
+
"the",
|
266
|
+
"and",
|
267
|
+
"for",
|
268
|
+
"are",
|
269
|
+
"but",
|
270
|
+
"not",
|
271
|
+
"you",
|
272
|
+
"all",
|
273
|
+
"can",
|
274
|
+
"had",
|
275
|
+
"has",
|
276
|
+
"was",
|
277
|
+
"one",
|
278
|
+
"our",
|
279
|
+
"out",
|
280
|
+
"day",
|
281
|
+
"get",
|
282
|
+
"use",
|
283
|
+
"man",
|
284
|
+
"new",
|
285
|
+
"now",
|
286
|
+
"old",
|
287
|
+
"see",
|
288
|
+
"two",
|
289
|
+
"way",
|
290
|
+
"who",
|
291
|
+
"its",
|
292
|
+
"did",
|
293
|
+
"yes",
|
294
|
+
"his",
|
295
|
+
"her",
|
296
|
+
"him",
|
297
|
+
"how",
|
298
|
+
"may",
|
299
|
+
"say",
|
300
|
+
"she",
|
301
|
+
"add",
|
302
|
+
"fix",
|
303
|
+
"set",
|
304
|
+
"put",
|
305
|
+
"run",
|
306
|
+
"try",
|
307
|
+
"let",
|
308
|
+
"end",
|
309
|
+
}
|
310
|
+
|
311
|
+
meaningful_tokens = set()
|
312
|
+
for raw_token in tokens:
|
313
|
+
token = raw_token.strip()
|
314
|
+
if len(token) >= 3 and token not in stop_words:
|
315
|
+
meaningful_tokens.add(token)
|
316
|
+
|
317
|
+
return meaningful_tokens
|
318
|
+
|
319
|
+
|
320
|
+
def compute_jaccard_similarity(set1: set[str], set2: set[str]) -> float:
|
321
|
+
"""
|
322
|
+
Compute Jaccard similarity between two sets of tokens.
|
323
|
+
|
324
|
+
Args:
|
325
|
+
set1: First set of tokens
|
326
|
+
set2: Second set of tokens
|
327
|
+
|
328
|
+
Returns:
|
329
|
+
Jaccard similarity coefficient (0.0 to 1.0)
|
330
|
+
"""
|
331
|
+
if not set1 and not set2:
|
332
|
+
return 1.0
|
333
|
+
|
334
|
+
if not set1 or not set2:
|
335
|
+
return 0.0
|
336
|
+
|
337
|
+
intersection = len(set1.intersection(set2))
|
338
|
+
union = len(set1.union(set2))
|
339
|
+
|
340
|
+
return intersection / union if union > 0 else 0.0
|
github2gerrit/utils.py
CHANGED
@@ -68,7 +68,9 @@ def is_verbose_mode() -> bool:
|
|
68
68
|
return os.getenv("G2G_VERBOSE", "").lower() in ("true", "1", "yes")
|
69
69
|
|
70
70
|
|
71
|
-
def log_exception_conditionally(
|
71
|
+
def log_exception_conditionally(
|
72
|
+
logger: logging.Logger, message: str, *args: Any
|
73
|
+
) -> None:
|
72
74
|
"""Log exception with traceback only if verbose mode is enabled.
|
73
75
|
|
74
76
|
Args:
|
@@ -110,4 +112,6 @@ def append_github_output(outputs: dict[str, str]) -> None:
|
|
110
112
|
except Exception as exc:
|
111
113
|
# Use a basic logger since we can't import from other modules
|
112
114
|
# without creating circular dependencies
|
113
|
-
logging.getLogger(__name__).debug(
|
115
|
+
logging.getLogger(__name__).debug(
|
116
|
+
"Failed to write GITHUB_OUTPUT: %s", exc
|
117
|
+
)
|