github2gerrit 0.1.10__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,595 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: 2025 The Linux Foundation
3
+ """
4
+ Multi-pass reconciliation matcher for GitHub commits to Gerrit changes.
5
+
6
+ This module implements the core matching algorithm that pairs local GitHub
7
+ commits with existing Gerrit changes using multiple strategies, from most
8
+ reliable to least reliable.
9
+ """
10
+
11
+ import logging
12
+ from dataclasses import dataclass
13
+ from enum import Enum
14
+
15
+ from .gerrit_query import GerritChange
16
+ from .trailers import compute_file_signature
17
+ from .trailers import compute_jaccard_similarity
18
+ from .trailers import extract_change_ids
19
+ from .trailers import extract_subject_tokens
20
+ from .trailers import normalize_subject_for_matching
21
+
22
+
23
+ log = logging.getLogger(__name__)
24
+
25
+
26
+ class MatchStrategy(Enum):
27
+ """Matching strategy used to pair commits with changes."""
28
+
29
+ TRAILER = "trailer" # Direct Change-ID match from commit trailer
30
+ SUBJECT_EXACT = "subject_exact" # Exact normalized subject match
31
+ FILE_SIGNATURE = "file_signature" # Same file set hash
32
+ SUBJECT_SIMILARITY = "subject_similarity" # Jaccard similarity >= threshold
33
+
34
+
35
+ @dataclass
36
+ class LocalCommit:
37
+ """Represents a local commit to be matched."""
38
+
39
+ index: int # Position in commit list (for ordering)
40
+ sha: str
41
+ subject: str
42
+ files: list[str]
43
+ commit_message: str
44
+ existing_change_id: str | None = None # From commit trailer
45
+
46
+
47
+ @dataclass
48
+ class MatchResult:
49
+ """Result of matching a local commit to a Gerrit change."""
50
+
51
+ local_commit: LocalCommit
52
+ gerrit_change: GerritChange | None
53
+ strategy: MatchStrategy | None
54
+ confidence: float # 0.0 to 1.0
55
+ change_id: str # Reused or newly generated
56
+
57
+
58
+ @dataclass
59
+ class ReconciliationResult:
60
+ """Complete reconciliation result with summary statistics."""
61
+
62
+ matches: list[MatchResult]
63
+ reused_count: int
64
+ new_count: int
65
+ orphaned_changes: list[GerritChange]
66
+ strategy_counts: dict[MatchStrategy, int]
67
+
68
+ @property
69
+ def change_ids(self) -> list[str]:
70
+ """Extract ordered list of Change-IDs from matches."""
71
+ return [match.change_id for match in self.matches]
72
+
73
+
74
+ class ReconciliationMatcher:
75
+ """Multi-pass matcher for reconciling commits with Gerrit changes."""
76
+
77
+ def __init__(
78
+ self,
79
+ *,
80
+ similarity_threshold: float = 0.7,
81
+ allow_duplicate_subjects: bool = True,
82
+ ):
83
+ """
84
+ Initialize the matcher with configuration.
85
+
86
+ Args:
87
+ similarity_threshold: Minimum Jaccard similarity for subject
88
+ matching
89
+ allow_duplicate_subjects: Allow multiple commits with same subject
90
+ """
91
+ self.similarity_threshold = similarity_threshold
92
+ self.allow_duplicate_subjects = allow_duplicate_subjects
93
+
94
+ def reconcile(
95
+ self,
96
+ local_commits: list[LocalCommit],
97
+ gerrit_changes: list[GerritChange],
98
+ ) -> ReconciliationResult:
99
+ """
100
+ Perform multi-pass reconciliation of local commits with Gerrit changes.
101
+
102
+ Args:
103
+ local_commits: Ordered list of local commits to match
104
+ gerrit_changes: List of existing Gerrit changes from topic query
105
+
106
+ Returns:
107
+ Complete reconciliation result with matches and statistics
108
+ """
109
+ if not local_commits:
110
+ return ReconciliationResult(
111
+ matches=[],
112
+ reused_count=0,
113
+ new_count=0,
114
+ orphaned_changes=gerrit_changes.copy(),
115
+ strategy_counts={},
116
+ )
117
+
118
+ # Phase 3 hardening: conflict / duplicate Change-Id detection before
119
+ # any pass
120
+ # Build a map of existing Change-Id trailers present on local commits;
121
+ # if one appears on more than one commit we abort with actionable
122
+ # guidance.
123
+ seen_trailer_ids: dict[str, list[int]] = {}
124
+ for lc in local_commits:
125
+ if lc.existing_change_id:
126
+ seen_trailer_ids.setdefault(lc.existing_change_id, []).append(
127
+ lc.index
128
+ )
129
+ duplicate_trailers = {
130
+ cid: idxs for cid, idxs in seen_trailer_ids.items() if len(idxs) > 1
131
+ }
132
+ if duplicate_trailers:
133
+ details = ", ".join(
134
+ f"{cid} -> positions {idxs}"
135
+ for cid, idxs in duplicate_trailers.items()
136
+ )
137
+ msg = (
138
+ f"Duplicate Change-Id trailer reuse detected across multiple "
139
+ f"local commits: {details}. "
140
+ "Amend commits to ensure each uses a distinct Change-Id or "
141
+ "drop conflicting trailers."
142
+ )
143
+ raise ValueError(msg)
144
+
145
+ log.info(
146
+ "Starting reconciliation: %d local commits, %d Gerrit changes",
147
+ len(local_commits),
148
+ len(gerrit_changes),
149
+ )
150
+
151
+ # Track used changes to prevent duplicate matching
152
+ used_changes: set[str] = set()
153
+ matches: list[MatchResult] = []
154
+ strategy_counts: dict[MatchStrategy, int] = {}
155
+
156
+ # Pass A: Trailer-based matching (highest priority)
157
+ remaining_commits = self._match_by_trailer(
158
+ local_commits,
159
+ gerrit_changes,
160
+ used_changes,
161
+ matches,
162
+ strategy_counts,
163
+ )
164
+
165
+ # Pass B: Exact subject matching
166
+ remaining_commits = self._match_by_subject_exact(
167
+ remaining_commits,
168
+ gerrit_changes,
169
+ used_changes,
170
+ matches,
171
+ strategy_counts,
172
+ )
173
+
174
+ # Pass C: File signature matching
175
+ remaining_commits = self._match_by_file_signature(
176
+ remaining_commits,
177
+ gerrit_changes,
178
+ used_changes,
179
+ matches,
180
+ strategy_counts,
181
+ )
182
+
183
+ # Pass D: Subject similarity matching
184
+ remaining_commits = self._match_by_subject_similarity(
185
+ remaining_commits,
186
+ gerrit_changes,
187
+ used_changes,
188
+ matches,
189
+ strategy_counts,
190
+ )
191
+
192
+ # Generate new Change-IDs for unmatched commits
193
+ for commit in remaining_commits:
194
+ new_change_id = self._generate_change_id()
195
+ matches.append(
196
+ MatchResult(
197
+ local_commit=commit,
198
+ gerrit_change=None,
199
+ strategy=None,
200
+ confidence=0.0,
201
+ change_id=new_change_id,
202
+ )
203
+ )
204
+
205
+ # Sort matches by original commit index to maintain order
206
+ matches.sort(key=lambda m: m.local_commit.index)
207
+
208
+ # Identify orphaned changes
209
+ orphaned = [
210
+ change
211
+ for change in gerrit_changes
212
+ if change.change_id not in used_changes
213
+ ]
214
+
215
+ reused_count = len(used_changes)
216
+ new_count = len(local_commits) - reused_count
217
+
218
+ result = ReconciliationResult(
219
+ matches=matches,
220
+ reused_count=reused_count,
221
+ new_count=new_count,
222
+ orphaned_changes=orphaned,
223
+ strategy_counts=strategy_counts,
224
+ )
225
+
226
+ # Enhanced Phase 3 summary: pass counts & ID classifications
227
+ self._log_reconciliation_summary(result)
228
+ return result
229
+
230
+ def _match_by_trailer(
231
+ self,
232
+ commits: list[LocalCommit],
233
+ gerrit_changes: list[GerritChange],
234
+ used_changes: set[str],
235
+ matches: list[MatchResult],
236
+ strategy_counts: dict[MatchStrategy, int],
237
+ ) -> list[LocalCommit]:
238
+ """Pass A: Match commits that already have Change-ID trailers."""
239
+ remaining = []
240
+
241
+ # Build lookup map for Gerrit changes
242
+ gerrit_by_id = {change.change_id: change for change in gerrit_changes}
243
+
244
+ for commit in commits:
245
+ if not commit.existing_change_id:
246
+ remaining.append(commit)
247
+ continue
248
+
249
+ cid = commit.existing_change_id
250
+ if cid in gerrit_by_id and cid not in used_changes:
251
+ gerrit_change = gerrit_by_id[cid]
252
+ matches.append(
253
+ MatchResult(
254
+ local_commit=commit,
255
+ gerrit_change=gerrit_change,
256
+ strategy=MatchStrategy.TRAILER,
257
+ confidence=1.0,
258
+ change_id=cid,
259
+ )
260
+ )
261
+ used_changes.add(cid)
262
+ strategy_counts[MatchStrategy.TRAILER] = (
263
+ strategy_counts.get(MatchStrategy.TRAILER, 0) + 1
264
+ )
265
+ log.debug("Trailer match: %s -> %s", commit.sha[:8], cid)
266
+ else:
267
+ # Change-ID not found in Gerrit or already used
268
+ remaining.append(commit)
269
+ if cid in used_changes:
270
+ log.warning(
271
+ "Duplicate Change-ID %s in commit %s (already used)",
272
+ cid,
273
+ commit.sha[:8],
274
+ )
275
+
276
+ return remaining
277
+
278
+ def _match_by_subject_exact(
279
+ self,
280
+ commits: list[LocalCommit],
281
+ gerrit_changes: list[GerritChange],
282
+ used_changes: set[str],
283
+ matches: list[MatchResult],
284
+ strategy_counts: dict[MatchStrategy, int],
285
+ ) -> list[LocalCommit]:
286
+ """Pass B: Match by exact normalized subject."""
287
+ remaining = []
288
+
289
+ # Build lookup map by normalized subject
290
+ gerrit_by_subject: dict[str, list[GerritChange]] = {}
291
+ for change in gerrit_changes:
292
+ if change.change_id in used_changes:
293
+ continue
294
+
295
+ norm_subject = normalize_subject_for_matching(change.subject)
296
+ if norm_subject not in gerrit_by_subject:
297
+ gerrit_by_subject[norm_subject] = []
298
+ gerrit_by_subject[norm_subject].append(change)
299
+
300
+ for commit in commits:
301
+ norm_subject = normalize_subject_for_matching(commit.subject)
302
+ candidates = gerrit_by_subject.get(norm_subject, [])
303
+
304
+ if not candidates:
305
+ remaining.append(commit)
306
+ continue
307
+
308
+ # Use first available candidate (could enhance with additional
309
+ # criteria)
310
+ gerrit_change = candidates[0]
311
+ matches.append(
312
+ MatchResult(
313
+ local_commit=commit,
314
+ gerrit_change=gerrit_change,
315
+ strategy=MatchStrategy.SUBJECT_EXACT,
316
+ confidence=0.9,
317
+ change_id=gerrit_change.change_id,
318
+ )
319
+ )
320
+ used_changes.add(gerrit_change.change_id)
321
+ strategy_counts[MatchStrategy.SUBJECT_EXACT] = (
322
+ strategy_counts.get(MatchStrategy.SUBJECT_EXACT, 0) + 1
323
+ )
324
+
325
+ # Remove from candidates to prevent reuse
326
+ candidates.remove(gerrit_change)
327
+ if not candidates:
328
+ del gerrit_by_subject[norm_subject]
329
+
330
+ log.debug(
331
+ "Subject exact match: %s -> %s",
332
+ commit.sha[:8],
333
+ gerrit_change.change_id,
334
+ )
335
+
336
+ return remaining
337
+
338
+ def _match_by_file_signature(
339
+ self,
340
+ commits: list[LocalCommit],
341
+ gerrit_changes: list[GerritChange],
342
+ used_changes: set[str],
343
+ matches: list[MatchResult],
344
+ strategy_counts: dict[MatchStrategy, int],
345
+ ) -> list[LocalCommit]:
346
+ """Pass C: Match by file signature (same set of files)."""
347
+ remaining = []
348
+
349
+ # Build lookup map by file signature
350
+ gerrit_by_files: dict[str, list[GerritChange]] = {}
351
+ for change in gerrit_changes:
352
+ if change.change_id in used_changes:
353
+ continue
354
+
355
+ file_sig = compute_file_signature(change.files)
356
+ if not file_sig:
357
+ continue # Skip changes with no files
358
+
359
+ if file_sig not in gerrit_by_files:
360
+ gerrit_by_files[file_sig] = []
361
+ gerrit_by_files[file_sig].append(change)
362
+
363
+ for commit in commits:
364
+ file_sig = compute_file_signature(commit.files)
365
+ if not file_sig:
366
+ remaining.append(commit)
367
+ continue
368
+
369
+ candidates = gerrit_by_files.get(file_sig, [])
370
+ if not candidates:
371
+ remaining.append(commit)
372
+ continue
373
+
374
+ # Use first available candidate
375
+ gerrit_change = candidates[0]
376
+ matches.append(
377
+ MatchResult(
378
+ local_commit=commit,
379
+ gerrit_change=gerrit_change,
380
+ strategy=MatchStrategy.FILE_SIGNATURE,
381
+ confidence=0.8,
382
+ change_id=gerrit_change.change_id,
383
+ )
384
+ )
385
+ used_changes.add(gerrit_change.change_id)
386
+ strategy_counts[MatchStrategy.FILE_SIGNATURE] = (
387
+ strategy_counts.get(MatchStrategy.FILE_SIGNATURE, 0) + 1
388
+ )
389
+
390
+ # Remove from candidates
391
+ candidates.remove(gerrit_change)
392
+ if not candidates:
393
+ del gerrit_by_files[file_sig]
394
+
395
+ log.debug(
396
+ "File signature match: %s -> %s (sig=%s)",
397
+ commit.sha[:8],
398
+ gerrit_change.change_id,
399
+ file_sig,
400
+ )
401
+
402
+ return remaining
403
+
404
+ def _match_by_subject_similarity(
405
+ self,
406
+ commits: list[LocalCommit],
407
+ gerrit_changes: list[GerritChange],
408
+ used_changes: set[str],
409
+ matches: list[MatchResult],
410
+ strategy_counts: dict[MatchStrategy, int],
411
+ ) -> list[LocalCommit]:
412
+ """Pass D: Match by subject token similarity (Jaccard)."""
413
+ remaining = []
414
+
415
+ # Get available Gerrit changes
416
+ available_changes = [
417
+ change
418
+ for change in gerrit_changes
419
+ if change.change_id not in used_changes
420
+ ]
421
+
422
+ for commit in commits:
423
+ commit_tokens = extract_subject_tokens(commit.subject)
424
+ if not commit_tokens:
425
+ remaining.append(commit)
426
+ continue
427
+
428
+ best_match: tuple[GerritChange, float] | None = None
429
+
430
+ for change in available_changes:
431
+ change_tokens = extract_subject_tokens(change.subject)
432
+ if not change_tokens:
433
+ continue
434
+
435
+ similarity = compute_jaccard_similarity(
436
+ commit_tokens, change_tokens
437
+ )
438
+ if similarity >= self.similarity_threshold and (
439
+ best_match is None or similarity > best_match[1]
440
+ ):
441
+ best_match = (change, similarity)
442
+
443
+ if best_match:
444
+ gerrit_change, confidence = best_match
445
+ matches.append(
446
+ MatchResult(
447
+ local_commit=commit,
448
+ gerrit_change=gerrit_change,
449
+ strategy=MatchStrategy.SUBJECT_SIMILARITY,
450
+ confidence=confidence,
451
+ change_id=gerrit_change.change_id,
452
+ )
453
+ )
454
+ used_changes.add(gerrit_change.change_id)
455
+ strategy_counts[MatchStrategy.SUBJECT_SIMILARITY] = (
456
+ strategy_counts.get(MatchStrategy.SUBJECT_SIMILARITY, 0) + 1
457
+ )
458
+
459
+ # Remove from available changes
460
+ available_changes.remove(gerrit_change)
461
+
462
+ log.debug(
463
+ "Similarity match: %s -> %s (confidence=%.2f)",
464
+ commit.sha[:8],
465
+ gerrit_change.change_id,
466
+ confidence,
467
+ )
468
+ else:
469
+ remaining.append(commit)
470
+
471
+ return remaining
472
+
473
+ def _generate_change_id(self) -> str:
474
+ """Generate a new Change-ID."""
475
+ import hashlib
476
+ import time
477
+
478
+ # Simple Change-ID generation (could be enhanced)
479
+ content = f"{time.time()}"
480
+ hash_obj = hashlib.sha256(content.encode("utf-8"))
481
+ return "I" + hash_obj.hexdigest()[:40]
482
+
483
+ def _log_reconciliation_summary(self, result: ReconciliationResult) -> None:
484
+ """Log a structured summary of reconciliation results."""
485
+ total = len(result.matches)
486
+
487
+ # Human-readable summary
488
+ log.info(
489
+ "Reconciliation complete: total=%d reused=%d new=%d orphaned=%d "
490
+ "(passes: trailer=%d subject_exact=%d file_signature=%d "
491
+ "subject_similarity=%d)",
492
+ total,
493
+ result.reused_count,
494
+ result.new_count,
495
+ len(result.orphaned_changes),
496
+ result.strategy_counts.get(MatchStrategy.TRAILER, 0),
497
+ result.strategy_counts.get(MatchStrategy.SUBJECT_EXACT, 0),
498
+ result.strategy_counts.get(MatchStrategy.FILE_SIGNATURE, 0),
499
+ result.strategy_counts.get(MatchStrategy.SUBJECT_SIMILARITY, 0),
500
+ )
501
+
502
+ # Strategy breakdown
503
+ if result.strategy_counts:
504
+ strategy_details = []
505
+ for strategy, count in result.strategy_counts.items():
506
+ strategy_details.append(f"{strategy.value}={count}")
507
+ log.info(
508
+ "Matching strategies (non-zero): %s", " ".join(strategy_details)
509
+ )
510
+
511
+ # Orphaned changes warning
512
+ if result.orphaned_changes:
513
+ orphaned_ids = [
514
+ change.change_id for change in result.orphaned_changes
515
+ ]
516
+ log.warning(
517
+ "Found %d orphaned Gerrit changes (no local counterpart): %s",
518
+ len(orphaned_ids),
519
+ ", ".join(orphaned_ids),
520
+ )
521
+
522
+ # Structured JSON debug line
523
+ import json
524
+
525
+ # Emit structured summary including per-pass counts and explicit lists
526
+ # to aid downstream tooling or debugging (names kept stable for
527
+ # automation).
528
+ debug_data = {
529
+ "total_local": total,
530
+ "reused": result.reused_count,
531
+ "new": result.new_count,
532
+ "orphaned": len(result.orphaned_changes),
533
+ "strategies": {
534
+ strategy.value: count
535
+ for strategy, count in result.strategy_counts.items()
536
+ },
537
+ "reused_ids": [
538
+ m.change_id for m in result.matches if m.strategy is not None
539
+ ],
540
+ "new_ids": [
541
+ m.change_id for m in result.matches if m.strategy is None
542
+ ],
543
+ "orphaned_ids": [c.change_id for c in result.orphaned_changes],
544
+ "passes": {
545
+ "A_trailer": result.strategy_counts.get(
546
+ MatchStrategy.TRAILER, 0
547
+ ),
548
+ "B_subject_exact": result.strategy_counts.get(
549
+ MatchStrategy.SUBJECT_EXACT, 0
550
+ ),
551
+ "C_file_signature": result.strategy_counts.get(
552
+ MatchStrategy.FILE_SIGNATURE, 0
553
+ ),
554
+ "D_subject_similarity": result.strategy_counts.get(
555
+ MatchStrategy.SUBJECT_SIMILARITY, 0
556
+ ),
557
+ },
558
+ }
559
+ # Keep INFO level for machine consumption while still concise
560
+ log.info("RECONCILE_SUMMARY json=%s", json.dumps(debug_data))
561
+
562
+
563
+ def create_local_commit(
564
+ index: int,
565
+ sha: str,
566
+ subject: str,
567
+ files: list[str],
568
+ commit_message: str,
569
+ ) -> LocalCommit:
570
+ """
571
+ Create a LocalCommit with extracted Change-ID from the message.
572
+
573
+ Args:
574
+ index: Position in commit list
575
+ sha: Commit SHA
576
+ subject: Commit subject line
577
+ files: List of modified file paths
578
+ commit_message: Full commit message
579
+
580
+ Returns:
581
+ LocalCommit with extracted existing_change_id if present
582
+ """
583
+ existing_change_ids = extract_change_ids(commit_message)
584
+ existing_change_id = (
585
+ existing_change_ids[-1] if existing_change_ids else None
586
+ )
587
+
588
+ return LocalCommit(
589
+ index=index,
590
+ sha=sha,
591
+ subject=subject,
592
+ files=files,
593
+ commit_message=commit_message,
594
+ existing_change_id=existing_change_id,
595
+ )