weaverx 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
weaverx/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """WeaveRx — supportive GitHub triage for the medical AI community."""
2
+
3
+ __version__ = "0.1.1"
weaverx/categories.py ADDED
@@ -0,0 +1,198 @@
1
+ """Medical AI issue categories with restorative, community-oriented framing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from enum import StrEnum
7
+
8
+
9
+ class Priority(StrEnum):
10
+ LOW = "low"
11
+ MEDIUM = "medium"
12
+ HIGH = "high"
13
+ CRITICAL = "critical"
14
+
15
+
16
+ @dataclass(frozen=True, slots=True)
17
+ class MedAICategory:
18
+ slug: str
19
+ display_name: str
20
+ description: str
21
+ suggested_labels: tuple[str, ...]
22
+ healer_framing: str
23
+
24
+
25
+ MED_AI_CATEGORIES: tuple[MedAICategory, ...] = (
26
+ MedAICategory(
27
+ slug="dataset-access-licensing",
28
+ display_name="Dataset Access & Licensing",
29
+ description=(
30
+ "Questions about obtaining datasets, usage terms, attribution, "
31
+ "or redistribution constraints for medical imaging or clinical data."
32
+ ),
33
+ suggested_labels=("dataset", "data-access", "licensing"),
34
+ healer_framing=(
35
+ "Acknowledge how dataset friction slows research; offer clear paths "
36
+ "to documentation, access forms, or community-maintained mirrors."
37
+ ),
38
+ ),
39
+ MedAICategory(
40
+ slug="model-performance-pathology",
41
+ display_name="Model Performance (Pathology/Subgroup)",
42
+ description=(
43
+ "Reports or questions about model accuracy on specific pathologies, "
44
+ "anatomical sites, patient subgroups, or edge cases."
45
+ ),
46
+ suggested_labels=("performance", "pathology", "subgroup"),
47
+ healer_framing=(
48
+ "Validate the concern—subgroup performance matters clinically. "
49
+ "Invite reproducible eval details and point to benchmark scripts."
50
+ ),
51
+ ),
52
+ MedAICategory(
53
+ slug="reproducibility-environment",
54
+ display_name="Reproducibility & Environment",
55
+ description=(
56
+ "Setup issues, dependency conflicts, MONAI/nnU-Net version mismatches, "
57
+ "CUDA/PyTorch environments, or inability to reproduce published results."
58
+ ),
59
+ suggested_labels=("reproducibility", "environment", "monai", "nnunet"),
60
+ healer_framing=(
61
+ "Reproducibility is the bedrock of medical AI trust. Offer a gentle "
62
+ "checklist: versions, seeds, config files, and minimal repro steps."
63
+ ),
64
+ ),
65
+ MedAICategory(
66
+ slug="clinical-validation",
67
+ display_name="Clinical Validation Request",
68
+ description=(
69
+ "Requests for external validation, reader studies, prospective evaluation, "
70
+ "or guidance on moving from research code to clinical assessment."
71
+ ),
72
+ suggested_labels=("clinical-validation", "evaluation"),
73
+ healer_framing=(
74
+ "Honor the clinical lens. Clarify scope (research vs. regulated use) "
75
+ "and suggest validation frameworks without overpromising."
76
+ ),
77
+ ),
78
+ MedAICategory(
79
+ slug="privacy-compliance-dicom",
80
+ display_name="Privacy/Compliance/DICOM Considerations",
81
+ description=(
82
+ "PHI handling, de-identification, HIPAA/GDPR questions, DICOM metadata, "
83
+ "or concerns about sensitive data in issues or workflows."
84
+ ),
85
+ suggested_labels=("privacy", "compliance", "dicom", "phi"),
86
+ healer_framing=(
87
+ "Respond with care and containment. Never ask for patient identifiers; "
88
+ "redirect to safe, de-identified sharing practices."
89
+ ),
90
+ ),
91
+ MedAICategory(
92
+ slug="bug",
93
+ display_name="Bug",
94
+ description="Unexpected crashes, incorrect outputs, or broken functionality.",
95
+ suggested_labels=("bug",),
96
+ healer_framing=(
97
+ "Thank them for the report. Ask for minimal repro steps and environment "
98
+ "details in a supportive, non-blaming tone."
99
+ ),
100
+ ),
101
+ MedAICategory(
102
+ slug="feature-integration",
103
+ display_name="Feature/Integration Request",
104
+ description=(
105
+ "New capabilities, framework integrations (MONAI, nnU-Net, ITK), "
106
+ "or workflow improvements."
107
+ ),
108
+ suggested_labels=("enhancement", "integration"),
109
+ healer_framing=(
110
+ "Welcome the idea; explain fit with project scope and invite a design "
111
+ "sketch or use case from their clinical/research context."
112
+ ),
113
+ ),
114
+ MedAICategory(
115
+ slug="documentation",
116
+ display_name="Documentation",
117
+ description="Missing, unclear, or outdated docs, tutorials, or API references.",
118
+ suggested_labels=("documentation",),
119
+ healer_framing=(
120
+ "Documentation gaps are community gifts—each question helps the next "
121
+ "researcher. Point to existing docs and offer to clarify."
122
+ ),
123
+ ),
124
+ )
125
+
126
+ CATEGORY_BY_SLUG: dict[str, MedAICategory] = {c.slug: c for c in MED_AI_CATEGORIES}
127
+
128
+ # Keywords used to boost duplicate detection and privacy scanning.
129
+ MED_AI_KEYWORDS: frozenset[str] = frozenset(
130
+ {
131
+ "monai",
132
+ "nnunet",
133
+ "nn-u-net",
134
+ "dicom",
135
+ "nifti",
136
+ "chestxray",
137
+ "chexpert",
138
+ "mimic",
139
+ "brats",
140
+ "isic",
141
+ "pathology",
142
+ "segmentation",
143
+ "radiology",
144
+ "hipaa",
145
+ "gdpr",
146
+ "phi",
147
+ "de-identification",
148
+ "deidentification",
149
+ "clinical",
150
+ "validation",
151
+ "reproducibility",
152
+ "cuda",
153
+ "pytorch",
154
+ }
155
+ )
156
+
157
+ PRIVACY_KEYWORDS: frozenset[str] = frozenset(
158
+ {
159
+ "phi",
160
+ "hipaa",
161
+ "gdpr",
162
+ "patient id",
163
+ "patient name",
164
+ "mrn",
165
+ "medical record",
166
+ "dicom",
167
+ "identifiable",
168
+ "de-ident",
169
+ "deident",
170
+ "ssn",
171
+ "date of birth",
172
+ "dob",
173
+ }
174
+ )
175
+
176
+
177
+ def category_prompt_block() -> str:
178
+ """Format categories for LLM system prompts."""
179
+ lines: list[str] = []
180
+ for cat in MED_AI_CATEGORIES:
181
+ labels = ", ".join(cat.suggested_labels)
182
+ lines.append(
183
+ f"- **{cat.display_name}** (`{cat.slug}`): {cat.description} [labels: {labels}]"
184
+ )
185
+ return "\n".join(lines)
186
+
187
+
188
+ def validate_category_slug(slug: str) -> str:
189
+ """Return slug if valid, otherwise the closest match or 'bug'."""
190
+ if slug in CATEGORY_BY_SLUG:
191
+ return slug
192
+ normalized = slug.lower().replace("_", "-").replace(" ", "-")
193
+ if normalized in CATEGORY_BY_SLUG:
194
+ return normalized
195
+ for cat in MED_AI_CATEGORIES:
196
+ if cat.slug in normalized or normalized in cat.slug:
197
+ return cat.slug
198
+ return "bug"
weaverx/cli.py ADDED
@@ -0,0 +1,322 @@
1
+ """WeaveRx CLI — medical AI GitHub triage from the terminal."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from typing import Annotated, Any
7
+
8
+ import typer
9
+ from rich import box
10
+ from rich.console import Console
11
+ from rich.panel import Panel
12
+ from rich.table import Table
13
+ from rich.text import Text
14
+
15
+ from weaverx import __version__
16
+ from weaverx.categories import CATEGORY_BY_SLUG, Priority
17
+ from weaverx.triage import TriageOptions, TriageResult, build_orchestrator
18
+ from weaverx.utils import setup_logging
19
+
20
+ app = typer.Typer(
21
+ name="weaverx",
22
+ help="WeaveRx — supportive AI triage for medical AI GitHub issues.",
23
+ no_args_is_help=True,
24
+ )
25
+ console = Console()
26
+ err_console = Console(stderr=True)
27
+
28
+ PRIORITY_STYLE = {
29
+ Priority.LOW.value: "dim",
30
+ Priority.MEDIUM.value: "cyan",
31
+ Priority.HIGH.value: "yellow",
32
+ Priority.CRITICAL.value: "bold red",
33
+ }
34
+
35
+ SAFEGUARD_STATUS_STYLE = {
36
+ "clean": "green",
37
+ "review_recommended": "yellow",
38
+ "high_risk": "bold red",
39
+ }
40
+
41
+ SAFEGUARD_STATUS_ABBR = {
42
+ "clean": "OK",
43
+ "review_recommended": "REV",
44
+ "high_risk": "RISK",
45
+ }
46
+
47
+
48
+ def _version_callback(value: bool) -> None:
49
+ if value:
50
+ console.print(f"weaverx {__version__}")
51
+ raise typer.Exit()
52
+
53
+
54
+ @app.callback()
55
+ def main(
56
+ version: Annotated[
57
+ bool | None,
58
+ typer.Option("--version", callback=_version_callback, is_eager=True, help="Show version."),
59
+ ] = None,
60
+ ) -> None:
61
+ """WeaveRx entrypoint."""
62
+
63
+
64
+ def _render_duplicate_bar(score: float) -> Text:
65
+ filled = int(score * 10)
66
+ bar = "#" * filled + "-" * (10 - filled)
67
+ style = "green" if score < 0.4 else "yellow" if score < 0.7 else "red"
68
+ return Text(f"[{bar}] {score:.0%}", style=style)
69
+
70
+
71
+ def render_triage_result(
72
+ result: TriageResult,
73
+ *,
74
+ verbose: bool = False,
75
+ output_console: Console | None = None,
76
+ ) -> None:
77
+ out = output_console or console
78
+ analysis = result.analysis
79
+ cat = CATEGORY_BY_SLUG.get(analysis.category)
80
+ category_name = cat.display_name if cat else analysis.category
81
+
82
+ title = f"Issue #{result.issue.number} - {result.issue.title[:60]}"
83
+ table = Table(title=title, box=box.ROUNDED)
84
+ table.add_column("Field", style="bold")
85
+ table.add_column("Value")
86
+
87
+ table.add_row("Category", category_name)
88
+ table.add_row(
89
+ "Priority",
90
+ Text(analysis.priority.upper(), style=PRIORITY_STYLE.get(analysis.priority, "white")),
91
+ )
92
+ table.add_row("Impact", analysis.impact_summary)
93
+ table.add_row("Duplicate likelihood", _render_duplicate_bar(analysis.duplicate_likelihood))
94
+ table.add_row("Suggested labels", ", ".join(analysis.suggested_labels) or "-")
95
+ table.add_row("Status", result.result_status().replace("_", " "))
96
+
97
+ if analysis.sources:
98
+ if verbose:
99
+ source_lines = [
100
+ f"[{s.type}] {_truncate(s.snippet, 80)} — {s.reason}" for s in analysis.sources
101
+ ]
102
+ table.add_row("Sources", "\n".join(source_lines))
103
+ else:
104
+ table.add_row("Sources", f"{len(analysis.sources)} excerpt(s) (use --verbose)")
105
+
106
+ if result.llm is not None:
107
+ table.add_row("LLM", f"{result.llm.provider} / {result.llm.model}")
108
+
109
+ if analysis.privacy_flags:
110
+ table.add_row("Privacy flags", ", ".join(analysis.privacy_flags))
111
+
112
+ if result.duplicate_matches:
113
+ dupes = "; ".join(
114
+ f"#{m.issue_number} ({m.score:.0%})" for m in result.duplicate_matches[:3]
115
+ )
116
+ table.add_row("Similar issues", dupes)
117
+
118
+ if result.safeguard is not None:
119
+ sg = result.safeguard
120
+ table.add_row("Safeguard score", f"{sg.score:.1f} / 10")
121
+ table.add_row(
122
+ "Safeguard status",
123
+ Text(
124
+ sg.status.replace("_", " ").upper(),
125
+ style=SAFEGUARD_STATUS_STYLE.get(sg.status, "white"),
126
+ ),
127
+ )
128
+ if sg.triggered:
129
+ flags = ", ".join(finding.id for finding in sg.triggered)
130
+ table.add_row("Safeguard flags", flags)
131
+
132
+ out.print(table)
133
+
134
+ draft_border = "green"
135
+ if result.safeguard is not None:
136
+ draft_border = SAFEGUARD_STATUS_STYLE.get(result.safeguard.status, "green")
137
+
138
+ draft_preview = result.draft_response if verbose else _truncate(result.draft_response, 400)
139
+ out.print(
140
+ Panel(
141
+ draft_preview,
142
+ title="Draft response",
143
+ subtitle="(use --verbose for full text)",
144
+ border_style=draft_border,
145
+ )
146
+ )
147
+
148
+ if result.safeguard is not None and result.safeguard.status == "high_risk":
149
+ out.print("[yellow]Safeguard: high risk — review draft carefully before posting.[/yellow]")
150
+
151
+ if result.dry_run:
152
+ out.print("[dim]Dry-run mode - no changes were made to GitHub.[/dim]")
153
+ if result.posted_comment:
154
+ out.print("[green]Posted triage comment to GitHub.[/green]")
155
+ if result.applied_labels:
156
+ out.print(f"[green]Applied labels:[/green] {', '.join(result.applied_labels)}")
157
+
158
+ out.print("[dim]WeaveRx drafts require human review - not for clinical use.[/dim]")
159
+
160
+
161
+ def _truncate(text: str, max_len: int) -> str:
162
+ if len(text) <= max_len:
163
+ return text
164
+ return text[: max_len - 3].rstrip() + "..."
165
+
166
+
167
+ def _render_batch_summary(results: list[TriageResult]) -> None:
168
+ table = Table(title=f"Batch triage - {len(results)} issues", box=box.SIMPLE_HEAVY)
169
+ table.add_column("#", justify="right")
170
+ table.add_column("Title")
171
+ table.add_column("Category")
172
+ table.add_column("Priority")
173
+ table.add_column("Dup")
174
+ table.add_column("Safeguard")
175
+
176
+ for r in results:
177
+ cat = CATEGORY_BY_SLUG.get(r.analysis.category)
178
+ sg_abbr = "-"
179
+ if r.safeguard is not None:
180
+ sg_abbr = SAFEGUARD_STATUS_ABBR.get(r.safeguard.status, "?")
181
+ table.add_row(
182
+ str(r.issue.number),
183
+ _truncate(r.issue.title, 40),
184
+ cat.display_name if cat else r.analysis.category,
185
+ r.analysis.priority,
186
+ f"{r.analysis.duplicate_likelihood:.0%}",
187
+ sg_abbr,
188
+ )
189
+ console.print(table)
190
+
191
+
192
+ def _output_json(payload: Any) -> None:
193
+ console.print_json(json.dumps(payload, indent=2))
194
+
195
+
196
+ @app.command("triage")
197
+ def triage_command(
198
+ repo: Annotated[str, typer.Option("--repo", help="GitHub repository (owner/name).")],
199
+ issue: Annotated[int | None, typer.Option("--issue", help="Issue number to triage.")] = None,
200
+ recent: Annotated[
201
+ int | None,
202
+ typer.Option("--recent", help="Triage N most recent open issues."),
203
+ ] = None,
204
+ json_output: Annotated[bool, typer.Option("--json", help="Output JSON only.")] = False,
205
+ mock: Annotated[bool, typer.Option("--mock", help="Offline mock mode (no API calls).")] = False,
206
+ mock_llm: Annotated[
207
+ bool,
208
+ typer.Option(
209
+ "--mock-llm",
210
+ help="Fetch real GitHub issues but use offline mock LLM (no XAI_API_KEY needed).",
211
+ ),
212
+ ] = False,
213
+ dry_run: Annotated[
214
+ bool,
215
+ typer.Option("--dry-run", help="Analyze only; never write to GitHub."),
216
+ ] = False,
217
+ confirm: Annotated[
218
+ bool,
219
+ typer.Option("--confirm", help="Confirm GitHub write actions."),
220
+ ] = False,
221
+ privacy_insight: Annotated[
222
+ bool,
223
+ typer.Option(
224
+ "--privacy-insight/--no-privacy-insight",
225
+ help="Flag possible PHI/DICOM concerns.",
226
+ ),
227
+ ] = True,
228
+ safeguards: Annotated[
229
+ bool,
230
+ typer.Option(
231
+ "--safeguards/--no-safeguards",
232
+ help="Run local draft safeguard heuristics (default: on).",
233
+ ),
234
+ ] = True,
235
+ llm_provider: Annotated[
236
+ str | None,
237
+ typer.Option(
238
+ "--llm-provider",
239
+ help=(
240
+ "LLM provider: grok, anthropic, or openai (default: grok or WEAVERX_LLM_PROVIDER)."
241
+ ),
242
+ ),
243
+ ] = None,
244
+ post_comment: Annotated[
245
+ bool,
246
+ typer.Option("--post-comment", help="Post draft response (requires --confirm)."),
247
+ ] = False,
248
+ apply_labels: Annotated[
249
+ bool,
250
+ typer.Option("--apply-labels", help="Apply suggested labels (requires --confirm)."),
251
+ ] = False,
252
+ verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Verbose output.")] = False,
253
+ ) -> None:
254
+ """Triage one issue or a batch of recent issues."""
255
+ setup_logging(verbose)
256
+
257
+ if issue is None and recent is None:
258
+ err_console.print("[red]Provide --issue N or --recent N.[/red]")
259
+ raise typer.Exit(code=1)
260
+
261
+ if issue is not None and recent is not None:
262
+ err_console.print("[red]Use either --issue or --recent, not both.[/red]")
263
+ raise typer.Exit(code=1)
264
+
265
+ if mock and mock_llm:
266
+ err_console.print("[red]Use either --mock or --mock-llm, not both.[/red]")
267
+ raise typer.Exit(code=1)
268
+
269
+ options = TriageOptions(
270
+ repo=repo,
271
+ issue_number=issue,
272
+ recent=recent,
273
+ mock=mock,
274
+ mock_llm=mock_llm,
275
+ dry_run=dry_run or not (post_comment or apply_labels),
276
+ confirm=confirm,
277
+ privacy_insight=privacy_insight,
278
+ safeguards=safeguards,
279
+ llm_provider=llm_provider,
280
+ post_comment=post_comment,
281
+ apply_labels=apply_labels,
282
+ )
283
+
284
+ try:
285
+ orchestrator = build_orchestrator(
286
+ mock=mock,
287
+ mock_llm=mock_llm,
288
+ llm_provider=llm_provider,
289
+ )
290
+
291
+ if recent is not None:
292
+ results = orchestrator.triage_recent(options)
293
+ if json_output:
294
+ _output_json([r.to_dict() for r in results])
295
+ else:
296
+ _render_batch_summary(results)
297
+ if verbose:
298
+ for r in results:
299
+ render_triage_result(r, verbose=True)
300
+ else:
301
+ result = orchestrator.triage_one(options)
302
+ if json_output:
303
+ _output_json(result.to_dict())
304
+ else:
305
+ render_triage_result(result, verbose=verbose)
306
+
307
+ except ValueError as exc:
308
+ err_console.print(f"[red]Configuration error:[/red] {exc}")
309
+ raise typer.Exit(code=1) from exc
310
+ except Exception as exc:
311
+ err_console.print(f"[red]Triage failed:[/red] {exc}")
312
+ if verbose:
313
+ err_console.print_exception()
314
+ raise typer.Exit(code=2) from exc
315
+
316
+
317
+ def run() -> None:
318
+ app()
319
+
320
+
321
+ if __name__ == "__main__":
322
+ run()
@@ -0,0 +1,66 @@
1
+ """Supportive draft response refinement for medical AI issues."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from weaverx.categories import CATEGORY_BY_SLUG
6
+ from weaverx.github import GitHubIssue
7
+ from weaverx.llm import TriageAnalysis
8
+
9
+
10
+ def _opening_line(issue: GitHubIssue) -> str:
11
+ author = issue.user or "there"
12
+ return f"Hi @{author} - thank you for sharing this with the community."
13
+
14
+
15
+ def _category_guidance(slug: str) -> str:
16
+ cat = CATEGORY_BY_SLUG.get(slug)
17
+ if not cat:
18
+ return ""
19
+ return cat.healer_framing
20
+
21
+
22
+ def _privacy_addendum(flags: list[str]) -> str:
23
+ if not flags:
24
+ return ""
25
+ return (
26
+ "\n\n---\n"
27
+ "**Privacy note:** Please avoid sharing patient identifiers, DICOM UIDs tied to "
28
+ "individuals, or other PHI in public issues. De-identified logs, synthetic data, "
29
+ "or private maintainer channels are safer ways to share sensitive details."
30
+ )
31
+
32
+
33
+ def _repro_checklist() -> str:
34
+ return (
35
+ "\n\n**Repro checklist (helps us help you faster):**\n"
36
+ "- Package versions (PyTorch, MONAI, nnU-Net, CUDA)\n"
37
+ "- Config files or trainer/plan identifiers\n"
38
+ "- Random seed and data split/fold\n"
39
+ "- Minimal command or notebook snippet"
40
+ )
41
+
42
+
43
+ def refine_draft(issue: GitHubIssue, analysis: TriageAnalysis) -> str:
44
+ """
45
+ Ensure the draft feels supportive and includes practical next steps.
46
+ If the LLM draft is already strong, lightly augment rather than replace.
47
+ """
48
+ draft = analysis.draft_response.strip()
49
+ if len(draft) < 80:
50
+ draft = (
51
+ f"{_opening_line(issue)}\n\n"
52
+ f"{analysis.impact_summary}\n\n"
53
+ f"{_category_guidance(analysis.category)}"
54
+ )
55
+
56
+ if analysis.category == "reproducibility-environment" and "version" not in draft.lower():
57
+ draft += _repro_checklist()
58
+
59
+ needs_privacy = analysis.category == "privacy-compliance-dicom" or analysis.privacy_flags
60
+ if needs_privacy and "phi" not in draft.lower() and "patient" not in draft.lower():
61
+ draft += _privacy_addendum(analysis.privacy_flags)
62
+
63
+ if not draft.startswith("Hi"):
64
+ draft = f"{_opening_line(issue)}\n\n{draft}"
65
+
66
+ return draft.strip()
@@ -0,0 +1,78 @@
1
+ """Lightweight duplicate issue detection without heavy ML dependencies."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from difflib import SequenceMatcher
7
+
8
+ from weaverx.categories import MED_AI_KEYWORDS
9
+ from weaverx.github import GitHubIssue
10
+ from weaverx.llm import DuplicateMatch
11
+
12
+
13
+ def normalize_text(text: str) -> str:
14
+ text = text.lower()
15
+ text = re.sub(r"```.*?```", " ", text, flags=re.DOTALL)
16
+ text = re.sub(r"`[^`]+`", " ", text)
17
+ text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
18
+ text = re.sub(r"[^a-z0-9\s\-_/]", " ", text)
19
+ text = re.sub(r"\s+", " ", text).strip()
20
+ return text
21
+
22
+
23
+ def _keyword_overlap(a: str, b: str) -> float:
24
+ words_a = {w for w in normalize_text(a).split() if w in MED_AI_KEYWORDS}
25
+ words_b = {w for w in normalize_text(b).split() if w in MED_AI_KEYWORDS}
26
+ if not words_a or not words_b:
27
+ return 0.0
28
+ overlap = len(words_a & words_b)
29
+ return overlap / max(len(words_a | words_b), 1)
30
+
31
+
32
+ def similarity_score(source: GitHubIssue, candidate: GitHubIssue) -> float:
33
+ if source.number == candidate.number:
34
+ return 0.0
35
+
36
+ title_a = normalize_text(source.title)
37
+ title_b = normalize_text(candidate.title)
38
+ body_a = normalize_text(source.body)[:500]
39
+ body_b = normalize_text(candidate.body)[:500]
40
+
41
+ title_ratio = SequenceMatcher(None, title_a, title_b).ratio()
42
+ body_ratio = SequenceMatcher(None, body_a, body_b).ratio() if body_a and body_b else 0.0
43
+ keyword_boost = _keyword_overlap(
44
+ f"{source.title} {source.body}",
45
+ f"{candidate.title} {candidate.body}",
46
+ )
47
+
48
+ combined = (0.55 * title_ratio) + (0.30 * body_ratio) + (0.15 * keyword_boost)
49
+ return min(1.0, combined)
50
+
51
+
52
+ def find_duplicates(
53
+ issue: GitHubIssue,
54
+ candidates: list[GitHubIssue],
55
+ *,
56
+ top_k: int = 3,
57
+ min_score: float = 0.25,
58
+ ) -> list[DuplicateMatch]:
59
+ scored: list[DuplicateMatch] = []
60
+ for candidate in candidates:
61
+ score = similarity_score(issue, candidate)
62
+ if score >= min_score:
63
+ scored.append(
64
+ DuplicateMatch(
65
+ issue_number=candidate.number,
66
+ title=candidate.title,
67
+ score=score,
68
+ url=candidate.html_url,
69
+ )
70
+ )
71
+ scored.sort(key=lambda m: m.score, reverse=True)
72
+ return scored[:top_k]
73
+
74
+
75
+ def best_duplicate_score(matches: list[DuplicateMatch]) -> float:
76
+ if not matches:
77
+ return 0.0
78
+ return matches[0].score