clean-code-tools 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/README.md +66 -0
  2. package/configs/eslint.clean-code.recommended.mjs +211 -0
  3. package/configs/python.clean-code.pyproject.toml +143 -0
  4. package/data/clean-code-patterns.jsonl +264 -0
  5. package/data/vector-record.schema.json +77 -0
  6. package/docs/README.md +29 -0
  7. package/docs/eslint-custom-rules.md +74 -0
  8. package/docs/eslint-recommended-config.md +87 -0
  9. package/docs/fastmcp-local-server.md +104 -0
  10. package/docs/publishing.md +125 -0
  11. package/docs/python-lint-recommended-config.md +57 -0
  12. package/docs/python-pylint-custom-rules.md +77 -0
  13. package/docs/semantic-weaviate.md +80 -0
  14. package/docs/static-trigger-semantic-review.md +97 -0
  15. package/evals/clean-code-retrieval.jsonl +13 -0
  16. package/ops/dev/weaviate/README.md +34 -0
  17. package/ops/dev/weaviate/compose.yaml +34 -0
  18. package/ops/dev/weaviate/smoke.sh +28 -0
  19. package/package.json +96 -0
  20. package/pyproject.toml +303 -0
  21. package/sample-apps/README.md +40 -0
  22. package/sample-apps/python-app/pyproject.toml +113 -0
  23. package/sample-apps/python-app/src/clean_pricing.py +10 -0
  24. package/sample-apps/python-app/src/smelly_pricing.py +8 -0
  25. package/sample-apps/ts-backend/eslint.config.mjs +3 -0
  26. package/sample-apps/ts-backend/package.json +18 -0
  27. package/sample-apps/ts-backend/src/clean-handler.ts +19 -0
  28. package/sample-apps/ts-backend/src/smelly-handler.ts +29 -0
  29. package/sample-apps/ts-backend/tsconfig.json +9 -0
  30. package/sample-apps/ts-frontend/eslint.config.mjs +3 -0
  31. package/sample-apps/ts-frontend/package.json +18 -0
  32. package/sample-apps/ts-frontend/src/CleanWidget.tsx +18 -0
  33. package/sample-apps/ts-frontend/src/SmellyWidget.tsx +27 -0
  34. package/sample-apps/ts-frontend/tsconfig.json +10 -0
  35. package/scripts/_mcp_app.py +21 -0
  36. package/scripts/check_clean_code_review_candidates.py +302 -0
  37. package/scripts/check_fastmcp_server.py +106 -0
  38. package/scripts/check_packages.py +137 -0
  39. package/scripts/check_python_config.py +130 -0
  40. package/scripts/check_repo_python_lint.py +46 -0
  41. package/scripts/check_retrieval_evals.py +132 -0
  42. package/scripts/check_sample_apps.py +169 -0
  43. package/scripts/check_semantic_search_tooling.py +102 -0
  44. package/scripts/clean_code_eslint_triggers.py +272 -0
  45. package/scripts/clean_code_mcp_server.py +7 -0
  46. package/scripts/clean_code_python_triggers.py +318 -0
  47. package/scripts/clean_code_review_candidates.py +291 -0
  48. package/scripts/clean_code_review_io.py +36 -0
  49. package/scripts/clean_code_review_models.py +43 -0
  50. package/scripts/clean_code_semantic.py +27 -0
  51. package/scripts/set_package_versions.py +82 -0
  52. package/scripts/weaviate_ingest_clean_code.py +44 -0
  53. package/scripts/weaviate_search_clean_code.py +51 -0
  54. package/skills/clean-code-mcp-reviewer/SKILL.md +209 -0
  55. package/skills/clean-code-mcp-reviewer/evals/evals.json +30 -0
  56. package/src/js/eslint-plugin-clean-code.mjs +758 -0
  57. package/src/python/clean_code_tools_pylint/__init__.py +14 -0
  58. package/src/python/clean_code_tools_pylint/ast_checker.py +122 -0
  59. package/src/python/clean_code_tools_pylint/comments.py +83 -0
  60. package/src/python/clean_code_tools_pylint/helpers.py +196 -0
  61. package/src/python/mcp_server/__init__.py +1 -0
  62. package/src/python/mcp_server/corpus.py +160 -0
  63. package/src/python/mcp_server/markdown.py +126 -0
  64. package/src/python/mcp_server/models.py +73 -0
  65. package/src/python/mcp_server/ranking.py +125 -0
  66. package/src/python/mcp_server/ranking_scoring.py +232 -0
  67. package/src/python/mcp_server/semantic.py +192 -0
  68. package/src/python/mcp_server/server.py +235 -0
  69. package/src/python/mcp_server/server_payloads.py +83 -0
  70. package/src/python/mcp_server/text.py +104 -0
  71. package/src/python/mcp_server/utils/__init__.py +1 -0
  72. package/src/python/mcp_server/utils/httpx_loader.py +14 -0
  73. package/src/python/mcp_server/utils/increment.py +7 -0
  74. package/src/python/mcp_server/utils/sha256_text.py +8 -0
  75. package/src/python/mcp_server/utils/unique_strings.py +15 -0
  76. package/src/python/mcp_server/weaviate.py +182 -0
  77. package/uv.lock +2012 -0
@@ -0,0 +1,126 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from pathlib import Path
6
+
7
+ from mcp_server.models import MarkdownSection
8
+ from mcp_server.text import approximate_tokens, detected_record_id, slugless
9
+ from mcp_server.utils.unique_strings import unique_strings
10
+
11
+ HEADING_RE = re.compile(r"^(#{1,6})\s+(.+?)\s*$")
12
+ CODE_FENCE_RE = re.compile(r"^\s*```")
13
+ MAX_SECTION_TOKENS = 1_200
14
+ TARGET_SECTION_TOKENS = 850
15
+
16
+
17
+ def markdown_sections(path: Path, *, root: Path) -> list[MarkdownSection]:
18
+ # pylint: disable=too-many-locals
19
+ relative_path = path.relative_to(root).as_posix()
20
+ lines = path.read_text().splitlines()
21
+ sections: list[MarkdownSection] = []
22
+ stack: list[str] = []
23
+ current_heading = path.stem
24
+ current_path = (path.stem,)
25
+ current_start = 1
26
+ current_body: list[str] = []
27
+ in_code = False
28
+
29
+ def flush(end_line: int) -> None:
30
+ body = "\n".join(current_body).strip()
31
+ if body:
32
+ sections.append(
33
+ MarkdownSection(
34
+ source_file=relative_path,
35
+ section_path=tuple(current_path),
36
+ heading=current_heading,
37
+ body=body,
38
+ start_line=current_start,
39
+ end_line=end_line,
40
+ )
41
+ )
42
+
43
+ for line_number, line in enumerate(lines, start=1):
44
+ if CODE_FENCE_RE.match(line):
45
+ in_code = not in_code
46
+ current_body.append(line)
47
+ continue
48
+ heading = HEADING_RE.match(line) if not in_code else None
49
+ if heading:
50
+ flush(line_number - 1)
51
+ level = len(heading.group(1))
52
+ text = heading.group(2).strip()
53
+ stack = stack[: level - 1]
54
+ stack.append(text)
55
+ current_heading = text
56
+ current_path = tuple(stack)
57
+ current_start = line_number
58
+ current_body = [line]
59
+ continue
60
+ current_body.append(line)
61
+
62
+ flush(len(lines))
63
+ return sections
64
+
65
+
66
+ def split_section_body(body: str) -> list[str]:
67
+ if approximate_tokens(body) <= MAX_SECTION_TOKENS:
68
+ return [body]
69
+ blocks = semantic_blocks(body)
70
+ chunks: list[str] = []
71
+ current: list[str] = []
72
+ current_tokens = 0
73
+ for block in blocks:
74
+ block_tokens = approximate_tokens(block)
75
+ if current and current_tokens + block_tokens > TARGET_SECTION_TOKENS:
76
+ chunks.append("\n\n".join(current).strip())
77
+ current = []
78
+ current_tokens = 0
79
+ current.append(block)
80
+ current_tokens += block_tokens
81
+ if current:
82
+ chunks.append("\n\n".join(current).strip())
83
+ return [chunk for chunk in chunks if chunk]
84
+
85
+
86
+ def semantic_blocks(body: str) -> list[str]:
87
+ blocks: list[str] = []
88
+ current: list[str] = []
89
+ in_code = False
90
+ for line in body.splitlines():
91
+ if CODE_FENCE_RE.match(line):
92
+ in_code = not in_code
93
+ current.append(line)
94
+ continue
95
+ if not in_code and not line.strip():
96
+ if current:
97
+ blocks.append("\n".join(current).strip())
98
+ current = []
99
+ continue
100
+ current.append(line)
101
+ if current:
102
+ blocks.append("\n".join(current).strip())
103
+ return blocks
104
+
105
+
106
+ def infer_markdown_rule_family(section: MarkdownSection) -> str:
107
+ text = " ".join(section.section_path).lower()
108
+ if "eslint" in text or "ruff" in text or "lint" in text:
109
+ return "linting"
110
+ if "comment" in text:
111
+ return "comments"
112
+ if "test" in text:
113
+ return "tests"
114
+ if "function" in text or "argument" in text:
115
+ return "functions"
116
+ if "name" in text:
117
+ return "naming"
118
+ return "documentation"
119
+
120
+
121
+ def markdown_aliases(section: MarkdownSection) -> tuple[str, ...]:
122
+ aliases = [section.heading, *section.section_path]
123
+ record_id = detected_record_id(section.heading)
124
+ if record_id:
125
+ aliases.append(record_id)
126
+ return tuple(unique_strings([slugless(alias) for alias in aliases if alias.strip()]))
@@ -0,0 +1,73 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ from dataclasses import dataclass
5
+ from datetime import UTC, datetime
6
+ from typing import Any, Literal
7
+
8
+ CHUNKER_VERSION = "clean-code-semantic-v1"
9
+ DEFAULT_EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"
10
+ DEFAULT_EMBEDDING_PROVIDER = "fastembed/cpu"
11
+
12
+ JsonDict = dict[str, Any]
13
+ Confidence = Literal["high", "medium", "low"]
14
+
15
+
16
+ @dataclass(frozen=True, slots=True)
17
+ class CleanCodeChunk: # pylint: disable=too-many-instance-attributes
18
+ chunk_id: str
19
+ object_id: str
20
+ source_file: str
21
+ source_kind: str
22
+ record_id: str
23
+ title: str
24
+ topic: str
25
+ section_path: tuple[str, ...]
26
+ chunk_kind: str
27
+ chunk_index: int
28
+ rule_family: str
29
+ lintability: str
30
+ aliases: tuple[str, ...]
31
+ languages: tuple[str, ...]
32
+ lint_candidates: tuple[str, ...]
33
+ content_text: str
34
+ embedding_text: str
35
+ display_text: str
36
+ text_hash: str
37
+
38
+ @property
39
+ def properties(self) -> JsonDict:
40
+ return {
41
+ "chunkId": self.chunk_id,
42
+ "sourceFile": self.source_file,
43
+ "sourceKind": self.source_kind,
44
+ "recordId": self.record_id,
45
+ "title": self.title,
46
+ "topic": self.topic,
47
+ "sectionPath": list(self.section_path),
48
+ "chunkKind": self.chunk_kind,
49
+ "chunkIndex": self.chunk_index,
50
+ "ruleFamily": self.rule_family,
51
+ "lintability": self.lintability,
52
+ "aliases": list(self.aliases),
53
+ "languages": list(self.languages),
54
+ "lintCandidates": list(self.lint_candidates),
55
+ "contentText": self.content_text,
56
+ "embeddingText": self.embedding_text,
57
+ "displayText": self.display_text,
58
+ "textHash": self.text_hash,
59
+ "chunkerVersion": CHUNKER_VERSION,
60
+ "embeddingModel": DEFAULT_EMBEDDING_MODEL,
61
+ "embeddingProvider": DEFAULT_EMBEDDING_PROVIDER,
62
+ "createdAt": datetime.now(UTC).isoformat(),
63
+ }
64
+
65
+
66
+ @dataclass(frozen=True, slots=True)
67
+ class MarkdownSection:
68
+ source_file: str
69
+ section_path: tuple[str, ...]
70
+ heading: str
71
+ body: str
72
+ start_line: int
73
+ end_line: int
@@ -0,0 +1,125 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ from mcp_server.corpus import build_chunks
5
+ from mcp_server.models import DEFAULT_EMBEDDING_MODEL, JsonDict
6
+ from mcp_server.ranking_scoring import (
7
+ dedupe_pattern_results,
8
+ distance_for,
9
+ row_matches_filters,
10
+ score_pattern_row,
11
+ )
12
+ from mcp_server.text import (
13
+ lexical_score,
14
+ query_tokens,
15
+ )
16
+ from mcp_server.weaviate import COLLECTION_NAME, search_chunks
17
+
18
+
19
+ def search_pattern_records( # noqa: PLR0913 # pylint: disable=too-many-arguments,too-many-locals
20
+ *,
21
+ query: str,
22
+ url: str,
23
+ collection_name: str = COLLECTION_NAME,
24
+ model_name: str = DEFAULT_EMBEDDING_MODEL,
25
+ limit: int = 8,
26
+ language: str = "any",
27
+ rule_families: tuple[str, ...] = (),
28
+ topics: tuple[str, ...] = (),
29
+ lintability: tuple[str, ...] = (),
30
+ source_kinds: tuple[str, ...] = ("clean_code_pattern",),
31
+ ) -> JsonDict:
32
+ vector_limit = max(limit * 4, 25)
33
+ vector_rows = search_chunks(
34
+ query=query,
35
+ url=url,
36
+ collection_name=collection_name,
37
+ model_name=model_name,
38
+ limit=vector_limit,
39
+ )
40
+ return rank_pattern_rows(
41
+ query=query,
42
+ vector_rows=vector_rows,
43
+ limit=limit,
44
+ language=language,
45
+ rule_families=rule_families,
46
+ topics=topics,
47
+ lintability=lintability,
48
+ source_kinds=source_kinds,
49
+ )
50
+
51
+
52
+ def rank_pattern_rows( # noqa: PLR0913 # pylint: disable=too-many-arguments,too-many-locals
53
+ *,
54
+ query: str,
55
+ vector_rows: list[JsonDict],
56
+ limit: int = 8,
57
+ language: str = "any",
58
+ rule_families: tuple[str, ...] = (),
59
+ topics: tuple[str, ...] = (),
60
+ lintability: tuple[str, ...] = (),
61
+ source_kinds: tuple[str, ...] = ("clean_code_pattern",),
62
+ ) -> JsonDict:
63
+ rows_by_chunk_id: dict[str, JsonDict] = {}
64
+ for row in [*vector_rows, *local_keyword_rows(query)]:
65
+ chunk_id = str(row.get("chunkId", ""))
66
+ if chunk_id:
67
+ rows_by_chunk_id[chunk_id] = merge_search_rows(rows_by_chunk_id.get(chunk_id), row)
68
+
69
+ filters = {
70
+ "language": language,
71
+ "rule_families": rule_families,
72
+ "topics": topics,
73
+ "lintability": lintability,
74
+ "source_kinds": source_kinds,
75
+ }
76
+ ranked: list[JsonDict] = []
77
+ for row in rows_by_chunk_id.values():
78
+ if not row_matches_filters(row, filters):
79
+ continue
80
+ ranked.append(score_pattern_row(row, query=query, filters=filters))
81
+
82
+ ranked.sort(key=lambda item: float(item["score"]), reverse=True)
83
+ deduped = dedupe_pattern_results(ranked)
84
+ results = deduped[:limit]
85
+ return {
86
+ "query": query,
87
+ "filters_applied": filters,
88
+ "results": results,
89
+ "no_strong_match": not results or results[0]["confidence"] != "high",
90
+ }
91
+
92
+
93
+ def local_keyword_rows(query: str) -> list[JsonDict]:
94
+ query_terms = set(query_tokens(query))
95
+ if not query_terms:
96
+ return []
97
+ matches: list[JsonDict] = []
98
+ for chunk in build_chunks():
99
+ haystack = " ".join(
100
+ (
101
+ chunk.record_id,
102
+ chunk.title,
103
+ chunk.topic,
104
+ chunk.rule_family,
105
+ chunk.lintability,
106
+ " ".join(chunk.aliases),
107
+ " ".join(chunk.lint_candidates),
108
+ chunk.content_text[:600],
109
+ )
110
+ )
111
+ if lexical_score(query_terms, haystack) > 0:
112
+ matches.append(chunk.properties | {"_additional": {"id": chunk.object_id}})
113
+ return matches
114
+
115
+
116
+ def merge_search_rows(existing: JsonDict | None, row: JsonDict) -> JsonDict:
117
+ if existing is None:
118
+ return row
119
+ existing_distance = distance_for(existing)
120
+ row_distance = distance_for(row)
121
+ if existing_distance is None:
122
+ return row
123
+ if row_distance is None:
124
+ return existing
125
+ return row if row_distance < existing_distance else existing
@@ -0,0 +1,232 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ from mcp_server.models import Confidence, JsonDict
5
+ from mcp_server.text import (
6
+ lexical_score,
7
+ query_tokens,
8
+ searchable_row_text,
9
+ semantic_similarity,
10
+ )
11
+ from mcp_server.utils.unique_strings import unique_strings
12
+
13
+ CONSERVATIVE_CONTEXT_THRESHOLD = 0.80
14
+ HIGH_CONFIDENCE_THRESHOLD = 0.72
15
+ EXACT_HIGH_CONFIDENCE_THRESHOLD = 0.40
16
+ MEDIUM_CONFIDENCE_THRESHOLD = 0.45
17
+ EXACT_MATCH_STOPWORDS = {
18
+ "a",
19
+ "an",
20
+ "and",
21
+ "be",
22
+ "by",
23
+ "code",
24
+ "do",
25
+ "for",
26
+ "in",
27
+ "is",
28
+ "it",
29
+ "of",
30
+ "on",
31
+ "or",
32
+ "the",
33
+ "thi",
34
+ "this",
35
+ "to",
36
+ "when",
37
+ "with",
38
+ }
39
+ NON_SPECIFIC_EXACT_ALIASES = {
40
+ "clean code",
41
+ "code smell",
42
+ "comments",
43
+ "error handling",
44
+ "function smell",
45
+ "functions",
46
+ "function design",
47
+ "planning guidance",
48
+ "refactoring rule",
49
+ }
50
+ BROAD_CATEGORY_QUERY_TERMS = {
51
+ "clean",
52
+ "code",
53
+ "comment",
54
+ "comments",
55
+ "error",
56
+ "function",
57
+ "handling",
58
+ "guidance",
59
+ "planning",
60
+ "python",
61
+ "refactoring",
62
+ "rule",
63
+ "smell",
64
+ "typescript",
65
+ }
66
+ VAGUE_QUERY_TERMS = {"stuff", "thing", "things"}
67
+
68
+
69
+ def row_matches_filters(row: JsonDict, filters: JsonDict) -> bool:
70
+ source_kinds = tuple(str(value) for value in filters["source_kinds"])
71
+ if source_kinds and str(row.get("sourceKind", "")) not in source_kinds:
72
+ return False
73
+
74
+ language = str(filters["language"])
75
+ languages = tuple(str(value) for value in row.get("languages", []))
76
+ if language not in {"", "any", "both"} and language not in languages:
77
+ return False
78
+ if language == "both" and not {"typescript", "python"} <= set(languages):
79
+ return False
80
+
81
+ rule_families = {str(value) for value in filters["rule_families"]}
82
+ if rule_families and str(row.get("ruleFamily", "")) not in rule_families:
83
+ return False
84
+
85
+ topics = {str(value) for value in filters["topics"]}
86
+ if topics and str(row.get("topic", "")) not in topics:
87
+ return False
88
+
89
+ lintability = {str(value) for value in filters["lintability"]}
90
+ return not lintability or str(row.get("lintability", "")) in lintability
91
+
92
+
93
+ def score_pattern_row(row: JsonDict, *, query: str, filters: JsonDict) -> JsonDict:
94
+ query_terms = set(query_tokens(query))
95
+ distance = distance_for(row)
96
+ semantic_score = semantic_similarity(distance)
97
+ keyword_score = lexical_score(query_terms, searchable_row_text(row))
98
+ metadata_boost, match_reasons = metadata_boost_and_reasons(row, filters, query_terms)
99
+ context_penalty, context_reasons = context_penalty_and_reasons(query_terms)
100
+ match_reasons.extend(context_reasons)
101
+ score = max(0.0, min(1.0, semantic_score * 0.60 + keyword_score * 0.25 + metadata_boost - context_penalty))
102
+ confidence = confidence_for(score, match_reasons, semantic_score=semantic_score)
103
+ return {
104
+ "pattern_id": row.get("recordId", ""),
105
+ "chunk_id": row.get("chunkId", ""),
106
+ "title": row.get("title", ""),
107
+ "topic": row.get("topic", ""),
108
+ "rule_family": row.get("ruleFamily", ""),
109
+ "lintability": row.get("lintability", ""),
110
+ "languages": row.get("languages", []),
111
+ "aliases": row.get("aliases", []),
112
+ "lint_candidates": row.get("lintCandidates", []),
113
+ "source_kind": row.get("sourceKind", ""),
114
+ "chunk_kind": row.get("chunkKind", ""),
115
+ "section_path": row.get("sectionPath", []),
116
+ "score": round(score, 4),
117
+ "confidence": confidence,
118
+ "distance": distance,
119
+ "match_reasons": match_reasons,
120
+ "snippet": " ".join(str(row.get("contentText", "")).split())[:500],
121
+ "source": {
122
+ "source_file": row.get("sourceFile", ""),
123
+ "source_kind": row.get("sourceKind", ""),
124
+ "text_hash": row.get("textHash", ""),
125
+ },
126
+ "score_breakdown": {
127
+ "semantic_score": round(semantic_score, 4),
128
+ "keyword_score": round(keyword_score, 4),
129
+ "metadata_boost": round(metadata_boost, 4),
130
+ "context_penalty": round(context_penalty, 4),
131
+ },
132
+ }
133
+
134
+
135
+ def dedupe_pattern_results(rows: list[JsonDict]) -> list[JsonDict]:
136
+ by_record_id: dict[str, JsonDict] = {}
137
+ for row in rows:
138
+ record_id = str(row.get("pattern_id") or row.get("chunk_id"))
139
+ if record_id not in by_record_id or float(row["score"]) > float(by_record_id[record_id]["score"]):
140
+ by_record_id[record_id] = row
141
+ return sorted(by_record_id.values(), key=lambda item: float(item["score"]), reverse=True)
142
+
143
+
144
+ def metadata_boost_and_reasons(
145
+ row: JsonDict,
146
+ filters: JsonDict,
147
+ query_terms: set[str],
148
+ ) -> tuple[float, list[str]]:
149
+ boost = 0.0
150
+ reasons: list[str] = []
151
+ source_kind = str(row.get("sourceKind", ""))
152
+ if source_kind == "clean_code_pattern":
153
+ boost += 0.05
154
+ reasons.append("canonical clean-code pattern")
155
+
156
+ language = str(filters["language"])
157
+ languages = {str(value) for value in row.get("languages", [])}
158
+ if language not in {"", "any"} and (language == "both" or language in languages):
159
+ boost += 0.05
160
+ reasons.append(f"language matched: {language}")
161
+
162
+ lintability = str(row.get("lintability", ""))
163
+ if lintability and lintability in set(filters["lintability"]):
164
+ boost += 0.05
165
+ reasons.append(f"lintability matched: {lintability}")
166
+
167
+ exact_terms = exact_match_terms(row, query_terms)
168
+ if exact_terms:
169
+ boost += 0.15
170
+ reasons.append(f"matched exact terms: {', '.join(exact_terms[:4])}")
171
+
172
+ if not reasons:
173
+ reasons.append("semantic similarity")
174
+ return min(boost, 0.30), reasons
175
+
176
+
177
+ def context_penalty_and_reasons(query_terms: set[str]) -> tuple[float, list[str]]:
178
+ safe_context_terms = {"generated", "fixture", "fixtures", "test", "tests"}
179
+ matched = sorted(query_terms & safe_context_terms)
180
+ reasons: list[str] = []
181
+ if matched:
182
+ reasons.append(f"conservative context: {', '.join(matched)}")
183
+ meaningful_terms = query_terms - EXACT_MATCH_STOPWORDS
184
+ if meaningful_terms and meaningful_terms <= BROAD_CATEGORY_QUERY_TERMS:
185
+ reasons.append("conservative context: broad category query")
186
+ if meaningful_terms and meaningful_terms <= VAGUE_QUERY_TERMS:
187
+ reasons.append("conservative context: vague query")
188
+ if {"todo", "tracked", "issue"} <= query_terms:
189
+ reasons.append("conservative context: compliant tracked TODO")
190
+ if not reasons:
191
+ return 0.0, []
192
+ return 0.35, reasons
193
+
194
+
195
+ def exact_match_terms(row: JsonDict, query_terms: set[str]) -> list[str]:
196
+ values = [
197
+ str(row.get("recordId", "")),
198
+ str(row.get("title", "")),
199
+ *[str(value) for value in row.get("aliases", [])],
200
+ *[str(value) for value in row.get("lintCandidates", [])],
201
+ ]
202
+ matched: list[str] = []
203
+ for value in values:
204
+ normalized_value = " ".join(query_tokens(value))
205
+ if normalized_value in NON_SPECIFIC_EXACT_ALIASES:
206
+ continue
207
+ terms = set(query_tokens(value))
208
+ meaningful_terms = terms - EXACT_MATCH_STOPWORDS
209
+ if terms and meaningful_terms and terms <= query_terms:
210
+ matched.append(value)
211
+ return unique_strings(matched)
212
+
213
+
214
+ def distance_for(row: JsonDict) -> float | None:
215
+ additional = row.get("_additional")
216
+ if not isinstance(additional, dict):
217
+ return None
218
+ distance = additional.get("distance")
219
+ return float(distance) if isinstance(distance, int | float) else None
220
+
221
+
222
+ def confidence_for(score: float, match_reasons: list[str], *, semantic_score: float) -> Confidence:
223
+ if any(reason.startswith("conservative context") for reason in match_reasons):
224
+ return "low" if score < CONSERVATIVE_CONTEXT_THRESHOLD else "medium"
225
+ has_exact = any(reason.startswith("matched exact terms") for reason in match_reasons)
226
+ if score >= HIGH_CONFIDENCE_THRESHOLD or (
227
+ has_exact and semantic_score > 0 and score >= EXACT_HIGH_CONFIDENCE_THRESHOLD
228
+ ):
229
+ return "high"
230
+ if score >= MEDIUM_CONFIDENCE_THRESHOLD:
231
+ return "medium"
232
+ return "low"