clean-code-tools 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +66 -0
- package/configs/eslint.clean-code.recommended.mjs +211 -0
- package/configs/python.clean-code.pyproject.toml +143 -0
- package/data/clean-code-patterns.jsonl +264 -0
- package/data/vector-record.schema.json +77 -0
- package/docs/README.md +29 -0
- package/docs/eslint-custom-rules.md +74 -0
- package/docs/eslint-recommended-config.md +87 -0
- package/docs/fastmcp-local-server.md +104 -0
- package/docs/publishing.md +125 -0
- package/docs/python-lint-recommended-config.md +57 -0
- package/docs/python-pylint-custom-rules.md +77 -0
- package/docs/semantic-weaviate.md +80 -0
- package/docs/static-trigger-semantic-review.md +97 -0
- package/evals/clean-code-retrieval.jsonl +13 -0
- package/ops/dev/weaviate/README.md +34 -0
- package/ops/dev/weaviate/compose.yaml +34 -0
- package/ops/dev/weaviate/smoke.sh +28 -0
- package/package.json +96 -0
- package/pyproject.toml +303 -0
- package/sample-apps/README.md +40 -0
- package/sample-apps/python-app/pyproject.toml +113 -0
- package/sample-apps/python-app/src/clean_pricing.py +10 -0
- package/sample-apps/python-app/src/smelly_pricing.py +8 -0
- package/sample-apps/ts-backend/eslint.config.mjs +3 -0
- package/sample-apps/ts-backend/package.json +18 -0
- package/sample-apps/ts-backend/src/clean-handler.ts +19 -0
- package/sample-apps/ts-backend/src/smelly-handler.ts +29 -0
- package/sample-apps/ts-backend/tsconfig.json +9 -0
- package/sample-apps/ts-frontend/eslint.config.mjs +3 -0
- package/sample-apps/ts-frontend/package.json +18 -0
- package/sample-apps/ts-frontend/src/CleanWidget.tsx +18 -0
- package/sample-apps/ts-frontend/src/SmellyWidget.tsx +27 -0
- package/sample-apps/ts-frontend/tsconfig.json +10 -0
- package/scripts/_mcp_app.py +21 -0
- package/scripts/check_clean_code_review_candidates.py +302 -0
- package/scripts/check_fastmcp_server.py +106 -0
- package/scripts/check_packages.py +137 -0
- package/scripts/check_python_config.py +130 -0
- package/scripts/check_repo_python_lint.py +46 -0
- package/scripts/check_retrieval_evals.py +132 -0
- package/scripts/check_sample_apps.py +169 -0
- package/scripts/check_semantic_search_tooling.py +102 -0
- package/scripts/clean_code_eslint_triggers.py +272 -0
- package/scripts/clean_code_mcp_server.py +7 -0
- package/scripts/clean_code_python_triggers.py +318 -0
- package/scripts/clean_code_review_candidates.py +291 -0
- package/scripts/clean_code_review_io.py +36 -0
- package/scripts/clean_code_review_models.py +43 -0
- package/scripts/clean_code_semantic.py +27 -0
- package/scripts/set_package_versions.py +82 -0
- package/scripts/weaviate_ingest_clean_code.py +44 -0
- package/scripts/weaviate_search_clean_code.py +51 -0
- package/skills/clean-code-mcp-reviewer/SKILL.md +209 -0
- package/skills/clean-code-mcp-reviewer/evals/evals.json +30 -0
- package/src/js/eslint-plugin-clean-code.mjs +758 -0
- package/src/python/clean_code_tools_pylint/__init__.py +14 -0
- package/src/python/clean_code_tools_pylint/ast_checker.py +122 -0
- package/src/python/clean_code_tools_pylint/comments.py +83 -0
- package/src/python/clean_code_tools_pylint/helpers.py +196 -0
- package/src/python/mcp_server/__init__.py +1 -0
- package/src/python/mcp_server/corpus.py +160 -0
- package/src/python/mcp_server/markdown.py +126 -0
- package/src/python/mcp_server/models.py +73 -0
- package/src/python/mcp_server/ranking.py +125 -0
- package/src/python/mcp_server/ranking_scoring.py +232 -0
- package/src/python/mcp_server/semantic.py +192 -0
- package/src/python/mcp_server/server.py +235 -0
- package/src/python/mcp_server/server_payloads.py +83 -0
- package/src/python/mcp_server/text.py +104 -0
- package/src/python/mcp_server/utils/__init__.py +1 -0
- package/src/python/mcp_server/utils/httpx_loader.py +14 -0
- package/src/python/mcp_server/utils/increment.py +7 -0
- package/src/python/mcp_server/utils/sha256_text.py +8 -0
- package/src/python/mcp_server/utils/unique_strings.py +15 -0
- package/src/python/mcp_server/weaviate.py +182 -0
- package/uv.lock +2012 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from mcp_server.models import MarkdownSection
|
|
8
|
+
from mcp_server.text import approximate_tokens, detected_record_id, slugless
|
|
9
|
+
from mcp_server.utils.unique_strings import unique_strings
|
|
10
|
+
|
|
11
|
+
HEADING_RE = re.compile(r"^(#{1,6})\s+(.+?)\s*$")
|
|
12
|
+
CODE_FENCE_RE = re.compile(r"^\s*```")
|
|
13
|
+
MAX_SECTION_TOKENS = 1_200
|
|
14
|
+
TARGET_SECTION_TOKENS = 850
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def markdown_sections(path: Path, *, root: Path) -> list[MarkdownSection]:
|
|
18
|
+
# pylint: disable=too-many-locals
|
|
19
|
+
relative_path = path.relative_to(root).as_posix()
|
|
20
|
+
lines = path.read_text().splitlines()
|
|
21
|
+
sections: list[MarkdownSection] = []
|
|
22
|
+
stack: list[str] = []
|
|
23
|
+
current_heading = path.stem
|
|
24
|
+
current_path = (path.stem,)
|
|
25
|
+
current_start = 1
|
|
26
|
+
current_body: list[str] = []
|
|
27
|
+
in_code = False
|
|
28
|
+
|
|
29
|
+
def flush(end_line: int) -> None:
|
|
30
|
+
body = "\n".join(current_body).strip()
|
|
31
|
+
if body:
|
|
32
|
+
sections.append(
|
|
33
|
+
MarkdownSection(
|
|
34
|
+
source_file=relative_path,
|
|
35
|
+
section_path=tuple(current_path),
|
|
36
|
+
heading=current_heading,
|
|
37
|
+
body=body,
|
|
38
|
+
start_line=current_start,
|
|
39
|
+
end_line=end_line,
|
|
40
|
+
)
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
for line_number, line in enumerate(lines, start=1):
|
|
44
|
+
if CODE_FENCE_RE.match(line):
|
|
45
|
+
in_code = not in_code
|
|
46
|
+
current_body.append(line)
|
|
47
|
+
continue
|
|
48
|
+
heading = HEADING_RE.match(line) if not in_code else None
|
|
49
|
+
if heading:
|
|
50
|
+
flush(line_number - 1)
|
|
51
|
+
level = len(heading.group(1))
|
|
52
|
+
text = heading.group(2).strip()
|
|
53
|
+
stack = stack[: level - 1]
|
|
54
|
+
stack.append(text)
|
|
55
|
+
current_heading = text
|
|
56
|
+
current_path = tuple(stack)
|
|
57
|
+
current_start = line_number
|
|
58
|
+
current_body = [line]
|
|
59
|
+
continue
|
|
60
|
+
current_body.append(line)
|
|
61
|
+
|
|
62
|
+
flush(len(lines))
|
|
63
|
+
return sections
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def split_section_body(body: str) -> list[str]:
|
|
67
|
+
if approximate_tokens(body) <= MAX_SECTION_TOKENS:
|
|
68
|
+
return [body]
|
|
69
|
+
blocks = semantic_blocks(body)
|
|
70
|
+
chunks: list[str] = []
|
|
71
|
+
current: list[str] = []
|
|
72
|
+
current_tokens = 0
|
|
73
|
+
for block in blocks:
|
|
74
|
+
block_tokens = approximate_tokens(block)
|
|
75
|
+
if current and current_tokens + block_tokens > TARGET_SECTION_TOKENS:
|
|
76
|
+
chunks.append("\n\n".join(current).strip())
|
|
77
|
+
current = []
|
|
78
|
+
current_tokens = 0
|
|
79
|
+
current.append(block)
|
|
80
|
+
current_tokens += block_tokens
|
|
81
|
+
if current:
|
|
82
|
+
chunks.append("\n\n".join(current).strip())
|
|
83
|
+
return [chunk for chunk in chunks if chunk]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def semantic_blocks(body: str) -> list[str]:
|
|
87
|
+
blocks: list[str] = []
|
|
88
|
+
current: list[str] = []
|
|
89
|
+
in_code = False
|
|
90
|
+
for line in body.splitlines():
|
|
91
|
+
if CODE_FENCE_RE.match(line):
|
|
92
|
+
in_code = not in_code
|
|
93
|
+
current.append(line)
|
|
94
|
+
continue
|
|
95
|
+
if not in_code and not line.strip():
|
|
96
|
+
if current:
|
|
97
|
+
blocks.append("\n".join(current).strip())
|
|
98
|
+
current = []
|
|
99
|
+
continue
|
|
100
|
+
current.append(line)
|
|
101
|
+
if current:
|
|
102
|
+
blocks.append("\n".join(current).strip())
|
|
103
|
+
return blocks
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def infer_markdown_rule_family(section: MarkdownSection) -> str:
|
|
107
|
+
text = " ".join(section.section_path).lower()
|
|
108
|
+
if "eslint" in text or "ruff" in text or "lint" in text:
|
|
109
|
+
return "linting"
|
|
110
|
+
if "comment" in text:
|
|
111
|
+
return "comments"
|
|
112
|
+
if "test" in text:
|
|
113
|
+
return "tests"
|
|
114
|
+
if "function" in text or "argument" in text:
|
|
115
|
+
return "functions"
|
|
116
|
+
if "name" in text:
|
|
117
|
+
return "naming"
|
|
118
|
+
return "documentation"
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def markdown_aliases(section: MarkdownSection) -> tuple[str, ...]:
|
|
122
|
+
aliases = [section.heading, *section.section_path]
|
|
123
|
+
record_id = detected_record_id(section.heading)
|
|
124
|
+
if record_id:
|
|
125
|
+
aliases.append(record_id)
|
|
126
|
+
return tuple(unique_strings([slugless(alias) for alias in aliases if alias.strip()]))
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from datetime import UTC, datetime
|
|
6
|
+
from typing import Any, Literal
|
|
7
|
+
|
|
8
|
+
CHUNKER_VERSION = "clean-code-semantic-v1"
|
|
9
|
+
DEFAULT_EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"
|
|
10
|
+
DEFAULT_EMBEDDING_PROVIDER = "fastembed/cpu"
|
|
11
|
+
|
|
12
|
+
JsonDict = dict[str, Any]
|
|
13
|
+
Confidence = Literal["high", "medium", "low"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True, slots=True)
|
|
17
|
+
class CleanCodeChunk: # pylint: disable=too-many-instance-attributes
|
|
18
|
+
chunk_id: str
|
|
19
|
+
object_id: str
|
|
20
|
+
source_file: str
|
|
21
|
+
source_kind: str
|
|
22
|
+
record_id: str
|
|
23
|
+
title: str
|
|
24
|
+
topic: str
|
|
25
|
+
section_path: tuple[str, ...]
|
|
26
|
+
chunk_kind: str
|
|
27
|
+
chunk_index: int
|
|
28
|
+
rule_family: str
|
|
29
|
+
lintability: str
|
|
30
|
+
aliases: tuple[str, ...]
|
|
31
|
+
languages: tuple[str, ...]
|
|
32
|
+
lint_candidates: tuple[str, ...]
|
|
33
|
+
content_text: str
|
|
34
|
+
embedding_text: str
|
|
35
|
+
display_text: str
|
|
36
|
+
text_hash: str
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def properties(self) -> JsonDict:
|
|
40
|
+
return {
|
|
41
|
+
"chunkId": self.chunk_id,
|
|
42
|
+
"sourceFile": self.source_file,
|
|
43
|
+
"sourceKind": self.source_kind,
|
|
44
|
+
"recordId": self.record_id,
|
|
45
|
+
"title": self.title,
|
|
46
|
+
"topic": self.topic,
|
|
47
|
+
"sectionPath": list(self.section_path),
|
|
48
|
+
"chunkKind": self.chunk_kind,
|
|
49
|
+
"chunkIndex": self.chunk_index,
|
|
50
|
+
"ruleFamily": self.rule_family,
|
|
51
|
+
"lintability": self.lintability,
|
|
52
|
+
"aliases": list(self.aliases),
|
|
53
|
+
"languages": list(self.languages),
|
|
54
|
+
"lintCandidates": list(self.lint_candidates),
|
|
55
|
+
"contentText": self.content_text,
|
|
56
|
+
"embeddingText": self.embedding_text,
|
|
57
|
+
"displayText": self.display_text,
|
|
58
|
+
"textHash": self.text_hash,
|
|
59
|
+
"chunkerVersion": CHUNKER_VERSION,
|
|
60
|
+
"embeddingModel": DEFAULT_EMBEDDING_MODEL,
|
|
61
|
+
"embeddingProvider": DEFAULT_EMBEDDING_PROVIDER,
|
|
62
|
+
"createdAt": datetime.now(UTC).isoformat(),
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass(frozen=True, slots=True)
|
|
67
|
+
class MarkdownSection:
|
|
68
|
+
source_file: str
|
|
69
|
+
section_path: tuple[str, ...]
|
|
70
|
+
heading: str
|
|
71
|
+
body: str
|
|
72
|
+
start_line: int
|
|
73
|
+
end_line: int
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from mcp_server.corpus import build_chunks
|
|
5
|
+
from mcp_server.models import DEFAULT_EMBEDDING_MODEL, JsonDict
|
|
6
|
+
from mcp_server.ranking_scoring import (
|
|
7
|
+
dedupe_pattern_results,
|
|
8
|
+
distance_for,
|
|
9
|
+
row_matches_filters,
|
|
10
|
+
score_pattern_row,
|
|
11
|
+
)
|
|
12
|
+
from mcp_server.text import (
|
|
13
|
+
lexical_score,
|
|
14
|
+
query_tokens,
|
|
15
|
+
)
|
|
16
|
+
from mcp_server.weaviate import COLLECTION_NAME, search_chunks
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def search_pattern_records( # noqa: PLR0913 # pylint: disable=too-many-arguments,too-many-locals
|
|
20
|
+
*,
|
|
21
|
+
query: str,
|
|
22
|
+
url: str,
|
|
23
|
+
collection_name: str = COLLECTION_NAME,
|
|
24
|
+
model_name: str = DEFAULT_EMBEDDING_MODEL,
|
|
25
|
+
limit: int = 8,
|
|
26
|
+
language: str = "any",
|
|
27
|
+
rule_families: tuple[str, ...] = (),
|
|
28
|
+
topics: tuple[str, ...] = (),
|
|
29
|
+
lintability: tuple[str, ...] = (),
|
|
30
|
+
source_kinds: tuple[str, ...] = ("clean_code_pattern",),
|
|
31
|
+
) -> JsonDict:
|
|
32
|
+
vector_limit = max(limit * 4, 25)
|
|
33
|
+
vector_rows = search_chunks(
|
|
34
|
+
query=query,
|
|
35
|
+
url=url,
|
|
36
|
+
collection_name=collection_name,
|
|
37
|
+
model_name=model_name,
|
|
38
|
+
limit=vector_limit,
|
|
39
|
+
)
|
|
40
|
+
return rank_pattern_rows(
|
|
41
|
+
query=query,
|
|
42
|
+
vector_rows=vector_rows,
|
|
43
|
+
limit=limit,
|
|
44
|
+
language=language,
|
|
45
|
+
rule_families=rule_families,
|
|
46
|
+
topics=topics,
|
|
47
|
+
lintability=lintability,
|
|
48
|
+
source_kinds=source_kinds,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def rank_pattern_rows( # noqa: PLR0913 # pylint: disable=too-many-arguments,too-many-locals
|
|
53
|
+
*,
|
|
54
|
+
query: str,
|
|
55
|
+
vector_rows: list[JsonDict],
|
|
56
|
+
limit: int = 8,
|
|
57
|
+
language: str = "any",
|
|
58
|
+
rule_families: tuple[str, ...] = (),
|
|
59
|
+
topics: tuple[str, ...] = (),
|
|
60
|
+
lintability: tuple[str, ...] = (),
|
|
61
|
+
source_kinds: tuple[str, ...] = ("clean_code_pattern",),
|
|
62
|
+
) -> JsonDict:
|
|
63
|
+
rows_by_chunk_id: dict[str, JsonDict] = {}
|
|
64
|
+
for row in [*vector_rows, *local_keyword_rows(query)]:
|
|
65
|
+
chunk_id = str(row.get("chunkId", ""))
|
|
66
|
+
if chunk_id:
|
|
67
|
+
rows_by_chunk_id[chunk_id] = merge_search_rows(rows_by_chunk_id.get(chunk_id), row)
|
|
68
|
+
|
|
69
|
+
filters = {
|
|
70
|
+
"language": language,
|
|
71
|
+
"rule_families": rule_families,
|
|
72
|
+
"topics": topics,
|
|
73
|
+
"lintability": lintability,
|
|
74
|
+
"source_kinds": source_kinds,
|
|
75
|
+
}
|
|
76
|
+
ranked: list[JsonDict] = []
|
|
77
|
+
for row in rows_by_chunk_id.values():
|
|
78
|
+
if not row_matches_filters(row, filters):
|
|
79
|
+
continue
|
|
80
|
+
ranked.append(score_pattern_row(row, query=query, filters=filters))
|
|
81
|
+
|
|
82
|
+
ranked.sort(key=lambda item: float(item["score"]), reverse=True)
|
|
83
|
+
deduped = dedupe_pattern_results(ranked)
|
|
84
|
+
results = deduped[:limit]
|
|
85
|
+
return {
|
|
86
|
+
"query": query,
|
|
87
|
+
"filters_applied": filters,
|
|
88
|
+
"results": results,
|
|
89
|
+
"no_strong_match": not results or results[0]["confidence"] != "high",
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def local_keyword_rows(query: str) -> list[JsonDict]:
|
|
94
|
+
query_terms = set(query_tokens(query))
|
|
95
|
+
if not query_terms:
|
|
96
|
+
return []
|
|
97
|
+
matches: list[JsonDict] = []
|
|
98
|
+
for chunk in build_chunks():
|
|
99
|
+
haystack = " ".join(
|
|
100
|
+
(
|
|
101
|
+
chunk.record_id,
|
|
102
|
+
chunk.title,
|
|
103
|
+
chunk.topic,
|
|
104
|
+
chunk.rule_family,
|
|
105
|
+
chunk.lintability,
|
|
106
|
+
" ".join(chunk.aliases),
|
|
107
|
+
" ".join(chunk.lint_candidates),
|
|
108
|
+
chunk.content_text[:600],
|
|
109
|
+
)
|
|
110
|
+
)
|
|
111
|
+
if lexical_score(query_terms, haystack) > 0:
|
|
112
|
+
matches.append(chunk.properties | {"_additional": {"id": chunk.object_id}})
|
|
113
|
+
return matches
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def merge_search_rows(existing: JsonDict | None, row: JsonDict) -> JsonDict:
|
|
117
|
+
if existing is None:
|
|
118
|
+
return row
|
|
119
|
+
existing_distance = distance_for(existing)
|
|
120
|
+
row_distance = distance_for(row)
|
|
121
|
+
if existing_distance is None:
|
|
122
|
+
return row
|
|
123
|
+
if row_distance is None:
|
|
124
|
+
return existing
|
|
125
|
+
return row if row_distance < existing_distance else existing
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from mcp_server.models import Confidence, JsonDict
|
|
5
|
+
from mcp_server.text import (
|
|
6
|
+
lexical_score,
|
|
7
|
+
query_tokens,
|
|
8
|
+
searchable_row_text,
|
|
9
|
+
semantic_similarity,
|
|
10
|
+
)
|
|
11
|
+
from mcp_server.utils.unique_strings import unique_strings
|
|
12
|
+
|
|
13
|
+
CONSERVATIVE_CONTEXT_THRESHOLD = 0.80
|
|
14
|
+
HIGH_CONFIDENCE_THRESHOLD = 0.72
|
|
15
|
+
EXACT_HIGH_CONFIDENCE_THRESHOLD = 0.40
|
|
16
|
+
MEDIUM_CONFIDENCE_THRESHOLD = 0.45
|
|
17
|
+
EXACT_MATCH_STOPWORDS = {
|
|
18
|
+
"a",
|
|
19
|
+
"an",
|
|
20
|
+
"and",
|
|
21
|
+
"be",
|
|
22
|
+
"by",
|
|
23
|
+
"code",
|
|
24
|
+
"do",
|
|
25
|
+
"for",
|
|
26
|
+
"in",
|
|
27
|
+
"is",
|
|
28
|
+
"it",
|
|
29
|
+
"of",
|
|
30
|
+
"on",
|
|
31
|
+
"or",
|
|
32
|
+
"the",
|
|
33
|
+
"thi",
|
|
34
|
+
"this",
|
|
35
|
+
"to",
|
|
36
|
+
"when",
|
|
37
|
+
"with",
|
|
38
|
+
}
|
|
39
|
+
NON_SPECIFIC_EXACT_ALIASES = {
|
|
40
|
+
"clean code",
|
|
41
|
+
"code smell",
|
|
42
|
+
"comments",
|
|
43
|
+
"error handling",
|
|
44
|
+
"function smell",
|
|
45
|
+
"functions",
|
|
46
|
+
"function design",
|
|
47
|
+
"planning guidance",
|
|
48
|
+
"refactoring rule",
|
|
49
|
+
}
|
|
50
|
+
BROAD_CATEGORY_QUERY_TERMS = {
|
|
51
|
+
"clean",
|
|
52
|
+
"code",
|
|
53
|
+
"comment",
|
|
54
|
+
"comments",
|
|
55
|
+
"error",
|
|
56
|
+
"function",
|
|
57
|
+
"handling",
|
|
58
|
+
"guidance",
|
|
59
|
+
"planning",
|
|
60
|
+
"python",
|
|
61
|
+
"refactoring",
|
|
62
|
+
"rule",
|
|
63
|
+
"smell",
|
|
64
|
+
"typescript",
|
|
65
|
+
}
|
|
66
|
+
VAGUE_QUERY_TERMS = {"stuff", "thing", "things"}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def row_matches_filters(row: JsonDict, filters: JsonDict) -> bool:
|
|
70
|
+
source_kinds = tuple(str(value) for value in filters["source_kinds"])
|
|
71
|
+
if source_kinds and str(row.get("sourceKind", "")) not in source_kinds:
|
|
72
|
+
return False
|
|
73
|
+
|
|
74
|
+
language = str(filters["language"])
|
|
75
|
+
languages = tuple(str(value) for value in row.get("languages", []))
|
|
76
|
+
if language not in {"", "any", "both"} and language not in languages:
|
|
77
|
+
return False
|
|
78
|
+
if language == "both" and not {"typescript", "python"} <= set(languages):
|
|
79
|
+
return False
|
|
80
|
+
|
|
81
|
+
rule_families = {str(value) for value in filters["rule_families"]}
|
|
82
|
+
if rule_families and str(row.get("ruleFamily", "")) not in rule_families:
|
|
83
|
+
return False
|
|
84
|
+
|
|
85
|
+
topics = {str(value) for value in filters["topics"]}
|
|
86
|
+
if topics and str(row.get("topic", "")) not in topics:
|
|
87
|
+
return False
|
|
88
|
+
|
|
89
|
+
lintability = {str(value) for value in filters["lintability"]}
|
|
90
|
+
return not lintability or str(row.get("lintability", "")) in lintability
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def score_pattern_row(row: JsonDict, *, query: str, filters: JsonDict) -> JsonDict:
|
|
94
|
+
query_terms = set(query_tokens(query))
|
|
95
|
+
distance = distance_for(row)
|
|
96
|
+
semantic_score = semantic_similarity(distance)
|
|
97
|
+
keyword_score = lexical_score(query_terms, searchable_row_text(row))
|
|
98
|
+
metadata_boost, match_reasons = metadata_boost_and_reasons(row, filters, query_terms)
|
|
99
|
+
context_penalty, context_reasons = context_penalty_and_reasons(query_terms)
|
|
100
|
+
match_reasons.extend(context_reasons)
|
|
101
|
+
score = max(0.0, min(1.0, semantic_score * 0.60 + keyword_score * 0.25 + metadata_boost - context_penalty))
|
|
102
|
+
confidence = confidence_for(score, match_reasons, semantic_score=semantic_score)
|
|
103
|
+
return {
|
|
104
|
+
"pattern_id": row.get("recordId", ""),
|
|
105
|
+
"chunk_id": row.get("chunkId", ""),
|
|
106
|
+
"title": row.get("title", ""),
|
|
107
|
+
"topic": row.get("topic", ""),
|
|
108
|
+
"rule_family": row.get("ruleFamily", ""),
|
|
109
|
+
"lintability": row.get("lintability", ""),
|
|
110
|
+
"languages": row.get("languages", []),
|
|
111
|
+
"aliases": row.get("aliases", []),
|
|
112
|
+
"lint_candidates": row.get("lintCandidates", []),
|
|
113
|
+
"source_kind": row.get("sourceKind", ""),
|
|
114
|
+
"chunk_kind": row.get("chunkKind", ""),
|
|
115
|
+
"section_path": row.get("sectionPath", []),
|
|
116
|
+
"score": round(score, 4),
|
|
117
|
+
"confidence": confidence,
|
|
118
|
+
"distance": distance,
|
|
119
|
+
"match_reasons": match_reasons,
|
|
120
|
+
"snippet": " ".join(str(row.get("contentText", "")).split())[:500],
|
|
121
|
+
"source": {
|
|
122
|
+
"source_file": row.get("sourceFile", ""),
|
|
123
|
+
"source_kind": row.get("sourceKind", ""),
|
|
124
|
+
"text_hash": row.get("textHash", ""),
|
|
125
|
+
},
|
|
126
|
+
"score_breakdown": {
|
|
127
|
+
"semantic_score": round(semantic_score, 4),
|
|
128
|
+
"keyword_score": round(keyword_score, 4),
|
|
129
|
+
"metadata_boost": round(metadata_boost, 4),
|
|
130
|
+
"context_penalty": round(context_penalty, 4),
|
|
131
|
+
},
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def dedupe_pattern_results(rows: list[JsonDict]) -> list[JsonDict]:
|
|
136
|
+
by_record_id: dict[str, JsonDict] = {}
|
|
137
|
+
for row in rows:
|
|
138
|
+
record_id = str(row.get("pattern_id") or row.get("chunk_id"))
|
|
139
|
+
if record_id not in by_record_id or float(row["score"]) > float(by_record_id[record_id]["score"]):
|
|
140
|
+
by_record_id[record_id] = row
|
|
141
|
+
return sorted(by_record_id.values(), key=lambda item: float(item["score"]), reverse=True)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def metadata_boost_and_reasons(
|
|
145
|
+
row: JsonDict,
|
|
146
|
+
filters: JsonDict,
|
|
147
|
+
query_terms: set[str],
|
|
148
|
+
) -> tuple[float, list[str]]:
|
|
149
|
+
boost = 0.0
|
|
150
|
+
reasons: list[str] = []
|
|
151
|
+
source_kind = str(row.get("sourceKind", ""))
|
|
152
|
+
if source_kind == "clean_code_pattern":
|
|
153
|
+
boost += 0.05
|
|
154
|
+
reasons.append("canonical clean-code pattern")
|
|
155
|
+
|
|
156
|
+
language = str(filters["language"])
|
|
157
|
+
languages = {str(value) for value in row.get("languages", [])}
|
|
158
|
+
if language not in {"", "any"} and (language == "both" or language in languages):
|
|
159
|
+
boost += 0.05
|
|
160
|
+
reasons.append(f"language matched: {language}")
|
|
161
|
+
|
|
162
|
+
lintability = str(row.get("lintability", ""))
|
|
163
|
+
if lintability and lintability in set(filters["lintability"]):
|
|
164
|
+
boost += 0.05
|
|
165
|
+
reasons.append(f"lintability matched: {lintability}")
|
|
166
|
+
|
|
167
|
+
exact_terms = exact_match_terms(row, query_terms)
|
|
168
|
+
if exact_terms:
|
|
169
|
+
boost += 0.15
|
|
170
|
+
reasons.append(f"matched exact terms: {', '.join(exact_terms[:4])}")
|
|
171
|
+
|
|
172
|
+
if not reasons:
|
|
173
|
+
reasons.append("semantic similarity")
|
|
174
|
+
return min(boost, 0.30), reasons
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def context_penalty_and_reasons(query_terms: set[str]) -> tuple[float, list[str]]:
|
|
178
|
+
safe_context_terms = {"generated", "fixture", "fixtures", "test", "tests"}
|
|
179
|
+
matched = sorted(query_terms & safe_context_terms)
|
|
180
|
+
reasons: list[str] = []
|
|
181
|
+
if matched:
|
|
182
|
+
reasons.append(f"conservative context: {', '.join(matched)}")
|
|
183
|
+
meaningful_terms = query_terms - EXACT_MATCH_STOPWORDS
|
|
184
|
+
if meaningful_terms and meaningful_terms <= BROAD_CATEGORY_QUERY_TERMS:
|
|
185
|
+
reasons.append("conservative context: broad category query")
|
|
186
|
+
if meaningful_terms and meaningful_terms <= VAGUE_QUERY_TERMS:
|
|
187
|
+
reasons.append("conservative context: vague query")
|
|
188
|
+
if {"todo", "tracked", "issue"} <= query_terms:
|
|
189
|
+
reasons.append("conservative context: compliant tracked TODO")
|
|
190
|
+
if not reasons:
|
|
191
|
+
return 0.0, []
|
|
192
|
+
return 0.35, reasons
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def exact_match_terms(row: JsonDict, query_terms: set[str]) -> list[str]:
|
|
196
|
+
values = [
|
|
197
|
+
str(row.get("recordId", "")),
|
|
198
|
+
str(row.get("title", "")),
|
|
199
|
+
*[str(value) for value in row.get("aliases", [])],
|
|
200
|
+
*[str(value) for value in row.get("lintCandidates", [])],
|
|
201
|
+
]
|
|
202
|
+
matched: list[str] = []
|
|
203
|
+
for value in values:
|
|
204
|
+
normalized_value = " ".join(query_tokens(value))
|
|
205
|
+
if normalized_value in NON_SPECIFIC_EXACT_ALIASES:
|
|
206
|
+
continue
|
|
207
|
+
terms = set(query_tokens(value))
|
|
208
|
+
meaningful_terms = terms - EXACT_MATCH_STOPWORDS
|
|
209
|
+
if terms and meaningful_terms and terms <= query_terms:
|
|
210
|
+
matched.append(value)
|
|
211
|
+
return unique_strings(matched)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def distance_for(row: JsonDict) -> float | None:
|
|
215
|
+
additional = row.get("_additional")
|
|
216
|
+
if not isinstance(additional, dict):
|
|
217
|
+
return None
|
|
218
|
+
distance = additional.get("distance")
|
|
219
|
+
return float(distance) if isinstance(distance, int | float) else None
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def confidence_for(score: float, match_reasons: list[str], *, semantic_score: float) -> Confidence:
|
|
223
|
+
if any(reason.startswith("conservative context") for reason in match_reasons):
|
|
224
|
+
return "low" if score < CONSERVATIVE_CONTEXT_THRESHOLD else "medium"
|
|
225
|
+
has_exact = any(reason.startswith("matched exact terms") for reason in match_reasons)
|
|
226
|
+
if score >= HIGH_CONFIDENCE_THRESHOLD or (
|
|
227
|
+
has_exact and semantic_score > 0 and score >= EXACT_HIGH_CONFIDENCE_THRESHOLD
|
|
228
|
+
):
|
|
229
|
+
return "high"
|
|
230
|
+
if score >= MEDIUM_CONFIDENCE_THRESHOLD:
|
|
231
|
+
return "medium"
|
|
232
|
+
return "low"
|