clean-code-tools 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/README.md +66 -0
  2. package/configs/eslint.clean-code.recommended.mjs +211 -0
  3. package/configs/python.clean-code.pyproject.toml +143 -0
  4. package/data/clean-code-patterns.jsonl +264 -0
  5. package/data/vector-record.schema.json +77 -0
  6. package/docs/README.md +29 -0
  7. package/docs/eslint-custom-rules.md +74 -0
  8. package/docs/eslint-recommended-config.md +87 -0
  9. package/docs/fastmcp-local-server.md +104 -0
  10. package/docs/publishing.md +125 -0
  11. package/docs/python-lint-recommended-config.md +57 -0
  12. package/docs/python-pylint-custom-rules.md +77 -0
  13. package/docs/semantic-weaviate.md +80 -0
  14. package/docs/static-trigger-semantic-review.md +97 -0
  15. package/evals/clean-code-retrieval.jsonl +13 -0
  16. package/ops/dev/weaviate/README.md +34 -0
  17. package/ops/dev/weaviate/compose.yaml +34 -0
  18. package/ops/dev/weaviate/smoke.sh +28 -0
  19. package/package.json +96 -0
  20. package/pyproject.toml +303 -0
  21. package/sample-apps/README.md +40 -0
  22. package/sample-apps/python-app/pyproject.toml +113 -0
  23. package/sample-apps/python-app/src/clean_pricing.py +10 -0
  24. package/sample-apps/python-app/src/smelly_pricing.py +8 -0
  25. package/sample-apps/ts-backend/eslint.config.mjs +3 -0
  26. package/sample-apps/ts-backend/package.json +18 -0
  27. package/sample-apps/ts-backend/src/clean-handler.ts +19 -0
  28. package/sample-apps/ts-backend/src/smelly-handler.ts +29 -0
  29. package/sample-apps/ts-backend/tsconfig.json +9 -0
  30. package/sample-apps/ts-frontend/eslint.config.mjs +3 -0
  31. package/sample-apps/ts-frontend/package.json +18 -0
  32. package/sample-apps/ts-frontend/src/CleanWidget.tsx +18 -0
  33. package/sample-apps/ts-frontend/src/SmellyWidget.tsx +27 -0
  34. package/sample-apps/ts-frontend/tsconfig.json +10 -0
  35. package/scripts/_mcp_app.py +21 -0
  36. package/scripts/check_clean_code_review_candidates.py +302 -0
  37. package/scripts/check_fastmcp_server.py +106 -0
  38. package/scripts/check_packages.py +137 -0
  39. package/scripts/check_python_config.py +130 -0
  40. package/scripts/check_repo_python_lint.py +46 -0
  41. package/scripts/check_retrieval_evals.py +132 -0
  42. package/scripts/check_sample_apps.py +169 -0
  43. package/scripts/check_semantic_search_tooling.py +102 -0
  44. package/scripts/clean_code_eslint_triggers.py +272 -0
  45. package/scripts/clean_code_mcp_server.py +7 -0
  46. package/scripts/clean_code_python_triggers.py +318 -0
  47. package/scripts/clean_code_review_candidates.py +291 -0
  48. package/scripts/clean_code_review_io.py +36 -0
  49. package/scripts/clean_code_review_models.py +43 -0
  50. package/scripts/clean_code_semantic.py +27 -0
  51. package/scripts/set_package_versions.py +82 -0
  52. package/scripts/weaviate_ingest_clean_code.py +44 -0
  53. package/scripts/weaviate_search_clean_code.py +51 -0
  54. package/skills/clean-code-mcp-reviewer/SKILL.md +209 -0
  55. package/skills/clean-code-mcp-reviewer/evals/evals.json +30 -0
  56. package/src/js/eslint-plugin-clean-code.mjs +758 -0
  57. package/src/python/clean_code_tools_pylint/__init__.py +14 -0
  58. package/src/python/clean_code_tools_pylint/ast_checker.py +122 -0
  59. package/src/python/clean_code_tools_pylint/comments.py +83 -0
  60. package/src/python/clean_code_tools_pylint/helpers.py +196 -0
  61. package/src/python/mcp_server/__init__.py +1 -0
  62. package/src/python/mcp_server/corpus.py +160 -0
  63. package/src/python/mcp_server/markdown.py +126 -0
  64. package/src/python/mcp_server/models.py +73 -0
  65. package/src/python/mcp_server/ranking.py +125 -0
  66. package/src/python/mcp_server/ranking_scoring.py +232 -0
  67. package/src/python/mcp_server/semantic.py +192 -0
  68. package/src/python/mcp_server/server.py +235 -0
  69. package/src/python/mcp_server/server_payloads.py +83 -0
  70. package/src/python/mcp_server/text.py +104 -0
  71. package/src/python/mcp_server/utils/__init__.py +1 -0
  72. package/src/python/mcp_server/utils/httpx_loader.py +14 -0
  73. package/src/python/mcp_server/utils/increment.py +7 -0
  74. package/src/python/mcp_server/utils/sha256_text.py +8 -0
  75. package/src/python/mcp_server/utils/unique_strings.py +15 -0
  76. package/src/python/mcp_server/weaviate.py +182 -0
  77. package/uv.lock +2012 -0
@@ -0,0 +1,192 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ from mcp_server.corpus import (
5
+ CHUNK_ID_NAMESPACE,
6
+ MARKDOWN_SOURCES,
7
+ PATTERN_RECORDS,
8
+ ROOT,
9
+ build_chunks,
10
+ get_pattern_record,
11
+ load_pattern_records,
12
+ markdown_chunks,
13
+ object_id_for,
14
+ pattern_record_chunks,
15
+ )
16
+ from mcp_server.markdown import (
17
+ CODE_FENCE_RE,
18
+ HEADING_RE,
19
+ MAX_SECTION_TOKENS,
20
+ TARGET_SECTION_TOKENS,
21
+ MarkdownSection,
22
+ infer_markdown_rule_family,
23
+ markdown_aliases,
24
+ markdown_sections,
25
+ semantic_blocks,
26
+ split_section_body,
27
+ )
28
+ from mcp_server.models import (
29
+ CHUNKER_VERSION,
30
+ DEFAULT_EMBEDDING_MODEL,
31
+ DEFAULT_EMBEDDING_PROVIDER,
32
+ CleanCodeChunk,
33
+ Confidence,
34
+ JsonDict,
35
+ )
36
+ from mcp_server.ranking import (
37
+ local_keyword_rows,
38
+ merge_search_rows,
39
+ rank_pattern_rows,
40
+ search_pattern_records,
41
+ )
42
+ from mcp_server.ranking_scoring import (
43
+ BROAD_CATEGORY_QUERY_TERMS,
44
+ CONSERVATIVE_CONTEXT_THRESHOLD,
45
+ EXACT_HIGH_CONFIDENCE_THRESHOLD,
46
+ EXACT_MATCH_STOPWORDS,
47
+ HIGH_CONFIDENCE_THRESHOLD,
48
+ MEDIUM_CONFIDENCE_THRESHOLD,
49
+ NON_SPECIFIC_EXACT_ALIASES,
50
+ VAGUE_QUERY_TERMS,
51
+ confidence_for,
52
+ context_penalty_and_reasons,
53
+ dedupe_pattern_results,
54
+ distance_for,
55
+ exact_match_terms,
56
+ metadata_boost_and_reasons,
57
+ row_matches_filters,
58
+ score_pattern_row,
59
+ )
60
+ from mcp_server.text import (
61
+ CC_ID_RE,
62
+ PHRASE_BONUS_MIN_OVERLAP,
63
+ PLURAL_NORMALIZATION_MIN_LENGTH,
64
+ SLUG_RE,
65
+ WORD_RE,
66
+ approximate_tokens,
67
+ clean_alias,
68
+ clean_topic,
69
+ clean_topic_text,
70
+ detected_record_id,
71
+ languages_in_text,
72
+ lexical_score,
73
+ lint_candidates_in_text,
74
+ normalize_token,
75
+ query_tokens,
76
+ searchable_row_text,
77
+ semantic_similarity,
78
+ slug,
79
+ slugless,
80
+ )
81
+ from mcp_server.utils.httpx_loader import require_httpx
82
+ from mcp_server.utils.sha256_text import sha256_text
83
+ from mcp_server.utils.unique_strings import unique_strings
84
+ from mcp_server.weaviate import (
85
+ COLLECTION_NAME,
86
+ DEFAULT_BATCH_SIZE,
87
+ DEFAULT_WEAVIATE_URL,
88
+ FASTEMBED_INSTALL_MESSAGE,
89
+ GRAPHQL_NAME_RE,
90
+ HTTP_NOT_FOUND,
91
+ VECTOR_NAME,
92
+ batch_failures,
93
+ build_search_graphql_query,
94
+ create_schema_payload,
95
+ embed_query,
96
+ embed_texts,
97
+ execute_graphql_search,
98
+ ingest_chunks,
99
+ is_successful_batch_row,
100
+ reset_collection,
101
+ search_chunks,
102
+ search_rows_from_payload,
103
+ )
104
+
105
+ __all__ = [
106
+ "BROAD_CATEGORY_QUERY_TERMS",
107
+ "CC_ID_RE",
108
+ "CHUNKER_VERSION",
109
+ "CHUNK_ID_NAMESPACE",
110
+ "CODE_FENCE_RE",
111
+ "COLLECTION_NAME",
112
+ "CONSERVATIVE_CONTEXT_THRESHOLD",
113
+ "DEFAULT_BATCH_SIZE",
114
+ "DEFAULT_EMBEDDING_MODEL",
115
+ "DEFAULT_EMBEDDING_PROVIDER",
116
+ "DEFAULT_WEAVIATE_URL",
117
+ "EXACT_HIGH_CONFIDENCE_THRESHOLD",
118
+ "EXACT_MATCH_STOPWORDS",
119
+ "FASTEMBED_INSTALL_MESSAGE",
120
+ "GRAPHQL_NAME_RE",
121
+ "HEADING_RE",
122
+ "HIGH_CONFIDENCE_THRESHOLD",
123
+ "HTTP_NOT_FOUND",
124
+ "MARKDOWN_SOURCES",
125
+ "MAX_SECTION_TOKENS",
126
+ "MEDIUM_CONFIDENCE_THRESHOLD",
127
+ "NON_SPECIFIC_EXACT_ALIASES",
128
+ "PATTERN_RECORDS",
129
+ "PHRASE_BONUS_MIN_OVERLAP",
130
+ "PLURAL_NORMALIZATION_MIN_LENGTH",
131
+ "ROOT",
132
+ "SLUG_RE",
133
+ "TARGET_SECTION_TOKENS",
134
+ "VAGUE_QUERY_TERMS",
135
+ "VECTOR_NAME",
136
+ "WORD_RE",
137
+ "CleanCodeChunk",
138
+ "Confidence",
139
+ "JsonDict",
140
+ "MarkdownSection",
141
+ "approximate_tokens",
142
+ "batch_failures",
143
+ "build_chunks",
144
+ "build_search_graphql_query",
145
+ "clean_alias",
146
+ "clean_topic",
147
+ "clean_topic_text",
148
+ "confidence_for",
149
+ "context_penalty_and_reasons",
150
+ "create_schema_payload",
151
+ "dedupe_pattern_results",
152
+ "detected_record_id",
153
+ "distance_for",
154
+ "embed_query",
155
+ "embed_texts",
156
+ "exact_match_terms",
157
+ "execute_graphql_search",
158
+ "get_pattern_record",
159
+ "infer_markdown_rule_family",
160
+ "ingest_chunks",
161
+ "is_successful_batch_row",
162
+ "languages_in_text",
163
+ "lexical_score",
164
+ "lint_candidates_in_text",
165
+ "load_pattern_records",
166
+ "local_keyword_rows",
167
+ "markdown_aliases",
168
+ "markdown_chunks",
169
+ "markdown_sections",
170
+ "merge_search_rows",
171
+ "metadata_boost_and_reasons",
172
+ "normalize_token",
173
+ "object_id_for",
174
+ "pattern_record_chunks",
175
+ "query_tokens",
176
+ "rank_pattern_rows",
177
+ "require_httpx",
178
+ "reset_collection",
179
+ "row_matches_filters",
180
+ "score_pattern_row",
181
+ "search_chunks",
182
+ "search_pattern_records",
183
+ "search_rows_from_payload",
184
+ "searchable_row_text",
185
+ "semantic_blocks",
186
+ "semantic_similarity",
187
+ "sha256_text",
188
+ "slug",
189
+ "slugless",
190
+ "split_section_body",
191
+ "unique_strings",
192
+ ]
@@ -0,0 +1,235 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ from typing import Any
7
+
8
+ from mcp_server import semantic
9
+ from mcp_server.server_payloads import (
10
+ default_lint_targets,
11
+ facet_counts,
12
+ lint_rule_recommendation,
13
+ search_result,
14
+ )
15
+
16
+ MAX_SEARCH_LIMIT = 25
17
+ COLLECTION_NAME = semantic.COLLECTION_NAME
18
+ DEFAULT_EMBEDDING_MODEL = semantic.DEFAULT_EMBEDDING_MODEL
19
+ DEFAULT_WEAVIATE_URL = semantic.DEFAULT_WEAVIATE_URL
20
+ build_chunks = semantic.build_chunks
21
+ create_schema_payload = semantic.create_schema_payload
22
+ get_pattern_record = semantic.get_pattern_record
23
+ search_pattern_records = semantic.search_pattern_records
24
+ search_chunks = semantic.search_chunks
25
+
26
+
27
+ try:
28
+ from fastmcp import FastMCP
29
+ except ImportError as exc: # pragma: no cover - exercised by CLI users without deps
30
+ raise SystemExit(
31
+ "Install FastMCP to run the server: python3 -m pip install 'fastmcp>=2.0.0'"
32
+ ) from exc
33
+
34
+
35
+ mcp = FastMCP("clean-code-tools")
36
+
37
+
38
+ @mcp.resource("clean-code://corpus/summary")
39
+ def corpus_summary() -> str:
40
+ """Return a compact summary of the local clean-code semantic corpus."""
41
+
42
+ chunks = build_chunks()
43
+ by_kind: dict[str, int] = {}
44
+ for chunk in chunks:
45
+ by_kind[chunk.chunk_kind] = by_kind.get(chunk.chunk_kind, 0) + 1
46
+ return json.dumps(
47
+ {
48
+ "chunks": len(chunks),
49
+ "by_kind": by_kind,
50
+ "default_collection": COLLECTION_NAME,
51
+ "default_embedding_model": DEFAULT_EMBEDDING_MODEL,
52
+ },
53
+ sort_keys=True,
54
+ )
55
+
56
+
57
+ @mcp.resource("clean-code://weaviate/schema")
58
+ def weaviate_schema() -> str:
59
+ """Return the Weaviate schema payload used by the ingest script."""
60
+
61
+ return json.dumps(create_schema_payload(), sort_keys=True, indent=2)
62
+
63
+
64
+ @mcp.resource("clean-code://patterns/{pattern_id}")
65
+ def clean_code_pattern_resource(pattern_id: str) -> str:
66
+ """Return one canonical clean-code pattern by ID."""
67
+
68
+ return json.dumps(pattern_by_id(pattern_id), sort_keys=True, indent=2)
69
+
70
+
71
+ @mcp.tool
72
+ def clean_code_corpus_summary() -> dict[str, Any]:
73
+ """Return chunk counts for the local clean-code corpus."""
74
+
75
+ return json.loads(corpus_summary())
76
+
77
+
78
+ @mcp.tool
79
+ def clean_code_weaviate_schema() -> dict[str, Any]:
80
+ """Return the Weaviate collection schema used for clean-code search."""
81
+
82
+ return create_schema_payload()
83
+
84
+
85
+ @mcp.tool
86
+ def search_clean_code(
87
+ query: str,
88
+ limit: int = 8,
89
+ weaviate_url: str = DEFAULT_WEAVIATE_URL,
90
+ collection: str = COLLECTION_NAME,
91
+ model: str = DEFAULT_EMBEDDING_MODEL,
92
+ ) -> list[dict[str, Any]]:
93
+ """Search the local Weaviate clean-code collection.
94
+
95
+ Requires a running Weaviate instance populated with
96
+ `scripts/weaviate_ingest_clean_code.py --reset`.
97
+ """
98
+
99
+ if not query.strip():
100
+ raise ValueError("query must not be empty")
101
+ if limit < 1 or limit > MAX_SEARCH_LIMIT:
102
+ raise ValueError("limit must be between 1 and 25")
103
+ rows = search_chunks(
104
+ query=query,
105
+ url=weaviate_url,
106
+ collection_name=collection,
107
+ model_name=model,
108
+ limit=limit,
109
+ )
110
+ return [search_result(row) for row in rows]
111
+
112
+
113
+ @mcp.tool
114
+ # pylint: disable-next=too-many-arguments
115
+ def search_clean_code_patterns(
116
+ query: str,
117
+ limit: int = 8,
118
+ language: str = "any",
119
+ rule_families: list[str] | None = None,
120
+ topics: list[str] | None = None,
121
+ lintability: list[str] | None = None,
122
+ source_kinds: list[str] | None = None,
123
+ weaviate_url: str = DEFAULT_WEAVIATE_URL,
124
+ collection: str = COLLECTION_NAME,
125
+ model: str = DEFAULT_EMBEDDING_MODEL,
126
+ ) -> dict[str, Any]:
127
+ """Find canonical clean-code patterns relevant to a concrete code concern."""
128
+
129
+ if not query.strip():
130
+ raise ValueError("query must not be empty")
131
+ if limit < 1 or limit > MAX_SEARCH_LIMIT:
132
+ raise ValueError("limit must be between 1 and 25")
133
+ return search_pattern_records(
134
+ query=query,
135
+ url=weaviate_url,
136
+ collection_name=collection,
137
+ model_name=model,
138
+ limit=limit,
139
+ language=language,
140
+ rule_families=tuple(rule_families or ()),
141
+ topics=tuple(topics or ()),
142
+ lintability=tuple(lintability or ()),
143
+ source_kinds=tuple(source_kinds or ("clean_code_pattern",)),
144
+ )
145
+
146
+
147
+ @mcp.tool
148
+ def get_clean_code_pattern(pattern_id: str) -> dict[str, Any]:
149
+ """Return the full canonical clean-code pattern record for a `CC-###` ID."""
150
+
151
+ return pattern_by_id(pattern_id)
152
+
153
+
154
+ @mcp.tool
155
+ # pylint: disable-next=too-many-arguments
156
+ def recommend_clean_code_lint_rules(
157
+ query: str,
158
+ language: str = "any",
159
+ targets: list[str] | None = None,
160
+ limit: int = 8,
161
+ weaviate_url: str = DEFAULT_WEAVIATE_URL,
162
+ collection: str = COLLECTION_NAME,
163
+ model: str = DEFAULT_EMBEDDING_MODEL,
164
+ ) -> dict[str, Any]:
165
+ """Recommend lint-rule candidates for repeated clean-code concerns."""
166
+
167
+ search_payload = search_clean_code_patterns(
168
+ query=query,
169
+ limit=limit,
170
+ language=language,
171
+ lintability=["high", "medium"],
172
+ source_kinds=["clean_code_pattern"],
173
+ weaviate_url=weaviate_url,
174
+ collection=collection,
175
+ model=model,
176
+ )
177
+ requested_targets = targets or default_lint_targets(language)
178
+ if search_payload["no_strong_match"]:
179
+ return {
180
+ "query": query,
181
+ "language": language,
182
+ "targets": requested_targets,
183
+ "results": [],
184
+ "no_strong_match": True,
185
+ "no_recommendation": "No high-confidence lint-rule candidate matched this query.",
186
+ }
187
+ return {
188
+ "query": query,
189
+ "language": language,
190
+ "targets": requested_targets,
191
+ "results": [lint_rule_recommendation(result, requested_targets) for result in search_payload["results"]],
192
+ "no_strong_match": search_payload["no_strong_match"],
193
+ }
194
+
195
+
196
+ @mcp.tool
197
+ def list_clean_code_facets() -> dict[str, Any]:
198
+ """Return available filter facets for clean-code pattern search."""
199
+
200
+ return facet_counts(build_chunks())
201
+
202
+
203
+ def pattern_by_id(pattern_id: str) -> dict[str, Any]:
204
+ normalized = pattern_id.strip().upper()
205
+ if not semantic.CC_ID_RE.fullmatch(normalized):
206
+ raise ValueError("pattern_id must use the CC-### format")
207
+ record = get_pattern_record(normalized)
208
+ if record is None:
209
+ raise ValueError(f"pattern not found: {normalized}")
210
+ return record
211
+
212
+
213
+ def parse_args() -> argparse.Namespace:
214
+ parser = argparse.ArgumentParser(description="Run the clean-code FastMCP server.")
215
+ parser.add_argument(
216
+ "--transport",
217
+ choices=("stdio", "http", "sse"),
218
+ default="stdio",
219
+ help="FastMCP transport to run.",
220
+ )
221
+ parser.add_argument("--host", default="127.0.0.1")
222
+ parser.add_argument("--port", type=int, default=8765)
223
+ return parser.parse_args()
224
+
225
+
226
+ def main() -> None:
227
+ args = parse_args()
228
+ if args.transport == "stdio":
229
+ mcp.run()
230
+ return
231
+ mcp.run(transport=args.transport, host=args.host, port=args.port)
232
+
233
+
234
+ if __name__ == "__main__":
235
+ main()
@@ -0,0 +1,83 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ from typing import Any
5
+
6
+ from mcp_server.models import CleanCodeChunk
7
+ from mcp_server.utils.increment import increment
8
+
9
+
10
+ def facet_counts(chunks: list[CleanCodeChunk]) -> dict[str, dict[str, int]]:
11
+ facets: dict[str, dict[str, int]] = {
12
+ "topics": {},
13
+ "rule_families": {},
14
+ "lintability": {},
15
+ "source_kinds": {},
16
+ "languages": {},
17
+ "chunk_kinds": {},
18
+ }
19
+ for chunk in chunks:
20
+ increment(facets["topics"], chunk.topic)
21
+ increment(facets["rule_families"], chunk.rule_family)
22
+ increment(facets["lintability"], chunk.lintability)
23
+ increment(facets["source_kinds"], chunk.source_kind)
24
+ increment(facets["chunk_kinds"], chunk.chunk_kind)
25
+ for language in chunk.languages:
26
+ increment(facets["languages"], language)
27
+ return facets
28
+
29
+
30
+ def search_result(row: dict[str, Any]) -> dict[str, Any]:
31
+ additional = row.get("_additional") or {}
32
+ content = " ".join(str(row.get("contentText", "")).split())
33
+ return {
34
+ "chunk_id": row.get("chunkId", ""),
35
+ "record_id": row.get("recordId", ""),
36
+ "title": row.get("title", ""),
37
+ "topic": row.get("topic", ""),
38
+ "source_file": row.get("sourceFile", ""),
39
+ "source_kind": row.get("sourceKind", ""),
40
+ "rule_family": row.get("ruleFamily", ""),
41
+ "lintability": row.get("lintability", ""),
42
+ "distance": additional.get("distance"),
43
+ "snippet": content[:500],
44
+ }
45
+
46
+
47
+ def lint_rule_recommendation(result: dict[str, Any], targets: list[str]) -> dict[str, Any]:
48
+ return {
49
+ "pattern_id": result["pattern_id"],
50
+ "title": result["title"],
51
+ "rule_family": result["rule_family"],
52
+ "lintability": result["lintability"],
53
+ "confidence": result["confidence"],
54
+ "score": result["score"],
55
+ "targets": targets,
56
+ "static_signals": result.get("lint_candidates", []),
57
+ "false_positive_risks": false_positive_risks(result),
58
+ "suppression_strategy": suppression_strategy(targets),
59
+ "autofix": "review required; only offer autofix for syntax-preserving local rewrites",
60
+ "match_reasons": result.get("match_reasons", []),
61
+ }
62
+
63
+
64
+ def default_lint_targets(language: str) -> list[str]:
65
+ if language == "typescript":
66
+ return ["eslint", "semgrep"]
67
+ if language == "python":
68
+ return ["ruff", "pylint", "semgrep"]
69
+ return ["eslint", "ruff", "pylint", "semgrep"]
70
+
71
+
72
+ def false_positive_risks(result: dict[str, Any]) -> list[str]:
73
+ risks = [
74
+ "local project conventions may intentionally allow this shape",
75
+ "tests, generated files, fixtures, and framework adapters may be safe contexts",
76
+ ]
77
+ if result.get("lintability") == "medium":
78
+ risks.append("medium-lintability patterns need narrower project-specific allowlists")
79
+ return risks
80
+
81
+
82
+ def suppression_strategy(targets: list[str]) -> str:
83
+ return f"use the narrowest inline suppression supported by {', '.join(targets)} and require a reason"
@@ -0,0 +1,104 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from typing import Any
6
+
7
+ PHRASE_BONUS_MIN_OVERLAP = 2
8
+ PLURAL_NORMALIZATION_MIN_LENGTH = 4
9
+ SLUG_RE = re.compile(r"[^a-z0-9]+")
10
+ WORD_RE = re.compile(r"[a-z0-9]+")
11
+ CC_ID_RE = re.compile(r"\b(CC-\d{3})\b")
12
+
13
+
14
+ def semantic_similarity(distance: float | None) -> float:
15
+ if distance is None:
16
+ return 0.0
17
+ return max(0.0, min(1.0, 1.0 - distance))
18
+
19
+
20
+ def lexical_score(query_terms: set[str], haystack: str) -> float:
21
+ if not query_terms:
22
+ return 0.0
23
+ haystack_terms = set(query_tokens(haystack))
24
+ if not haystack_terms:
25
+ return 0.0
26
+ overlap = query_terms & haystack_terms
27
+ phrase_bonus = 0.2 if overlap and len(overlap) >= PHRASE_BONUS_MIN_OVERLAP else 0.0
28
+ return min(1.0, len(overlap) / len(query_terms) + phrase_bonus)
29
+
30
+
31
+ def searchable_row_text(row: dict[str, Any]) -> str:
32
+ return " ".join(
33
+ [
34
+ str(row.get("recordId", "")),
35
+ str(row.get("title", "")),
36
+ str(row.get("topic", "")),
37
+ str(row.get("ruleFamily", "")),
38
+ str(row.get("lintability", "")),
39
+ " ".join(str(value) for value in row.get("aliases", [])),
40
+ " ".join(str(value) for value in row.get("lintCandidates", [])),
41
+ str(row.get("contentText", "")),
42
+ ]
43
+ )
44
+
45
+
46
+ def query_tokens(value: str) -> list[str]:
47
+ return [normalize_token(token) for token in WORD_RE.findall(value.lower())]
48
+
49
+
50
+ def normalize_token(value: str) -> str:
51
+ if len(value) > PLURAL_NORMALIZATION_MIN_LENGTH and value.endswith("s"):
52
+ return value[:-1]
53
+ return value
54
+
55
+
56
+ def detected_record_id(value: str) -> str:
57
+ match = CC_ID_RE.search(value)
58
+ return match.group(1) if match else ""
59
+
60
+
61
+ def languages_in_text(text: str) -> tuple[str, ...]:
62
+ languages: list[str] = []
63
+ if "```ts" in text or "TypeScript" in text:
64
+ languages.append("typescript")
65
+ if "```python" in text or "Python" in text:
66
+ languages.append("python")
67
+ return tuple(languages)
68
+
69
+
70
+ def lint_candidates_in_text(text: str) -> tuple[str, ...]:
71
+ return tuple(
72
+ line.split(":", 1)[1].strip()
73
+ for line in text.splitlines()
74
+ if line.startswith("Lint candidates:")
75
+ )
76
+
77
+
78
+ def slug(value: str) -> str:
79
+ normalized = SLUG_RE.sub("-", value.lower()).strip("-")
80
+ return normalized[:96] or "section"
81
+
82
+
83
+ def slugless(value: str) -> str:
84
+ return re.sub(r"^[#`\s]+|[#`\s]+$", "", value)
85
+
86
+
87
+ def clean_topic(value: str) -> str:
88
+ topic = re.sub(r"^chapter\s+\d+:\s*", "", value, flags=re.IGNORECASE).strip()
89
+ return re.sub(r"^smells and heuristics\s*-\s*", "", topic, flags=re.IGNORECASE).strip()
90
+
91
+
92
+ def clean_alias(value: str) -> str:
93
+ alias = clean_topic(value)
94
+ return "" if re.fullmatch(r"chapter\s+\d+", alias, flags=re.IGNORECASE) else alias
95
+
96
+
97
+ def clean_topic_text(value: str) -> str:
98
+ value = re.sub(r"Chapter\s+\d+:\s*", "", value)
99
+ value = re.sub(r"\bChapter\s+\d+\s+", "", value)
100
+ return re.sub(r"Smells and Heuristics\s*-\s*", "", value)
101
+
102
+
103
+ def approximate_tokens(value: str) -> int:
104
+ return max(1, len(re.findall(r"\S+", value)))
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ from typing import Any
5
+
6
+ HTTPX_INSTALL_MESSAGE = "Install httpx to talk to Weaviate: python3 -m pip install httpx"
7
+
8
+
9
+ def require_httpx() -> Any:
10
+ try:
11
+ import httpx # noqa: PLC0415
12
+ except ImportError as exc:
13
+ raise SystemExit(HTTPX_INSTALL_MESSAGE) from exc
14
+ return httpx
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+
5
+ def increment(counter: dict[str, int], value: str) -> None:
6
+ if value:
7
+ counter[value] = counter.get(value, 0) + 1 # pylint: disable=clean-code-output-argument-mutation
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import hashlib
5
+
6
+
7
+ def sha256_text(value: str) -> str:
8
+ return hashlib.sha256(value.encode()).hexdigest()
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import re
5
+
6
+
7
+ def unique_strings(values: list[str]) -> list[str]:
8
+ seen: set[str] = set()
9
+ unique: list[str] = []
10
+ for value in values:
11
+ normalized = re.sub(r"\s+", " ", value.strip())
12
+ if normalized and normalized not in seen:
13
+ seen.add(normalized)
14
+ unique.append(normalized)
15
+ return unique