clean-code-tools 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/README.md +66 -0
  2. package/configs/eslint.clean-code.recommended.mjs +211 -0
  3. package/configs/python.clean-code.pyproject.toml +143 -0
  4. package/data/clean-code-patterns.jsonl +264 -0
  5. package/data/vector-record.schema.json +77 -0
  6. package/docs/README.md +29 -0
  7. package/docs/eslint-custom-rules.md +74 -0
  8. package/docs/eslint-recommended-config.md +87 -0
  9. package/docs/fastmcp-local-server.md +104 -0
  10. package/docs/publishing.md +125 -0
  11. package/docs/python-lint-recommended-config.md +57 -0
  12. package/docs/python-pylint-custom-rules.md +77 -0
  13. package/docs/semantic-weaviate.md +80 -0
  14. package/docs/static-trigger-semantic-review.md +97 -0
  15. package/evals/clean-code-retrieval.jsonl +13 -0
  16. package/ops/dev/weaviate/README.md +34 -0
  17. package/ops/dev/weaviate/compose.yaml +34 -0
  18. package/ops/dev/weaviate/smoke.sh +28 -0
  19. package/package.json +96 -0
  20. package/pyproject.toml +303 -0
  21. package/sample-apps/README.md +40 -0
  22. package/sample-apps/python-app/pyproject.toml +113 -0
  23. package/sample-apps/python-app/src/clean_pricing.py +10 -0
  24. package/sample-apps/python-app/src/smelly_pricing.py +8 -0
  25. package/sample-apps/ts-backend/eslint.config.mjs +3 -0
  26. package/sample-apps/ts-backend/package.json +18 -0
  27. package/sample-apps/ts-backend/src/clean-handler.ts +19 -0
  28. package/sample-apps/ts-backend/src/smelly-handler.ts +29 -0
  29. package/sample-apps/ts-backend/tsconfig.json +9 -0
  30. package/sample-apps/ts-frontend/eslint.config.mjs +3 -0
  31. package/sample-apps/ts-frontend/package.json +18 -0
  32. package/sample-apps/ts-frontend/src/CleanWidget.tsx +18 -0
  33. package/sample-apps/ts-frontend/src/SmellyWidget.tsx +27 -0
  34. package/sample-apps/ts-frontend/tsconfig.json +10 -0
  35. package/scripts/_mcp_app.py +21 -0
  36. package/scripts/check_clean_code_review_candidates.py +302 -0
  37. package/scripts/check_fastmcp_server.py +106 -0
  38. package/scripts/check_packages.py +137 -0
  39. package/scripts/check_python_config.py +130 -0
  40. package/scripts/check_repo_python_lint.py +46 -0
  41. package/scripts/check_retrieval_evals.py +132 -0
  42. package/scripts/check_sample_apps.py +169 -0
  43. package/scripts/check_semantic_search_tooling.py +102 -0
  44. package/scripts/clean_code_eslint_triggers.py +272 -0
  45. package/scripts/clean_code_mcp_server.py +7 -0
  46. package/scripts/clean_code_python_triggers.py +318 -0
  47. package/scripts/clean_code_review_candidates.py +291 -0
  48. package/scripts/clean_code_review_io.py +36 -0
  49. package/scripts/clean_code_review_models.py +43 -0
  50. package/scripts/clean_code_semantic.py +27 -0
  51. package/scripts/set_package_versions.py +82 -0
  52. package/scripts/weaviate_ingest_clean_code.py +44 -0
  53. package/scripts/weaviate_search_clean_code.py +51 -0
  54. package/skills/clean-code-mcp-reviewer/SKILL.md +209 -0
  55. package/skills/clean-code-mcp-reviewer/evals/evals.json +30 -0
  56. package/src/js/eslint-plugin-clean-code.mjs +758 -0
  57. package/src/python/clean_code_tools_pylint/__init__.py +14 -0
  58. package/src/python/clean_code_tools_pylint/ast_checker.py +122 -0
  59. package/src/python/clean_code_tools_pylint/comments.py +83 -0
  60. package/src/python/clean_code_tools_pylint/helpers.py +196 -0
  61. package/src/python/mcp_server/__init__.py +1 -0
  62. package/src/python/mcp_server/corpus.py +160 -0
  63. package/src/python/mcp_server/markdown.py +126 -0
  64. package/src/python/mcp_server/models.py +73 -0
  65. package/src/python/mcp_server/ranking.py +125 -0
  66. package/src/python/mcp_server/ranking_scoring.py +232 -0
  67. package/src/python/mcp_server/semantic.py +192 -0
  68. package/src/python/mcp_server/server.py +235 -0
  69. package/src/python/mcp_server/server_payloads.py +83 -0
  70. package/src/python/mcp_server/text.py +104 -0
  71. package/src/python/mcp_server/utils/__init__.py +1 -0
  72. package/src/python/mcp_server/utils/httpx_loader.py +14 -0
  73. package/src/python/mcp_server/utils/increment.py +7 -0
  74. package/src/python/mcp_server/utils/sha256_text.py +8 -0
  75. package/src/python/mcp_server/utils/unique_strings.py +15 -0
  76. package/src/python/mcp_server/weaviate.py +182 -0
  77. package/uv.lock +2012 -0
@@ -0,0 +1,14 @@
1
+ from __future__ import annotations
2
+
3
+ from pylint.lint import PyLinter
4
+
5
+ from .ast_checker import CleanCodeAstChecker
6
+ from .comments import CleanCodeCommentChecker
7
+
8
+
9
+ def register(linter: PyLinter) -> None:
10
+ if getattr(linter, "_clean_code_tools_registered", False): # pylint: disable=clean-code-boolean-flag-argument
11
+ return
12
+ linter._clean_code_tools_registered = True
13
+ linter.register_checker(CleanCodeCommentChecker(linter))
14
+ linter.register_checker(CleanCodeAstChecker(linter))
@@ -0,0 +1,122 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import ClassVar
4
+
5
+ from astroid import nodes
6
+ from pylint.checkers import BaseChecker
7
+ from pylint.lint import PyLinter
8
+
9
+ from .helpers import (
10
+ MAX_ATTRIBUTE_CHAIN_DEPTH,
11
+ MUTATOR_METHODS,
12
+ annotation_is_bool,
13
+ attribute_depth,
14
+ is_allowed_literal_context,
15
+ is_policy_literal_context,
16
+ literal_looks_like_policy,
17
+ name_looks_like_selector,
18
+ root_name,
19
+ )
20
+
21
+
22
+ class CleanCodeAstChecker(BaseChecker):
23
+ name = "clean-code-ast"
24
+ msgs: ClassVar = {
25
+ "C9003": (
26
+ "Boolean selector argument changes behavior by mode; prefer named operations or an explicit options object.",
27
+ "clean-code-boolean-flag-argument",
28
+ "Discourage boolean selector arguments and boolean mode parameters.",
29
+ ),
30
+ "C9004": (
31
+ "Avoid mutating parameter '%s' as an output argument; return a value or create a local copy instead.",
32
+ "clean-code-output-argument-mutation",
33
+ "Flag parameter mutation that treats arguments as output containers.",
34
+ ),
35
+ "C9007": (
36
+ "Policy literal '%s' should usually be a named constant so the rule is searchable.",
37
+ "clean-code-business-policy-literal",
38
+ "Flag hard-coded policy literals in branch, return, and call expressions.",
39
+ ),
40
+ "C9008": (
41
+ "Deep attribute chain exposes object internals; prefer a named query on the owning object.",
42
+ "clean-code-train-wreck",
43
+ "Flag deep attribute chains that expose transitive object structure.",
44
+ ),
45
+ }
46
+
47
+ def __init__(self, linter: PyLinter) -> None:
48
+ super().__init__(linter)
49
+ self._function_params: list[set[str]] = []
50
+ self._function_locals: list[set[str]] = []
51
+
52
+ def visit_functiondef(self, node: nodes.FunctionDef) -> None:
53
+ params = {argument.name for argument in node.args.args + node.args.kwonlyargs}
54
+ self._function_params.append(params)
55
+ self._function_locals.append(set())
56
+ self.check_boolean_params(node)
57
+
58
+ visit_asyncfunctiondef = visit_functiondef
59
+
60
+ def leave_functiondef(self, _node: nodes.FunctionDef) -> None:
61
+ self._function_params.pop()
62
+ self._function_locals.pop()
63
+
64
+ leave_asyncfunctiondef = leave_functiondef
65
+
66
+ def visit_assignname(self, node: nodes.AssignName) -> None:
67
+ if isinstance(node.parent, nodes.Arguments):
68
+ return
69
+ if self._function_locals:
70
+ self._function_locals[-1].add(node.name)
71
+
72
+ def visit_call(self, node: nodes.Call) -> None:
73
+ for argument in node.args:
74
+ if isinstance(argument, nodes.Const) and isinstance(argument.value, bool):
75
+ self.add_message("clean-code-boolean-flag-argument", node=argument)
76
+ if isinstance(node.func, nodes.Attribute) and node.func.attrname in MUTATOR_METHODS:
77
+ self.report_if_param_mutation(node.func.expr, node.func.expr)
78
+
79
+ def visit_assignattr(self, node: nodes.AssignAttr) -> None:
80
+ self.report_if_param_mutation(node, node)
81
+
82
+ def visit_assign(self, node: nodes.Assign) -> None:
83
+ for target in node.targets:
84
+ self.report_if_param_mutation(target, target)
85
+
86
+ def visit_augassign(self, node: nodes.AugAssign) -> None:
87
+ self.report_if_param_mutation(node.target, node.target)
88
+
89
+ def visit_const(self, node: nodes.Const) -> None:
90
+ if (
91
+ literal_looks_like_policy(node.value)
92
+ and is_policy_literal_context(node)
93
+ and not is_allowed_literal_context(node)
94
+ ):
95
+ self.add_message("clean-code-business-policy-literal", node=node, args=(str(node.value),))
96
+
97
+ def visit_attribute(self, node: nodes.Attribute) -> None:
98
+ if isinstance(node.parent, nodes.Attribute):
99
+ return
100
+ if attribute_depth(node) > MAX_ATTRIBUTE_CHAIN_DEPTH:
101
+ self.add_message("clean-code-train-wreck", node=node)
102
+
103
+ def check_boolean_params(self, node: nodes.FunctionDef) -> None:
104
+ arguments = node.args.args + node.args.kwonlyargs
105
+ annotations = node.args.annotations + node.args.kwonlyargs_annotations
106
+ for argument, annotation in zip(arguments, annotations, strict=False):
107
+ if argument.name in {"self", "cls"}:
108
+ continue
109
+ if annotation_is_bool(annotation) and name_looks_like_selector(argument.name):
110
+ self.add_message("clean-code-boolean-flag-argument", node=argument)
111
+
112
+ def report_if_param_mutation(self, node: nodes.NodeNG, expression: nodes.NodeNG) -> None:
113
+ name = root_name(expression)
114
+ if not name or name in {"self", "cls"}:
115
+ return
116
+ scopes = zip(reversed(self._function_params), reversed(self._function_locals), strict=False)
117
+ for params, locals_ in scopes:
118
+ if name in locals_:
119
+ return
120
+ if name in params:
121
+ self.add_message("clean-code-output-argument-mutation", node=node, args=(name,))
122
+ return
@@ -0,0 +1,83 @@
1
+ from __future__ import annotations
2
+
3
+ import tokenize
4
+ from io import BytesIO
5
+ from typing import ClassVar
6
+
7
+ from astroid import nodes
8
+ from pylint.checkers import BaseRawFileChecker
9
+
10
+ from .helpers import (
11
+ MIN_SHARED_COMMENT_WORDS,
12
+ REDUNDANT_COMMENT_OVERLAP_RATIO,
13
+ TODO_PATTERN,
14
+ TODO_SEGMENT,
15
+ clean_comment,
16
+ is_byline_or_date,
17
+ is_likely_code_comment,
18
+ is_separator_comment,
19
+ normalized_words,
20
+ )
21
+
22
+
23
+ class CleanCodeCommentChecker(BaseRawFileChecker):
24
+ name = "clean-code-comments"
25
+ msgs: ClassVar = {
26
+ "C9001": (
27
+ "TODO/FIXME comments should include an owner or issue ID, for example TODO(PROJ-123): remove fallback.",
28
+ "clean-code-todo-format",
29
+ "Require TODO, FIXME, and XXX comments to include an owner or issue identifier.",
30
+ ),
31
+ "C9002": (
32
+ "Remove commented-out code; version history should preserve old implementations.",
33
+ "clean-code-commented-out-code",
34
+ "Flag comments that look like disabled Python code.",
35
+ ),
36
+ "C9005": (
37
+ "Comment mostly repeats the next line; prefer making the code name carry the intent.",
38
+ "clean-code-redundant-comment",
39
+ "Flag comments that mostly repeat the following line of code.",
40
+ ),
41
+ "C9006": (
42
+ "Avoid noisy separator, byline, or date comments; use structure and version control instead.",
43
+ "clean-code-noisy-comment",
44
+ "Flag separator, byline, and date comments.",
45
+ ),
46
+ }
47
+
48
+ def process_module(self, node: nodes.Module) -> None:
49
+ raw_bytes = node.stream().read()
50
+ lines = raw_bytes.decode("utf-8", errors="replace").splitlines()
51
+ for token in tokenize.tokenize(BytesIO(raw_bytes).readline):
52
+ if token.type != tokenize.COMMENT:
53
+ continue
54
+ text = clean_comment(token.string)
55
+ line_number = token.start[0]
56
+ self.check_todo(text, line_number)
57
+ self.check_comment_shape(text, line_number)
58
+ self.check_redundant_comment(text, line_number, lines)
59
+
60
+ def check_todo(self, text: str, line_number: int) -> None:
61
+ todo_segments = TODO_SEGMENT.findall(text)
62
+ if any(not TODO_PATTERN.match(segment.strip()) for segment in todo_segments):
63
+ self.add_message("clean-code-todo-format", line=line_number)
64
+
65
+ def check_comment_shape(self, text: str, line_number: int) -> None:
66
+ if TODO_SEGMENT.search(text):
67
+ return
68
+ if is_likely_code_comment(text):
69
+ self.add_message("clean-code-commented-out-code", line=line_number)
70
+ if is_separator_comment(text) or is_byline_or_date(text):
71
+ self.add_message("clean-code-noisy-comment", line=line_number)
72
+
73
+ def check_redundant_comment(self, text: str, line_number: int, lines: list[str]) -> None:
74
+ comment_words = normalized_words(text)
75
+ if len(comment_words) < MIN_SHARED_COMMENT_WORDS or line_number >= len(lines):
76
+ return
77
+ next_line_words = set(normalized_words(lines[line_number]))
78
+ shared_words = [word for word in comment_words if word in next_line_words]
79
+ if (
80
+ len(shared_words) >= MIN_SHARED_COMMENT_WORDS
81
+ and len(shared_words) / len(comment_words) >= REDUNDANT_COMMENT_OVERLAP_RATIO
82
+ ):
83
+ self.add_message("clean-code-redundant-comment", line=line_number)
@@ -0,0 +1,196 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import Any
5
+
6
+ from astroid import nodes
7
+
8
+ TODO_PATTERN = re.compile(r"^(TODO|FIXME|XXX)\([A-Z][A-Z0-9]+-\d+\):\s+\S", re.IGNORECASE)
9
+ TODO_SEGMENT = re.compile(r"\b(?:TODO|FIXME|XXX)\b[^\n;]*", re.IGNORECASE)
10
+ SELECTOR_PARAM_NAMES = (
11
+ "flag",
12
+ "mode",
13
+ "option",
14
+ "type",
15
+ "kind",
16
+ "variant",
17
+ "selector",
18
+ "enabled",
19
+ "disabled",
20
+ "dry_run",
21
+ "verbose",
22
+ "silent",
23
+ "force",
24
+ "skip",
25
+ "include",
26
+ "exclude",
27
+ )
28
+ MUTATOR_METHODS = {
29
+ "add",
30
+ "append",
31
+ "clear",
32
+ "discard",
33
+ "extend",
34
+ "insert",
35
+ "pop",
36
+ "popitem",
37
+ "remove",
38
+ "reverse",
39
+ "setdefault",
40
+ "sort",
41
+ "update",
42
+ }
43
+ ALLOWED_LITERAL_CALLS = {
44
+ "bool",
45
+ "bytes",
46
+ "dict",
47
+ "float",
48
+ "int",
49
+ "len",
50
+ "list",
51
+ "print",
52
+ "range",
53
+ "repr",
54
+ "set",
55
+ "str",
56
+ "tuple",
57
+ }
58
+ STATUS_WORD = re.compile(
59
+ r"(?:^|[_\s-])(active|approved|cancelled|canceled|draft|failed|paid|pending|rejected|retry|suspended)(?:$|[_\s-])",
60
+ re.IGNORECASE,
61
+ )
62
+ MIN_CODE_COMMENT_LENGTH = 4
63
+ MIN_SEPARATOR_LENGTH = 8
64
+ MIN_POLICY_STRING_LENGTH = 2
65
+ MIN_SHARED_COMMENT_WORDS = 2
66
+ REDUNDANT_COMMENT_OVERLAP_RATIO = 0.65
67
+ MAX_ATTRIBUTE_CHAIN_DEPTH = 3
68
+
69
+
70
+ def clean_comment(comment: str) -> str:
71
+ return comment.lstrip("#").strip()
72
+
73
+
74
+ def normalized_words(value: str) -> list[str]:
75
+ return re.findall(r"[a-z][a-z0-9]+", re.sub(r"[_$]+", " ", value).lower())
76
+
77
+
78
+ def is_likely_code_comment(text: str) -> bool:
79
+ if len(text.strip()) < MIN_CODE_COMMENT_LENGTH:
80
+ return False
81
+ code_patterns = (
82
+ r"\b(await|def|class|return|raise|if|for|while|match|import|from|with|try|except)\b",
83
+ r"(?:^|\s)[\w.]+\([^)]*\)\s*$",
84
+ r"^\s*[\w.]+\s*=\s*.+$",
85
+ r"[{}\[\]]",
86
+ r"->",
87
+ )
88
+ return any(re.search(pattern, text) for pattern in code_patterns)
89
+
90
+
91
+ def is_separator_comment(text: str) -> bool:
92
+ compact = re.sub(r"\s+", "", text)
93
+ return len(compact) >= MIN_SEPARATOR_LENGTH and bool(re.fullmatch(r"[-=*_/#]+", compact))
94
+
95
+
96
+ def is_byline_or_date(text: str) -> bool:
97
+ byline = r"\b(author|created by|written by|modified by|last modified|since)\b"
98
+ date = r"\b(?:\d{4}[-/]\d{1,2}[-/]\d{1,2}|\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\b"
99
+ return bool(re.search(byline, text, re.IGNORECASE) or re.search(date, text))
100
+
101
+
102
+ def annotation_is_bool(annotation: nodes.NodeNG | None) -> bool:
103
+ return isinstance(annotation, nodes.Name) and annotation.name == "bool"
104
+
105
+
106
+ def name_looks_like_selector(name: str) -> bool:
107
+ lower_name = name.lower()
108
+ return any(selector in lower_name for selector in SELECTOR_PARAM_NAMES)
109
+
110
+
111
+ def call_name(node: nodes.Call) -> str | None:
112
+ func = node.func
113
+ if isinstance(func, nodes.Name):
114
+ return func.name
115
+ if isinstance(func, nodes.Attribute):
116
+ return func.attrname
117
+ return None
118
+
119
+
120
+ def root_name(node: nodes.NodeNG) -> str | None:
121
+ current = node
122
+ while isinstance(current, (nodes.Attribute, nodes.Subscript)):
123
+ current = current.expr if isinstance(current, nodes.Attribute) else current.value
124
+ if isinstance(current, nodes.Name):
125
+ return current.name
126
+ return None
127
+
128
+
129
+ def is_uppercase_assignment(node: nodes.Const) -> bool:
130
+ parent = node.parent
131
+ if not isinstance(parent, (nodes.Assign, nodes.AnnAssign)):
132
+ return False
133
+ targets = parent.targets if isinstance(parent, nodes.Assign) else [parent.target]
134
+ return any(isinstance(target, nodes.AssignName) and target.name.isupper() for target in targets)
135
+
136
+
137
+ def literal_looks_like_policy(value: Any) -> bool:
138
+ if isinstance(value, bool):
139
+ return False
140
+ if isinstance(value, int | float):
141
+ return value not in {-1, 0, 1}
142
+ if not isinstance(value, str) or len(value) < MIN_POLICY_STRING_LENGTH:
143
+ return False
144
+ return bool(
145
+ re.fullmatch(r"[A-Z][A-Z0-9_]+", value)
146
+ or re.fullmatch(r"\d{4}-\d{2}-\d{2}", value)
147
+ or STATUS_WORD.search(value)
148
+ )
149
+
150
+
151
+ def is_allowed_literal_context(node: nodes.Const) -> bool:
152
+ if is_uppercase_assignment(node):
153
+ return True
154
+ current = node.parent
155
+ while current is not None:
156
+ if isinstance(current, nodes.Call) and call_name(current) in ALLOWED_LITERAL_CALLS:
157
+ return True
158
+ current = current.parent
159
+ return False
160
+
161
+
162
+ def is_policy_literal_context(node: nodes.Const) -> bool:
163
+ if isinstance(node.value, int | float) and not literal_looks_like_named_threshold(node):
164
+ return False
165
+ current = node.parent
166
+ while current is not None and not isinstance(
167
+ current,
168
+ (nodes.ClassDef, nodes.FunctionDef, nodes.AsyncFunctionDef, nodes.Module),
169
+ ):
170
+ if isinstance(current, (nodes.Compare, nodes.If, nodes.Return, nodes.Call)):
171
+ return True
172
+ if isinstance(current, nodes.Assign):
173
+ return any(isinstance(target, (nodes.AssignAttr, nodes.Subscript)) for target in current.targets)
174
+ current = current.parent
175
+ return False
176
+
177
+
178
+ def literal_looks_like_named_threshold(node: nodes.Const) -> bool:
179
+ current = node.parent
180
+ while current is not None and not isinstance(
181
+ current,
182
+ (nodes.ClassDef, nodes.FunctionDef, nodes.AsyncFunctionDef, nodes.Module),
183
+ ):
184
+ if isinstance(current, nodes.Compare):
185
+ return True
186
+ current = current.parent
187
+ return False
188
+
189
+
190
+ def attribute_depth(node: nodes.Attribute) -> int:
191
+ depth = 0
192
+ current: nodes.NodeNG = node
193
+ while isinstance(current, nodes.Attribute):
194
+ depth += 1
195
+ current = current.expr
196
+ return depth
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,160 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import uuid
6
+ from pathlib import Path
7
+
8
+ from mcp_server.markdown import (
9
+ infer_markdown_rule_family,
10
+ markdown_aliases,
11
+ markdown_sections,
12
+ split_section_body,
13
+ )
14
+ from mcp_server.models import CleanCodeChunk, JsonDict
15
+ from mcp_server.text import (
16
+ clean_alias,
17
+ clean_topic,
18
+ clean_topic_text,
19
+ detected_record_id,
20
+ languages_in_text,
21
+ lint_candidates_in_text,
22
+ slug,
23
+ )
24
+ from mcp_server.utils.sha256_text import sha256_text
25
+
26
+ ROOT = Path(__file__).resolve().parents[3]
27
+ PATTERN_RECORDS = ROOT / "data" / "clean-code-patterns.jsonl"
28
+ MARKDOWN_SOURCES = (
29
+ ROOT / "README.md",
30
+ ROOT / "docs" / "eslint-custom-rules.md",
31
+ ROOT / "docs" / "eslint-recommended-config.md",
32
+ ROOT / "docs" / "python-lint-recommended-config.md",
33
+ ROOT / "docs" / "python-pylint-custom-rules.md",
34
+ ROOT / "docs" / "static-trigger-semantic-review.md",
35
+ )
36
+ CHUNK_ID_NAMESPACE = uuid.UUID("fd1b279f-073e-5aa4-bf70-9f70446a3d8f")
37
+
38
+
39
+ def build_chunks(root: Path = ROOT) -> list[CleanCodeChunk]:
40
+ chunks = [*pattern_record_chunks(root / PATTERN_RECORDS.relative_to(ROOT))]
41
+ for source in MARKDOWN_SOURCES:
42
+ path = root / source.relative_to(ROOT)
43
+ if path.exists():
44
+ chunks.extend(markdown_chunks(path, root=root))
45
+ return chunks
46
+
47
+
48
+ def pattern_record_chunks(path: Path) -> list[CleanCodeChunk]:
49
+ chunks: list[CleanCodeChunk] = []
50
+ with path.open() as handle:
51
+ for index, line in enumerate(handle):
52
+ if not line.strip():
53
+ continue
54
+ record = json.loads(line)
55
+ chunk_id = f"pattern:{record['id']}"
56
+ topic = clean_topic(str(record["topic"]))
57
+ aliases = tuple(
58
+ alias
59
+ for alias in (clean_alias(str(item)) for item in record["aliases"])
60
+ if alias
61
+ )
62
+ embedding_text = clean_topic_text(str(record["embedding_text"]).strip())
63
+ display_text = clean_topic_text(str(record["display_text"]).strip())
64
+ languages = tuple(
65
+ language
66
+ for language in ("typescript", "python")
67
+ if record.get("good_examples", {}).get(language)
68
+ or record.get("bad_examples", {}).get(language)
69
+ )
70
+ chunks.append(
71
+ CleanCodeChunk(
72
+ chunk_id=chunk_id,
73
+ object_id=object_id_for(chunk_id),
74
+ source_file=path.name,
75
+ source_kind="clean_code_pattern",
76
+ record_id=str(record["id"]),
77
+ title=str(record["title"]),
78
+ topic=topic,
79
+ section_path=(topic, str(record["title"])),
80
+ chunk_kind="pattern_record",
81
+ chunk_index=index,
82
+ rule_family=str(record["rule_family"]),
83
+ lintability=str(record["lintability"]),
84
+ aliases=aliases,
85
+ languages=languages,
86
+ lint_candidates=tuple(str(item) for item in record["lint_candidates"]),
87
+ content_text=display_text,
88
+ embedding_text=embedding_text,
89
+ display_text=display_text,
90
+ text_hash=sha256_text(embedding_text),
91
+ )
92
+ )
93
+ return chunks
94
+
95
+
96
+ def load_pattern_records(path: Path = PATTERN_RECORDS) -> list[JsonDict]:
97
+ records: list[JsonDict] = []
98
+ with path.open() as handle:
99
+ for line in handle:
100
+ if line.strip():
101
+ record = json.loads(line)
102
+ if isinstance(record, dict):
103
+ records.append(record)
104
+ return records
105
+
106
+
107
+ def get_pattern_record(pattern_id: str, *, root: Path = ROOT) -> JsonDict | None:
108
+ normalized_id = pattern_id.strip().upper()
109
+ for record in load_pattern_records(root / PATTERN_RECORDS.relative_to(ROOT)):
110
+ if str(record.get("id", "")).upper() == normalized_id:
111
+ return record
112
+ return None
113
+
114
+
115
+ def markdown_chunks(path: Path, *, root: Path = ROOT) -> list[CleanCodeChunk]:
116
+ chunks: list[CleanCodeChunk] = []
117
+ relative_path = path.relative_to(root).as_posix()
118
+ for section_index, section in enumerate(markdown_sections(path, root=root)):
119
+ for split_index, body in enumerate(split_section_body(section.body)):
120
+ heading_text = " > ".join(section.section_path)
121
+ record_id = detected_record_id(section.heading)
122
+ chunk_id = (
123
+ f"md:{relative_path}:{slug(heading_text)}"
124
+ if split_index == 0
125
+ else f"md:{relative_path}:{slug(heading_text)}:{split_index + 1}"
126
+ )
127
+ content_text = clean_topic_text(body.strip())
128
+ embedding_text = (
129
+ f"Markdown section: {heading_text}\n"
130
+ f"Source: {relative_path}:{section.start_line}-{section.end_line}\n\n"
131
+ f"{content_text}"
132
+ )
133
+ chunks.append(
134
+ CleanCodeChunk(
135
+ chunk_id=chunk_id,
136
+ object_id=object_id_for(chunk_id),
137
+ source_file=relative_path,
138
+ source_kind="markdown_doc",
139
+ record_id=record_id,
140
+ title=section.heading,
141
+ topic=clean_topic(section.section_path[0]) if section.section_path else clean_topic(section.heading),
142
+ section_path=tuple(clean_topic(item) for item in section.section_path),
143
+ chunk_kind="markdown_section" if split_index == 0 else "markdown_section_part",
144
+ chunk_index=section_index * 100 + split_index,
145
+ rule_family=infer_markdown_rule_family(section),
146
+ lintability="",
147
+ aliases=tuple(clean_alias(alias) for alias in markdown_aliases(section) if clean_alias(alias)),
148
+ languages=languages_in_text(content_text),
149
+ lint_candidates=lint_candidates_in_text(content_text),
150
+ content_text=content_text,
151
+ embedding_text=embedding_text,
152
+ display_text=embedding_text,
153
+ text_hash=sha256_text(embedding_text),
154
+ )
155
+ )
156
+ return chunks
157
+
158
+
159
+ def object_id_for(chunk_id: str) -> str:
160
+ return str(uuid.uuid5(CHUNK_ID_NAMESPACE, chunk_id))