codecrate 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,388 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+
7
+
8
+ @dataclass(frozen=True)
9
+ class Part:
10
+ path: Path
11
+ content: str
12
+
13
+
14
+ def split_by_max_chars(markdown: str, out_path: Path, max_chars: int) -> list[Part]:
15
+ """Split markdown into multiple files limited by *approximate* character count.
16
+
17
+ For normal markdown, this splits on paragraph boundaries ("\n\n"). For Codecrate
18
+ context packs, it performs a semantic split that avoids breaking code fences and
19
+ rewrites intra-pack links so that an LLM can navigate across the generated parts.
20
+
21
+ Notes:
22
+ - The returned parts are intended for LLM consumption. The original,
23
+ unsplit markdown should still be written to ``out_path``
24
+ for machine parsing (unpack/validate).
25
+ - When splitting a Codecrate pack, markdown line ranges like ``(L123-150)`` are
26
+ removed because they are unstable across parts.
27
+ """
28
+ if max_chars <= 0 or len(markdown) <= max_chars:
29
+ return [Part(path=out_path, content=markdown)]
30
+
31
+ if _looks_like_codecrate_pack(markdown):
32
+ return _split_codecrate_pack(markdown, out_path, max_chars)
33
+
34
+ return _split_paragraphs(markdown, out_path, max_chars)
35
+
36
+
37
+ def _split_paragraphs(markdown: str, out_path: Path, max_chars: int) -> list[Part]:
38
+ parts: list[Part] = []
39
+ chunk: list[str] = []
40
+ chunk_len = 0
41
+ idx = 1
42
+
43
+ for block in markdown.split("\n\n"):
44
+ add = block + "\n\n"
45
+ if chunk_len + len(add) > max_chars and chunk:
46
+ part_path = out_path.with_name(
47
+ f"{out_path.stem}.part{idx}{out_path.suffix}"
48
+ )
49
+ parts.append(Part(path=part_path, content="".join(chunk).rstrip() + "\n"))
50
+ idx += 1
51
+ chunk = []
52
+ chunk_len = 0
53
+ chunk.append(add)
54
+ chunk_len += len(add)
55
+
56
+ if chunk:
57
+ part_path = out_path.with_name(f"{out_path.stem}.part{idx}{out_path.suffix}")
58
+ parts.append(Part(path=part_path, content="".join(chunk).rstrip() + "\n"))
59
+
60
+ return parts
61
+
62
+
63
+ _FENCE_RE = re.compile(r"^```")
64
+ _FUNC_ANCHOR_RE = re.compile(r'^<a id="func-([0-9a-f]{8})"></a>\s*$')
65
+ _FILE_HEADING_RE = re.compile(r"^### `([^`]+)`")
66
+
67
+
68
+ def _looks_like_codecrate_pack(markdown: str) -> bool:
69
+ head = markdown.lstrip()[:200]
70
+ return head.startswith("# Codecrate Context Pack") and "## Files" in markdown
71
+
72
+
73
+ def _find_heading_line_index(lines: list[str], heading: str) -> int | None:
74
+ in_fence = False
75
+ for i, line in enumerate(lines):
76
+ if _FENCE_RE.match(line):
77
+ if not in_fence:
78
+ in_fence = True
79
+ elif line.strip() == "```":
80
+ in_fence = False
81
+ if not in_fence and line.startswith(heading):
82
+ return i
83
+ return None
84
+
85
+
86
+ def _drop_section(text: str, heading: str) -> str:
87
+ """Drop a top-level '## ...' section from a Codecrate prefix (fence-safe)."""
88
+ lines = text.splitlines(keepends=True)
89
+ start = _find_heading_line_index(lines, heading)
90
+ if start is None:
91
+ return text
92
+
93
+ in_fence = False
94
+ end = len(lines)
95
+ for i in range(start + 1, len(lines)):
96
+ line = lines[i]
97
+ if _FENCE_RE.match(line):
98
+ if not in_fence:
99
+ in_fence = True
100
+ elif line.strip() == "```":
101
+ in_fence = False
102
+ continue
103
+ if not in_fence and line.startswith("## "):
104
+ end = i
105
+ break
106
+ return "".join(lines[:start] + lines[end:])
107
+
108
+
109
+ def _split_codecrate_pack(markdown: str, out_path: Path, max_chars: int) -> list[Part]:
110
+ """Semantic split for Codecrate packs.
111
+
112
+ Strategy:
113
+ - Keep the "index" prefix (everything before the first content section: Function
114
+ Library or Files) in part1.
115
+ - Split the remaining content only at safe boundaries:
116
+ * function library entry anchors (<a id="func-..."></a>)
117
+ * file blocks (### `path`) inside the Files section
118
+ * section headings (## Function Library / ## Files)
119
+ while never splitting inside a fenced code block.
120
+ - Rewrite links across parts:
121
+ * Symbol Index links target the part that contains the relevant anchor.
122
+ * "jump to index" links in file blocks point back to part1.
123
+ * func jump links inside file symbol lists point to the part containing the
124
+ function library entry.
125
+ - Strip markdown line-range decorations like (L10-20) because they don't survive
126
+ splitting.
127
+ """
128
+ lines = markdown.splitlines(keepends=True)
129
+
130
+ idx_files = _find_heading_line_index(lines, "## Files")
131
+ idx_funcs = _find_heading_line_index(lines, "## Function Library")
132
+ if idx_files is None and idx_funcs is None:
133
+ return _split_paragraphs(markdown, out_path, max_chars)
134
+
135
+ content_start = min(i for i in [idx_files, idx_funcs] if i is not None)
136
+
137
+ # Parts are intended for LLM consumption; drop the Manifest to save tokens
138
+ # while keeping the unsplit output (written by the CLI) fully machine-readable.
139
+ prefix = "".join(lines[:content_start])
140
+ prefix = _drop_section(prefix, "## Manifest")
141
+ prefix = prefix.rstrip() + "\n"
142
+ content_lines = lines[content_start:]
143
+
144
+ breakpoints: list[int] = [0]
145
+ in_fence = False
146
+ in_files = False
147
+ for i, line in enumerate(content_lines):
148
+ if _FENCE_RE.match(line):
149
+ if not in_fence:
150
+ in_fence = True
151
+ elif line.strip() == "```":
152
+ in_fence = False
153
+
154
+ if in_fence:
155
+ continue
156
+
157
+ if line.startswith("## Files"):
158
+ in_files = True
159
+ breakpoints.append(i)
160
+ continue
161
+ if line.startswith("## Function Library"):
162
+ breakpoints.append(i)
163
+ continue
164
+
165
+ if line.startswith('<a id="func-'):
166
+ breakpoints.append(i)
167
+ continue
168
+
169
+ if in_files and line.startswith("### `"):
170
+ breakpoints.append(i)
171
+
172
+ breakpoints = sorted(set(bp for bp in breakpoints if 0 <= bp < len(content_lines)))
173
+ if not breakpoints or breakpoints[0] != 0:
174
+ breakpoints = [0] + breakpoints
175
+ breakpoints.append(len(content_lines))
176
+
177
+ blocks: list[str] = []
178
+ for a, b in zip(breakpoints, breakpoints[1:], strict=False):
179
+ if a == b:
180
+ continue
181
+ blocks.append("".join(content_lines[a:b]))
182
+
183
+ parts: list[Part] = []
184
+ idx = 1
185
+ part1_path = out_path.with_name(f"{out_path.stem}.part{idx}{out_path.suffix}")
186
+ parts.append(Part(path=part1_path, content=prefix))
187
+ idx += 1
188
+
189
+ chunk: list[str] = []
190
+ chunk_len = 0
191
+ for block in blocks:
192
+ if chunk and chunk_len + len(block) > max_chars:
193
+ part_path = out_path.with_name(
194
+ f"{out_path.stem}.part{idx}{out_path.suffix}"
195
+ )
196
+ parts.append(Part(path=part_path, content="".join(chunk).rstrip() + "\n"))
197
+ idx += 1
198
+ chunk = []
199
+ chunk_len = 0
200
+ chunk.append(block)
201
+ chunk_len += len(block)
202
+
203
+ if chunk:
204
+ part_path = out_path.with_name(f"{out_path.stem}.part{idx}{out_path.suffix}")
205
+ parts.append(Part(path=part_path, content="".join(chunk).rstrip() + "\n"))
206
+
207
+ file_to_part: dict[str, str] = {}
208
+ func_to_part: dict[str, str] = {}
209
+ for part in parts[1:]:
210
+ _scan_part_for_anchors(part.content, part.path.name, file_to_part, func_to_part)
211
+
212
+ index_name = parts[0].path.name
213
+ parts[0] = Part(
214
+ path=parts[0].path,
215
+ content=_rewrite_part1(parts[0].content, file_to_part, func_to_part),
216
+ )
217
+
218
+ rewritten_parts: list[Part] = [parts[0]]
219
+ for part in parts[1:]:
220
+ text = part.content
221
+ text = _strip_markdown_line_ranges(text)
222
+ text = _rewrite_jump_to_index(text, index_name)
223
+ text = _rewrite_func_links(text, func_to_part)
224
+ rewritten_parts.append(Part(path=part.path, content=text))
225
+
226
+ return rewritten_parts
227
+
228
+
229
+ def _scan_part_for_anchors(
230
+ text: str,
231
+ part_filename: str,
232
+ file_to_part: dict[str, str],
233
+ func_to_part: dict[str, str],
234
+ ) -> None:
235
+ in_fence = False
236
+ for line in text.splitlines():
237
+ if _FENCE_RE.match(line):
238
+ if not in_fence:
239
+ in_fence = True
240
+ elif line.strip() == "```":
241
+ in_fence = False
242
+ continue
243
+ if in_fence:
244
+ continue
245
+
246
+ m = _FUNC_ANCHOR_RE.match(line.strip())
247
+ if m:
248
+ func_to_part[m.group(1).upper()] = part_filename
249
+ continue
250
+
251
+ m2 = _FILE_HEADING_RE.match(line)
252
+ if m2:
253
+ file_to_part[m2.group(1)] = part_filename
254
+
255
+
256
+ def _strip_markdown_line_ranges(text: str) -> str:
257
+ out: list[str] = []
258
+ in_fence = False
259
+ for line in text.splitlines(keepends=True):
260
+ if _FENCE_RE.match(line):
261
+ if not in_fence:
262
+ in_fence = True
263
+ elif line.strip() == "```":
264
+ in_fence = False
265
+ out.append(line)
266
+ continue
267
+ if not in_fence:
268
+ line = re.sub(r"\s*\(L\d+-\d+\)", "", line)
269
+ out.append(line)
270
+ return "".join(out)
271
+
272
+
273
+ def _rewrite_jump_to_index(text: str, index_filename: str) -> str:
274
+ out: list[str] = []
275
+ in_fence = False
276
+ pat = re.compile(r"\[jump to index\]\(\#(file-[^)]+)\)")
277
+ for line in text.splitlines(keepends=True):
278
+ if _FENCE_RE.match(line):
279
+ if not in_fence:
280
+ in_fence = True
281
+ elif line.strip() == "```":
282
+ in_fence = False
283
+ out.append(line)
284
+ continue
285
+ if not in_fence:
286
+ line = pat.sub(rf"[jump to index]({index_filename}#\1)", line)
287
+ out.append(line)
288
+ return "".join(out)
289
+
290
+
291
+ def _rewrite_func_links(text: str, func_to_part: dict[str, str]) -> str:
292
+ out: list[str] = []
293
+ in_fence = False
294
+ pat = re.compile(r"\(\#(func-[0-9a-f]{8})\)")
295
+ for line in text.splitlines(keepends=True):
296
+ if _FENCE_RE.match(line):
297
+ if not in_fence:
298
+ in_fence = True
299
+ elif line.strip() == "```":
300
+ in_fence = False
301
+ out.append(line)
302
+ continue
303
+ if not in_fence and "(#func-" in line:
304
+
305
+ def repl(m: re.Match[str]) -> str:
306
+ anchor = m.group(1)
307
+ fid = anchor.split("-")[1].upper()
308
+ part = func_to_part.get(fid)
309
+ if not part:
310
+ return m.group(0)
311
+ return f"({part}#{anchor})"
312
+
313
+ line = pat.sub(repl, line)
314
+ out.append(line)
315
+ return "".join(out)
316
+
317
+
318
+ def _rewrite_part1(
319
+ text: str, file_to_part: dict[str, str], func_to_part: dict[str, str]
320
+ ) -> str:
321
+ lines = text.splitlines(keepends=True)
322
+ out: list[str] = []
323
+ in_fence = False
324
+ in_index = False
325
+ current_file_part: str | None = None
326
+
327
+ for line in lines:
328
+ if _FENCE_RE.match(line):
329
+ if not in_fence:
330
+ in_fence = True
331
+ elif line.strip() == "```":
332
+ in_fence = False
333
+
334
+ if not in_fence and line.startswith("## Symbol Index"):
335
+ in_index = True
336
+ out.append(line)
337
+ continue
338
+
339
+ if (
340
+ in_index
341
+ and not in_fence
342
+ and line.startswith("## ")
343
+ and not line.startswith("## Symbol Index")
344
+ ):
345
+ in_index = False
346
+ current_file_part = None
347
+ out.append(line)
348
+ continue
349
+
350
+ if not in_index or in_fence:
351
+ out.append(line)
352
+ continue
353
+
354
+ m_file = _FILE_HEADING_RE.match(line)
355
+ if m_file:
356
+ rel = m_file.group(1)
357
+ current_file_part = file_to_part.get(rel)
358
+ empty = " (empty)" if "(empty)" in line else ""
359
+ m_jump = re.search(r"\[jump\]\(\#([^)]+)\)", line)
360
+ anchor = m_jump.group(1) if m_jump else None
361
+ if current_file_part and anchor:
362
+ out.append(
363
+ f"### `{rel}`{empty} (in {current_file_part}) — "
364
+ f"[jump]({current_file_part}#{anchor})\n"
365
+ )
366
+ else:
367
+ out.append(re.sub(r"\s*\(L\d+-\d+\)", "", line))
368
+ continue
369
+
370
+ if line.lstrip().startswith("- "):
371
+ ln = re.sub(r"\s*\(L\d+-\d+\)", "", line)
372
+ m = re.search(r"\[jump\]\(\#func-([0-9a-f]{8})\)", ln)
373
+ if m:
374
+ fid = m.group(1).upper()
375
+ part = func_to_part.get(fid) or current_file_part
376
+ if part:
377
+ ln = ln.replace("— [jump]", f"(in {part}) — [jump]")
378
+ ln = re.sub(r"\(\#(func-[0-9a-f]{8})\)", rf"({part}#\1)", ln)
379
+ out.append(ln)
380
+ continue
381
+ if current_file_part:
382
+ ln = ln.rstrip("\n") + f" (in {current_file_part})\n"
383
+ out.append(ln)
384
+ continue
385
+
386
+ out.append(re.sub(r"\s*\(L\d+-\d+\)", "", line))
387
+
388
+ return _strip_markdown_line_ranges("".join(out))
codecrate/udiff.py ADDED
@@ -0,0 +1,187 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Literal
7
+
8
+ _HUNK_RE = re.compile(r"^@@\s+-(\d+),?(\d*)\s+\+(\d+),?(\d*)\s+@@")
9
+
10
+
11
+ def normalize_newlines(s: str) -> str:
12
+ return s.replace("\r\n", "\n").replace("\r", "\n")
13
+
14
+
15
+ def ensure_parent_dir(path: Path) -> None:
16
+ path.parent.mkdir(parents=True, exist_ok=True)
17
+
18
+
19
+ @dataclass
20
+ class FileDiff:
21
+ path: str
22
+ hunks: list[list[str]] # raw hunk lines including @@ header and +/-/space lines
23
+ op: Literal["add", "modify", "delete"]
24
+
25
+
26
+ def parse_unified_diff(diff_text: str) -> list[FileDiff]:
27
+ lines = normalize_newlines(diff_text).splitlines()
28
+ i = 0
29
+ out: list[FileDiff] = []
30
+
31
+ while i < len(lines):
32
+ if not lines[i].startswith("--- "):
33
+ i += 1
34
+ continue
35
+ if i + 1 >= len(lines):
36
+ break
37
+ if not lines[i + 1].startswith("+++ "):
38
+ i += 1
39
+ continue
40
+ from_raw = lines[i][4:].strip()
41
+ to_raw = lines[i + 1][4:].strip()
42
+
43
+ def _side(raw: str, prefix: str) -> str | None:
44
+ if raw == "/dev/null":
45
+ return None
46
+ return raw[len(prefix) :] if raw.startswith(prefix) else raw
47
+
48
+ from_path = _side(from_raw, "a/")
49
+ to_path = _side(to_raw, "b/")
50
+
51
+ if from_path is None and to_path is None:
52
+ i += 2
53
+ continue
54
+
55
+ if from_path is None:
56
+ op: Literal["add", "modify", "delete"] = "add"
57
+ path = to_path or ""
58
+ elif to_path is None:
59
+ op = "delete"
60
+ path = from_path
61
+ else:
62
+ op = "modify"
63
+ path = to_path
64
+ i += 2
65
+
66
+ hunks: list[list[str]] = []
67
+ while i < len(lines):
68
+ if lines[i].startswith("--- "):
69
+ break
70
+ if lines[i].startswith("@@"):
71
+ h = [lines[i]]
72
+ i += 1
73
+ while (
74
+ i < len(lines)
75
+ and not lines[i].startswith("@@")
76
+ and not lines[i].startswith("--- ")
77
+ ):
78
+ if lines[i].startswith((" ", "+", "-")):
79
+ h.append(lines[i])
80
+ i += 1
81
+ hunks.append(h)
82
+ else:
83
+ i += 1
84
+
85
+ out.append(FileDiff(path=path, hunks=hunks, op=op))
86
+
87
+ return out
88
+
89
+
90
+ def apply_hunks_to_text(old_text: str, hunks: list[list[str]]) -> str:
91
+ """
92
+ Minimal unified-diff applier.
93
+ - Expects hunks in order and matching context lines.
94
+ - Raises ValueError on mismatch.
95
+ """
96
+ old_lines = normalize_newlines(old_text).splitlines()
97
+ new_lines: list[str] = []
98
+ old_i = 0
99
+
100
+ for hunk in hunks:
101
+ m = _HUNK_RE.match(hunk[0])
102
+ if not m:
103
+ raise ValueError(f"Bad hunk header: {hunk[0]}")
104
+ old_start = int(m.group(1)) - 1 # 0-based
105
+
106
+ # copy unchanged prefix
107
+ if old_start < old_i and not (old_i == 0 and len(old_lines) == 0):
108
+ raise ValueError("Overlapping hunks")
109
+ new_lines.extend(old_lines[old_i:old_start])
110
+ old_i = old_start
111
+
112
+ # apply hunk lines
113
+ for line in hunk[1:]:
114
+ tag = line[:1]
115
+ payload = line[1:]
116
+ if tag == " ":
117
+ if old_i >= len(old_lines) or old_lines[old_i] != payload:
118
+ raise ValueError("Context mismatch while applying patch")
119
+ new_lines.append(payload)
120
+ old_i += 1
121
+ elif tag == "-":
122
+ # Check if current line matches what we'd add (file already modified)
123
+ current_line_matches_target = False
124
+ if old_i < len(old_lines) and old_lines[old_i] != payload:
125
+ # Look ahead to find what we're adding
126
+ add_line = None
127
+ for next_line in hunk[1:]:
128
+ if next_line.startswith("+"):
129
+ add_line = next_line[1:] # Full add line with indentation
130
+ break
131
+ # If current line matches what we'd add (with/without stripping),
132
+ # skip the delete operation
133
+ if add_line is not None:
134
+ current_matches_add = old_lines[old_i] == add_line
135
+ current_matches_add_stripped = (
136
+ old_lines[old_i].strip() == add_line.strip()
137
+ )
138
+ if current_matches_add or current_matches_add_stripped:
139
+ current_line_matches_target = True
140
+
141
+ if not current_line_matches_target:
142
+ # Use original payload for comparison, but also try stripped version
143
+ if old_i < len(old_lines) and old_lines[old_i] != payload:
144
+ # Try stripped versions for fuzzy matching
145
+ old_stripped = old_lines[old_i].strip()
146
+ payload_stripped = payload.strip()
147
+ if old_stripped != payload_stripped:
148
+ raise ValueError("Delete mismatch while applying patch")
149
+ old_i += 1
150
+ elif tag == "+":
151
+ new_lines.append(payload)
152
+ else:
153
+ raise ValueError(f"Unexpected diff tag: {tag}")
154
+
155
+ # copy remainder
156
+ new_lines.extend(old_lines[old_i:])
157
+ out = "\n".join(new_lines)
158
+ if old_text.endswith("\n") or (not old_text and out):
159
+ out += "\n"
160
+ return out
161
+
162
+
163
+ def apply_file_diffs(diffs: list[FileDiff], root: Path) -> list[Path]:
164
+ """
165
+ Applies diffs to files under root. Returns list of modified paths.
166
+ """
167
+ root = root.resolve()
168
+ changed: list[Path] = []
169
+
170
+ for fd in diffs:
171
+ path = root / fd.path
172
+
173
+ if fd.op == "delete":
174
+ if path.exists():
175
+ path.unlink()
176
+ changed.append(path)
177
+ continue
178
+
179
+ old = ""
180
+ if path.exists():
181
+ old = path.read_text(encoding="utf-8", errors="replace")
182
+ new = apply_hunks_to_text(old, fd.hunks)
183
+ ensure_parent_dir(path)
184
+ path.write_text(new, encoding="utf-8")
185
+ changed.append(path)
186
+
187
+ return changed