kc-cli 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. kc/__init__.py +5 -0
  2. kc/__main__.py +11 -0
  3. kc/artifacts/__init__.py +1 -0
  4. kc/artifacts/diff.py +76 -0
  5. kc/artifacts/frontmatter.py +26 -0
  6. kc/artifacts/markdown.py +116 -0
  7. kc/atomic_write.py +33 -0
  8. kc/cli.py +284 -0
  9. kc/commands/__init__.py +1 -0
  10. kc/commands/artifact.py +1190 -0
  11. kc/commands/citation.py +231 -0
  12. kc/commands/common.py +346 -0
  13. kc/commands/conformance.py +293 -0
  14. kc/commands/context.py +190 -0
  15. kc/commands/doctor.py +81 -0
  16. kc/commands/eval.py +133 -0
  17. kc/commands/export.py +97 -0
  18. kc/commands/guide.py +571 -0
  19. kc/commands/index.py +54 -0
  20. kc/commands/init.py +207 -0
  21. kc/commands/lint.py +238 -0
  22. kc/commands/source.py +464 -0
  23. kc/commands/status.py +52 -0
  24. kc/commands/task.py +260 -0
  25. kc/config.py +127 -0
  26. kc/embedding_models/potion-base-8M/README.md +97 -0
  27. kc/embedding_models/potion-base-8M/config.json +13 -0
  28. kc/embedding_models/potion-base-8M/model.safetensors +0 -0
  29. kc/embedding_models/potion-base-8M/modules.json +14 -0
  30. kc/embedding_models/potion-base-8M/tokenizer.json +1 -0
  31. kc/errors.py +141 -0
  32. kc/fingerprints.py +35 -0
  33. kc/ids.py +23 -0
  34. kc/locks.py +65 -0
  35. kc/models/__init__.py +17 -0
  36. kc/models/artifact.py +34 -0
  37. kc/models/citation.py +60 -0
  38. kc/models/context.py +23 -0
  39. kc/models/eval.py +21 -0
  40. kc/models/plan.py +37 -0
  41. kc/models/source.py +37 -0
  42. kc/models/source_range.py +29 -0
  43. kc/models/source_revision.py +19 -0
  44. kc/models/task.py +35 -0
  45. kc/output.py +838 -0
  46. kc/paths.py +126 -0
  47. kc/provenance/__init__.py +1 -0
  48. kc/provenance/citations.py +296 -0
  49. kc/search/__init__.py +1 -0
  50. kc/search/extract.py +268 -0
  51. kc/search/fts.py +284 -0
  52. kc/search/semantic.py +346 -0
  53. kc/store/__init__.py +1 -0
  54. kc/store/jsonl.py +55 -0
  55. kc/store/sqlite.py +444 -0
  56. kc/store/transaction.py +67 -0
  57. kc/templates/agents/skills/kc/SKILL.md +282 -0
  58. kc/templates/agents/skills/kc/agents/openai.yaml +5 -0
  59. kc/templates/agents/skills/kc/scripts/resolve_query_citations.py +134 -0
  60. kc/workspace.py +98 -0
  61. kc_cli-0.4.0.dist-info/METADATA +522 -0
  62. kc_cli-0.4.0.dist-info/RECORD +65 -0
  63. kc_cli-0.4.0.dist-info/WHEEL +4 -0
  64. kc_cli-0.4.0.dist-info/entry_points.txt +2 -0
  65. kc_cli-0.4.0.dist-info/licenses/LICENSE +21 -0
kc/paths.py ADDED
@@ -0,0 +1,126 @@
1
+ """Repository path resolution and traversal checks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+
8
+ from kc.errors import KcError
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class KcPaths:
13
+ root: Path
14
+ data_dir: Path
15
+ state_dir: Path
16
+
17
+ @property
18
+ def config_path(self) -> Path:
19
+ return self.root / "kc.toml"
20
+
21
+ @property
22
+ def sources_jsonl(self) -> Path:
23
+ return self.data_dir / "sources.jsonl"
24
+
25
+ @property
26
+ def ranges_jsonl(self) -> Path:
27
+ return self.data_dir / "source_ranges.jsonl"
28
+
29
+ @property
30
+ def source_revisions_jsonl(self) -> Path:
31
+ return self.data_dir / "source_revisions.jsonl"
32
+
33
+ @property
34
+ def artifacts_jsonl(self) -> Path:
35
+ return self.data_dir / "artifacts.jsonl"
36
+
37
+ @property
38
+ def citation_edges_jsonl(self) -> Path:
39
+ return self.data_dir / "citation_edges.jsonl"
40
+
41
+ @property
42
+ def sqlite_path(self) -> Path:
43
+ return self.state_dir / "state.sqlite"
44
+
45
+ @property
46
+ def locks_dir(self) -> Path:
47
+ return self.state_dir / "locks"
48
+
49
+ @property
50
+ def plans_dir(self) -> Path:
51
+ return self.state_dir / "plans"
52
+
53
+ @property
54
+ def snapshots_dir(self) -> Path:
55
+ return self.state_dir / "snapshots"
56
+
57
+ @property
58
+ def tasks_dir(self) -> Path:
59
+ return self.state_dir / "tasks"
60
+
61
+ @property
62
+ def context_dir(self) -> Path:
63
+ return self.state_dir / "context"
64
+
65
+ @property
66
+ def operations_dir(self) -> Path:
67
+ return self.state_dir / "operations"
68
+
69
+ @property
70
+ def wiki_dir(self) -> Path:
71
+ return self.data_dir / "wiki"
72
+
73
+ @property
74
+ def log_path(self) -> Path:
75
+ return self.wiki_dir / "log.md"
76
+
77
+
78
+ def current_paths() -> KcPaths:
79
+ return current_workspace().paths
80
+
81
+
82
+ def current_workspace():
83
+ from kc.workspace import resolve_workspace
84
+
85
+ return resolve_workspace()
86
+
87
+
88
+ def ensure_data_dir_exists() -> KcPaths:
89
+ paths = current_paths()
90
+ if not paths.data_dir.exists():
91
+ raise KcError(
92
+ code="KC_CONFIG_NOT_FOUND",
93
+ message=f"Knowledge data directory not found: {repo_relative(paths.data_dir)}",
94
+ details={"data_dir": repo_relative(paths.data_dir)},
95
+ )
96
+ return paths
97
+
98
+
99
+ def ensure_under_root(path: Path, root: Path | None = None) -> Path:
100
+ root = root or current_paths().root
101
+ resolved = path.resolve()
102
+ try:
103
+ resolved.relative_to(root)
104
+ except ValueError as exc:
105
+ raise KcError(
106
+ code="KC_PATH_OUTSIDE_REPO",
107
+ message=f"Path is outside repository root: {path}",
108
+ details={"path": str(path), "repo_root": str(root)},
109
+ ) from exc
110
+ return resolved
111
+
112
+
113
+ def resolve_repo_path(path: Path | str, root: Path | None = None) -> Path:
114
+ root = root or current_paths().root
115
+ candidate = Path(path).expanduser()
116
+ if not candidate.is_absolute():
117
+ candidate = root / candidate
118
+ return ensure_under_root(candidate.resolve(), root=root)
119
+
120
+
121
+ def repo_relative(path: Path, root: Path | None = None) -> str:
122
+ root = root or current_paths().root
123
+ try:
124
+ return path.resolve().relative_to(root).as_posix()
125
+ except ValueError:
126
+ return path.as_posix()
@@ -0,0 +1 @@
1
+ """Provenance and citation validation."""
@@ -0,0 +1,296 @@
1
+ """Citation token parsing and validation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from datetime import UTC, datetime
7
+ from pathlib import Path
8
+ from typing import Any
9
+ from urllib.parse import unquote
10
+
11
+ from kc.fingerprints import raw_fingerprint
12
+ from kc.ids import new_id
13
+ from kc.models.citation import ArtifactLocator, CitationEdgeRecord, ParsedCitation
14
+ from kc.models.source import SourceRecord
15
+ from kc.models.source_range import SourceRangeRecord
16
+ from kc.paths import resolve_repo_path
17
+ from kc.store.jsonl import read_jsonl
18
+
19
+ CITATION_RE = re.compile(
20
+ r"\[kc:(?P<source>src_[A-Za-z0-9_]+):"
21
+ r"(?:(?P<range>rng_[A-Za-z0-9_]+)(?::"
22
+ r"(?:(?:L(?P<v2_line_start>\d+)-L(?P<v2_line_end>\d+))|"
23
+ r"(?:JP:(?P<v2_pointer>[^\]]+))|"
24
+ r"(?:CSV:R(?P<v2_row_start>\d+)-R(?P<v2_row_end>\d+))))?|"
25
+ r"(?:(?:L(?P<line_start>\d+)-L(?P<line_end>\d+))|"
26
+ r"(?:JP:(?P<pointer>[^\]]+))|"
27
+ r"(?:CSV:R(?P<row_start>\d+)-R(?P<row_end>\d+))))\]"
28
+ )
29
+ KC_TOKEN_RE = re.compile(r"\[kc:[^\]]+\]")
30
+ MARKER_RE = re.compile(r"\[kc:(inference|todo|uncited)\]")
31
+
32
+
33
+ def parse_markdown_citations(text: str) -> list[ParsedCitation]:
34
+ parsed: list[ParsedCitation] = []
35
+ for line_no, line in enumerate(text.splitlines(), start=1):
36
+ for match in CITATION_RE.finditer(line):
37
+ range_id = match.group("range")
38
+ token_version = "v2" if range_id else "v1"
39
+ if range_id and not any(
40
+ match.group(name) is not None
41
+ for name in (
42
+ "v2_line_start",
43
+ "v2_pointer",
44
+ "v2_row_start",
45
+ )
46
+ ):
47
+ parsed.append(
48
+ ParsedCitation(
49
+ token=match.group(0),
50
+ source_id=match.group("source"),
51
+ range_id=range_id,
52
+ token_version=token_version,
53
+ kind="line_range",
54
+ line=line_no,
55
+ )
56
+ )
57
+ continue
58
+ if (match.group("v2_line_start") or match.group("line_start")) is not None:
59
+ parsed.append(
60
+ ParsedCitation(
61
+ token=match.group(0),
62
+ source_id=match.group("source"),
63
+ range_id=range_id,
64
+ token_version=token_version,
65
+ kind="line_range",
66
+ start_line=int(match.group("v2_line_start") or match.group("line_start")),
67
+ end_line=int(match.group("v2_line_end") or match.group("line_end")),
68
+ line=line_no,
69
+ )
70
+ )
71
+ continue
72
+ if (match.group("v2_pointer") or match.group("pointer")) is not None:
73
+ parsed.append(
74
+ ParsedCitation(
75
+ token=match.group(0),
76
+ source_id=match.group("source"),
77
+ range_id=range_id,
78
+ token_version=token_version,
79
+ kind="json_pointer",
80
+ pointer=unquote(match.group("v2_pointer") or match.group("pointer")),
81
+ line=line_no,
82
+ )
83
+ )
84
+ continue
85
+ parsed.append(
86
+ ParsedCitation(
87
+ token=match.group(0),
88
+ source_id=match.group("source"),
89
+ range_id=range_id,
90
+ token_version=token_version,
91
+ kind="csv_row_range",
92
+ start_row=int(match.group("v2_row_start") or match.group("row_start")),
93
+ end_row=int(match.group("v2_row_end") or match.group("row_end")),
94
+ line=line_no,
95
+ )
96
+ )
97
+ return parsed
98
+
99
+
100
+ def invalid_markdown_citation_tokens(text: str) -> list[dict[str, Any]]:
101
+ invalid: list[dict[str, Any]] = []
102
+ for line_no, line in enumerate(text.splitlines(), start=1):
103
+ for match in KC_TOKEN_RE.finditer(line):
104
+ token = match.group(0)
105
+ if MARKER_RE.fullmatch(token) or CITATION_RE.fullmatch(token):
106
+ continue
107
+ invalid.append(
108
+ {
109
+ "code": "KC_CITATION_INVALID_TOKEN",
110
+ "message": f"Invalid kc citation token: {token}",
111
+ "line": line_no,
112
+ "token": token,
113
+ }
114
+ )
115
+ return invalid
116
+
117
+
118
+ def has_citation_or_marker(text: str) -> bool:
119
+ return bool(CITATION_RE.search(text) or MARKER_RE.search(text))
120
+
121
+
122
+ def find_range_for_token(
123
+ citation: ParsedCitation,
124
+ ranges: list[SourceRangeRecord],
125
+ ) -> SourceRangeRecord | None:
126
+ if citation.range_id:
127
+ for candidate in ranges:
128
+ if candidate.source_id != citation.source_id or candidate.range_id != citation.range_id:
129
+ continue
130
+ if citation.start_line is None and citation.pointer is None and citation.start_row is None:
131
+ return candidate
132
+ loc = candidate.locator
133
+ if loc.kind != citation.kind:
134
+ return None
135
+ if citation.kind == "line_range" and loc.start_line == citation.start_line and loc.end_line == citation.end_line:
136
+ return candidate
137
+ if citation.kind == "json_pointer" and loc.pointer == citation.pointer:
138
+ return candidate
139
+ if citation.kind == "csv_row_range" and loc.start_row == citation.start_row and loc.end_row == citation.end_row:
140
+ return candidate
141
+ return None
142
+ for candidate in ranges:
143
+ loc = candidate.locator
144
+ if candidate.source_id != citation.source_id or loc.kind != citation.kind:
145
+ continue
146
+ if citation.kind == "line_range" and loc.start_line == citation.start_line and loc.end_line == citation.end_line:
147
+ return candidate
148
+ if citation.kind == "json_pointer" and loc.pointer == citation.pointer:
149
+ return candidate
150
+ if citation.kind == "csv_row_range" and loc.start_row == citation.start_row and loc.end_row == citation.end_row:
151
+ return candidate
152
+ return None
153
+
154
+
155
+ def _previous_valid_edges(
156
+ citation_edges_path: Path | None,
157
+ artifact_path: str,
158
+ ) -> dict[str, CitationEdgeRecord]:
159
+ if citation_edges_path is None:
160
+ return {}
161
+ return {
162
+ edge.citation_token: edge
163
+ for edge in read_jsonl(citation_edges_path, CitationEdgeRecord)
164
+ if edge.artifact_path == artifact_path and edge.status == "valid"
165
+ }
166
+
167
+
168
+ def _current_source_fingerprint(source: SourceRecord) -> str | None:
169
+ original = source.metadata.get("original_path")
170
+ if not isinstance(original, str):
171
+ return None
172
+ path = resolve_repo_path(original)
173
+ if not path.exists():
174
+ return None
175
+ return raw_fingerprint(path)
176
+
177
+
178
+ def validate_citations(
179
+ artifact_path: str,
180
+ artifact_text: str,
181
+ *,
182
+ sources_path: Path,
183
+ ranges_path: Path,
184
+ citation_edges_path: Path | None = None,
185
+ artifact_id: str | None = None,
186
+ ) -> tuple[list[CitationEdgeRecord], list[dict[str, Any]]]:
187
+ sources = read_jsonl(sources_path, SourceRecord)
188
+ ranges = read_jsonl(ranges_path, SourceRangeRecord)
189
+ source_by_id = {s.source_id: s for s in sources}
190
+ parsed = parse_markdown_citations(artifact_text)
191
+ previous_edges = _previous_valid_edges(citation_edges_path, artifact_path)
192
+ edges: list[CitationEdgeRecord] = []
193
+ problems: list[dict[str, Any]] = invalid_markdown_citation_tokens(artifact_text)
194
+ timestamp = datetime.now(UTC).isoformat()
195
+ for problem in problems:
196
+ token = str(problem.get("token", ""))
197
+ source_match = re.search(r"\[kc:(src_[A-Za-z0-9_]+)", token)
198
+ edges.append(
199
+ CitationEdgeRecord(
200
+ edge_id=new_id("cite"),
201
+ artifact_id=artifact_id,
202
+ artifact_path=artifact_path,
203
+ artifact_locator=ArtifactLocator(
204
+ start_line=int(problem.get("line", 1)),
205
+ end_line=int(problem.get("line", 1)),
206
+ ),
207
+ citation_token=token,
208
+ source_id=source_match.group(1) if source_match else "",
209
+ range_id=None,
210
+ source_fingerprint_at_validation=None,
211
+ validated_at=timestamp,
212
+ status="invalid_token",
213
+ )
214
+ )
215
+ for citation in parsed:
216
+ source = source_by_id.get(citation.source_id)
217
+ range_record = find_range_for_token(citation, ranges)
218
+ status = "valid"
219
+ if source is None:
220
+ status = "missing_source"
221
+ problems.append(
222
+ {
223
+ "code": "KC_CITATION_SOURCE_MISSING",
224
+ "message": f"Citation source does not exist: {citation.source_id}",
225
+ "line": citation.line,
226
+ "token": citation.token,
227
+ }
228
+ )
229
+ elif range_record is None:
230
+ status = "missing_range"
231
+ problems.append(
232
+ {
233
+ "code": "KC_CITATION_RANGE_MISSING",
234
+ "message": f"Citation range does not exist: {citation.token}",
235
+ "line": citation.line,
236
+ "token": citation.token,
237
+ }
238
+ )
239
+ elif range_record.source_fingerprint != source.fingerprint:
240
+ status = "stale_source"
241
+ problems.append(
242
+ {
243
+ "code": "KC_CITATION_STALE_SOURCE",
244
+ "message": f"Citation points to stale source fingerprint: {citation.token}",
245
+ "line": citation.line,
246
+ "token": citation.token,
247
+ }
248
+ )
249
+ else:
250
+ previous_edge = previous_edges.get(citation.token)
251
+ if (
252
+ citation.token_version == "v1"
253
+ and previous_edge is not None
254
+ and previous_edge.range_id
255
+ and previous_edge.range_id != range_record.range_id
256
+ ):
257
+ status = "stale_source"
258
+ problems.append(
259
+ {
260
+ "code": "KC_CITATION_STALE_SOURCE",
261
+ "message": f"Legacy locator citation now resolves to different source text: {citation.token}",
262
+ "line": citation.line,
263
+ "token": citation.token,
264
+ "previous_range_id": previous_edge.range_id,
265
+ "current_range_id": range_record.range_id,
266
+ }
267
+ )
268
+ current_fingerprint = _current_source_fingerprint(source)
269
+ if current_fingerprint is not None and current_fingerprint != source.fingerprint:
270
+ status = "stale_source"
271
+ problems.append(
272
+ {
273
+ "code": "KC_CITATION_STALE_SOURCE",
274
+ "message": f"Citation source file fingerprint has changed: {citation.token}",
275
+ "line": citation.line,
276
+ "token": citation.token,
277
+ "source_id": source.source_id,
278
+ "registered_fingerprint": source.fingerprint,
279
+ "current_fingerprint": current_fingerprint,
280
+ }
281
+ )
282
+ edges.append(
283
+ CitationEdgeRecord(
284
+ edge_id=new_id("cite"),
285
+ artifact_id=artifact_id,
286
+ artifact_path=artifact_path,
287
+ artifact_locator=ArtifactLocator(start_line=citation.line, end_line=citation.line),
288
+ citation_token=citation.token,
289
+ source_id=citation.source_id,
290
+ range_id=range_record.range_id if range_record else None,
291
+ source_fingerprint_at_validation=source.fingerprint if source else None,
292
+ validated_at=timestamp,
293
+ status=status, # type: ignore[arg-type]
294
+ )
295
+ )
296
+ return edges, problems
kc/search/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """Search and extraction primitives."""
kc/search/extract.py ADDED
@@ -0,0 +1,268 @@
1
+ """Deterministic local source extraction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ import hashlib
7
+ import json
8
+ import mimetypes
9
+ import tomllib
10
+ from datetime import UTC, datetime
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ import yaml
15
+
16
+ from kc.fingerprints import normalize_text, text_hash
17
+ from kc.models.source_range import Locator, SourceRangeRecord
18
+
19
+ TEXT_EXTENSIONS = {
20
+ ".md",
21
+ ".markdown",
22
+ ".txt",
23
+ ".rst",
24
+ ".py",
25
+ ".js",
26
+ ".ts",
27
+ ".json",
28
+ ".yaml",
29
+ ".yml",
30
+ ".toml",
31
+ ".csv",
32
+ }
33
+
34
+
35
+ def guess_media_type(path: Path) -> str:
36
+ guessed, _encoding = mimetypes.guess_type(path.name)
37
+ if guessed:
38
+ return guessed
39
+ if path.suffix.lower() in TEXT_EXTENSIONS:
40
+ return "text/plain"
41
+ return "application/octet-stream"
42
+
43
+
44
+ def is_text_like(path: Path, media_type: str) -> bool:
45
+ return media_type.startswith("text/") or path.suffix.lower() in TEXT_EXTENSIONS
46
+
47
+
48
+ def extract_ranges(
49
+ path: Path,
50
+ source_id: str,
51
+ source_fingerprint: str,
52
+ *,
53
+ revision_id: str | None = None,
54
+ ) -> list[SourceRangeRecord]:
55
+ media_type = guess_media_type(path)
56
+ if path.suffix.lower() == ".json":
57
+ try:
58
+ return _extract_structured_ranges(
59
+ json.loads(path.read_text(encoding="utf-8-sig")),
60
+ source_id,
61
+ source_fingerprint,
62
+ revision_id=revision_id,
63
+ )
64
+ except Exception:
65
+ return _extract_text_ranges(path, source_id, source_fingerprint, revision_id=revision_id)
66
+ if path.suffix.lower() in {".yaml", ".yml"}:
67
+ try:
68
+ return _extract_structured_ranges(
69
+ yaml.safe_load(path.read_text(encoding="utf-8-sig")),
70
+ source_id,
71
+ source_fingerprint,
72
+ revision_id=revision_id,
73
+ )
74
+ except Exception:
75
+ return _extract_text_ranges(path, source_id, source_fingerprint, revision_id=revision_id)
76
+ if path.suffix.lower() == ".toml":
77
+ try:
78
+ return _extract_structured_ranges(
79
+ tomllib.loads(path.read_text(encoding="utf-8-sig")),
80
+ source_id,
81
+ source_fingerprint,
82
+ revision_id=revision_id,
83
+ )
84
+ except Exception:
85
+ return _extract_text_ranges(path, source_id, source_fingerprint, revision_id=revision_id)
86
+ if path.suffix.lower() == ".csv":
87
+ try:
88
+ return _extract_csv_ranges(path, source_id, source_fingerprint, revision_id=revision_id)
89
+ except Exception:
90
+ return _extract_text_ranges(path, source_id, source_fingerprint, revision_id=revision_id)
91
+ if not is_text_like(path, media_type):
92
+ return []
93
+ return _extract_text_ranges(path, source_id, source_fingerprint, revision_id=revision_id)
94
+
95
+
96
+ def _now() -> str:
97
+ return datetime.now(UTC).isoformat()
98
+
99
+
100
+ def _estimate_tokens(text: str) -> int:
101
+ return max(1, len(text.split()))
102
+
103
+
104
+ def _range_id(source_id: str, revision_id: str | None, locator: Locator, excerpt_hash: str) -> str:
105
+ digest = hashlib.sha256(
106
+ f"{source_id}:{revision_id or ''}:{locator.model_dump_json(exclude_none=True)}:{excerpt_hash}".encode()
107
+ ).hexdigest()
108
+ return f"rng_{digest[:26].upper()}"
109
+
110
+
111
+ def _extract_text_ranges(
112
+ path: Path, source_id: str, source_fingerprint: str, *, revision_id: str | None = None
113
+ ) -> list[SourceRangeRecord]:
114
+ text = normalize_text(path.read_text(encoding="utf-8-sig"))
115
+ lines = text.split("\n")
116
+ chunks: list[tuple[int, int, list[str]]] = []
117
+ current_start: int | None = None
118
+ current_lines: list[str] = []
119
+ heading_path: list[str] = []
120
+ current_heading: list[str] = []
121
+
122
+ def flush(end_line: int) -> None:
123
+ nonlocal current_start, current_lines, current_heading
124
+ if current_start is None:
125
+ return
126
+ content = "\n".join(current_lines).strip()
127
+ if content:
128
+ chunks.append((current_start, end_line, list(current_heading)))
129
+ current_start = None
130
+ current_lines = []
131
+
132
+ for idx, raw_line in enumerate(lines, start=1):
133
+ line = raw_line.rstrip()
134
+ if path.suffix.lower() in {".md", ".markdown"} and line.startswith("#"):
135
+ flush(idx - 1)
136
+ depth = len(line) - len(line.lstrip("#"))
137
+ title = line.lstrip("#").strip()
138
+ if title:
139
+ heading_path = [*heading_path[: max(depth - 1, 0)], title]
140
+ current_heading = list(heading_path)
141
+ current_start = idx
142
+ current_lines = [raw_line]
143
+ continue
144
+
145
+ if not line.strip():
146
+ flush(idx - 1)
147
+ continue
148
+
149
+ if current_start is None:
150
+ current_start = idx
151
+ current_heading = list(heading_path)
152
+ current_lines = []
153
+ current_lines.append(raw_line)
154
+
155
+ if len(current_lines) >= 24:
156
+ flush(idx)
157
+
158
+ flush(len(lines))
159
+
160
+ if not chunks and text.strip():
161
+ chunks = [(1, len(lines), [])]
162
+
163
+ extracted_at = _now()
164
+ records: list[SourceRangeRecord] = []
165
+ for start_line, end_line, headings in chunks:
166
+ excerpt = "\n".join(lines[start_line - 1 : end_line]).strip()
167
+ if not excerpt:
168
+ continue
169
+ locator = Locator(kind="line_range", start_line=start_line, end_line=end_line)
170
+ excerpt_hash = text_hash(excerpt)
171
+ records.append(
172
+ SourceRangeRecord(
173
+ range_id=_range_id(source_id, revision_id, locator, excerpt_hash),
174
+ source_id=source_id,
175
+ revision_id=revision_id,
176
+ source_fingerprint=source_fingerprint,
177
+ locator=locator,
178
+ text_hash=excerpt_hash,
179
+ excerpt=excerpt,
180
+ tokens_estimate=_estimate_tokens(excerpt),
181
+ extracted_at=extracted_at,
182
+ metadata={"heading_path": headings},
183
+ )
184
+ )
185
+ return records
186
+
187
+
188
+ def _extract_structured_ranges(
189
+ data: Any,
190
+ source_id: str,
191
+ source_fingerprint: str,
192
+ *,
193
+ revision_id: str | None = None,
194
+ ) -> list[SourceRangeRecord]:
195
+ records: list[SourceRangeRecord] = []
196
+ extracted_at = _now()
197
+
198
+ def visit(value: Any, pointer: str) -> None:
199
+ if isinstance(value, dict):
200
+ for key, child in value.items():
201
+ escaped = str(key).replace("~", "~0").replace("/", "~1")
202
+ visit(child, f"{pointer}/{escaped}")
203
+ elif isinstance(value, list):
204
+ for idx, child in enumerate(value):
205
+ visit(child, f"{pointer}/{idx}")
206
+ else:
207
+ excerpt = json.dumps(value, ensure_ascii=False, sort_keys=True) if not isinstance(value, str) else value
208
+ excerpt = excerpt.strip()
209
+ if not excerpt:
210
+ return
211
+ locator = Locator(kind="json_pointer", pointer=pointer or "/")
212
+ excerpt_hash = text_hash(excerpt)
213
+ records.append(
214
+ SourceRangeRecord(
215
+ range_id=_range_id(source_id, revision_id, locator, excerpt_hash),
216
+ source_id=source_id,
217
+ revision_id=revision_id,
218
+ source_fingerprint=source_fingerprint,
219
+ locator=locator,
220
+ text_hash=excerpt_hash,
221
+ excerpt=excerpt,
222
+ tokens_estimate=_estimate_tokens(excerpt),
223
+ extracted_at=extracted_at,
224
+ metadata={"heading_path": [pointer or "/"]},
225
+ )
226
+ )
227
+
228
+ visit(data, "")
229
+ return records
230
+
231
+
232
+ def _extract_csv_ranges(
233
+ path: Path,
234
+ source_id: str,
235
+ source_fingerprint: str,
236
+ *,
237
+ revision_id: str | None = None,
238
+ ) -> list[SourceRangeRecord]:
239
+ text = path.read_text(encoding="utf-8-sig")
240
+ rows = list(csv.reader(text.splitlines()))
241
+ if not rows:
242
+ return []
243
+ header = [cell.strip() for cell in rows[0]]
244
+ records: list[SourceRangeRecord] = []
245
+ extracted_at = _now()
246
+ for row_index, row in enumerate(rows[1:], start=2):
247
+ values = {
248
+ header[index] if index < len(header) and header[index] else f"column_{index + 1}": value
249
+ for index, value in enumerate(row)
250
+ }
251
+ excerpt = json.dumps(values, ensure_ascii=False, sort_keys=True)
252
+ locator = Locator(kind="csv_row_range", start_row=row_index, end_row=row_index)
253
+ excerpt_hash = text_hash(excerpt)
254
+ records.append(
255
+ SourceRangeRecord(
256
+ range_id=_range_id(source_id, revision_id, locator, excerpt_hash),
257
+ source_id=source_id,
258
+ revision_id=revision_id,
259
+ source_fingerprint=source_fingerprint,
260
+ locator=locator,
261
+ text_hash=excerpt_hash,
262
+ excerpt=excerpt,
263
+ tokens_estimate=_estimate_tokens(excerpt),
264
+ extracted_at=extracted_at,
265
+ metadata={"heading_path": ["csv", f"row {row_index}"]},
266
+ )
267
+ )
268
+ return records