kc-cli 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kc/__init__.py +5 -0
- kc/__main__.py +11 -0
- kc/artifacts/__init__.py +1 -0
- kc/artifacts/diff.py +76 -0
- kc/artifacts/frontmatter.py +26 -0
- kc/artifacts/markdown.py +116 -0
- kc/atomic_write.py +33 -0
- kc/cli.py +284 -0
- kc/commands/__init__.py +1 -0
- kc/commands/artifact.py +1190 -0
- kc/commands/citation.py +231 -0
- kc/commands/common.py +346 -0
- kc/commands/conformance.py +293 -0
- kc/commands/context.py +190 -0
- kc/commands/doctor.py +81 -0
- kc/commands/eval.py +133 -0
- kc/commands/export.py +97 -0
- kc/commands/guide.py +571 -0
- kc/commands/index.py +54 -0
- kc/commands/init.py +207 -0
- kc/commands/lint.py +238 -0
- kc/commands/source.py +464 -0
- kc/commands/status.py +52 -0
- kc/commands/task.py +260 -0
- kc/config.py +127 -0
- kc/embedding_models/potion-base-8M/README.md +97 -0
- kc/embedding_models/potion-base-8M/config.json +13 -0
- kc/embedding_models/potion-base-8M/model.safetensors +0 -0
- kc/embedding_models/potion-base-8M/modules.json +14 -0
- kc/embedding_models/potion-base-8M/tokenizer.json +1 -0
- kc/errors.py +141 -0
- kc/fingerprints.py +35 -0
- kc/ids.py +23 -0
- kc/locks.py +65 -0
- kc/models/__init__.py +17 -0
- kc/models/artifact.py +34 -0
- kc/models/citation.py +60 -0
- kc/models/context.py +23 -0
- kc/models/eval.py +21 -0
- kc/models/plan.py +37 -0
- kc/models/source.py +37 -0
- kc/models/source_range.py +29 -0
- kc/models/source_revision.py +19 -0
- kc/models/task.py +35 -0
- kc/output.py +838 -0
- kc/paths.py +126 -0
- kc/provenance/__init__.py +1 -0
- kc/provenance/citations.py +296 -0
- kc/search/__init__.py +1 -0
- kc/search/extract.py +268 -0
- kc/search/fts.py +284 -0
- kc/search/semantic.py +346 -0
- kc/store/__init__.py +1 -0
- kc/store/jsonl.py +55 -0
- kc/store/sqlite.py +444 -0
- kc/store/transaction.py +67 -0
- kc/templates/agents/skills/kc/SKILL.md +282 -0
- kc/templates/agents/skills/kc/agents/openai.yaml +5 -0
- kc/templates/agents/skills/kc/scripts/resolve_query_citations.py +134 -0
- kc/workspace.py +98 -0
- kc_cli-0.4.0.dist-info/METADATA +522 -0
- kc_cli-0.4.0.dist-info/RECORD +65 -0
- kc_cli-0.4.0.dist-info/WHEEL +4 -0
- kc_cli-0.4.0.dist-info/entry_points.txt +2 -0
- kc_cli-0.4.0.dist-info/licenses/LICENSE +21 -0
kc/paths.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""Repository path resolution and traversal checks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from kc.errors import KcError
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class KcPaths:
|
|
13
|
+
root: Path
|
|
14
|
+
data_dir: Path
|
|
15
|
+
state_dir: Path
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def config_path(self) -> Path:
|
|
19
|
+
return self.root / "kc.toml"
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def sources_jsonl(self) -> Path:
|
|
23
|
+
return self.data_dir / "sources.jsonl"
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def ranges_jsonl(self) -> Path:
|
|
27
|
+
return self.data_dir / "source_ranges.jsonl"
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def source_revisions_jsonl(self) -> Path:
|
|
31
|
+
return self.data_dir / "source_revisions.jsonl"
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def artifacts_jsonl(self) -> Path:
|
|
35
|
+
return self.data_dir / "artifacts.jsonl"
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def citation_edges_jsonl(self) -> Path:
|
|
39
|
+
return self.data_dir / "citation_edges.jsonl"
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def sqlite_path(self) -> Path:
|
|
43
|
+
return self.state_dir / "state.sqlite"
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def locks_dir(self) -> Path:
|
|
47
|
+
return self.state_dir / "locks"
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def plans_dir(self) -> Path:
|
|
51
|
+
return self.state_dir / "plans"
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def snapshots_dir(self) -> Path:
|
|
55
|
+
return self.state_dir / "snapshots"
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def tasks_dir(self) -> Path:
|
|
59
|
+
return self.state_dir / "tasks"
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def context_dir(self) -> Path:
|
|
63
|
+
return self.state_dir / "context"
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def operations_dir(self) -> Path:
|
|
67
|
+
return self.state_dir / "operations"
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def wiki_dir(self) -> Path:
|
|
71
|
+
return self.data_dir / "wiki"
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def log_path(self) -> Path:
|
|
75
|
+
return self.wiki_dir / "log.md"
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def current_paths() -> KcPaths:
|
|
79
|
+
return current_workspace().paths
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def current_workspace():
|
|
83
|
+
from kc.workspace import resolve_workspace
|
|
84
|
+
|
|
85
|
+
return resolve_workspace()
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def ensure_data_dir_exists() -> KcPaths:
|
|
89
|
+
paths = current_paths()
|
|
90
|
+
if not paths.data_dir.exists():
|
|
91
|
+
raise KcError(
|
|
92
|
+
code="KC_CONFIG_NOT_FOUND",
|
|
93
|
+
message=f"Knowledge data directory not found: {repo_relative(paths.data_dir)}",
|
|
94
|
+
details={"data_dir": repo_relative(paths.data_dir)},
|
|
95
|
+
)
|
|
96
|
+
return paths
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def ensure_under_root(path: Path, root: Path | None = None) -> Path:
|
|
100
|
+
root = root or current_paths().root
|
|
101
|
+
resolved = path.resolve()
|
|
102
|
+
try:
|
|
103
|
+
resolved.relative_to(root)
|
|
104
|
+
except ValueError as exc:
|
|
105
|
+
raise KcError(
|
|
106
|
+
code="KC_PATH_OUTSIDE_REPO",
|
|
107
|
+
message=f"Path is outside repository root: {path}",
|
|
108
|
+
details={"path": str(path), "repo_root": str(root)},
|
|
109
|
+
) from exc
|
|
110
|
+
return resolved
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def resolve_repo_path(path: Path | str, root: Path | None = None) -> Path:
|
|
114
|
+
root = root or current_paths().root
|
|
115
|
+
candidate = Path(path).expanduser()
|
|
116
|
+
if not candidate.is_absolute():
|
|
117
|
+
candidate = root / candidate
|
|
118
|
+
return ensure_under_root(candidate.resolve(), root=root)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def repo_relative(path: Path, root: Path | None = None) -> str:
|
|
122
|
+
root = root or current_paths().root
|
|
123
|
+
try:
|
|
124
|
+
return path.resolve().relative_to(root).as_posix()
|
|
125
|
+
except ValueError:
|
|
126
|
+
return path.as_posix()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Provenance and citation validation."""
|
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
"""Citation token parsing and validation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from datetime import UTC, datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
from urllib.parse import unquote
|
|
10
|
+
|
|
11
|
+
from kc.fingerprints import raw_fingerprint
|
|
12
|
+
from kc.ids import new_id
|
|
13
|
+
from kc.models.citation import ArtifactLocator, CitationEdgeRecord, ParsedCitation
|
|
14
|
+
from kc.models.source import SourceRecord
|
|
15
|
+
from kc.models.source_range import SourceRangeRecord
|
|
16
|
+
from kc.paths import resolve_repo_path
|
|
17
|
+
from kc.store.jsonl import read_jsonl
|
|
18
|
+
|
|
19
|
+
CITATION_RE = re.compile(
|
|
20
|
+
r"\[kc:(?P<source>src_[A-Za-z0-9_]+):"
|
|
21
|
+
r"(?:(?P<range>rng_[A-Za-z0-9_]+)(?::"
|
|
22
|
+
r"(?:(?:L(?P<v2_line_start>\d+)-L(?P<v2_line_end>\d+))|"
|
|
23
|
+
r"(?:JP:(?P<v2_pointer>[^\]]+))|"
|
|
24
|
+
r"(?:CSV:R(?P<v2_row_start>\d+)-R(?P<v2_row_end>\d+))))?|"
|
|
25
|
+
r"(?:(?:L(?P<line_start>\d+)-L(?P<line_end>\d+))|"
|
|
26
|
+
r"(?:JP:(?P<pointer>[^\]]+))|"
|
|
27
|
+
r"(?:CSV:R(?P<row_start>\d+)-R(?P<row_end>\d+))))\]"
|
|
28
|
+
)
|
|
29
|
+
KC_TOKEN_RE = re.compile(r"\[kc:[^\]]+\]")
|
|
30
|
+
MARKER_RE = re.compile(r"\[kc:(inference|todo|uncited)\]")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def parse_markdown_citations(text: str) -> list[ParsedCitation]:
|
|
34
|
+
parsed: list[ParsedCitation] = []
|
|
35
|
+
for line_no, line in enumerate(text.splitlines(), start=1):
|
|
36
|
+
for match in CITATION_RE.finditer(line):
|
|
37
|
+
range_id = match.group("range")
|
|
38
|
+
token_version = "v2" if range_id else "v1"
|
|
39
|
+
if range_id and not any(
|
|
40
|
+
match.group(name) is not None
|
|
41
|
+
for name in (
|
|
42
|
+
"v2_line_start",
|
|
43
|
+
"v2_pointer",
|
|
44
|
+
"v2_row_start",
|
|
45
|
+
)
|
|
46
|
+
):
|
|
47
|
+
parsed.append(
|
|
48
|
+
ParsedCitation(
|
|
49
|
+
token=match.group(0),
|
|
50
|
+
source_id=match.group("source"),
|
|
51
|
+
range_id=range_id,
|
|
52
|
+
token_version=token_version,
|
|
53
|
+
kind="line_range",
|
|
54
|
+
line=line_no,
|
|
55
|
+
)
|
|
56
|
+
)
|
|
57
|
+
continue
|
|
58
|
+
if (match.group("v2_line_start") or match.group("line_start")) is not None:
|
|
59
|
+
parsed.append(
|
|
60
|
+
ParsedCitation(
|
|
61
|
+
token=match.group(0),
|
|
62
|
+
source_id=match.group("source"),
|
|
63
|
+
range_id=range_id,
|
|
64
|
+
token_version=token_version,
|
|
65
|
+
kind="line_range",
|
|
66
|
+
start_line=int(match.group("v2_line_start") or match.group("line_start")),
|
|
67
|
+
end_line=int(match.group("v2_line_end") or match.group("line_end")),
|
|
68
|
+
line=line_no,
|
|
69
|
+
)
|
|
70
|
+
)
|
|
71
|
+
continue
|
|
72
|
+
if (match.group("v2_pointer") or match.group("pointer")) is not None:
|
|
73
|
+
parsed.append(
|
|
74
|
+
ParsedCitation(
|
|
75
|
+
token=match.group(0),
|
|
76
|
+
source_id=match.group("source"),
|
|
77
|
+
range_id=range_id,
|
|
78
|
+
token_version=token_version,
|
|
79
|
+
kind="json_pointer",
|
|
80
|
+
pointer=unquote(match.group("v2_pointer") or match.group("pointer")),
|
|
81
|
+
line=line_no,
|
|
82
|
+
)
|
|
83
|
+
)
|
|
84
|
+
continue
|
|
85
|
+
parsed.append(
|
|
86
|
+
ParsedCitation(
|
|
87
|
+
token=match.group(0),
|
|
88
|
+
source_id=match.group("source"),
|
|
89
|
+
range_id=range_id,
|
|
90
|
+
token_version=token_version,
|
|
91
|
+
kind="csv_row_range",
|
|
92
|
+
start_row=int(match.group("v2_row_start") or match.group("row_start")),
|
|
93
|
+
end_row=int(match.group("v2_row_end") or match.group("row_end")),
|
|
94
|
+
line=line_no,
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
return parsed
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def invalid_markdown_citation_tokens(text: str) -> list[dict[str, Any]]:
|
|
101
|
+
invalid: list[dict[str, Any]] = []
|
|
102
|
+
for line_no, line in enumerate(text.splitlines(), start=1):
|
|
103
|
+
for match in KC_TOKEN_RE.finditer(line):
|
|
104
|
+
token = match.group(0)
|
|
105
|
+
if MARKER_RE.fullmatch(token) or CITATION_RE.fullmatch(token):
|
|
106
|
+
continue
|
|
107
|
+
invalid.append(
|
|
108
|
+
{
|
|
109
|
+
"code": "KC_CITATION_INVALID_TOKEN",
|
|
110
|
+
"message": f"Invalid kc citation token: {token}",
|
|
111
|
+
"line": line_no,
|
|
112
|
+
"token": token,
|
|
113
|
+
}
|
|
114
|
+
)
|
|
115
|
+
return invalid
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def has_citation_or_marker(text: str) -> bool:
|
|
119
|
+
return bool(CITATION_RE.search(text) or MARKER_RE.search(text))
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def find_range_for_token(
|
|
123
|
+
citation: ParsedCitation,
|
|
124
|
+
ranges: list[SourceRangeRecord],
|
|
125
|
+
) -> SourceRangeRecord | None:
|
|
126
|
+
if citation.range_id:
|
|
127
|
+
for candidate in ranges:
|
|
128
|
+
if candidate.source_id != citation.source_id or candidate.range_id != citation.range_id:
|
|
129
|
+
continue
|
|
130
|
+
if citation.start_line is None and citation.pointer is None and citation.start_row is None:
|
|
131
|
+
return candidate
|
|
132
|
+
loc = candidate.locator
|
|
133
|
+
if loc.kind != citation.kind:
|
|
134
|
+
return None
|
|
135
|
+
if citation.kind == "line_range" and loc.start_line == citation.start_line and loc.end_line == citation.end_line:
|
|
136
|
+
return candidate
|
|
137
|
+
if citation.kind == "json_pointer" and loc.pointer == citation.pointer:
|
|
138
|
+
return candidate
|
|
139
|
+
if citation.kind == "csv_row_range" and loc.start_row == citation.start_row and loc.end_row == citation.end_row:
|
|
140
|
+
return candidate
|
|
141
|
+
return None
|
|
142
|
+
for candidate in ranges:
|
|
143
|
+
loc = candidate.locator
|
|
144
|
+
if candidate.source_id != citation.source_id or loc.kind != citation.kind:
|
|
145
|
+
continue
|
|
146
|
+
if citation.kind == "line_range" and loc.start_line == citation.start_line and loc.end_line == citation.end_line:
|
|
147
|
+
return candidate
|
|
148
|
+
if citation.kind == "json_pointer" and loc.pointer == citation.pointer:
|
|
149
|
+
return candidate
|
|
150
|
+
if citation.kind == "csv_row_range" and loc.start_row == citation.start_row and loc.end_row == citation.end_row:
|
|
151
|
+
return candidate
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _previous_valid_edges(
|
|
156
|
+
citation_edges_path: Path | None,
|
|
157
|
+
artifact_path: str,
|
|
158
|
+
) -> dict[str, CitationEdgeRecord]:
|
|
159
|
+
if citation_edges_path is None:
|
|
160
|
+
return {}
|
|
161
|
+
return {
|
|
162
|
+
edge.citation_token: edge
|
|
163
|
+
for edge in read_jsonl(citation_edges_path, CitationEdgeRecord)
|
|
164
|
+
if edge.artifact_path == artifact_path and edge.status == "valid"
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _current_source_fingerprint(source: SourceRecord) -> str | None:
|
|
169
|
+
original = source.metadata.get("original_path")
|
|
170
|
+
if not isinstance(original, str):
|
|
171
|
+
return None
|
|
172
|
+
path = resolve_repo_path(original)
|
|
173
|
+
if not path.exists():
|
|
174
|
+
return None
|
|
175
|
+
return raw_fingerprint(path)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def validate_citations(
|
|
179
|
+
artifact_path: str,
|
|
180
|
+
artifact_text: str,
|
|
181
|
+
*,
|
|
182
|
+
sources_path: Path,
|
|
183
|
+
ranges_path: Path,
|
|
184
|
+
citation_edges_path: Path | None = None,
|
|
185
|
+
artifact_id: str | None = None,
|
|
186
|
+
) -> tuple[list[CitationEdgeRecord], list[dict[str, Any]]]:
|
|
187
|
+
sources = read_jsonl(sources_path, SourceRecord)
|
|
188
|
+
ranges = read_jsonl(ranges_path, SourceRangeRecord)
|
|
189
|
+
source_by_id = {s.source_id: s for s in sources}
|
|
190
|
+
parsed = parse_markdown_citations(artifact_text)
|
|
191
|
+
previous_edges = _previous_valid_edges(citation_edges_path, artifact_path)
|
|
192
|
+
edges: list[CitationEdgeRecord] = []
|
|
193
|
+
problems: list[dict[str, Any]] = invalid_markdown_citation_tokens(artifact_text)
|
|
194
|
+
timestamp = datetime.now(UTC).isoformat()
|
|
195
|
+
for problem in problems:
|
|
196
|
+
token = str(problem.get("token", ""))
|
|
197
|
+
source_match = re.search(r"\[kc:(src_[A-Za-z0-9_]+)", token)
|
|
198
|
+
edges.append(
|
|
199
|
+
CitationEdgeRecord(
|
|
200
|
+
edge_id=new_id("cite"),
|
|
201
|
+
artifact_id=artifact_id,
|
|
202
|
+
artifact_path=artifact_path,
|
|
203
|
+
artifact_locator=ArtifactLocator(
|
|
204
|
+
start_line=int(problem.get("line", 1)),
|
|
205
|
+
end_line=int(problem.get("line", 1)),
|
|
206
|
+
),
|
|
207
|
+
citation_token=token,
|
|
208
|
+
source_id=source_match.group(1) if source_match else "",
|
|
209
|
+
range_id=None,
|
|
210
|
+
source_fingerprint_at_validation=None,
|
|
211
|
+
validated_at=timestamp,
|
|
212
|
+
status="invalid_token",
|
|
213
|
+
)
|
|
214
|
+
)
|
|
215
|
+
for citation in parsed:
|
|
216
|
+
source = source_by_id.get(citation.source_id)
|
|
217
|
+
range_record = find_range_for_token(citation, ranges)
|
|
218
|
+
status = "valid"
|
|
219
|
+
if source is None:
|
|
220
|
+
status = "missing_source"
|
|
221
|
+
problems.append(
|
|
222
|
+
{
|
|
223
|
+
"code": "KC_CITATION_SOURCE_MISSING",
|
|
224
|
+
"message": f"Citation source does not exist: {citation.source_id}",
|
|
225
|
+
"line": citation.line,
|
|
226
|
+
"token": citation.token,
|
|
227
|
+
}
|
|
228
|
+
)
|
|
229
|
+
elif range_record is None:
|
|
230
|
+
status = "missing_range"
|
|
231
|
+
problems.append(
|
|
232
|
+
{
|
|
233
|
+
"code": "KC_CITATION_RANGE_MISSING",
|
|
234
|
+
"message": f"Citation range does not exist: {citation.token}",
|
|
235
|
+
"line": citation.line,
|
|
236
|
+
"token": citation.token,
|
|
237
|
+
}
|
|
238
|
+
)
|
|
239
|
+
elif range_record.source_fingerprint != source.fingerprint:
|
|
240
|
+
status = "stale_source"
|
|
241
|
+
problems.append(
|
|
242
|
+
{
|
|
243
|
+
"code": "KC_CITATION_STALE_SOURCE",
|
|
244
|
+
"message": f"Citation points to stale source fingerprint: {citation.token}",
|
|
245
|
+
"line": citation.line,
|
|
246
|
+
"token": citation.token,
|
|
247
|
+
}
|
|
248
|
+
)
|
|
249
|
+
else:
|
|
250
|
+
previous_edge = previous_edges.get(citation.token)
|
|
251
|
+
if (
|
|
252
|
+
citation.token_version == "v1"
|
|
253
|
+
and previous_edge is not None
|
|
254
|
+
and previous_edge.range_id
|
|
255
|
+
and previous_edge.range_id != range_record.range_id
|
|
256
|
+
):
|
|
257
|
+
status = "stale_source"
|
|
258
|
+
problems.append(
|
|
259
|
+
{
|
|
260
|
+
"code": "KC_CITATION_STALE_SOURCE",
|
|
261
|
+
"message": f"Legacy locator citation now resolves to different source text: {citation.token}",
|
|
262
|
+
"line": citation.line,
|
|
263
|
+
"token": citation.token,
|
|
264
|
+
"previous_range_id": previous_edge.range_id,
|
|
265
|
+
"current_range_id": range_record.range_id,
|
|
266
|
+
}
|
|
267
|
+
)
|
|
268
|
+
current_fingerprint = _current_source_fingerprint(source)
|
|
269
|
+
if current_fingerprint is not None and current_fingerprint != source.fingerprint:
|
|
270
|
+
status = "stale_source"
|
|
271
|
+
problems.append(
|
|
272
|
+
{
|
|
273
|
+
"code": "KC_CITATION_STALE_SOURCE",
|
|
274
|
+
"message": f"Citation source file fingerprint has changed: {citation.token}",
|
|
275
|
+
"line": citation.line,
|
|
276
|
+
"token": citation.token,
|
|
277
|
+
"source_id": source.source_id,
|
|
278
|
+
"registered_fingerprint": source.fingerprint,
|
|
279
|
+
"current_fingerprint": current_fingerprint,
|
|
280
|
+
}
|
|
281
|
+
)
|
|
282
|
+
edges.append(
|
|
283
|
+
CitationEdgeRecord(
|
|
284
|
+
edge_id=new_id("cite"),
|
|
285
|
+
artifact_id=artifact_id,
|
|
286
|
+
artifact_path=artifact_path,
|
|
287
|
+
artifact_locator=ArtifactLocator(start_line=citation.line, end_line=citation.line),
|
|
288
|
+
citation_token=citation.token,
|
|
289
|
+
source_id=citation.source_id,
|
|
290
|
+
range_id=range_record.range_id if range_record else None,
|
|
291
|
+
source_fingerprint_at_validation=source.fingerprint if source else None,
|
|
292
|
+
validated_at=timestamp,
|
|
293
|
+
status=status, # type: ignore[arg-type]
|
|
294
|
+
)
|
|
295
|
+
)
|
|
296
|
+
return edges, problems
|
kc/search/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Search and extraction primitives."""
|
kc/search/extract.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
"""Deterministic local source extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import csv
|
|
6
|
+
import hashlib
|
|
7
|
+
import json
|
|
8
|
+
import mimetypes
|
|
9
|
+
import tomllib
|
|
10
|
+
from datetime import UTC, datetime
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
import yaml
|
|
15
|
+
|
|
16
|
+
from kc.fingerprints import normalize_text, text_hash
|
|
17
|
+
from kc.models.source_range import Locator, SourceRangeRecord
|
|
18
|
+
|
|
19
|
+
TEXT_EXTENSIONS = {
|
|
20
|
+
".md",
|
|
21
|
+
".markdown",
|
|
22
|
+
".txt",
|
|
23
|
+
".rst",
|
|
24
|
+
".py",
|
|
25
|
+
".js",
|
|
26
|
+
".ts",
|
|
27
|
+
".json",
|
|
28
|
+
".yaml",
|
|
29
|
+
".yml",
|
|
30
|
+
".toml",
|
|
31
|
+
".csv",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def guess_media_type(path: Path) -> str:
|
|
36
|
+
guessed, _encoding = mimetypes.guess_type(path.name)
|
|
37
|
+
if guessed:
|
|
38
|
+
return guessed
|
|
39
|
+
if path.suffix.lower() in TEXT_EXTENSIONS:
|
|
40
|
+
return "text/plain"
|
|
41
|
+
return "application/octet-stream"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def is_text_like(path: Path, media_type: str) -> bool:
|
|
45
|
+
return media_type.startswith("text/") or path.suffix.lower() in TEXT_EXTENSIONS
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def extract_ranges(
|
|
49
|
+
path: Path,
|
|
50
|
+
source_id: str,
|
|
51
|
+
source_fingerprint: str,
|
|
52
|
+
*,
|
|
53
|
+
revision_id: str | None = None,
|
|
54
|
+
) -> list[SourceRangeRecord]:
|
|
55
|
+
media_type = guess_media_type(path)
|
|
56
|
+
if path.suffix.lower() == ".json":
|
|
57
|
+
try:
|
|
58
|
+
return _extract_structured_ranges(
|
|
59
|
+
json.loads(path.read_text(encoding="utf-8-sig")),
|
|
60
|
+
source_id,
|
|
61
|
+
source_fingerprint,
|
|
62
|
+
revision_id=revision_id,
|
|
63
|
+
)
|
|
64
|
+
except Exception:
|
|
65
|
+
return _extract_text_ranges(path, source_id, source_fingerprint, revision_id=revision_id)
|
|
66
|
+
if path.suffix.lower() in {".yaml", ".yml"}:
|
|
67
|
+
try:
|
|
68
|
+
return _extract_structured_ranges(
|
|
69
|
+
yaml.safe_load(path.read_text(encoding="utf-8-sig")),
|
|
70
|
+
source_id,
|
|
71
|
+
source_fingerprint,
|
|
72
|
+
revision_id=revision_id,
|
|
73
|
+
)
|
|
74
|
+
except Exception:
|
|
75
|
+
return _extract_text_ranges(path, source_id, source_fingerprint, revision_id=revision_id)
|
|
76
|
+
if path.suffix.lower() == ".toml":
|
|
77
|
+
try:
|
|
78
|
+
return _extract_structured_ranges(
|
|
79
|
+
tomllib.loads(path.read_text(encoding="utf-8-sig")),
|
|
80
|
+
source_id,
|
|
81
|
+
source_fingerprint,
|
|
82
|
+
revision_id=revision_id,
|
|
83
|
+
)
|
|
84
|
+
except Exception:
|
|
85
|
+
return _extract_text_ranges(path, source_id, source_fingerprint, revision_id=revision_id)
|
|
86
|
+
if path.suffix.lower() == ".csv":
|
|
87
|
+
try:
|
|
88
|
+
return _extract_csv_ranges(path, source_id, source_fingerprint, revision_id=revision_id)
|
|
89
|
+
except Exception:
|
|
90
|
+
return _extract_text_ranges(path, source_id, source_fingerprint, revision_id=revision_id)
|
|
91
|
+
if not is_text_like(path, media_type):
|
|
92
|
+
return []
|
|
93
|
+
return _extract_text_ranges(path, source_id, source_fingerprint, revision_id=revision_id)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _now() -> str:
|
|
97
|
+
return datetime.now(UTC).isoformat()
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _estimate_tokens(text: str) -> int:
|
|
101
|
+
return max(1, len(text.split()))
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _range_id(source_id: str, revision_id: str | None, locator: Locator, excerpt_hash: str) -> str:
|
|
105
|
+
digest = hashlib.sha256(
|
|
106
|
+
f"{source_id}:{revision_id or ''}:{locator.model_dump_json(exclude_none=True)}:{excerpt_hash}".encode()
|
|
107
|
+
).hexdigest()
|
|
108
|
+
return f"rng_{digest[:26].upper()}"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _extract_text_ranges(
|
|
112
|
+
path: Path, source_id: str, source_fingerprint: str, *, revision_id: str | None = None
|
|
113
|
+
) -> list[SourceRangeRecord]:
|
|
114
|
+
text = normalize_text(path.read_text(encoding="utf-8-sig"))
|
|
115
|
+
lines = text.split("\n")
|
|
116
|
+
chunks: list[tuple[int, int, list[str]]] = []
|
|
117
|
+
current_start: int | None = None
|
|
118
|
+
current_lines: list[str] = []
|
|
119
|
+
heading_path: list[str] = []
|
|
120
|
+
current_heading: list[str] = []
|
|
121
|
+
|
|
122
|
+
def flush(end_line: int) -> None:
|
|
123
|
+
nonlocal current_start, current_lines, current_heading
|
|
124
|
+
if current_start is None:
|
|
125
|
+
return
|
|
126
|
+
content = "\n".join(current_lines).strip()
|
|
127
|
+
if content:
|
|
128
|
+
chunks.append((current_start, end_line, list(current_heading)))
|
|
129
|
+
current_start = None
|
|
130
|
+
current_lines = []
|
|
131
|
+
|
|
132
|
+
for idx, raw_line in enumerate(lines, start=1):
|
|
133
|
+
line = raw_line.rstrip()
|
|
134
|
+
if path.suffix.lower() in {".md", ".markdown"} and line.startswith("#"):
|
|
135
|
+
flush(idx - 1)
|
|
136
|
+
depth = len(line) - len(line.lstrip("#"))
|
|
137
|
+
title = line.lstrip("#").strip()
|
|
138
|
+
if title:
|
|
139
|
+
heading_path = [*heading_path[: max(depth - 1, 0)], title]
|
|
140
|
+
current_heading = list(heading_path)
|
|
141
|
+
current_start = idx
|
|
142
|
+
current_lines = [raw_line]
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
if not line.strip():
|
|
146
|
+
flush(idx - 1)
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
if current_start is None:
|
|
150
|
+
current_start = idx
|
|
151
|
+
current_heading = list(heading_path)
|
|
152
|
+
current_lines = []
|
|
153
|
+
current_lines.append(raw_line)
|
|
154
|
+
|
|
155
|
+
if len(current_lines) >= 24:
|
|
156
|
+
flush(idx)
|
|
157
|
+
|
|
158
|
+
flush(len(lines))
|
|
159
|
+
|
|
160
|
+
if not chunks and text.strip():
|
|
161
|
+
chunks = [(1, len(lines), [])]
|
|
162
|
+
|
|
163
|
+
extracted_at = _now()
|
|
164
|
+
records: list[SourceRangeRecord] = []
|
|
165
|
+
for start_line, end_line, headings in chunks:
|
|
166
|
+
excerpt = "\n".join(lines[start_line - 1 : end_line]).strip()
|
|
167
|
+
if not excerpt:
|
|
168
|
+
continue
|
|
169
|
+
locator = Locator(kind="line_range", start_line=start_line, end_line=end_line)
|
|
170
|
+
excerpt_hash = text_hash(excerpt)
|
|
171
|
+
records.append(
|
|
172
|
+
SourceRangeRecord(
|
|
173
|
+
range_id=_range_id(source_id, revision_id, locator, excerpt_hash),
|
|
174
|
+
source_id=source_id,
|
|
175
|
+
revision_id=revision_id,
|
|
176
|
+
source_fingerprint=source_fingerprint,
|
|
177
|
+
locator=locator,
|
|
178
|
+
text_hash=excerpt_hash,
|
|
179
|
+
excerpt=excerpt,
|
|
180
|
+
tokens_estimate=_estimate_tokens(excerpt),
|
|
181
|
+
extracted_at=extracted_at,
|
|
182
|
+
metadata={"heading_path": headings},
|
|
183
|
+
)
|
|
184
|
+
)
|
|
185
|
+
return records
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _extract_structured_ranges(
|
|
189
|
+
data: Any,
|
|
190
|
+
source_id: str,
|
|
191
|
+
source_fingerprint: str,
|
|
192
|
+
*,
|
|
193
|
+
revision_id: str | None = None,
|
|
194
|
+
) -> list[SourceRangeRecord]:
|
|
195
|
+
records: list[SourceRangeRecord] = []
|
|
196
|
+
extracted_at = _now()
|
|
197
|
+
|
|
198
|
+
def visit(value: Any, pointer: str) -> None:
|
|
199
|
+
if isinstance(value, dict):
|
|
200
|
+
for key, child in value.items():
|
|
201
|
+
escaped = str(key).replace("~", "~0").replace("/", "~1")
|
|
202
|
+
visit(child, f"{pointer}/{escaped}")
|
|
203
|
+
elif isinstance(value, list):
|
|
204
|
+
for idx, child in enumerate(value):
|
|
205
|
+
visit(child, f"{pointer}/{idx}")
|
|
206
|
+
else:
|
|
207
|
+
excerpt = json.dumps(value, ensure_ascii=False, sort_keys=True) if not isinstance(value, str) else value
|
|
208
|
+
excerpt = excerpt.strip()
|
|
209
|
+
if not excerpt:
|
|
210
|
+
return
|
|
211
|
+
locator = Locator(kind="json_pointer", pointer=pointer or "/")
|
|
212
|
+
excerpt_hash = text_hash(excerpt)
|
|
213
|
+
records.append(
|
|
214
|
+
SourceRangeRecord(
|
|
215
|
+
range_id=_range_id(source_id, revision_id, locator, excerpt_hash),
|
|
216
|
+
source_id=source_id,
|
|
217
|
+
revision_id=revision_id,
|
|
218
|
+
source_fingerprint=source_fingerprint,
|
|
219
|
+
locator=locator,
|
|
220
|
+
text_hash=excerpt_hash,
|
|
221
|
+
excerpt=excerpt,
|
|
222
|
+
tokens_estimate=_estimate_tokens(excerpt),
|
|
223
|
+
extracted_at=extracted_at,
|
|
224
|
+
metadata={"heading_path": [pointer or "/"]},
|
|
225
|
+
)
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
visit(data, "")
|
|
229
|
+
return records
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _extract_csv_ranges(
|
|
233
|
+
path: Path,
|
|
234
|
+
source_id: str,
|
|
235
|
+
source_fingerprint: str,
|
|
236
|
+
*,
|
|
237
|
+
revision_id: str | None = None,
|
|
238
|
+
) -> list[SourceRangeRecord]:
|
|
239
|
+
text = path.read_text(encoding="utf-8-sig")
|
|
240
|
+
rows = list(csv.reader(text.splitlines()))
|
|
241
|
+
if not rows:
|
|
242
|
+
return []
|
|
243
|
+
header = [cell.strip() for cell in rows[0]]
|
|
244
|
+
records: list[SourceRangeRecord] = []
|
|
245
|
+
extracted_at = _now()
|
|
246
|
+
for row_index, row in enumerate(rows[1:], start=2):
|
|
247
|
+
values = {
|
|
248
|
+
header[index] if index < len(header) and header[index] else f"column_{index + 1}": value
|
|
249
|
+
for index, value in enumerate(row)
|
|
250
|
+
}
|
|
251
|
+
excerpt = json.dumps(values, ensure_ascii=False, sort_keys=True)
|
|
252
|
+
locator = Locator(kind="csv_row_range", start_row=row_index, end_row=row_index)
|
|
253
|
+
excerpt_hash = text_hash(excerpt)
|
|
254
|
+
records.append(
|
|
255
|
+
SourceRangeRecord(
|
|
256
|
+
range_id=_range_id(source_id, revision_id, locator, excerpt_hash),
|
|
257
|
+
source_id=source_id,
|
|
258
|
+
revision_id=revision_id,
|
|
259
|
+
source_fingerprint=source_fingerprint,
|
|
260
|
+
locator=locator,
|
|
261
|
+
text_hash=excerpt_hash,
|
|
262
|
+
excerpt=excerpt,
|
|
263
|
+
tokens_estimate=_estimate_tokens(excerpt),
|
|
264
|
+
extracted_at=extracted_at,
|
|
265
|
+
metadata={"heading_path": ["csv", f"row {row_index}"]},
|
|
266
|
+
)
|
|
267
|
+
)
|
|
268
|
+
return records
|