arc-context 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arc/__init__.py +55 -0
- arc/builder.py +584 -0
- arc/cache.py +100 -0
- arc/cas.py +168 -0
- arc/cli.py +391 -0
- arc/compressor.py +71 -0
- arc/config.py +108 -0
- arc/diff.py +157 -0
- arc/embeddings.py +328 -0
- arc/extractor.py +484 -0
- arc/loader.py +528 -0
- arc/manifest.py +97 -0
- arc/models.py +368 -0
- arc/provenance.py +36 -0
- arc/py.typed +0 -0
- arc/reasoning.py +42 -0
- arc/refinement.py +458 -0
- arc/retrieval_pipeline.py +284 -0
- arc/scope.py +314 -0
- arc_context-1.0.0.dist-info/METADATA +212 -0
- arc_context-1.0.0.dist-info/RECORD +25 -0
- arc_context-1.0.0.dist-info/WHEEL +5 -0
- arc_context-1.0.0.dist-info/entry_points.txt +2 -0
- arc_context-1.0.0.dist-info/licenses/LICENSE +190 -0
- arc_context-1.0.0.dist-info/top_level.txt +1 -0
arc/__init__.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""ARC (Agent Reasoning Context) — portable, verifiable context packaging for AI agents."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
__version__ = "1.0.0"
|
|
6
|
+
|
|
7
|
+
# Configure library logging — NullHandler by default so applications
|
|
8
|
+
# can attach their own handlers without seeing unexpected output.
|
|
9
|
+
logging.getLogger(__name__).addHandler(logging.NullHandler())
|
|
10
|
+
|
|
11
|
+
# Public API
|
|
12
|
+
from .builder import BuildResult, build_archive
|
|
13
|
+
from .cas import ContentAddressedStore, VerificationResult, sha256_digest
|
|
14
|
+
from .diff import diff_archives
|
|
15
|
+
from .loader import LoadedArchive, load, restore_sources, verify
|
|
16
|
+
from .models import (
|
|
17
|
+
Claim,
|
|
18
|
+
Decision,
|
|
19
|
+
EvidencePointer,
|
|
20
|
+
Layer,
|
|
21
|
+
Manifest,
|
|
22
|
+
PolicyRule,
|
|
23
|
+
Resource,
|
|
24
|
+
TextUnit,
|
|
25
|
+
ToolDeclaration,
|
|
26
|
+
WorkflowStep,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
# Builder
|
|
31
|
+
"build_archive",
|
|
32
|
+
"BuildResult",
|
|
33
|
+
# CAS
|
|
34
|
+
"ContentAddressedStore",
|
|
35
|
+
"VerificationResult",
|
|
36
|
+
"sha256_digest",
|
|
37
|
+
# Diff
|
|
38
|
+
"diff_archives",
|
|
39
|
+
# Loader
|
|
40
|
+
"load",
|
|
41
|
+
"verify",
|
|
42
|
+
"restore_sources",
|
|
43
|
+
"LoadedArchive",
|
|
44
|
+
# Models
|
|
45
|
+
"Claim",
|
|
46
|
+
"Decision",
|
|
47
|
+
"EvidencePointer",
|
|
48
|
+
"Layer",
|
|
49
|
+
"Manifest",
|
|
50
|
+
"PolicyRule",
|
|
51
|
+
"Resource",
|
|
52
|
+
"TextUnit",
|
|
53
|
+
"ToolDeclaration",
|
|
54
|
+
"WorkflowStep",
|
|
55
|
+
]
|
arc/builder.py
ADDED
|
@@ -0,0 +1,584 @@
|
|
|
1
|
+
"""Builder pipeline — 8-stage: ingest → normalize → chunk → extract → deduplicate → index → assemble → validate."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Callable, Optional
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
from .cas import ContentAddressedStore, sha256_digest
|
|
16
|
+
from .compressor import DeduplicationResult, deduplicate_claims
|
|
17
|
+
from .embeddings import VectorStore, get_embedder
|
|
18
|
+
from .extractor import extract_claims, extract_decisions, extract_policies, extract_tools, extract_workflow
|
|
19
|
+
from .manifest import build_manifest, validate_manifest, write_manifest_to_cas
|
|
20
|
+
from .models import (
|
|
21
|
+
Claim,
|
|
22
|
+
Decision,
|
|
23
|
+
Layer,
|
|
24
|
+
Manifest,
|
|
25
|
+
PolicyRule,
|
|
26
|
+
Resource,
|
|
27
|
+
TextUnit,
|
|
28
|
+
ToolDeclaration,
|
|
29
|
+
WorkflowStep,
|
|
30
|
+
_generate_id,
|
|
31
|
+
)
|
|
32
|
+
from .provenance import BuildProvenance
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class BuildResult:
|
|
37
|
+
"""Result of building an archive."""
|
|
38
|
+
|
|
39
|
+
archive_path: str
|
|
40
|
+
manifest: Manifest
|
|
41
|
+
resources: list[Resource]
|
|
42
|
+
text_units: list[TextUnit]
|
|
43
|
+
claims: list[Claim]
|
|
44
|
+
decisions: list[Decision]
|
|
45
|
+
deduplication: Optional[DeduplicationResult] = None
|
|
46
|
+
tools: list[ToolDeclaration] = field(default_factory=list)
|
|
47
|
+
policies: list[PolicyRule] = field(default_factory=list)
|
|
48
|
+
workflow_steps: list[WorkflowStep] = field(default_factory=list)
|
|
49
|
+
errors: list[str] = field(default_factory=list)
|
|
50
|
+
valid: bool = True
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def build_archive(
|
|
54
|
+
source_dir: str | Path,
|
|
55
|
+
output_dir: str | Path,
|
|
56
|
+
archive_id: Optional[str] = None,
|
|
57
|
+
archive_version: str = "1.0.0",
|
|
58
|
+
parent_archive: Optional[str | Path] = None,
|
|
59
|
+
force_tfidf: bool = False,
|
|
60
|
+
on_progress: Optional[Callable[[str, str], None]] = None,
|
|
61
|
+
) -> BuildResult:
|
|
62
|
+
"""Build an ARC archive from source directory.
|
|
63
|
+
|
|
64
|
+
8-stage pipeline:
|
|
65
|
+
1. Ingest: scan source directory, create Resource objects
|
|
66
|
+
2. Normalize: detect file types, extract structure
|
|
67
|
+
3. Chunk: produce TextUnit objects with provenance
|
|
68
|
+
4. Extract: claims and decisions
|
|
69
|
+
5. Deduplicate: remove duplicate claims, exclude contested
|
|
70
|
+
6. Index: generate embeddings
|
|
71
|
+
7. Assemble: write blobs to CAS, build manifest
|
|
72
|
+
8. Validate: verify all references
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
on_progress: optional callback(stage, detail) for progress reporting.
|
|
76
|
+
"""
|
|
77
|
+
source_dir = Path(source_dir)
|
|
78
|
+
output_dir = Path(output_dir)
|
|
79
|
+
|
|
80
|
+
def _progress(stage: str, detail: str = "") -> None:
|
|
81
|
+
if on_progress:
|
|
82
|
+
on_progress(stage, detail)
|
|
83
|
+
|
|
84
|
+
if not archive_id:
|
|
85
|
+
archive_id = f"arc://{source_dir.name}"
|
|
86
|
+
|
|
87
|
+
# Initialize CAS
|
|
88
|
+
cas = ContentAddressedStore(output_dir)
|
|
89
|
+
cas.initialize()
|
|
90
|
+
|
|
91
|
+
# Load parent CAS for incremental builds
|
|
92
|
+
parent_cas = None
|
|
93
|
+
if parent_archive:
|
|
94
|
+
parent_cas = ContentAddressedStore(Path(parent_archive))
|
|
95
|
+
|
|
96
|
+
# Initialize provenance
|
|
97
|
+
provenance = BuildProvenance(parameters={
|
|
98
|
+
"source_dir": str(source_dir),
|
|
99
|
+
})
|
|
100
|
+
|
|
101
|
+
# === Stage 1: Ingest ===
|
|
102
|
+
_progress("1/8", "Scanning files...")
|
|
103
|
+
resources = _ingest(source_dir, provenance)
|
|
104
|
+
_progress("1/8", f"Found {len(resources)} files")
|
|
105
|
+
|
|
106
|
+
# === Stage 2 & 3: Normalize + Chunk ===
|
|
107
|
+
_progress("2/8", "Chunking...")
|
|
108
|
+
text_units = _chunk(resources, source_dir)
|
|
109
|
+
_progress("2/8", f"Produced {len(text_units)} chunks")
|
|
110
|
+
|
|
111
|
+
# === Stage 4: Extract ===
|
|
112
|
+
_progress("3/8", "Extracting claims and decisions...")
|
|
113
|
+
claims = extract_claims(text_units)
|
|
114
|
+
decisions = extract_decisions(text_units)
|
|
115
|
+
tools = extract_tools(text_units, resources, source_dir)
|
|
116
|
+
policies = extract_policies(text_units, resources, source_dir)
|
|
117
|
+
workflow_steps = extract_workflow(text_units, resources, source_dir)
|
|
118
|
+
_progress("3/8", f"Extracted {len(claims)} claims, {len(decisions)} decisions")
|
|
119
|
+
|
|
120
|
+
# === Stage 5: Deduplicate ===
|
|
121
|
+
_progress("4/8", "Deduplicating claims...")
|
|
122
|
+
dedup = deduplicate_claims(claims)
|
|
123
|
+
deduped_claims = dedup.claims
|
|
124
|
+
if dedup.duplicates_removed > 0:
|
|
125
|
+
_progress("4/8", f"Removed {dedup.duplicates_removed} duplicates")
|
|
126
|
+
|
|
127
|
+
# === Stage 6: Index (embeddings) ===
|
|
128
|
+
_progress("5/8", "Building embeddings...")
|
|
129
|
+
embedder = get_embedder(dimensions=256, force_tfidf=force_tfidf)
|
|
130
|
+
all_texts = [tu.content for tu in text_units] + [c.text for c in deduped_claims]
|
|
131
|
+
if all_texts:
|
|
132
|
+
embedder.fit(all_texts)
|
|
133
|
+
_progress("5/8", f"Indexed {len(all_texts)} items")
|
|
134
|
+
|
|
135
|
+
vector_store = VectorStore(index_info=embedder.get_index_info())
|
|
136
|
+
if hasattr(embedder, 'vocab') and hasattr(embedder, 'idf'):
|
|
137
|
+
vector_store.embedder_state = {
|
|
138
|
+
"vocab": embedder.vocab,
|
|
139
|
+
"idf": {k: round(v, 6) for k, v in embedder.idf.items()},
|
|
140
|
+
"dimensions": embedder.dimensions,
|
|
141
|
+
}
|
|
142
|
+
for claim in deduped_claims:
|
|
143
|
+
vec = embedder.embed(claim.text)
|
|
144
|
+
vector_store.add(claim.id, vec, claim.text, {"kind": claim.kind})
|
|
145
|
+
|
|
146
|
+
for decision in decisions:
|
|
147
|
+
vec = embedder.embed(f"{decision.title} {decision.context}")
|
|
148
|
+
vector_store.add(decision.id, vec, decision.title, {"kind": "decision"})
|
|
149
|
+
|
|
150
|
+
for tool in tools:
|
|
151
|
+
vec = embedder.embed(f"{tool.name} {tool.description}")
|
|
152
|
+
vector_store.add(tool.id, vec, f"tool:{tool.name}: {tool.description}", {"kind": "tool"})
|
|
153
|
+
|
|
154
|
+
for step in workflow_steps:
|
|
155
|
+
vec = embedder.embed(f"{step.name} {step.description}")
|
|
156
|
+
vector_store.add(step.id, vec, f"workflow:{step.name}: {step.description}", {"kind": "workflow"})
|
|
157
|
+
|
|
158
|
+
# === Stage 7: Assemble ===
|
|
159
|
+
_progress("6/8", "Assembling archive...")
|
|
160
|
+
layers = []
|
|
161
|
+
|
|
162
|
+
# Resources layer (file metadata with locators)
|
|
163
|
+
res_data = json.dumps([r.to_dict() for r in resources], indent=2, sort_keys=True).encode()
|
|
164
|
+
res_digest = cas.store_blob(res_data)
|
|
165
|
+
layers.append(Layer(
|
|
166
|
+
name="resources",
|
|
167
|
+
type="semantic.resources",
|
|
168
|
+
digest=res_digest,
|
|
169
|
+
required=True,
|
|
170
|
+
))
|
|
171
|
+
|
|
172
|
+
# Source units layer
|
|
173
|
+
su_data = json.dumps([tu.to_dict() for tu in text_units], indent=2, sort_keys=True).encode()
|
|
174
|
+
su_digest = cas.store_blob(su_data)
|
|
175
|
+
layers.append(Layer(
|
|
176
|
+
name="source-units",
|
|
177
|
+
type="semantic.source_units",
|
|
178
|
+
digest=su_digest,
|
|
179
|
+
required=True,
|
|
180
|
+
depends_on=["resources"],
|
|
181
|
+
))
|
|
182
|
+
|
|
183
|
+
# Claims layer (deduplicated)
|
|
184
|
+
claims_data = json.dumps(
|
|
185
|
+
[c.to_dict() for c in deduped_claims], indent=2, sort_keys=True
|
|
186
|
+
).encode()
|
|
187
|
+
claims_digest = cas.store_blob(claims_data)
|
|
188
|
+
layers.append(Layer(
|
|
189
|
+
name="claims",
|
|
190
|
+
type="semantic.claims",
|
|
191
|
+
digest=claims_digest,
|
|
192
|
+
required=True,
|
|
193
|
+
depends_on=["source-units"],
|
|
194
|
+
))
|
|
195
|
+
|
|
196
|
+
# Decisions layer
|
|
197
|
+
if decisions:
|
|
198
|
+
decisions_data = json.dumps(
|
|
199
|
+
[d.to_dict() for d in decisions], indent=2, sort_keys=True
|
|
200
|
+
).encode()
|
|
201
|
+
decisions_digest = cas.store_blob(decisions_data)
|
|
202
|
+
layers.append(Layer(
|
|
203
|
+
name="decisions",
|
|
204
|
+
type="semantic.decisions",
|
|
205
|
+
digest=decisions_digest,
|
|
206
|
+
required=False,
|
|
207
|
+
depends_on=["source-units"],
|
|
208
|
+
))
|
|
209
|
+
|
|
210
|
+
# Embeddings layer
|
|
211
|
+
embeddings_data = json.dumps(vector_store.to_dict(), sort_keys=True).encode()
|
|
212
|
+
embeddings_digest = cas.store_blob(embeddings_data)
|
|
213
|
+
layers.append(Layer(
|
|
214
|
+
name="embeddings",
|
|
215
|
+
type="index.embeddings",
|
|
216
|
+
digest=embeddings_digest,
|
|
217
|
+
required=False,
|
|
218
|
+
depends_on=["claims"],
|
|
219
|
+
))
|
|
220
|
+
|
|
221
|
+
# Operational layers
|
|
222
|
+
if tools:
|
|
223
|
+
tools_data = json.dumps([t.to_dict() for t in tools], indent=2, sort_keys=True).encode()
|
|
224
|
+
layers.append(Layer(
|
|
225
|
+
name="tools",
|
|
226
|
+
type="operational.tools",
|
|
227
|
+
digest=cas.store_blob(tools_data),
|
|
228
|
+
required=False,
|
|
229
|
+
depends_on=["source-units"],
|
|
230
|
+
))
|
|
231
|
+
|
|
232
|
+
if policies:
|
|
233
|
+
policy_deps = ["tools"] if tools else ["source-units"]
|
|
234
|
+
policies_data = json.dumps([p.to_dict() for p in policies], indent=2, sort_keys=True).encode()
|
|
235
|
+
layers.append(Layer(
|
|
236
|
+
name="policy",
|
|
237
|
+
type="operational.policy",
|
|
238
|
+
digest=cas.store_blob(policies_data),
|
|
239
|
+
required=False,
|
|
240
|
+
depends_on=policy_deps,
|
|
241
|
+
))
|
|
242
|
+
|
|
243
|
+
if workflow_steps:
|
|
244
|
+
workflow_data = json.dumps([w.to_dict() for w in workflow_steps], indent=2, sort_keys=True).encode()
|
|
245
|
+
layers.append(Layer(
|
|
246
|
+
name="workflow",
|
|
247
|
+
type="operational.workflow",
|
|
248
|
+
digest=cas.store_blob(workflow_data),
|
|
249
|
+
required=False,
|
|
250
|
+
depends_on=["source-units"],
|
|
251
|
+
))
|
|
252
|
+
|
|
253
|
+
# Provenance
|
|
254
|
+
cas.store_json(provenance.to_dict(), "provenance.json")
|
|
255
|
+
|
|
256
|
+
# Parent reference for incremental builds
|
|
257
|
+
parent_ref = None
|
|
258
|
+
if parent_cas:
|
|
259
|
+
parent_manifest = parent_cas.read_manifest()
|
|
260
|
+
if parent_manifest:
|
|
261
|
+
parent_ref = parent_manifest.get("root_digest")
|
|
262
|
+
|
|
263
|
+
# Build manifest
|
|
264
|
+
manifest = build_manifest(
|
|
265
|
+
archive_id=archive_id,
|
|
266
|
+
archive_version=archive_version,
|
|
267
|
+
layers=layers,
|
|
268
|
+
provenance=provenance.to_dict(),
|
|
269
|
+
parent_archive=parent_ref,
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
# === Stage 8: Validate ===
|
|
273
|
+
_progress("7/8", "Validating references...")
|
|
274
|
+
errors = validate_manifest(manifest)
|
|
275
|
+
|
|
276
|
+
# Check all blob references exist
|
|
277
|
+
for layer in layers:
|
|
278
|
+
if not cas.has_blob(layer.digest):
|
|
279
|
+
errors.append(f"Missing blob for layer '{layer.name}': {layer.digest}")
|
|
280
|
+
|
|
281
|
+
# Check all evidence pointers reference valid source units
|
|
282
|
+
su_ids = {tu.id for tu in text_units}
|
|
283
|
+
for claim in deduped_claims:
|
|
284
|
+
for ev in claim.evidence:
|
|
285
|
+
if ev.source_unit_id not in su_ids:
|
|
286
|
+
errors.append(f"Claim '{claim.id}' references unknown source unit '{ev.source_unit_id}'")
|
|
287
|
+
|
|
288
|
+
if errors:
|
|
289
|
+
return BuildResult(
|
|
290
|
+
archive_path=str(output_dir),
|
|
291
|
+
manifest=manifest,
|
|
292
|
+
resources=resources,
|
|
293
|
+
text_units=text_units,
|
|
294
|
+
claims=deduped_claims,
|
|
295
|
+
decisions=decisions,
|
|
296
|
+
deduplication=dedup,
|
|
297
|
+
tools=tools,
|
|
298
|
+
policies=policies,
|
|
299
|
+
workflow_steps=workflow_steps,
|
|
300
|
+
errors=errors,
|
|
301
|
+
valid=False,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
# Write manifest
|
|
305
|
+
_progress("8/8", "Writing manifest...")
|
|
306
|
+
write_manifest_to_cas(manifest, cas)
|
|
307
|
+
|
|
308
|
+
_progress("done", "Archive built")
|
|
309
|
+
return BuildResult(
|
|
310
|
+
archive_path=str(output_dir),
|
|
311
|
+
manifest=manifest,
|
|
312
|
+
resources=resources,
|
|
313
|
+
text_units=text_units,
|
|
314
|
+
claims=deduped_claims,
|
|
315
|
+
decisions=decisions,
|
|
316
|
+
deduplication=dedup,
|
|
317
|
+
tools=tools,
|
|
318
|
+
policies=policies,
|
|
319
|
+
workflow_steps=workflow_steps,
|
|
320
|
+
valid=True,
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
_SOURCE_EXTENSIONS = {
|
|
325
|
+
".md", ".txt", ".py", ".js", ".ts", ".tsx", ".jsx",
|
|
326
|
+
".json", ".yaml", ".yml", ".toml",
|
|
327
|
+
".kt", ".kts", ".java", ".swift", ".go", ".rs", ".rb",
|
|
328
|
+
".c", ".h", ".cpp", ".hpp", ".cs", ".scala",
|
|
329
|
+
".sh", ".bash", ".zsh",
|
|
330
|
+
".sql", ".graphql", ".proto",
|
|
331
|
+
".xml", ".gradle",
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
# Directories always skipped during ingest (build artifacts, deps, caches)
|
|
335
|
+
_SKIP_DIRS = {
|
|
336
|
+
"build", "dist", "out", "target",
|
|
337
|
+
"node_modules", "__pycache__", ".gradle",
|
|
338
|
+
"Pods", "DerivedData",
|
|
339
|
+
"intermediates", "generated", "tmp", "outputs",
|
|
340
|
+
".git", ".hg", ".svn",
|
|
341
|
+
"vendor", "venv", ".venv", "env",
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def _ingest(source_dir: Path, provenance: BuildProvenance) -> list[Resource]:
|
|
346
|
+
"""Stage 1: Scan source directory and create Resource objects."""
|
|
347
|
+
resources = []
|
|
348
|
+
|
|
349
|
+
for root, dirs, files in os.walk(source_dir):
|
|
350
|
+
root_path = Path(root)
|
|
351
|
+
|
|
352
|
+
# Skip hidden and build artifact directories (prune in-place)
|
|
353
|
+
dirs[:] = [
|
|
354
|
+
d for d in dirs
|
|
355
|
+
if not d.startswith(".") and d not in _SKIP_DIRS
|
|
356
|
+
]
|
|
357
|
+
|
|
358
|
+
if any(part.startswith(".") for part in root_path.relative_to(source_dir).parts):
|
|
359
|
+
continue
|
|
360
|
+
|
|
361
|
+
for fname in sorted(files):
|
|
362
|
+
fpath = root_path / fname
|
|
363
|
+
if fpath.suffix not in _SOURCE_EXTENSIONS:
|
|
364
|
+
continue
|
|
365
|
+
|
|
366
|
+
content = fpath.read_bytes()
|
|
367
|
+
digest = sha256_digest(content)
|
|
368
|
+
|
|
369
|
+
kind = "document" if fpath.suffix in {".md", ".txt"} else "file"
|
|
370
|
+
locator = str(fpath.relative_to(source_dir))
|
|
371
|
+
resource = Resource(
|
|
372
|
+
id=_generate_id(f"resource:{locator}"),
|
|
373
|
+
kind=kind,
|
|
374
|
+
locator=locator,
|
|
375
|
+
content_digest=digest,
|
|
376
|
+
metadata={"size": len(content), "extension": fpath.suffix},
|
|
377
|
+
)
|
|
378
|
+
resources.append(resource)
|
|
379
|
+
provenance.add_source(resource.locator, digest, kind)
|
|
380
|
+
|
|
381
|
+
return resources
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
def _chunk(resources: list[Resource], source_dir: Path) -> list[TextUnit]:
|
|
385
|
+
"""Stage 2 & 3: Normalize files and produce TextUnit objects."""
|
|
386
|
+
text_units = []
|
|
387
|
+
|
|
388
|
+
for resource in resources:
|
|
389
|
+
fpath = source_dir / resource.locator
|
|
390
|
+
if not fpath.exists():
|
|
391
|
+
logger.warning("skipping missing file: %s", resource.locator)
|
|
392
|
+
continue
|
|
393
|
+
|
|
394
|
+
content = fpath.read_text(encoding="utf-8", errors="replace")
|
|
395
|
+
ext = resource.metadata.get("extension", "")
|
|
396
|
+
|
|
397
|
+
if ext in (".md", ".txt"):
|
|
398
|
+
units = _chunk_markdown(content, resource.id)
|
|
399
|
+
elif ext in (".py",):
|
|
400
|
+
units = _chunk_python(content, resource.id)
|
|
401
|
+
elif ext in (".kt", ".kts", ".java", ".swift", ".go", ".rs", ".scala", ".cs"):
|
|
402
|
+
units = _chunk_curly_brace(content, resource.id)
|
|
403
|
+
else:
|
|
404
|
+
units = _chunk_generic(content, resource.id)
|
|
405
|
+
|
|
406
|
+
text_units.extend(units)
|
|
407
|
+
|
|
408
|
+
return text_units
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def _chunk_markdown(content: str, resource_id: str) -> list[TextUnit]:
|
|
412
|
+
"""Chunk markdown by sections (## headers)."""
|
|
413
|
+
units = []
|
|
414
|
+
lines = content.split("\n")
|
|
415
|
+
|
|
416
|
+
# Detect frontmatter
|
|
417
|
+
if lines and lines[0].strip() == "---":
|
|
418
|
+
end_idx = -1
|
|
419
|
+
for i in range(1, len(lines)):
|
|
420
|
+
if lines[i].strip() == "---":
|
|
421
|
+
end_idx = i
|
|
422
|
+
break
|
|
423
|
+
if end_idx > 0:
|
|
424
|
+
fm_content = "\n".join(lines[:end_idx + 1])
|
|
425
|
+
units.append(TextUnit(
|
|
426
|
+
id=_generate_id(),
|
|
427
|
+
resource_id=resource_id,
|
|
428
|
+
kind="frontmatter",
|
|
429
|
+
content=fm_content,
|
|
430
|
+
span=(1, end_idx + 1),
|
|
431
|
+
))
|
|
432
|
+
lines = lines[end_idx + 1:]
|
|
433
|
+
|
|
434
|
+
# Split by headers
|
|
435
|
+
current_section: list[str] = []
|
|
436
|
+
section_start = 1
|
|
437
|
+
for i, line in enumerate(lines, start=1):
|
|
438
|
+
if re.match(r'^#{1,3}\s+', line) and current_section:
|
|
439
|
+
section_content = "\n".join(current_section).strip()
|
|
440
|
+
if section_content and len(section_content) > 10:
|
|
441
|
+
units.append(TextUnit(
|
|
442
|
+
id=_generate_id(),
|
|
443
|
+
resource_id=resource_id,
|
|
444
|
+
kind="section",
|
|
445
|
+
content=section_content,
|
|
446
|
+
span=(section_start, section_start + len(current_section) - 1),
|
|
447
|
+
))
|
|
448
|
+
current_section = []
|
|
449
|
+
section_start = i
|
|
450
|
+
current_section.append(line)
|
|
451
|
+
|
|
452
|
+
# Last section
|
|
453
|
+
if current_section:
|
|
454
|
+
section_content = "\n".join(current_section).strip()
|
|
455
|
+
if section_content and len(section_content) > 10:
|
|
456
|
+
units.append(TextUnit(
|
|
457
|
+
id=_generate_id(),
|
|
458
|
+
resource_id=resource_id,
|
|
459
|
+
kind="section",
|
|
460
|
+
content=section_content,
|
|
461
|
+
span=(section_start, section_start + len(current_section) - 1),
|
|
462
|
+
))
|
|
463
|
+
|
|
464
|
+
return units
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def _chunk_python(content: str, resource_id: str) -> list[TextUnit]:
|
|
468
|
+
"""Chunk Python by top-level functions and classes."""
|
|
469
|
+
units = []
|
|
470
|
+
lines = content.split("\n")
|
|
471
|
+
|
|
472
|
+
# Module docstring
|
|
473
|
+
if content.strip().startswith('"""') or content.strip().startswith("'''"):
|
|
474
|
+
quote = '"""' if content.strip().startswith('"""') else "'''"
|
|
475
|
+
end = content.find(quote, content.find(quote) + 3)
|
|
476
|
+
if end > 0:
|
|
477
|
+
doc = content[:end + 3]
|
|
478
|
+
units.append(TextUnit(
|
|
479
|
+
id=_generate_id(),
|
|
480
|
+
resource_id=resource_id,
|
|
481
|
+
kind="docstring",
|
|
482
|
+
content=doc,
|
|
483
|
+
span=(1, doc.count("\n") + 1),
|
|
484
|
+
))
|
|
485
|
+
|
|
486
|
+
# Top-level defs
|
|
487
|
+
current_block: list[str] = []
|
|
488
|
+
block_start = 1
|
|
489
|
+
for i, line in enumerate(lines, start=1):
|
|
490
|
+
if re.match(r'^(def |class |@)', line) and current_block:
|
|
491
|
+
block_content = "\n".join(current_block).strip()
|
|
492
|
+
if block_content and len(block_content) > 20:
|
|
493
|
+
kind = "function" if any(l.startswith("def ") for l in current_block) else "section"
|
|
494
|
+
units.append(TextUnit(
|
|
495
|
+
id=_generate_id(),
|
|
496
|
+
resource_id=resource_id,
|
|
497
|
+
kind=kind,
|
|
498
|
+
content=block_content,
|
|
499
|
+
span=(block_start, i - 1),
|
|
500
|
+
))
|
|
501
|
+
current_block = [line]
|
|
502
|
+
block_start = i
|
|
503
|
+
else:
|
|
504
|
+
current_block.append(line)
|
|
505
|
+
|
|
506
|
+
if current_block:
|
|
507
|
+
block_content = "\n".join(current_block).strip()
|
|
508
|
+
if block_content and len(block_content) > 20:
|
|
509
|
+
units.append(TextUnit(
|
|
510
|
+
id=_generate_id(),
|
|
511
|
+
resource_id=resource_id,
|
|
512
|
+
kind="section",
|
|
513
|
+
content=block_content,
|
|
514
|
+
span=(block_start, block_start + len(current_block) - 1),
|
|
515
|
+
))
|
|
516
|
+
|
|
517
|
+
return units
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
def _chunk_curly_brace(content: str, resource_id: str) -> list[TextUnit]:
|
|
521
|
+
"""Chunk Kotlin/Java/Swift/Go/Rust/etc. by top-level declarations."""
|
|
522
|
+
units = []
|
|
523
|
+
lines = content.split("\n")
|
|
524
|
+
|
|
525
|
+
# Top-level declarations: fun, class, interface, object, enum, struct, impl, func
|
|
526
|
+
_TOP_LEVEL = re.compile(
|
|
527
|
+
r'^(?:(?:public|private|protected|internal|open|abstract|override|suspend|data|sealed|inline|actual|expect)\s+)*'
|
|
528
|
+
r'(?:fun |class |interface |object |enum |struct |impl |func |fn |extension )'
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
current_block: list[str] = []
|
|
532
|
+
block_start = 1
|
|
533
|
+
for i, line in enumerate(lines, start=1):
|
|
534
|
+
if _TOP_LEVEL.match(line) and current_block:
|
|
535
|
+
block_content = "\n".join(current_block).strip()
|
|
536
|
+
if block_content and len(block_content) > 20:
|
|
537
|
+
kind = "function" if re.search(r'\b(fun |func |fn )\b', block_content[:200]) else "class"
|
|
538
|
+
units.append(TextUnit(
|
|
539
|
+
id=_generate_id(),
|
|
540
|
+
resource_id=resource_id,
|
|
541
|
+
kind=kind,
|
|
542
|
+
content=block_content,
|
|
543
|
+
span=(block_start, i - 1),
|
|
544
|
+
))
|
|
545
|
+
current_block = [line]
|
|
546
|
+
block_start = i
|
|
547
|
+
else:
|
|
548
|
+
current_block.append(line)
|
|
549
|
+
|
|
550
|
+
if current_block:
|
|
551
|
+
block_content = "\n".join(current_block).strip()
|
|
552
|
+
if block_content and len(block_content) > 20:
|
|
553
|
+
units.append(TextUnit(
|
|
554
|
+
id=_generate_id(),
|
|
555
|
+
resource_id=resource_id,
|
|
556
|
+
kind="section",
|
|
557
|
+
content=block_content,
|
|
558
|
+
span=(block_start, block_start + len(current_block) - 1),
|
|
559
|
+
))
|
|
560
|
+
|
|
561
|
+
return units
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
def _chunk_generic(content: str, resource_id: str) -> list[TextUnit]:
|
|
565
|
+
"""Chunk generic files by paragraph breaks."""
|
|
566
|
+
paragraphs = re.split(r'\n\s*\n', content)
|
|
567
|
+
units = []
|
|
568
|
+
line_offset = 1
|
|
569
|
+
for para in paragraphs:
|
|
570
|
+
para = para.strip()
|
|
571
|
+
if para and len(para) > 10:
|
|
572
|
+
n_lines = para.count("\n") + 1
|
|
573
|
+
units.append(TextUnit(
|
|
574
|
+
id=_generate_id(),
|
|
575
|
+
resource_id=resource_id,
|
|
576
|
+
kind="paragraph",
|
|
577
|
+
content=para,
|
|
578
|
+
span=(line_offset, line_offset + n_lines - 1),
|
|
579
|
+
))
|
|
580
|
+
line_offset += n_lines + 1
|
|
581
|
+
else:
|
|
582
|
+
line_offset += para.count("\n") + 2
|
|
583
|
+
|
|
584
|
+
return units
|