arc-context 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
arc/__init__.py ADDED
@@ -0,0 +1,55 @@
1
+ """ARC (Agent Reasoning Context) — portable, verifiable context packaging for AI agents."""
2
+
3
+ import logging
4
+
5
+ __version__ = "1.0.0"
6
+
7
+ # Configure library logging — NullHandler by default so applications
8
+ # can attach their own handlers without seeing unexpected output.
9
+ logging.getLogger(__name__).addHandler(logging.NullHandler())
10
+
11
+ # Public API
12
+ from .builder import BuildResult, build_archive
13
+ from .cas import ContentAddressedStore, VerificationResult, sha256_digest
14
+ from .diff import diff_archives
15
+ from .loader import LoadedArchive, load, restore_sources, verify
16
+ from .models import (
17
+ Claim,
18
+ Decision,
19
+ EvidencePointer,
20
+ Layer,
21
+ Manifest,
22
+ PolicyRule,
23
+ Resource,
24
+ TextUnit,
25
+ ToolDeclaration,
26
+ WorkflowStep,
27
+ )
28
+
29
+ __all__ = [
30
+ # Builder
31
+ "build_archive",
32
+ "BuildResult",
33
+ # CAS
34
+ "ContentAddressedStore",
35
+ "VerificationResult",
36
+ "sha256_digest",
37
+ # Diff
38
+ "diff_archives",
39
+ # Loader
40
+ "load",
41
+ "verify",
42
+ "restore_sources",
43
+ "LoadedArchive",
44
+ # Models
45
+ "Claim",
46
+ "Decision",
47
+ "EvidencePointer",
48
+ "Layer",
49
+ "Manifest",
50
+ "PolicyRule",
51
+ "Resource",
52
+ "TextUnit",
53
+ "ToolDeclaration",
54
+ "WorkflowStep",
55
+ ]
arc/builder.py ADDED
@@ -0,0 +1,584 @@
1
+ """Builder pipeline — 8-stage: ingest → normalize → chunk → extract → deduplicate → index → assemble → validate."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ import os
8
+ import re
9
+ from dataclasses import dataclass, field
10
+ from pathlib import Path
11
+ from typing import Callable, Optional
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ from .cas import ContentAddressedStore, sha256_digest
16
+ from .compressor import DeduplicationResult, deduplicate_claims
17
+ from .embeddings import VectorStore, get_embedder
18
+ from .extractor import extract_claims, extract_decisions, extract_policies, extract_tools, extract_workflow
19
+ from .manifest import build_manifest, validate_manifest, write_manifest_to_cas
20
+ from .models import (
21
+ Claim,
22
+ Decision,
23
+ Layer,
24
+ Manifest,
25
+ PolicyRule,
26
+ Resource,
27
+ TextUnit,
28
+ ToolDeclaration,
29
+ WorkflowStep,
30
+ _generate_id,
31
+ )
32
+ from .provenance import BuildProvenance
33
+
34
+
35
+ @dataclass
36
+ class BuildResult:
37
+ """Result of building an archive."""
38
+
39
+ archive_path: str
40
+ manifest: Manifest
41
+ resources: list[Resource]
42
+ text_units: list[TextUnit]
43
+ claims: list[Claim]
44
+ decisions: list[Decision]
45
+ deduplication: Optional[DeduplicationResult] = None
46
+ tools: list[ToolDeclaration] = field(default_factory=list)
47
+ policies: list[PolicyRule] = field(default_factory=list)
48
+ workflow_steps: list[WorkflowStep] = field(default_factory=list)
49
+ errors: list[str] = field(default_factory=list)
50
+ valid: bool = True
51
+
52
+
53
+ def build_archive(
54
+ source_dir: str | Path,
55
+ output_dir: str | Path,
56
+ archive_id: Optional[str] = None,
57
+ archive_version: str = "1.0.0",
58
+ parent_archive: Optional[str | Path] = None,
59
+ force_tfidf: bool = False,
60
+ on_progress: Optional[Callable[[str, str], None]] = None,
61
+ ) -> BuildResult:
62
+ """Build an ARC archive from source directory.
63
+
64
+ 8-stage pipeline:
65
+ 1. Ingest: scan source directory, create Resource objects
66
+ 2. Normalize: detect file types, extract structure
67
+ 3. Chunk: produce TextUnit objects with provenance
68
+ 4. Extract: claims and decisions
69
+ 5. Deduplicate: remove duplicate claims, exclude contested
70
+ 6. Index: generate embeddings
71
+ 7. Assemble: write blobs to CAS, build manifest
72
+ 8. Validate: verify all references
73
+
74
+ Args:
75
+ on_progress: optional callback(stage, detail) for progress reporting.
76
+ """
77
+ source_dir = Path(source_dir)
78
+ output_dir = Path(output_dir)
79
+
80
+ def _progress(stage: str, detail: str = "") -> None:
81
+ if on_progress:
82
+ on_progress(stage, detail)
83
+
84
+ if not archive_id:
85
+ archive_id = f"arc://{source_dir.name}"
86
+
87
+ # Initialize CAS
88
+ cas = ContentAddressedStore(output_dir)
89
+ cas.initialize()
90
+
91
+ # Load parent CAS for incremental builds
92
+ parent_cas = None
93
+ if parent_archive:
94
+ parent_cas = ContentAddressedStore(Path(parent_archive))
95
+
96
+ # Initialize provenance
97
+ provenance = BuildProvenance(parameters={
98
+ "source_dir": str(source_dir),
99
+ })
100
+
101
+ # === Stage 1: Ingest ===
102
+ _progress("1/8", "Scanning files...")
103
+ resources = _ingest(source_dir, provenance)
104
+ _progress("1/8", f"Found {len(resources)} files")
105
+
106
+ # === Stage 2 & 3: Normalize + Chunk ===
107
+ _progress("2/8", "Chunking...")
108
+ text_units = _chunk(resources, source_dir)
109
+ _progress("2/8", f"Produced {len(text_units)} chunks")
110
+
111
+ # === Stage 4: Extract ===
112
+ _progress("3/8", "Extracting claims and decisions...")
113
+ claims = extract_claims(text_units)
114
+ decisions = extract_decisions(text_units)
115
+ tools = extract_tools(text_units, resources, source_dir)
116
+ policies = extract_policies(text_units, resources, source_dir)
117
+ workflow_steps = extract_workflow(text_units, resources, source_dir)
118
+ _progress("3/8", f"Extracted {len(claims)} claims, {len(decisions)} decisions")
119
+
120
+ # === Stage 5: Deduplicate ===
121
+ _progress("4/8", "Deduplicating claims...")
122
+ dedup = deduplicate_claims(claims)
123
+ deduped_claims = dedup.claims
124
+ if dedup.duplicates_removed > 0:
125
+ _progress("4/8", f"Removed {dedup.duplicates_removed} duplicates")
126
+
127
+ # === Stage 6: Index (embeddings) ===
128
+ _progress("5/8", "Building embeddings...")
129
+ embedder = get_embedder(dimensions=256, force_tfidf=force_tfidf)
130
+ all_texts = [tu.content for tu in text_units] + [c.text for c in deduped_claims]
131
+ if all_texts:
132
+ embedder.fit(all_texts)
133
+ _progress("5/8", f"Indexed {len(all_texts)} items")
134
+
135
+ vector_store = VectorStore(index_info=embedder.get_index_info())
136
+ if hasattr(embedder, 'vocab') and hasattr(embedder, 'idf'):
137
+ vector_store.embedder_state = {
138
+ "vocab": embedder.vocab,
139
+ "idf": {k: round(v, 6) for k, v in embedder.idf.items()},
140
+ "dimensions": embedder.dimensions,
141
+ }
142
+ for claim in deduped_claims:
143
+ vec = embedder.embed(claim.text)
144
+ vector_store.add(claim.id, vec, claim.text, {"kind": claim.kind})
145
+
146
+ for decision in decisions:
147
+ vec = embedder.embed(f"{decision.title} {decision.context}")
148
+ vector_store.add(decision.id, vec, decision.title, {"kind": "decision"})
149
+
150
+ for tool in tools:
151
+ vec = embedder.embed(f"{tool.name} {tool.description}")
152
+ vector_store.add(tool.id, vec, f"tool:{tool.name}: {tool.description}", {"kind": "tool"})
153
+
154
+ for step in workflow_steps:
155
+ vec = embedder.embed(f"{step.name} {step.description}")
156
+ vector_store.add(step.id, vec, f"workflow:{step.name}: {step.description}", {"kind": "workflow"})
157
+
158
+ # === Stage 7: Assemble ===
159
+ _progress("6/8", "Assembling archive...")
160
+ layers = []
161
+
162
+ # Resources layer (file metadata with locators)
163
+ res_data = json.dumps([r.to_dict() for r in resources], indent=2, sort_keys=True).encode()
164
+ res_digest = cas.store_blob(res_data)
165
+ layers.append(Layer(
166
+ name="resources",
167
+ type="semantic.resources",
168
+ digest=res_digest,
169
+ required=True,
170
+ ))
171
+
172
+ # Source units layer
173
+ su_data = json.dumps([tu.to_dict() for tu in text_units], indent=2, sort_keys=True).encode()
174
+ su_digest = cas.store_blob(su_data)
175
+ layers.append(Layer(
176
+ name="source-units",
177
+ type="semantic.source_units",
178
+ digest=su_digest,
179
+ required=True,
180
+ depends_on=["resources"],
181
+ ))
182
+
183
+ # Claims layer (deduplicated)
184
+ claims_data = json.dumps(
185
+ [c.to_dict() for c in deduped_claims], indent=2, sort_keys=True
186
+ ).encode()
187
+ claims_digest = cas.store_blob(claims_data)
188
+ layers.append(Layer(
189
+ name="claims",
190
+ type="semantic.claims",
191
+ digest=claims_digest,
192
+ required=True,
193
+ depends_on=["source-units"],
194
+ ))
195
+
196
+ # Decisions layer
197
+ if decisions:
198
+ decisions_data = json.dumps(
199
+ [d.to_dict() for d in decisions], indent=2, sort_keys=True
200
+ ).encode()
201
+ decisions_digest = cas.store_blob(decisions_data)
202
+ layers.append(Layer(
203
+ name="decisions",
204
+ type="semantic.decisions",
205
+ digest=decisions_digest,
206
+ required=False,
207
+ depends_on=["source-units"],
208
+ ))
209
+
210
+ # Embeddings layer
211
+ embeddings_data = json.dumps(vector_store.to_dict(), sort_keys=True).encode()
212
+ embeddings_digest = cas.store_blob(embeddings_data)
213
+ layers.append(Layer(
214
+ name="embeddings",
215
+ type="index.embeddings",
216
+ digest=embeddings_digest,
217
+ required=False,
218
+ depends_on=["claims"],
219
+ ))
220
+
221
+ # Operational layers
222
+ if tools:
223
+ tools_data = json.dumps([t.to_dict() for t in tools], indent=2, sort_keys=True).encode()
224
+ layers.append(Layer(
225
+ name="tools",
226
+ type="operational.tools",
227
+ digest=cas.store_blob(tools_data),
228
+ required=False,
229
+ depends_on=["source-units"],
230
+ ))
231
+
232
+ if policies:
233
+ policy_deps = ["tools"] if tools else ["source-units"]
234
+ policies_data = json.dumps([p.to_dict() for p in policies], indent=2, sort_keys=True).encode()
235
+ layers.append(Layer(
236
+ name="policy",
237
+ type="operational.policy",
238
+ digest=cas.store_blob(policies_data),
239
+ required=False,
240
+ depends_on=policy_deps,
241
+ ))
242
+
243
+ if workflow_steps:
244
+ workflow_data = json.dumps([w.to_dict() for w in workflow_steps], indent=2, sort_keys=True).encode()
245
+ layers.append(Layer(
246
+ name="workflow",
247
+ type="operational.workflow",
248
+ digest=cas.store_blob(workflow_data),
249
+ required=False,
250
+ depends_on=["source-units"],
251
+ ))
252
+
253
+ # Provenance
254
+ cas.store_json(provenance.to_dict(), "provenance.json")
255
+
256
+ # Parent reference for incremental builds
257
+ parent_ref = None
258
+ if parent_cas:
259
+ parent_manifest = parent_cas.read_manifest()
260
+ if parent_manifest:
261
+ parent_ref = parent_manifest.get("root_digest")
262
+
263
+ # Build manifest
264
+ manifest = build_manifest(
265
+ archive_id=archive_id,
266
+ archive_version=archive_version,
267
+ layers=layers,
268
+ provenance=provenance.to_dict(),
269
+ parent_archive=parent_ref,
270
+ )
271
+
272
+ # === Stage 8: Validate ===
273
+ _progress("7/8", "Validating references...")
274
+ errors = validate_manifest(manifest)
275
+
276
+ # Check all blob references exist
277
+ for layer in layers:
278
+ if not cas.has_blob(layer.digest):
279
+ errors.append(f"Missing blob for layer '{layer.name}': {layer.digest}")
280
+
281
+ # Check all evidence pointers reference valid source units
282
+ su_ids = {tu.id for tu in text_units}
283
+ for claim in deduped_claims:
284
+ for ev in claim.evidence:
285
+ if ev.source_unit_id not in su_ids:
286
+ errors.append(f"Claim '{claim.id}' references unknown source unit '{ev.source_unit_id}'")
287
+
288
+ if errors:
289
+ return BuildResult(
290
+ archive_path=str(output_dir),
291
+ manifest=manifest,
292
+ resources=resources,
293
+ text_units=text_units,
294
+ claims=deduped_claims,
295
+ decisions=decisions,
296
+ deduplication=dedup,
297
+ tools=tools,
298
+ policies=policies,
299
+ workflow_steps=workflow_steps,
300
+ errors=errors,
301
+ valid=False,
302
+ )
303
+
304
+ # Write manifest
305
+ _progress("8/8", "Writing manifest...")
306
+ write_manifest_to_cas(manifest, cas)
307
+
308
+ _progress("done", "Archive built")
309
+ return BuildResult(
310
+ archive_path=str(output_dir),
311
+ manifest=manifest,
312
+ resources=resources,
313
+ text_units=text_units,
314
+ claims=deduped_claims,
315
+ decisions=decisions,
316
+ deduplication=dedup,
317
+ tools=tools,
318
+ policies=policies,
319
+ workflow_steps=workflow_steps,
320
+ valid=True,
321
+ )
322
+
323
+
324
+ _SOURCE_EXTENSIONS = {
325
+ ".md", ".txt", ".py", ".js", ".ts", ".tsx", ".jsx",
326
+ ".json", ".yaml", ".yml", ".toml",
327
+ ".kt", ".kts", ".java", ".swift", ".go", ".rs", ".rb",
328
+ ".c", ".h", ".cpp", ".hpp", ".cs", ".scala",
329
+ ".sh", ".bash", ".zsh",
330
+ ".sql", ".graphql", ".proto",
331
+ ".xml", ".gradle",
332
+ }
333
+
334
+ # Directories always skipped during ingest (build artifacts, deps, caches)
335
+ _SKIP_DIRS = {
336
+ "build", "dist", "out", "target",
337
+ "node_modules", "__pycache__", ".gradle",
338
+ "Pods", "DerivedData",
339
+ "intermediates", "generated", "tmp", "outputs",
340
+ ".git", ".hg", ".svn",
341
+ "vendor", "venv", ".venv", "env",
342
+ }
343
+
344
+
345
+ def _ingest(source_dir: Path, provenance: BuildProvenance) -> list[Resource]:
346
+ """Stage 1: Scan source directory and create Resource objects."""
347
+ resources = []
348
+
349
+ for root, dirs, files in os.walk(source_dir):
350
+ root_path = Path(root)
351
+
352
+ # Skip hidden and build artifact directories (prune in-place)
353
+ dirs[:] = [
354
+ d for d in dirs
355
+ if not d.startswith(".") and d not in _SKIP_DIRS
356
+ ]
357
+
358
+ if any(part.startswith(".") for part in root_path.relative_to(source_dir).parts):
359
+ continue
360
+
361
+ for fname in sorted(files):
362
+ fpath = root_path / fname
363
+ if fpath.suffix not in _SOURCE_EXTENSIONS:
364
+ continue
365
+
366
+ content = fpath.read_bytes()
367
+ digest = sha256_digest(content)
368
+
369
+ kind = "document" if fpath.suffix in {".md", ".txt"} else "file"
370
+ locator = str(fpath.relative_to(source_dir))
371
+ resource = Resource(
372
+ id=_generate_id(f"resource:{locator}"),
373
+ kind=kind,
374
+ locator=locator,
375
+ content_digest=digest,
376
+ metadata={"size": len(content), "extension": fpath.suffix},
377
+ )
378
+ resources.append(resource)
379
+ provenance.add_source(resource.locator, digest, kind)
380
+
381
+ return resources
382
+
383
+
384
+ def _chunk(resources: list[Resource], source_dir: Path) -> list[TextUnit]:
385
+ """Stage 2 & 3: Normalize files and produce TextUnit objects."""
386
+ text_units = []
387
+
388
+ for resource in resources:
389
+ fpath = source_dir / resource.locator
390
+ if not fpath.exists():
391
+ logger.warning("skipping missing file: %s", resource.locator)
392
+ continue
393
+
394
+ content = fpath.read_text(encoding="utf-8", errors="replace")
395
+ ext = resource.metadata.get("extension", "")
396
+
397
+ if ext in (".md", ".txt"):
398
+ units = _chunk_markdown(content, resource.id)
399
+ elif ext in (".py",):
400
+ units = _chunk_python(content, resource.id)
401
+ elif ext in (".kt", ".kts", ".java", ".swift", ".go", ".rs", ".scala", ".cs"):
402
+ units = _chunk_curly_brace(content, resource.id)
403
+ else:
404
+ units = _chunk_generic(content, resource.id)
405
+
406
+ text_units.extend(units)
407
+
408
+ return text_units
409
+
410
+
411
+ def _chunk_markdown(content: str, resource_id: str) -> list[TextUnit]:
412
+ """Chunk markdown by sections (## headers)."""
413
+ units = []
414
+ lines = content.split("\n")
415
+
416
+ # Detect frontmatter
417
+ if lines and lines[0].strip() == "---":
418
+ end_idx = -1
419
+ for i in range(1, len(lines)):
420
+ if lines[i].strip() == "---":
421
+ end_idx = i
422
+ break
423
+ if end_idx > 0:
424
+ fm_content = "\n".join(lines[:end_idx + 1])
425
+ units.append(TextUnit(
426
+ id=_generate_id(),
427
+ resource_id=resource_id,
428
+ kind="frontmatter",
429
+ content=fm_content,
430
+ span=(1, end_idx + 1),
431
+ ))
432
+ lines = lines[end_idx + 1:]
433
+
434
+ # Split by headers
435
+ current_section: list[str] = []
436
+ section_start = 1
437
+ for i, line in enumerate(lines, start=1):
438
+ if re.match(r'^#{1,3}\s+', line) and current_section:
439
+ section_content = "\n".join(current_section).strip()
440
+ if section_content and len(section_content) > 10:
441
+ units.append(TextUnit(
442
+ id=_generate_id(),
443
+ resource_id=resource_id,
444
+ kind="section",
445
+ content=section_content,
446
+ span=(section_start, section_start + len(current_section) - 1),
447
+ ))
448
+ current_section = []
449
+ section_start = i
450
+ current_section.append(line)
451
+
452
+ # Last section
453
+ if current_section:
454
+ section_content = "\n".join(current_section).strip()
455
+ if section_content and len(section_content) > 10:
456
+ units.append(TextUnit(
457
+ id=_generate_id(),
458
+ resource_id=resource_id,
459
+ kind="section",
460
+ content=section_content,
461
+ span=(section_start, section_start + len(current_section) - 1),
462
+ ))
463
+
464
+ return units
465
+
466
+
467
+ def _chunk_python(content: str, resource_id: str) -> list[TextUnit]:
468
+ """Chunk Python by top-level functions and classes."""
469
+ units = []
470
+ lines = content.split("\n")
471
+
472
+ # Module docstring
473
+ if content.strip().startswith('"""') or content.strip().startswith("'''"):
474
+ quote = '"""' if content.strip().startswith('"""') else "'''"
475
+ end = content.find(quote, content.find(quote) + 3)
476
+ if end > 0:
477
+ doc = content[:end + 3]
478
+ units.append(TextUnit(
479
+ id=_generate_id(),
480
+ resource_id=resource_id,
481
+ kind="docstring",
482
+ content=doc,
483
+ span=(1, doc.count("\n") + 1),
484
+ ))
485
+
486
+ # Top-level defs
487
+ current_block: list[str] = []
488
+ block_start = 1
489
+ for i, line in enumerate(lines, start=1):
490
+ if re.match(r'^(def |class |@)', line) and current_block:
491
+ block_content = "\n".join(current_block).strip()
492
+ if block_content and len(block_content) > 20:
493
+ kind = "function" if any(l.startswith("def ") for l in current_block) else "section"
494
+ units.append(TextUnit(
495
+ id=_generate_id(),
496
+ resource_id=resource_id,
497
+ kind=kind,
498
+ content=block_content,
499
+ span=(block_start, i - 1),
500
+ ))
501
+ current_block = [line]
502
+ block_start = i
503
+ else:
504
+ current_block.append(line)
505
+
506
+ if current_block:
507
+ block_content = "\n".join(current_block).strip()
508
+ if block_content and len(block_content) > 20:
509
+ units.append(TextUnit(
510
+ id=_generate_id(),
511
+ resource_id=resource_id,
512
+ kind="section",
513
+ content=block_content,
514
+ span=(block_start, block_start + len(current_block) - 1),
515
+ ))
516
+
517
+ return units
518
+
519
+
520
+ def _chunk_curly_brace(content: str, resource_id: str) -> list[TextUnit]:
521
+ """Chunk Kotlin/Java/Swift/Go/Rust/etc. by top-level declarations."""
522
+ units = []
523
+ lines = content.split("\n")
524
+
525
+ # Top-level declarations: fun, class, interface, object, enum, struct, impl, func
526
+ _TOP_LEVEL = re.compile(
527
+ r'^(?:(?:public|private|protected|internal|open|abstract|override|suspend|data|sealed|inline|actual|expect)\s+)*'
528
+ r'(?:fun |class |interface |object |enum |struct |impl |func |fn |extension )'
529
+ )
530
+
531
+ current_block: list[str] = []
532
+ block_start = 1
533
+ for i, line in enumerate(lines, start=1):
534
+ if _TOP_LEVEL.match(line) and current_block:
535
+ block_content = "\n".join(current_block).strip()
536
+ if block_content and len(block_content) > 20:
537
+ kind = "function" if re.search(r'\b(fun |func |fn )\b', block_content[:200]) else "class"
538
+ units.append(TextUnit(
539
+ id=_generate_id(),
540
+ resource_id=resource_id,
541
+ kind=kind,
542
+ content=block_content,
543
+ span=(block_start, i - 1),
544
+ ))
545
+ current_block = [line]
546
+ block_start = i
547
+ else:
548
+ current_block.append(line)
549
+
550
+ if current_block:
551
+ block_content = "\n".join(current_block).strip()
552
+ if block_content and len(block_content) > 20:
553
+ units.append(TextUnit(
554
+ id=_generate_id(),
555
+ resource_id=resource_id,
556
+ kind="section",
557
+ content=block_content,
558
+ span=(block_start, block_start + len(current_block) - 1),
559
+ ))
560
+
561
+ return units
562
+
563
+
564
+ def _chunk_generic(content: str, resource_id: str) -> list[TextUnit]:
565
+ """Chunk generic files by paragraph breaks."""
566
+ paragraphs = re.split(r'\n\s*\n', content)
567
+ units = []
568
+ line_offset = 1
569
+ for para in paragraphs:
570
+ para = para.strip()
571
+ if para and len(para) > 10:
572
+ n_lines = para.count("\n") + 1
573
+ units.append(TextUnit(
574
+ id=_generate_id(),
575
+ resource_id=resource_id,
576
+ kind="paragraph",
577
+ content=para,
578
+ span=(line_offset, line_offset + n_lines - 1),
579
+ ))
580
+ line_offset += n_lines + 1
581
+ else:
582
+ line_offset += para.count("\n") + 2
583
+
584
+ return units