locus-etl 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. locus_etl-0.0.1/.github/workflows/ci.yml +36 -0
  2. locus_etl-0.0.1/.gitignore +24 -0
  3. locus_etl-0.0.1/.kiro/specs/locus-image-runtime/.config.kiro +1 -0
  4. locus_etl-0.0.1/.kiro/specs/locus-image-runtime/architecture-notes.md +37 -0
  5. locus_etl-0.0.1/.kiro/specs/locus-image-runtime/design.md +342 -0
  6. locus_etl-0.0.1/.kiro/specs/locus-image-runtime/image-catalog.md +120 -0
  7. locus_etl-0.0.1/.kiro/specs/locus-image-runtime/requirements.md +189 -0
  8. locus_etl-0.0.1/.kiro/specs/locus-image-runtime/tasks.md +161 -0
  9. locus_etl-0.0.1/.kiro/specs/unstructured-to-tabular-etl/.config.kiro +1 -0
  10. locus_etl-0.0.1/.kiro/specs/unstructured-to-tabular-etl/architecture-notes.md +683 -0
  11. locus_etl-0.0.1/.kiro/specs/unstructured-to-tabular-etl/design.md +657 -0
  12. locus_etl-0.0.1/.kiro/specs/unstructured-to-tabular-etl/requirements.md +248 -0
  13. locus_etl-0.0.1/.kiro/specs/unstructured-to-tabular-etl/tasks.md +282 -0
  14. locus_etl-0.0.1/.python-version +1 -0
  15. locus_etl-0.0.1/CHANGELOG.md +71 -0
  16. locus_etl-0.0.1/CONTRIBUTING.md +40 -0
  17. locus_etl-0.0.1/LICENSE +21 -0
  18. locus_etl-0.0.1/PKG-INFO +132 -0
  19. locus_etl-0.0.1/PUBLISHING.md +51 -0
  20. locus_etl-0.0.1/README.md +78 -0
  21. locus_etl-0.0.1/docs/architecture.png +0 -0
  22. locus_etl-0.0.1/docs/architecture.svg +4525 -0
  23. locus_etl-0.0.1/docs/generate_architecture_diagram.py +182 -0
  24. locus_etl-0.0.1/docs/requirements-diagram.txt +6 -0
  25. locus_etl-0.0.1/pyproject.toml +99 -0
  26. locus_etl-0.0.1/src/locus/__init__.py +12 -0
  27. locus_etl-0.0.1/src/locus/artifacts.py +73 -0
  28. locus_etl-0.0.1/src/locus/backends.py +91 -0
  29. locus_etl-0.0.1/src/locus/builder.py +113 -0
  30. locus_etl-0.0.1/src/locus/builtins.py +133 -0
  31. locus_etl-0.0.1/src/locus/catalog/__init__.py +53 -0
  32. locus_etl-0.0.1/src/locus/catalog/common.py +59 -0
  33. locus_etl-0.0.1/src/locus/catalog/converters.py +42 -0
  34. locus_etl-0.0.1/src/locus/catalog/extractors.py +84 -0
  35. locus_etl-0.0.1/src/locus/catalog/transforms.py +223 -0
  36. locus_etl-0.0.1/src/locus/cli.py +284 -0
  37. locus_etl-0.0.1/src/locus/emit_constants.py +11 -0
  38. locus_etl-0.0.1/src/locus/errors.py +69 -0
  39. locus_etl-0.0.1/src/locus/executor.py +115 -0
  40. locus_etl-0.0.1/src/locus/hub.py +181 -0
  41. locus_etl-0.0.1/src/locus/image.py +62 -0
  42. locus_etl-0.0.1/src/locus/loader.py +94 -0
  43. locus_etl-0.0.1/src/locus/locusfile.py +58 -0
  44. locus_etl-0.0.1/src/locus/manifest.py +40 -0
  45. locus_etl-0.0.1/src/locus/oci_store.py +191 -0
  46. locus_etl-0.0.1/src/locus/packaging.py +57 -0
  47. locus_etl-0.0.1/src/locus/planner.py +172 -0
  48. locus_etl-0.0.1/src/locus/privacy.py +62 -0
  49. locus_etl-0.0.1/src/locus/provenance.py +123 -0
  50. locus_etl-0.0.1/src/locus/runner.py +94 -0
  51. locus_etl-0.0.1/src/locus/seed.py +55 -0
  52. locus_etl-0.0.1/src/locus/serve.py +126 -0
  53. locus_etl-0.0.1/src/locus/store.py +184 -0
  54. locus_etl-0.0.1/src/locus/workspace.py +72 -0
  55. locus_etl-0.0.1/src/locus_engine/__init__.py +178 -0
  56. locus_etl-0.0.1/src/locus_engine/clean/__init__.py +8 -0
  57. locus_etl-0.0.1/src/locus_engine/clean/cleaner.py +129 -0
  58. locus_etl-0.0.1/src/locus_engine/clean/dedup.py +87 -0
  59. locus_etl-0.0.1/src/locus_engine/composer.py +90 -0
  60. locus_etl-0.0.1/src/locus_engine/config.py +64 -0
  61. locus_etl-0.0.1/src/locus_engine/conformance.py +143 -0
  62. locus_etl-0.0.1/src/locus_engine/connectors/__init__.py +9 -0
  63. locus_etl-0.0.1/src/locus_engine/connectors/files.py +62 -0
  64. locus_etl-0.0.1/src/locus_engine/connectors/http.py +92 -0
  65. locus_etl-0.0.1/src/locus_engine/connectors/sql.py +74 -0
  66. locus_etl-0.0.1/src/locus_engine/emit/__init__.py +9 -0
  67. locus_etl-0.0.1/src/locus_engine/emit/common.py +39 -0
  68. locus_etl-0.0.1/src/locus_engine/emit/dataframe.py +40 -0
  69. locus_etl-0.0.1/src/locus_engine/emit/parquet.py +42 -0
  70. locus_etl-0.0.1/src/locus_engine/emit/sql.py +85 -0
  71. locus_etl-0.0.1/src/locus_engine/errors.py +77 -0
  72. locus_etl-0.0.1/src/locus_engine/extract/__init__.py +7 -0
  73. locus_etl-0.0.1/src/locus_engine/extract/deterministic.py +116 -0
  74. locus_etl-0.0.1/src/locus_engine/extract/dual.py +89 -0
  75. locus_etl-0.0.1/src/locus_engine/ir.py +55 -0
  76. locus_etl-0.0.1/src/locus_engine/lineage.py +81 -0
  77. locus_etl-0.0.1/src/locus_engine/llm/__init__.py +13 -0
  78. locus_etl-0.0.1/src/locus_engine/llm/credentials.py +111 -0
  79. locus_etl-0.0.1/src/locus_engine/llm/engine.py +81 -0
  80. locus_etl-0.0.1/src/locus_engine/llm/router.py +57 -0
  81. locus_etl-0.0.1/src/locus_engine/observability.py +119 -0
  82. locus_etl-0.0.1/src/locus_engine/parsers/__init__.py +11 -0
  83. locus_etl-0.0.1/src/locus_engine/parsers/csv_parser.py +62 -0
  84. locus_etl-0.0.1/src/locus_engine/parsers/html.py +127 -0
  85. locus_etl-0.0.1/src/locus_engine/parsers/pdf.py +106 -0
  86. locus_etl-0.0.1/src/locus_engine/parsers/records.py +54 -0
  87. locus_etl-0.0.1/src/locus_engine/parsers/router.py +40 -0
  88. locus_etl-0.0.1/src/locus_engine/pipeline.py +228 -0
  89. locus_etl-0.0.1/src/locus_engine/plugins.py +145 -0
  90. locus_etl-0.0.1/src/locus_engine/provenance.py +85 -0
  91. locus_etl-0.0.1/src/locus_engine/registry.py +89 -0
  92. locus_etl-0.0.1/src/locus_engine/results.py +56 -0
  93. locus_etl-0.0.1/src/locus_engine/review.py +108 -0
  94. locus_etl-0.0.1/src/locus_engine/schema_infer.py +62 -0
  95. locus_etl-0.0.1/src/locus_engine/table.py +63 -0
  96. locus_etl-0.0.1/src/locus_engine/validate/__init__.py +7 -0
  97. locus_etl-0.0.1/src/locus_engine/validate/grounding.py +113 -0
  98. locus_etl-0.0.1/tests/__init__.py +0 -0
  99. locus_etl-0.0.1/tests/fixtures/_make_pdf.py +68 -0
  100. locus_etl-0.0.1/tests/fixtures/dupes.csv +4 -0
  101. locus_etl-0.0.1/tests/fixtures/invoices.csv +4 -0
  102. locus_etl-0.0.1/tests/fixtures/sample.pdf +39 -0
  103. locus_etl-0.0.1/tests/runtime/__init__.py +0 -0
  104. locus_etl-0.0.1/tests/runtime/test_backends_privacy.py +97 -0
  105. locus_etl-0.0.1/tests/runtime/test_builder.py +69 -0
  106. locus_etl-0.0.1/tests/runtime/test_catalog.py +128 -0
  107. locus_etl-0.0.1/tests/runtime/test_cli_scaffold.py +76 -0
  108. locus_etl-0.0.1/tests/runtime/test_cross_stage_provenance.py +71 -0
  109. locus_etl-0.0.1/tests/runtime/test_discovery.py +51 -0
  110. locus_etl-0.0.1/tests/runtime/test_executor.py +36 -0
  111. locus_etl-0.0.1/tests/runtime/test_hub.py +70 -0
  112. locus_etl-0.0.1/tests/runtime/test_loader.py +81 -0
  113. locus_etl-0.0.1/tests/runtime/test_models.py +97 -0
  114. locus_etl-0.0.1/tests/runtime/test_oci_store.py +147 -0
  115. locus_etl-0.0.1/tests/runtime/test_planner.py +149 -0
  116. locus_etl-0.0.1/tests/runtime/test_properties_runtime.py +161 -0
  117. locus_etl-0.0.1/tests/runtime/test_run_single.py +65 -0
  118. locus_etl-0.0.1/tests/runtime/test_seed.py +34 -0
  119. locus_etl-0.0.1/tests/runtime/test_serve.py +56 -0
  120. locus_etl-0.0.1/tests/runtime/test_store.py +108 -0
  121. locus_etl-0.0.1/tests/test_clean.py +86 -0
  122. locus_etl-0.0.1/tests/test_composer.py +79 -0
  123. locus_etl-0.0.1/tests/test_config.py +49 -0
  124. locus_etl-0.0.1/tests/test_conformance.py +71 -0
  125. locus_etl-0.0.1/tests/test_connectors.py +34 -0
  126. locus_etl-0.0.1/tests/test_emit.py +60 -0
  127. locus_etl-0.0.1/tests/test_errors.py +57 -0
  128. locus_etl-0.0.1/tests/test_extract.py +67 -0
  129. locus_etl-0.0.1/tests/test_grounding.py +70 -0
  130. locus_etl-0.0.1/tests/test_ir.py +46 -0
  131. locus_etl-0.0.1/tests/test_lineage.py +70 -0
  132. locus_etl-0.0.1/tests/test_llm.py +157 -0
  133. locus_etl-0.0.1/tests/test_observability.py +49 -0
  134. locus_etl-0.0.1/tests/test_parsers.py +64 -0
  135. locus_etl-0.0.1/tests/test_pdf.py +47 -0
  136. locus_etl-0.0.1/tests/test_pipeline_clean_dedup.py +49 -0
  137. locus_etl-0.0.1/tests/test_pipeline_e2e.py +70 -0
  138. locus_etl-0.0.1/tests/test_pipeline_llm.py +87 -0
  139. locus_etl-0.0.1/tests/test_properties.py +122 -0
  140. locus_etl-0.0.1/tests/test_provenance.py +57 -0
  141. locus_etl-0.0.1/tests/test_registry.py +73 -0
  142. locus_etl-0.0.1/tests/test_results.py +34 -0
  143. locus_etl-0.0.1/tests/test_review.py +70 -0
  144. locus_etl-0.0.1/tests/test_schema_infer.py +35 -0
  145. locus_etl-0.0.1/tests/test_stage9.py +148 -0
  146. locus_etl-0.0.1/tests/test_table.py +51 -0
  147. locus_etl-0.0.1/uv.lock +5177 -0
@@ -0,0 +1,36 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.11", "3.12"]
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - name: Install uv
19
+ uses: astral-sh/setup-uv@v5
20
+ with:
21
+ enable-cache: true
22
+
23
+ - name: Set Python version
24
+ run: uv python install ${{ matrix.python-version }}
25
+
26
+ - name: Sync dependencies
27
+ run: uv sync --extra dedup --extra pdf --extra serve
28
+
29
+ - name: Ruff (lint)
30
+ run: uv run ruff check src tests
31
+
32
+ - name: Mypy (types)
33
+ run: uv run mypy
34
+
35
+ - name: Pytest (with coverage)
36
+ run: uv run pytest --cov --cov-report=term-missing
@@ -0,0 +1,24 @@
1
+ # Diagram-generation virtualenv (transient build tooling)
2
+ .diagram-venv/
3
+
4
+ # Python
5
+ __pycache__/
6
+ *.py[cod]
7
+ .venv/
8
+ venv/
9
+ .mypy_cache/
10
+ .pytest_cache/
11
+ .ruff_cache/
12
+ *.egg-info/
13
+ htmlcov/
14
+ .coverage
15
+
16
+ # OS
17
+ .DS_Store
18
+
19
+ # Local scratch / demo space (not committed)
20
+ scratch/
21
+
22
+ # Build artifacts
23
+ dist/
24
+ build/
@@ -0,0 +1 @@
1
+ {"specId": "2c1324e3-b19f-554d-a057-476f4bf65262", "workflowType": "requirements-first", "specType": "feature"}
@@ -0,0 +1,37 @@
1
+ # Architecture & Vision Notes — Layer 2 (locus-image-runtime)
2
+
3
+ > The canonical, cross-layer decision log lives in the sibling spec:
4
+ > `.kiro/specs/unstructured-to-tabular-etl/architecture-notes.md`
5
+ > Read that file for the full decision trail (project name, runtime model, key handling,
6
+ > LiteLLM, composition solutions, Hub/Harbor, privacy model, etc.). It spans BOTH layers.
7
+ > This file only records what is specific to Layer 2 navigation.
8
+
9
+ ## What this spec owns (Layer 2 = packaging / distribution / runtime)
10
+ - Client install + invocation (single CLI, local-process default, Docker optional backend).
11
+ - Image pull + local cache; version resolution.
12
+ - Locusfile run-configuration surface (source, volumes, ports, schema ref, llm, export,
13
+ review) — minimal required fields = source + image.
14
+ - Local LLM credential handling (.env primary; env var / keyring fallbacks; raw-key
15
+ prohibition + gitignore guardrails).
16
+ - Multi-image composition = pipeline DAG (NOT docker-compose semantics; compose-FAMILIAR
17
+ syntax only). `needs:`-style dependency edges, parallel branches, cycle detection,
18
+ stage output caching.
19
+ - Stage interchange contract: single versioned `Locus_Artifact` envelope, fixed
20
+ Artifact_Type set, Arrow/Parquet payloads, static pre-run type check (fail fast).
21
+ - Cross-stage provenance propagation: framework-managed (SDK) lineage, run-scoped
22
+ append-only Lineage_Store, conformance certification, strict/permissive non-conformant.
23
+ - Result serving / preview UI / export (local, on mapped port).
24
+ - Image authoring + building (Image_Manifest, dependency pinning, conformance cert).
25
+ - Publishing to Locus Hub: public + private (Harbor), namespaces, RBAC, self-host path.
26
+ - Image discovery; runtime privacy disclosure / consent.
27
+
28
+ ## What the SIBLING spec (`unstructured-to-tabular-etl`, Layer 1) owns
29
+ - The processing engine an Image embeds: connectors, parser routing, IR, schema-driven
30
+ extraction, cleaning, dedup, the cell-level grounding/faithfulness contract, HITL review,
31
+ plugin architecture, observability, dual-engine (deterministic default + LLM opt-in),
32
+ LLM guardrails. Req 1-15 there.
33
+
34
+ ## Key cross-references
35
+ - Layer 2 Req 6 (interchange) + Req 7 (cross-stage provenance) depend on Layer 1 Req 3 (IR)
36
+ and Req 8 (emission/provenance) being the STABLE VERSIONED contract.
37
+ - `image-catalog.md` (this folder) = 65 candidate images / 9 tiers / locked build order.
@@ -0,0 +1,342 @@
1
+ # Design Document
2
+
3
+ ## Overview
4
+
5
+ This document specifies the design of **Layer 2 of Locus: the image runtime, packaging, distribution, and registry**. It builds on the Layer 1 engine (`unstructured-to-tabular-etl`, now implemented as the `locus_engine` package) and turns engine capabilities into reusable, versioned **images** that users pull, point at their own data via a **Locusfile**, run locally, preview, and export. Multiple images compose into a **pipeline DAG** with a versioned interchange contract and cross-stage provenance.
6
+
7
+ This layer is a separate installable package, `locus` (the CLI), which depends on `locus_engine`. The engine stays free of any packaging/registry concern; the runtime orchestrates engine-backed images.
8
+
9
+ ### Tech baseline (inherited + additions)
10
+
11
+ - **Python 3.11+**, `uv` + `pyproject.toml`, Pydantic v2 (same as Layer 1).
12
+ - **CLI:** Typer (argument parsing, subcommands) + Rich (terminal output).
13
+ - **Registry transport:** ORAS-style OCI client (`oras` Python library) against Harbor; the CLI is registry-agnostic OCI so Harbor is swappable.
14
+ - **Artifact serialization:** Apache Arrow / Parquet for tabular payloads (already produced by the engine's emitters).
15
+ - **Result UI:** embedded FastAPI + a static front end, served locally via uvicorn.
16
+ - **Optional Docker backend:** the `docker` Python SDK, used only when `--runtime=docker`.
17
+
18
+ ### Design principles
19
+
20
+ 1. **The CLI is the only install.** A single `pip install locus`. No daemon, no Linux VM by default (Req 1).
21
+ 2. **Local-first, registry-agnostic.** Runs on the user's machine; talks plain OCI so the default Harbor hub is swappable (architecture notes).
22
+ 3. **Composition is a typed DAG, not docker-compose.** Compose-familiar YAML, pipeline execution semantics (Req 5).
23
+ 4. **Trust is enforced at the seams.** Static type-check across stage edges before running (Req 6); provenance composes across stages or the build fails (Req 7).
24
+ 5. **The engine does the work; the runtime orchestrates.** No data transformation logic lives here — it delegates to `locus_engine`.
25
+
26
+ ## Architecture
27
+
28
+ ```mermaid
29
+ flowchart TB
30
+ subgraph cli["locus CLI (Typer)"]
31
+ PULL[pull]
32
+ RUN[run]
33
+ BUILD[build]
34
+ PUSH[push]
35
+ SEARCH[search]
36
+ INIT[init]
37
+ end
38
+
39
+ subgraph rt["Runtime"]
40
+ LF[LocusfileLoader\n+ validation]
41
+ DAG[PipelinePlanner\nDAG + static type-check]
42
+ EXEC[StageExecutor\nlocal-process | docker]
43
+ PROV[CrossStageProvenance\n+ run LineageStore]
44
+ WS[(Run_Workspace\nArrow artifacts)]
45
+ end
46
+
47
+ subgraph img["Image"]
48
+ MAN[Image_Manifest\naccepts/emits, engine modes, privacy]
49
+ CODE[engine-backed capability code]
50
+ end
51
+
52
+ subgraph dist["Distribution"]
53
+ CACHE[(local image cache)]
54
+ ORAS[OCI client / ORAS]
55
+ HUB[(Locus Hub / Harbor)]
56
+ end
57
+
58
+ subgraph serve["Serving"]
59
+ UI[Result UI\nFastAPI + static]
60
+ EXP[Exporter]
61
+ end
62
+
63
+ ENG[[locus_engine\nLayer 1]]
64
+
65
+ RUN --> LF --> DAG --> EXEC
66
+ EXEC --> img
67
+ img --> ENG
68
+ EXEC <--> WS
69
+ EXEC --> PROV
70
+ PULL --> ORAS <--> HUB
71
+ PULL --> CACHE
72
+ BUILD --> MAN
73
+ PUSH --> ORAS
74
+ SEARCH --> HUB
75
+ EXEC --> UI
76
+ EXEC --> EXP
77
+ ```
78
+
79
+ The CLI dispatches subcommands. `run` is the core path: load + validate the Locusfile, plan the DAG (resolving and type-checking every stage edge), then execute stages in dependency order through the selected runtime backend, materializing typed `Locus_Artifact`s in the run workspace and threading provenance through a run-scoped lineage store. `pull`/`push`/`search` talk OCI to Harbor. `build` packages an image from its manifest and certifies provenance conformance.
80
+
81
+ ## Data Models
82
+
83
+ All models are Pydantic v2. The `Locus_Artifact`, `Artifact_Type`, and provenance structures align with the engine's `ProvenancedTable`/`IntermediateRepresentation` so the contract is shared, not re-invented.
84
+
85
+ ```python
86
+ from __future__ import annotations
87
+ from enum import StrEnum
88
+ from typing import Any
89
+ from pydantic import BaseModel, Field
90
+
91
+
92
+ class ArtifactKind(StrEnum):
93
+ IR = "ir"
94
+ TABLE = "table"
95
+ CHUNKS = "chunks"
96
+ EMBEDDINGS = "embeddings"
97
+ GRAPH = "graph"
98
+
99
+
100
+ class ArtifactType(BaseModel):
101
+ """A versioned artifact type, e.g. kind=table version=(1,0) -> 'table/v1'."""
102
+ kind: ArtifactKind
103
+ major: int = 1
104
+ minor: int = 0
105
+
106
+ def tag(self) -> str:
107
+ return f"{self.kind.value}/v{self.major}"
108
+
109
+ def compatible_with(self, consumer: "ArtifactType") -> "Compat":
110
+ if self.kind != consumer.kind or self.major != consumer.major:
111
+ return Compat.INCOMPATIBLE
112
+ if self.minor != consumer.minor:
113
+ return Compat.MINOR_DIFF
114
+ return Compat.OK
115
+
116
+
117
+ class Compat(StrEnum):
118
+ OK = "ok"
119
+ MINOR_DIFF = "minor_diff" # compatible with warning (Req 6.6)
120
+ INCOMPATIBLE = "incompatible"
121
+
122
+
123
+ class LocusArtifact(BaseModel):
124
+ """The only structure that crosses a stage boundary (Req 6.1)."""
125
+ type: ArtifactType
126
+ payload_path: str # Arrow/Parquet file in the Run_Workspace (Req 6.2)
127
+ lineage_path: str | None = None # serialized lineage graph for this artifact
128
+ produced_by: str # stage id
129
+ engine_mode: str = "deterministic" # "deterministic" | "llm"
130
+
131
+
132
+ class PrivacyClass(StrEnum):
133
+ LOCAL_ONLY = "local_only"
134
+ CALLS_EXTERNAL = "calls_external" # an LLM-backed image (Req 12.2)
135
+
136
+
137
+ class ImageManifest(BaseModel):
138
+ """Build-time declaration of an image (Req 9.2)."""
139
+ name: str
140
+ version: str
141
+ description: str = ""
142
+ accepts: list[ArtifactType] = Field(default_factory=list)
143
+ emits: ArtifactType
144
+ engine_modes: list[str] = Field(default_factory=lambda: ["deterministic"])
145
+ privacy_class: PrivacyClass = PrivacyClass.LOCAL_ONLY
146
+ provenance_conformant: bool = False # set by build-time certification (Req 9.4)
147
+ entrypoint: str # import path of the capability callable
148
+ dependencies: dict[str, str] = Field(default_factory=dict) # pinned (Req 9.3)
149
+
150
+
151
+ class StageSpec(BaseModel):
152
+ """One stage in a Locusfile pipeline."""
153
+ id: str
154
+ image: str # "name:version"
155
+ needs: list[str] = Field(default_factory=list) # dependency ids (Req 5.4)
156
+ source: "SourceSpec | None" = None # override pipeline source
157
+ config: dict[str, Any] = Field(default_factory=dict)
158
+
159
+
160
+ class SourceSpec(BaseModel):
161
+ type: str # "files" | "url" | "api" | "sql"
162
+ path: str | None = None
163
+ uri: str | None = None
164
+
165
+
166
+ class PortSpec(BaseModel):
167
+ ui: int | None = None
168
+ api: int | None = None
169
+
170
+
171
+ class Locusfile(BaseModel):
172
+ """The run-configuration surface (Req 3)."""
173
+ image: str | None = None # single-image shorthand
174
+ source: SourceSpec | None = None
175
+ pipeline: list[StageSpec] = Field(default_factory=list)
176
+ volumes: list[str] = Field(default_factory=list)
177
+ ports: PortSpec = Field(default_factory=PortSpec)
178
+ schema_mode: str = "infer"
179
+ schema_ref: str | None = None
180
+ llm: dict[str, Any] | None = None
181
+ env_file: str | None = None
182
+ export: dict[str, Any] | None = None
183
+ review: dict[str, Any] | None = None
184
+ mode: str = "strict" # "strict" | "permissive" (Req 7.7/7.8)
185
+ ```
186
+
187
+ ## Components and Interfaces
188
+
189
+ ### CLI commands (Typer)
190
+
191
+ ```python
192
+ # locus <command>
193
+ def pull(image: str) -> None: ... # Req 2
194
+ def run(locusfile: str = "locusfile.yaml",
195
+ runtime: str = "process") -> None: ... # Req 1, 5, 8
196
+ def build(manifest: str = "locus.image.yaml") -> None: ... # Req 9
197
+ def push(image: str, private: bool = False) -> None: ... # Req 10
198
+ def search(query: str = "") -> None: ... # Req 11
199
+ def login(registry: str | None = None) -> None: ... # Req 10.1
200
+ def init(path: str = ".") -> None: ... # Req 4.4 (.env + gitignore)
201
+ def config_set_key(provider: str, key: str) -> None: ... # keyring (Req 4.1)
202
+ ```
203
+
204
+ ### LocusfileLoader
205
+
206
+ Loads YAML, validates against the `Locusfile` model, performs credential safety checks (reject raw keys, enforce `.env` gitignore/untracked — Req 4.3/4.5), and resolves a single-image shorthand into a one-stage pipeline. Raises `ConfigError` with the offending setting (Req 3.7).
207
+
208
+ ### PipelinePlanner
209
+
210
+ ```python
211
+ class PipelinePlanner:
212
+ def plan(self, lf: Locusfile, manifests: dict[str, ImageManifest]) -> PipelinePlan:
213
+ """Build the DAG, detect cycles (Req 5.6), topologically order stages, and
214
+ STATIC-CHECK every edge's artifact-type compatibility BEFORE execution
215
+ (Req 6.4/6.5). A major-version mismatch fails; a minor mismatch warns
216
+ (Req 6.6). Also verifies provenance-conformance per mode (Req 7.7/7.8)."""
217
+ ```
218
+
219
+ `PipelinePlan` holds the ordered stages, parallelizable groups (waves of independent stages, Req 5.5), and the validated edge map. Planning fails fast: no stage runs if any edge is incompatible or a cycle exists.
220
+
221
+ ### StageExecutor and Runtime backends
222
+
223
+ ```python
224
+ class RuntimeBackend(Protocol):
225
+ name: str
226
+ def run_stage(self, image: ResolvedImage, inputs: list[LocusArtifact],
227
+ ctx: StageContext) -> LocusArtifact: ...
228
+
229
+ class ProcessBackend: # default (Req 1.3) — per-image venv, in-process engine call
230
+ ...
231
+ class DockerBackend: # optional (Req 1.4); errors clearly if Docker absent (Req 1.5)
232
+ ...
233
+ ```
234
+
235
+ `StageExecutor` walks the plan wave by wave: gathers each stage's input artifacts (the pipeline source for root stages, dependency outputs otherwise — Req 5.3/5.4), invokes the backend, writes the output `Locus_Artifact` (Arrow file) to the run workspace, and records lineage. Stage caching (Req 5.8): if a stage's input artifact hashes and image version match a prior run, reuse the cached output.
236
+
237
+ ### CrossStageProvenance
238
+
239
+ Wraps the engine's `ProvenanceComposer`/`LineageStore`. As artifacts flow between stages, cell ids and lineage edges are preserved in a run-scoped `Lineage_Store`; the terminal artifact's cells resolve to original `Source_Location`s (Req 7.5/7.6). Conformance is checked at build time (Req 9.4) and enforced at plan time per mode (Req 7.7/7.8).
240
+
241
+ ### Distribution: OCI client + cache
242
+
243
+ ```python
244
+ class ImageStore:
245
+ def pull(self, ref: str) -> ResolvedImage: ... # ORAS pull -> cache (Req 2.1)
246
+ def cached(self, ref: str) -> ResolvedImage | None: ... # (Req 2.2)
247
+ def push(self, image: BuiltImage, *, private: bool) -> None: ... # (Req 10)
248
+ def search(self, query: str) -> list[ImageSummary]: ... # (Req 11)
249
+ ```
250
+
251
+ Images are OCI artifacts: the manifest + a layer with the capability code + pinned deps. Private/public visibility maps to Harbor project visibility (Req 10.3/10.4); pulls authenticate via the `Registry_Credential`, which is wholly separate from LLM keys (Req 10.8).
252
+
253
+ ### ImageBuilder
254
+
255
+ Packages an `ImageManifest` + entrypoint into a `BuiltImage`, pins dependency versions (Req 9.3), and runs the engine's `conformance` harness (`probe_extraction_engine`) against the capability to set `provenance_conformant` (Req 9.4/9.5).
256
+
257
+ ### Result serving and export
258
+
259
+ `ResultServer` (FastAPI) serves the final artifact on the mapped UI port (Req 8.1): rows, per-cell faithfulness + source location, flagged rows, and the engine mode (Req 8.2/8.3), reading directly from the engine's emitted DataFrame + `_lineage` column. `Exporter` writes the configured format (Req 8.4). Both are local; no hosted service (Req 8.5).
260
+
261
+ ## Error Handling
262
+
263
+ | Class | Examples | Behavior |
264
+ |-------|----------|----------|
265
+ | Plan-fatal, fail-fast | cycle in DAG, artifact-type mismatch, non-conformant stage in strict mode, missing image | raise before any stage runs (Req 5.6, 6.5, 7.7, 2.4) |
266
+ | Config | invalid Locusfile, raw key in file, git-tracked `.env` | reject with offending setting (Req 3.7, 4.3, 4.5) |
267
+ | Backend | Docker selected but absent | error identifying the backend; no silent fallback (Req 1.5) |
268
+ | Permissive degradation | non-conformant stage in permissive mode | continue; mark downstream cells lineage-broken (Req 7.8) |
269
+ | Runtime privacy | external-LLM stage about to send data | consent notice before egress (Req 12.1) |
270
+
271
+ Errors derive from a `LocusRuntimeError` hierarchy, distinct from but mirroring the engine's `LocusError`.
272
+
273
+ ## Correctness Properties
274
+
275
+ ### Property 1: Fail-fast type safety
276
+
277
+ For any composed pipeline, no stage executes unless every dependency edge is artifact-type compatible (major version equal, kind equal). An incompatible edge raises before execution.
278
+
279
+ **Validates: Requirements 6.4, 6.5**
280
+
281
+ ### Property 2: Acyclic execution
282
+
283
+ A pipeline graph containing a cycle never executes any stage and reports the cycle.
284
+
285
+ **Validates: Requirements 5.6**
286
+
287
+ ### Property 3: Dependency ordering
288
+
289
+ A stage executes only after all stages in its `needs` have completed, and its inputs are exactly those dependencies' outputs (or the pipeline source for root stages).
290
+
291
+ **Validates: Requirements 5.3, 5.4**
292
+
293
+ ### Property 4: End-to-end provenance survival
294
+
295
+ Every cell in the terminal artifact resolves, through the run lineage store, to at least one originating `Source_Location`, across all intermediate stages.
296
+
297
+ **Validates: Requirements 7.1, 7.5, 7.6**
298
+
299
+ ### Property 5: Conformance gating
300
+
301
+ In strict mode a non-conformant stage fails the run; in permissive mode it continues and the affected downstream cells are marked lineage-broken.
302
+
303
+ **Validates: Requirements 7.7, 7.8**
304
+
305
+ ### Property 6: Credential locality
306
+
307
+ No LLM credential or user data is transmitted to any hosted Locus service; credentials resolve only from local sources and a raw key in the Locusfile is rejected.
308
+
309
+ **Validates: Requirements 4.1, 4.2, 4.3**
310
+
311
+ ### Property 7: Registry/credential separation
312
+
313
+ The registry credential authorizes only image pull/push and never grants access to LLM provider credentials.
314
+
315
+ **Validates: Requirements 10.8**
316
+
317
+ ### Property 8: Privacy disclosure honesty
318
+
319
+ A run containing an external-LLM image never presents a blanket data-stays-local claim and surfaces a consent notice before data leaves.
320
+
321
+ **Validates: Requirements 12.1, 12.3**
322
+
323
+ ## Testing strategy
324
+
325
+ 1. **Locusfile validation** — minimal (source + image) accepted; raw-key rejection; `.env` gitignore/untracked guardrails; single-image shorthand expands to one stage.
326
+ 2. **Planner unit tests** — DAG construction, cycle detection, topological waves, edge type-check (compatible / minor-warn / major-fail), conformance gating per mode.
327
+ 3. **Interchange round-trip** — a `Locus_Artifact` written by one stage (Arrow) reads back identically in the next; lineage path resolves.
328
+ 4. **Cross-stage provenance** — a two-stage pipeline (extract → redact) where the final masked cell still resolves to the original source; reuses the engine's conformance assertions.
329
+ 5. **Backend tests** — process backend runs a stub image end-to-end; docker backend errors cleanly when Docker is absent (mocked).
330
+ 6. **Distribution** — pull/cache hit-miss against a mocked OCI client; private-image auth; version resolution default.
331
+ 7. **Serving/export** — FastAPI test client returns rows + provenance + engine mode; exporter writes the configured format.
332
+ 8. **Privacy** — consent fires before an external-LLM stage; no blanket local claim when such a stage is present.
333
+ 9. **Golden two-stage pipeline** — using two real engine-backed stub images, assert the typed DAG, provenance survival, and final emitted output end-to-end.
334
+
335
+ ## Key design decisions and rationale
336
+
337
+ - **Separate `locus` package depending on `locus_engine`.** Keeps the engine reusable and the runtime's heavier deps (Typer, FastAPI, oras, optional docker) out of the engine.
338
+ - **Artifacts pass by file path in a run workspace, not in memory.** Enables the Docker backend (cross-process), stage caching (content-addressed), and large datasets — and reuses the engine's Arrow/Parquet emitters.
339
+ - **Static type-check before any execution.** Composition is only trustworthy if mismatches fail fast; this is a pipeline-level type system over the fixed `ArtifactType` set.
340
+ - **Provenance enforced at build (certification) and plan (gating).** Reuses the Layer 1 conformance harness so the guarantee is identical end to end; non-conformant images are visible, never silent.
341
+ - **OCI/Harbor borrowed, CLI registry-agnostic.** No registry is built; the default hub is swappable to GHCR/ECR/self-hosted Harbor via config.
342
+ ```
@@ -0,0 +1,120 @@
1
+ # Locus Image Catalog (working list)
2
+
3
+ > Candidate "capability images" to build and publish to Locus Hub. Each image contains
4
+ > LOGIC (parser + extractor + validator + emitter + pinned config); the USER brings their
5
+ > own data. Default engine = deterministic Python; LLM engine activates only when the user
6
+ > supplies a key (see architecture-notes.md). Every image inherits the cell-level
7
+ > provenance + faithfulness grounding contract from the Layer-1 engine.
8
+ >
9
+ > Status legend: [ ] not started | [~] in progress | [x] shipped
10
+ > This catalog will likely move into the sibling `data-image-runtime` spec.
11
+
12
+ ## Guiding principle
13
+ - ~80% of enterprise data is unstructured; extraction (not the model) is the bottleneck.
14
+ - Highest recurring-spend jobs: invoices, receipts, bank statements, contracts, reports.
15
+ - DO NOT build breadth-first. Ship one fully-grounded image at a time.
16
+ - "Solve ALL data problems" is rejected as a goal — this list is the SUPERSET of what's
17
+ possible; discipline = ship incrementally starting with `doc-to-tables`.
18
+
19
+ ---
20
+
21
+ ## Tier 1 — Document -> Structured (highest, proven demand)
22
+ 1. [ ] `doc-to-tables` — tables from PDF/scan/Word -> rows. FIRST image to build.
23
+ 2. [ ] `invoice-extractor` — invoices -> fields + line items.
24
+ 3. [ ] `receipt-extractor` — receipts -> expense records.
25
+ 4. [ ] `bank-statement-extractor` — statements -> transaction tables.
26
+ 5. [ ] `resume-parser` — CVs -> candidate schema.
27
+ 6. [ ] `contract-extractor` — clauses, parties, dates, obligations, renewal terms.
28
+ 7. [ ] `form-extractor` — tax/customs/medical/court forms -> fields.
29
+ 8. [ ] `report-extractor` — annual/financial reports -> tables + KPIs.
30
+ 9. [ ] `scanned-ocr-to-table` — OCR-first for image-only scans (no text layer).
31
+
32
+ ## Tier 2 — Format Conversion & Reshaping (broad everyday demand)
33
+ 10. [ ] `any-to-json` — any source -> clean schema-stable JSON (dominant AI-ready format).
34
+ 11. [ ] `any-to-csv` / `any-to-parquet` — tabular export for analytics.
35
+ 12. [ ] `any-to-yaml` — config-shaped output.
36
+ 13. [ ] `any-to-markdown` — doc/web -> clean Markdown (standard RAG ingestion format).
37
+ 14. [ ] `json-reshaper` — JSON -> JSON remap/flatten/nest.
38
+ 15. [ ] `schema-mapper` — map source schema -> target schema (rename, type-align).
39
+
40
+ ## Tier 3 — Cleaning, Quality & Identity (the "data is messy" problems)
41
+ 16. [ ] `data-cleaner` — type coercion, normalization, missing-value handling.
42
+ 17. [ ] `deduplicator` / `entity-resolver` — match + merge same-real-world-entity records.
43
+ 18. [ ] `data-validator` — assert dataset vs rules/constraints, report violations.
44
+ 19. [ ] `data-enricher` — augment records with derived/looked-up fields.
45
+ 20. [ ] `normalizer` — standardize units, currencies, dates, addresses, phone numbers.
46
+
47
+ ## Tier 4 — RAG / AI-Prep (serves the "feed the LLM" goal directly)
48
+ 21. [ ] `chunker` — layout-aware chunking (make-or-break RAG step).
49
+ 22. [ ] `embedder` — chunk -> vector, output to Parquet / vector DB.
50
+ 23. [ ] `knowledge-graph-builder` — text -> entities + relationships (triples) for GraphRAG.
51
+ 24. [ ] `metadata-enricher` — LLM-generated metadata to improve retrieval.
52
+ 25. [ ] `qa-pair-generator` — corpus -> synthetic Q&A pairs for fine-tune/eval.
53
+
54
+ ## Tier 5 — Multimodal (next frontier; less crowded)
55
+ 26. [ ] `audio-to-table` — transcription + diarization -> structured transcript/records.
56
+ 27. [ ] `video-to-table` — frames + ASR -> structured events/timestamps.
57
+ 28. [ ] `image-to-data` — charts/diagrams/photos -> structured values.
58
+ 29. [ ] `email-extractor` — emails/threads -> structured fields.
59
+ 30. [ ] `log-parser` — app/system logs -> structured events.
60
+
61
+ ## Tier 6 — Unique / hard-problem images (Locus differentiation; lean on grounding engine)
62
+ 31. [ ] `pii-redactor` — detect + mask PII in structured + unstructured data.
63
+ 32. [ ] `provenance-tracker` — emit data with full cell-level lineage as the headline output.
64
+ 33. [ ] `fact-grounder` — score faithfulness of every cell in an EXISTING table vs sources
65
+ (validation-as-a-service). Flagship/marketing wedge.
66
+ 34. [ ] `synthetic-data-generator` — schema -> realistic synthetic rows, referential
67
+ integrity across multi-table schemas.
68
+ 35. [ ] `cross-doc-reconciler` — reconcile conflicting values across multiple docs, surface
69
+ the conflict with provenance.
70
+ 36. [ ] `table-joiner` — semantic/fuzzy join of tables lacking shared keys.
71
+ 37. [ ] `time-series-structurer` — messy event text -> clean time series (no peak flatten).
72
+ 38. [ ] `taxonomy-classifier` — auto-classify records into a user-defined taxonomy tree.
73
+ 39. [ ] `change-diff` — diff two versions of a doc/dataset -> structured changeset.
74
+ 40. [ ] `anomaly-flagger` — flag outlier/suspect rows with reasons (quality + fraud signal).
75
+
76
+ ## Tier 7 — Domain-specific verticals (additional; high willingness-to-pay)
77
+ 41. [ ] `medical-record-extractor` — clinical notes/EHR text -> structured codes (ICD/SNOMED-style).
78
+ 42. [ ] `legal-doc-extractor` — case law / filings -> citations, holdings, parties.
79
+ 43. [ ] `scientific-paper-extractor` — papers -> structured methods/results tables, citations.
80
+ 44. [ ] `financial-filing-extractor` — 10-K/earnings -> normalized financial line items.
81
+ 45. [ ] `real-estate-doc-extractor` — leases/deeds -> property + terms schema.
82
+ 46. [ ] `insurance-claim-extractor` — claims/policies -> structured claim records.
83
+ 47. [ ] `shipping-logistics-extractor` — BOL / customs / manifests -> structured shipment data.
84
+ 48. [ ] `survey-response-structurer` — free-text survey answers -> coded categorical data.
85
+ 49. [ ] `product-catalog-extractor` — supplier sheets/specs -> normalized product attributes.
86
+ 50. [ ] `menu-extractor` — restaurant menus (img/PDF) -> items + prices + modifiers.
87
+
88
+ ## Tier 8 — Web & API sources (additional)
89
+ 51. [ ] `web-to-table` — web pages/listings -> structured rows (price/spec scraping).
90
+ 52. [ ] `api-to-table` — paginated REST/GraphQL responses -> flattened tables.
91
+ 53. [ ] `sitemap-crawler-to-corpus` — crawl a site -> clean corpus for downstream images.
92
+ 54. [ ] `social-feed-structurer` — posts/threads -> structured engagement/content records.
93
+ 55. [ ] `spreadsheet-normalizer` — messy human Excel (merged cells, multi-header) -> tidy table.
94
+
95
+ ## Tier 9 — Operational / pipeline-utility images (additional; glue + governance)
96
+ 56. [ ] `schema-inferer` — sample data -> proposed Pydantic schema (bootstraps other images).
97
+ 57. [ ] `data-profiler` — dataset -> stats/quality report (nulls, cardinality, distributions).
98
+ 58. [ ] `data-masker` / `anonymizer` — reversible/irreversible masking beyond PII (k-anon).
99
+ 59. [ ] `format-detector` — sniff + classify unknown file/content types for routing.
100
+ 60. [ ] `language-detector-translator` — detect language, optionally normalize to one language.
101
+ 61. [ ] `unit-test-data-extractor` — code repos -> structured fixtures/test data.
102
+ 62. [ ] `csv-repair` — fix broken/ragged CSVs (bad quoting, mixed delimiters, encoding).
103
+ 63. [ ] `encoding-normalizer` — normalize text encodings / mojibake repair.
104
+ 64. [ ] `dataset-merger` — union/merge heterogeneous datasets into one schema.
105
+ 65. [ ] `incremental-sync` — detect new/changed source records since last run (CDC-style).
106
+
107
+ ## Build-order recommendation (LOCKED priority)
108
+ 1. `doc-to-tables` (Tier 1 #1) — proves full engine end-to-end; everything reuses its plumbing.
109
+ 2. Tier-1 specializations `invoice-extractor`, `receipt-extractor` — base + baked-in schema; highest revenue.
110
+ 3. 2-3 conversion images `any-to-json`, `any-to-markdown` — high pull volume, low build cost, drives adoption.
111
+ 4. One flagship unique image `fact-grounder` (or `provenance-tracker`) — the marketing wedge competitors lack.
112
+ 5. Tiers 4-9 follow once base engine + packaging are battle-tested.
113
+
114
+ ## Notes / open
115
+ - Several Tier 6 images (fact-grounder, provenance-tracker, cross-doc-reconciler) are the
116
+ defensible differentiation — they ARE the grounding engine, packaged.
117
+ - Each image should advertise: engine mode support (deterministic / LLM / hybrid),
118
+ privacy class (local-only vs calls-external-LLM), and required vs inferred schema.
119
+ - Verticals (Tier 7) are mostly Tier-1 base + a baked-in domain schema + tuned prompts;
120
+ cheap to produce once base exists, high willingness-to-pay.