locus-etl 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- locus_etl-0.0.1/.github/workflows/ci.yml +36 -0
- locus_etl-0.0.1/.gitignore +24 -0
- locus_etl-0.0.1/.kiro/specs/locus-image-runtime/.config.kiro +1 -0
- locus_etl-0.0.1/.kiro/specs/locus-image-runtime/architecture-notes.md +37 -0
- locus_etl-0.0.1/.kiro/specs/locus-image-runtime/design.md +342 -0
- locus_etl-0.0.1/.kiro/specs/locus-image-runtime/image-catalog.md +120 -0
- locus_etl-0.0.1/.kiro/specs/locus-image-runtime/requirements.md +189 -0
- locus_etl-0.0.1/.kiro/specs/locus-image-runtime/tasks.md +161 -0
- locus_etl-0.0.1/.kiro/specs/unstructured-to-tabular-etl/.config.kiro +1 -0
- locus_etl-0.0.1/.kiro/specs/unstructured-to-tabular-etl/architecture-notes.md +683 -0
- locus_etl-0.0.1/.kiro/specs/unstructured-to-tabular-etl/design.md +657 -0
- locus_etl-0.0.1/.kiro/specs/unstructured-to-tabular-etl/requirements.md +248 -0
- locus_etl-0.0.1/.kiro/specs/unstructured-to-tabular-etl/tasks.md +282 -0
- locus_etl-0.0.1/.python-version +1 -0
- locus_etl-0.0.1/CHANGELOG.md +71 -0
- locus_etl-0.0.1/CONTRIBUTING.md +40 -0
- locus_etl-0.0.1/LICENSE +21 -0
- locus_etl-0.0.1/PKG-INFO +132 -0
- locus_etl-0.0.1/PUBLISHING.md +51 -0
- locus_etl-0.0.1/README.md +78 -0
- locus_etl-0.0.1/docs/architecture.png +0 -0
- locus_etl-0.0.1/docs/architecture.svg +4525 -0
- locus_etl-0.0.1/docs/generate_architecture_diagram.py +182 -0
- locus_etl-0.0.1/docs/requirements-diagram.txt +6 -0
- locus_etl-0.0.1/pyproject.toml +99 -0
- locus_etl-0.0.1/src/locus/__init__.py +12 -0
- locus_etl-0.0.1/src/locus/artifacts.py +73 -0
- locus_etl-0.0.1/src/locus/backends.py +91 -0
- locus_etl-0.0.1/src/locus/builder.py +113 -0
- locus_etl-0.0.1/src/locus/builtins.py +133 -0
- locus_etl-0.0.1/src/locus/catalog/__init__.py +53 -0
- locus_etl-0.0.1/src/locus/catalog/common.py +59 -0
- locus_etl-0.0.1/src/locus/catalog/converters.py +42 -0
- locus_etl-0.0.1/src/locus/catalog/extractors.py +84 -0
- locus_etl-0.0.1/src/locus/catalog/transforms.py +223 -0
- locus_etl-0.0.1/src/locus/cli.py +284 -0
- locus_etl-0.0.1/src/locus/emit_constants.py +11 -0
- locus_etl-0.0.1/src/locus/errors.py +69 -0
- locus_etl-0.0.1/src/locus/executor.py +115 -0
- locus_etl-0.0.1/src/locus/hub.py +181 -0
- locus_etl-0.0.1/src/locus/image.py +62 -0
- locus_etl-0.0.1/src/locus/loader.py +94 -0
- locus_etl-0.0.1/src/locus/locusfile.py +58 -0
- locus_etl-0.0.1/src/locus/manifest.py +40 -0
- locus_etl-0.0.1/src/locus/oci_store.py +191 -0
- locus_etl-0.0.1/src/locus/packaging.py +57 -0
- locus_etl-0.0.1/src/locus/planner.py +172 -0
- locus_etl-0.0.1/src/locus/privacy.py +62 -0
- locus_etl-0.0.1/src/locus/provenance.py +123 -0
- locus_etl-0.0.1/src/locus/runner.py +94 -0
- locus_etl-0.0.1/src/locus/seed.py +55 -0
- locus_etl-0.0.1/src/locus/serve.py +126 -0
- locus_etl-0.0.1/src/locus/store.py +184 -0
- locus_etl-0.0.1/src/locus/workspace.py +72 -0
- locus_etl-0.0.1/src/locus_engine/__init__.py +178 -0
- locus_etl-0.0.1/src/locus_engine/clean/__init__.py +8 -0
- locus_etl-0.0.1/src/locus_engine/clean/cleaner.py +129 -0
- locus_etl-0.0.1/src/locus_engine/clean/dedup.py +87 -0
- locus_etl-0.0.1/src/locus_engine/composer.py +90 -0
- locus_etl-0.0.1/src/locus_engine/config.py +64 -0
- locus_etl-0.0.1/src/locus_engine/conformance.py +143 -0
- locus_etl-0.0.1/src/locus_engine/connectors/__init__.py +9 -0
- locus_etl-0.0.1/src/locus_engine/connectors/files.py +62 -0
- locus_etl-0.0.1/src/locus_engine/connectors/http.py +92 -0
- locus_etl-0.0.1/src/locus_engine/connectors/sql.py +74 -0
- locus_etl-0.0.1/src/locus_engine/emit/__init__.py +9 -0
- locus_etl-0.0.1/src/locus_engine/emit/common.py +39 -0
- locus_etl-0.0.1/src/locus_engine/emit/dataframe.py +40 -0
- locus_etl-0.0.1/src/locus_engine/emit/parquet.py +42 -0
- locus_etl-0.0.1/src/locus_engine/emit/sql.py +85 -0
- locus_etl-0.0.1/src/locus_engine/errors.py +77 -0
- locus_etl-0.0.1/src/locus_engine/extract/__init__.py +7 -0
- locus_etl-0.0.1/src/locus_engine/extract/deterministic.py +116 -0
- locus_etl-0.0.1/src/locus_engine/extract/dual.py +89 -0
- locus_etl-0.0.1/src/locus_engine/ir.py +55 -0
- locus_etl-0.0.1/src/locus_engine/lineage.py +81 -0
- locus_etl-0.0.1/src/locus_engine/llm/__init__.py +13 -0
- locus_etl-0.0.1/src/locus_engine/llm/credentials.py +111 -0
- locus_etl-0.0.1/src/locus_engine/llm/engine.py +81 -0
- locus_etl-0.0.1/src/locus_engine/llm/router.py +57 -0
- locus_etl-0.0.1/src/locus_engine/observability.py +119 -0
- locus_etl-0.0.1/src/locus_engine/parsers/__init__.py +11 -0
- locus_etl-0.0.1/src/locus_engine/parsers/csv_parser.py +62 -0
- locus_etl-0.0.1/src/locus_engine/parsers/html.py +127 -0
- locus_etl-0.0.1/src/locus_engine/parsers/pdf.py +106 -0
- locus_etl-0.0.1/src/locus_engine/parsers/records.py +54 -0
- locus_etl-0.0.1/src/locus_engine/parsers/router.py +40 -0
- locus_etl-0.0.1/src/locus_engine/pipeline.py +228 -0
- locus_etl-0.0.1/src/locus_engine/plugins.py +145 -0
- locus_etl-0.0.1/src/locus_engine/provenance.py +85 -0
- locus_etl-0.0.1/src/locus_engine/registry.py +89 -0
- locus_etl-0.0.1/src/locus_engine/results.py +56 -0
- locus_etl-0.0.1/src/locus_engine/review.py +108 -0
- locus_etl-0.0.1/src/locus_engine/schema_infer.py +62 -0
- locus_etl-0.0.1/src/locus_engine/table.py +63 -0
- locus_etl-0.0.1/src/locus_engine/validate/__init__.py +7 -0
- locus_etl-0.0.1/src/locus_engine/validate/grounding.py +113 -0
- locus_etl-0.0.1/tests/__init__.py +0 -0
- locus_etl-0.0.1/tests/fixtures/_make_pdf.py +68 -0
- locus_etl-0.0.1/tests/fixtures/dupes.csv +4 -0
- locus_etl-0.0.1/tests/fixtures/invoices.csv +4 -0
- locus_etl-0.0.1/tests/fixtures/sample.pdf +39 -0
- locus_etl-0.0.1/tests/runtime/__init__.py +0 -0
- locus_etl-0.0.1/tests/runtime/test_backends_privacy.py +97 -0
- locus_etl-0.0.1/tests/runtime/test_builder.py +69 -0
- locus_etl-0.0.1/tests/runtime/test_catalog.py +128 -0
- locus_etl-0.0.1/tests/runtime/test_cli_scaffold.py +76 -0
- locus_etl-0.0.1/tests/runtime/test_cross_stage_provenance.py +71 -0
- locus_etl-0.0.1/tests/runtime/test_discovery.py +51 -0
- locus_etl-0.0.1/tests/runtime/test_executor.py +36 -0
- locus_etl-0.0.1/tests/runtime/test_hub.py +70 -0
- locus_etl-0.0.1/tests/runtime/test_loader.py +81 -0
- locus_etl-0.0.1/tests/runtime/test_models.py +97 -0
- locus_etl-0.0.1/tests/runtime/test_oci_store.py +147 -0
- locus_etl-0.0.1/tests/runtime/test_planner.py +149 -0
- locus_etl-0.0.1/tests/runtime/test_properties_runtime.py +161 -0
- locus_etl-0.0.1/tests/runtime/test_run_single.py +65 -0
- locus_etl-0.0.1/tests/runtime/test_seed.py +34 -0
- locus_etl-0.0.1/tests/runtime/test_serve.py +56 -0
- locus_etl-0.0.1/tests/runtime/test_store.py +108 -0
- locus_etl-0.0.1/tests/test_clean.py +86 -0
- locus_etl-0.0.1/tests/test_composer.py +79 -0
- locus_etl-0.0.1/tests/test_config.py +49 -0
- locus_etl-0.0.1/tests/test_conformance.py +71 -0
- locus_etl-0.0.1/tests/test_connectors.py +34 -0
- locus_etl-0.0.1/tests/test_emit.py +60 -0
- locus_etl-0.0.1/tests/test_errors.py +57 -0
- locus_etl-0.0.1/tests/test_extract.py +67 -0
- locus_etl-0.0.1/tests/test_grounding.py +70 -0
- locus_etl-0.0.1/tests/test_ir.py +46 -0
- locus_etl-0.0.1/tests/test_lineage.py +70 -0
- locus_etl-0.0.1/tests/test_llm.py +157 -0
- locus_etl-0.0.1/tests/test_observability.py +49 -0
- locus_etl-0.0.1/tests/test_parsers.py +64 -0
- locus_etl-0.0.1/tests/test_pdf.py +47 -0
- locus_etl-0.0.1/tests/test_pipeline_clean_dedup.py +49 -0
- locus_etl-0.0.1/tests/test_pipeline_e2e.py +70 -0
- locus_etl-0.0.1/tests/test_pipeline_llm.py +87 -0
- locus_etl-0.0.1/tests/test_properties.py +122 -0
- locus_etl-0.0.1/tests/test_provenance.py +57 -0
- locus_etl-0.0.1/tests/test_registry.py +73 -0
- locus_etl-0.0.1/tests/test_results.py +34 -0
- locus_etl-0.0.1/tests/test_review.py +70 -0
- locus_etl-0.0.1/tests/test_schema_infer.py +35 -0
- locus_etl-0.0.1/tests/test_stage9.py +148 -0
- locus_etl-0.0.1/tests/test_table.py +51 -0
- locus_etl-0.0.1/uv.lock +5177 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.11", "3.12"]
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
|
|
18
|
+
- name: Install uv
|
|
19
|
+
uses: astral-sh/setup-uv@v5
|
|
20
|
+
with:
|
|
21
|
+
enable-cache: true
|
|
22
|
+
|
|
23
|
+
- name: Set Python version
|
|
24
|
+
run: uv python install ${{ matrix.python-version }}
|
|
25
|
+
|
|
26
|
+
- name: Sync dependencies
|
|
27
|
+
run: uv sync --extra dedup --extra pdf --extra serve
|
|
28
|
+
|
|
29
|
+
- name: Ruff (lint)
|
|
30
|
+
run: uv run ruff check src tests
|
|
31
|
+
|
|
32
|
+
- name: Mypy (types)
|
|
33
|
+
run: uv run mypy
|
|
34
|
+
|
|
35
|
+
- name: Pytest (with coverage)
|
|
36
|
+
run: uv run pytest --cov --cov-report=term-missing
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Diagram-generation virtualenv (transient build tooling)
|
|
2
|
+
.diagram-venv/
|
|
3
|
+
|
|
4
|
+
# Python
|
|
5
|
+
__pycache__/
|
|
6
|
+
*.py[cod]
|
|
7
|
+
.venv/
|
|
8
|
+
venv/
|
|
9
|
+
.mypy_cache/
|
|
10
|
+
.pytest_cache/
|
|
11
|
+
.ruff_cache/
|
|
12
|
+
*.egg-info/
|
|
13
|
+
htmlcov/
|
|
14
|
+
.coverage
|
|
15
|
+
|
|
16
|
+
# OS
|
|
17
|
+
.DS_Store
|
|
18
|
+
|
|
19
|
+
# Local scratch / demo space (not committed)
|
|
20
|
+
scratch/
|
|
21
|
+
|
|
22
|
+
# Build artifacts
|
|
23
|
+
dist/
|
|
24
|
+
build/
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"specId": "2c1324e3-b19f-554d-a057-476f4bf65262", "workflowType": "requirements-first", "specType": "feature"}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Architecture & Vision Notes — Layer 2 (locus-image-runtime)
|
|
2
|
+
|
|
3
|
+
> The canonical, cross-layer decision log lives in the sibling spec:
|
|
4
|
+
> `.kiro/specs/unstructured-to-tabular-etl/architecture-notes.md`
|
|
5
|
+
> Read that file for the full decision trail (project name, runtime model, key handling,
|
|
6
|
+
> LiteLLM, composition solutions, Hub/Harbor, privacy model, etc.). It spans BOTH layers.
|
|
7
|
+
> This file only records what is specific to Layer 2 navigation.
|
|
8
|
+
|
|
9
|
+
## What this spec owns (Layer 2 = packaging / distribution / runtime)
|
|
10
|
+
- Client install + invocation (single CLI, local-process default, Docker optional backend).
|
|
11
|
+
- Image pull + local cache; version resolution.
|
|
12
|
+
- Locusfile run-configuration surface (source, volumes, ports, schema ref, llm, export,
|
|
13
|
+
review) — minimal required fields = source + image.
|
|
14
|
+
- Local LLM credential handling (.env primary; env var / keyring fallbacks; raw-key
|
|
15
|
+
prohibition + gitignore guardrails).
|
|
16
|
+
- Multi-image composition = pipeline DAG (NOT docker-compose semantics; compose-FAMILIAR
|
|
17
|
+
syntax only). `needs:`-style dependency edges, parallel branches, cycle detection,
|
|
18
|
+
stage output caching.
|
|
19
|
+
- Stage interchange contract: single versioned `Locus_Artifact` envelope, fixed
|
|
20
|
+
Artifact_Type set, Arrow/Parquet payloads, static pre-run type check (fail fast).
|
|
21
|
+
- Cross-stage provenance propagation: framework-managed (SDK) lineage, run-scoped
|
|
22
|
+
append-only Lineage_Store, conformance certification, strict/permissive non-conformant.
|
|
23
|
+
- Result serving / preview UI / export (local, on mapped port).
|
|
24
|
+
- Image authoring + building (Image_Manifest, dependency pinning, conformance cert).
|
|
25
|
+
- Publishing to Locus Hub: public + private (Harbor), namespaces, RBAC, self-host path.
|
|
26
|
+
- Image discovery; runtime privacy disclosure / consent.
|
|
27
|
+
|
|
28
|
+
## What the SIBLING spec (`unstructured-to-tabular-etl`, Layer 1) owns
|
|
29
|
+
- The processing engine an Image embeds: connectors, parser routing, IR, schema-driven
|
|
30
|
+
extraction, cleaning, dedup, the cell-level grounding/faithfulness contract, HITL review,
|
|
31
|
+
plugin architecture, observability, dual-engine (deterministic default + LLM opt-in),
|
|
32
|
+
LLM guardrails. Req 1-15 there.
|
|
33
|
+
|
|
34
|
+
## Key cross-references
|
|
35
|
+
- Layer 2 Req 6 (interchange) + Req 7 (cross-stage provenance) depend on Layer 1 Req 3 (IR)
|
|
36
|
+
and Req 8 (emission/provenance) being the STABLE VERSIONED contract.
|
|
37
|
+
- `image-catalog.md` (this folder) = 65 candidate images / 9 tiers / locked build order.
|
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
# Design Document
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
This document specifies the design of **Layer 2 of Locus: the image runtime, packaging, distribution, and registry**. It builds on the Layer 1 engine (`unstructured-to-tabular-etl`, now implemented as the `locus_engine` package) and turns engine capabilities into reusable, versioned **images** that users pull, point at their own data via a **Locusfile**, run locally, preview, and export. Multiple images compose into a **pipeline DAG** with a versioned interchange contract and cross-stage provenance.
|
|
6
|
+
|
|
7
|
+
This layer is a separate installable package, `locus` (the CLI), which depends on `locus_engine`. The engine stays free of any packaging/registry concern; the runtime orchestrates engine-backed images.
|
|
8
|
+
|
|
9
|
+
### Tech baseline (inherited + additions)
|
|
10
|
+
|
|
11
|
+
- **Python 3.11+**, `uv` + `pyproject.toml`, Pydantic v2 (same as Layer 1).
|
|
12
|
+
- **CLI:** Typer (argument parsing, subcommands) + Rich (terminal output).
|
|
13
|
+
- **Registry transport:** ORAS-style OCI client (`oras` Python library) against Harbor; the CLI is registry-agnostic OCI so Harbor is swappable.
|
|
14
|
+
- **Artifact serialization:** Apache Arrow / Parquet for tabular payloads (already produced by the engine's emitters).
|
|
15
|
+
- **Result UI:** embedded FastAPI + a static front end, served locally via uvicorn.
|
|
16
|
+
- **Optional Docker backend:** the `docker` Python SDK, used only when `--runtime=docker`.
|
|
17
|
+
|
|
18
|
+
### Design principles
|
|
19
|
+
|
|
20
|
+
1. **The CLI is the only install.** A single `pip install locus`. No daemon, no Linux VM by default (Req 1).
|
|
21
|
+
2. **Local-first, registry-agnostic.** Runs on the user's machine; talks plain OCI so the default Harbor hub is swappable (architecture notes).
|
|
22
|
+
3. **Composition is a typed DAG, not docker-compose.** Compose-familiar YAML, pipeline execution semantics (Req 5).
|
|
23
|
+
4. **Trust is enforced at the seams.** Static type-check across stage edges before running (Req 6); provenance composes across stages or the build fails (Req 7).
|
|
24
|
+
5. **The engine does the work; the runtime orchestrates.** No data transformation logic lives here — it delegates to `locus_engine`.
|
|
25
|
+
|
|
26
|
+
## Architecture
|
|
27
|
+
|
|
28
|
+
```mermaid
|
|
29
|
+
flowchart TB
|
|
30
|
+
subgraph cli["locus CLI (Typer)"]
|
|
31
|
+
PULL[pull]
|
|
32
|
+
RUN[run]
|
|
33
|
+
BUILD[build]
|
|
34
|
+
PUSH[push]
|
|
35
|
+
SEARCH[search]
|
|
36
|
+
INIT[init]
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
subgraph rt["Runtime"]
|
|
40
|
+
LF[LocusfileLoader\n+ validation]
|
|
41
|
+
DAG[PipelinePlanner\nDAG + static type-check]
|
|
42
|
+
EXEC[StageExecutor\nlocal-process | docker]
|
|
43
|
+
PROV[CrossStageProvenance\n+ run LineageStore]
|
|
44
|
+
WS[(Run_Workspace\nArrow artifacts)]
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
subgraph img["Image"]
|
|
48
|
+
MAN[Image_Manifest\naccepts/emits, engine modes, privacy]
|
|
49
|
+
CODE[engine-backed capability code]
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
subgraph dist["Distribution"]
|
|
53
|
+
CACHE[(local image cache)]
|
|
54
|
+
ORAS[OCI client / ORAS]
|
|
55
|
+
HUB[(Locus Hub / Harbor)]
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
subgraph serve["Serving"]
|
|
59
|
+
UI[Result UI\nFastAPI + static]
|
|
60
|
+
EXP[Exporter]
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
ENG[[locus_engine\nLayer 1]]
|
|
64
|
+
|
|
65
|
+
RUN --> LF --> DAG --> EXEC
|
|
66
|
+
EXEC --> img
|
|
67
|
+
img --> ENG
|
|
68
|
+
EXEC <--> WS
|
|
69
|
+
EXEC --> PROV
|
|
70
|
+
PULL --> ORAS <--> HUB
|
|
71
|
+
PULL --> CACHE
|
|
72
|
+
BUILD --> MAN
|
|
73
|
+
PUSH --> ORAS
|
|
74
|
+
SEARCH --> HUB
|
|
75
|
+
EXEC --> UI
|
|
76
|
+
EXEC --> EXP
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
The CLI dispatches subcommands. `run` is the core path: load + validate the Locusfile, plan the DAG (resolving and type-checking every stage edge), then execute stages in dependency order through the selected runtime backend, materializing typed `Locus_Artifact`s in the run workspace and threading provenance through a run-scoped lineage store. `pull`/`push`/`search` talk OCI to Harbor. `build` packages an image from its manifest and certifies provenance conformance.
|
|
80
|
+
|
|
81
|
+
## Data Models
|
|
82
|
+
|
|
83
|
+
All models are Pydantic v2. The `Locus_Artifact`, `Artifact_Type`, and provenance structures align with the engine's `ProvenancedTable`/`IntermediateRepresentation` so the contract is shared, not re-invented.
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from __future__ import annotations
|
|
87
|
+
from enum import StrEnum
|
|
88
|
+
from typing import Any
|
|
89
|
+
from pydantic import BaseModel, Field
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class ArtifactKind(StrEnum):
|
|
93
|
+
IR = "ir"
|
|
94
|
+
TABLE = "table"
|
|
95
|
+
CHUNKS = "chunks"
|
|
96
|
+
EMBEDDINGS = "embeddings"
|
|
97
|
+
GRAPH = "graph"
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class ArtifactType(BaseModel):
|
|
101
|
+
"""A versioned artifact type, e.g. kind=table version=(1,0) -> 'table/v1'."""
|
|
102
|
+
kind: ArtifactKind
|
|
103
|
+
major: int = 1
|
|
104
|
+
minor: int = 0
|
|
105
|
+
|
|
106
|
+
def tag(self) -> str:
|
|
107
|
+
return f"{self.kind.value}/v{self.major}"
|
|
108
|
+
|
|
109
|
+
def compatible_with(self, consumer: "ArtifactType") -> "Compat":
|
|
110
|
+
if self.kind != consumer.kind or self.major != consumer.major:
|
|
111
|
+
return Compat.INCOMPATIBLE
|
|
112
|
+
if self.minor != consumer.minor:
|
|
113
|
+
return Compat.MINOR_DIFF
|
|
114
|
+
return Compat.OK
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class Compat(StrEnum):
|
|
118
|
+
OK = "ok"
|
|
119
|
+
MINOR_DIFF = "minor_diff" # compatible with warning (Req 6.6)
|
|
120
|
+
INCOMPATIBLE = "incompatible"
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class LocusArtifact(BaseModel):
|
|
124
|
+
"""The only structure that crosses a stage boundary (Req 6.1)."""
|
|
125
|
+
type: ArtifactType
|
|
126
|
+
payload_path: str # Arrow/Parquet file in the Run_Workspace (Req 6.2)
|
|
127
|
+
lineage_path: str | None = None # serialized lineage graph for this artifact
|
|
128
|
+
produced_by: str # stage id
|
|
129
|
+
engine_mode: str = "deterministic" # "deterministic" | "llm"
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class PrivacyClass(StrEnum):
|
|
133
|
+
LOCAL_ONLY = "local_only"
|
|
134
|
+
CALLS_EXTERNAL = "calls_external" # an LLM-backed image (Req 12.2)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class ImageManifest(BaseModel):
|
|
138
|
+
"""Build-time declaration of an image (Req 9.2)."""
|
|
139
|
+
name: str
|
|
140
|
+
version: str
|
|
141
|
+
description: str = ""
|
|
142
|
+
accepts: list[ArtifactType] = Field(default_factory=list)
|
|
143
|
+
emits: ArtifactType
|
|
144
|
+
engine_modes: list[str] = Field(default_factory=lambda: ["deterministic"])
|
|
145
|
+
privacy_class: PrivacyClass = PrivacyClass.LOCAL_ONLY
|
|
146
|
+
provenance_conformant: bool = False # set by build-time certification (Req 9.4)
|
|
147
|
+
entrypoint: str # import path of the capability callable
|
|
148
|
+
dependencies: dict[str, str] = Field(default_factory=dict) # pinned (Req 9.3)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class StageSpec(BaseModel):
|
|
152
|
+
"""One stage in a Locusfile pipeline."""
|
|
153
|
+
id: str
|
|
154
|
+
image: str # "name:version"
|
|
155
|
+
needs: list[str] = Field(default_factory=list) # dependency ids (Req 5.4)
|
|
156
|
+
source: "SourceSpec | None" = None # override pipeline source
|
|
157
|
+
config: dict[str, Any] = Field(default_factory=dict)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class SourceSpec(BaseModel):
|
|
161
|
+
type: str # "files" | "url" | "api" | "sql"
|
|
162
|
+
path: str | None = None
|
|
163
|
+
uri: str | None = None
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class PortSpec(BaseModel):
|
|
167
|
+
ui: int | None = None
|
|
168
|
+
api: int | None = None
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class Locusfile(BaseModel):
|
|
172
|
+
"""The run-configuration surface (Req 3)."""
|
|
173
|
+
image: str | None = None # single-image shorthand
|
|
174
|
+
source: SourceSpec | None = None
|
|
175
|
+
pipeline: list[StageSpec] = Field(default_factory=list)
|
|
176
|
+
volumes: list[str] = Field(default_factory=list)
|
|
177
|
+
ports: PortSpec = Field(default_factory=PortSpec)
|
|
178
|
+
schema_mode: str = "infer"
|
|
179
|
+
schema_ref: str | None = None
|
|
180
|
+
llm: dict[str, Any] | None = None
|
|
181
|
+
env_file: str | None = None
|
|
182
|
+
export: dict[str, Any] | None = None
|
|
183
|
+
review: dict[str, Any] | None = None
|
|
184
|
+
mode: str = "strict" # "strict" | "permissive" (Req 7.7/7.8)
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
## Components and Interfaces
|
|
188
|
+
|
|
189
|
+
### CLI commands (Typer)
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
# locus <command>
|
|
193
|
+
def pull(image: str) -> None: ... # Req 2
|
|
194
|
+
def run(locusfile: str = "locusfile.yaml",
|
|
195
|
+
runtime: str = "process") -> None: ... # Req 1, 5, 8
|
|
196
|
+
def build(manifest: str = "locus.image.yaml") -> None: ... # Req 9
|
|
197
|
+
def push(image: str, private: bool = False) -> None: ... # Req 10
|
|
198
|
+
def search(query: str = "") -> None: ... # Req 11
|
|
199
|
+
def login(registry: str | None = None) -> None: ... # Req 10.1
|
|
200
|
+
def init(path: str = ".") -> None: ... # Req 4.4 (.env + gitignore)
|
|
201
|
+
def config_set_key(provider: str, key: str) -> None: ... # keyring (Req 4.1)
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
### LocusfileLoader
|
|
205
|
+
|
|
206
|
+
Loads YAML, validates against the `Locusfile` model, performs credential safety checks (reject raw keys, enforce `.env` gitignore/untracked — Req 4.3/4.5), and resolves a single-image shorthand into a one-stage pipeline. Raises `ConfigError` with the offending setting (Req 3.7).
|
|
207
|
+
|
|
208
|
+
### PipelinePlanner
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
class PipelinePlanner:
|
|
212
|
+
def plan(self, lf: Locusfile, manifests: dict[str, ImageManifest]) -> PipelinePlan:
|
|
213
|
+
"""Build the DAG, detect cycles (Req 5.6), topologically order stages, and
|
|
214
|
+
STATIC-CHECK every edge's artifact-type compatibility BEFORE execution
|
|
215
|
+
(Req 6.4/6.5). A major-version mismatch fails; a minor mismatch warns
|
|
216
|
+
(Req 6.6). Also verifies provenance-conformance per mode (Req 7.7/7.8)."""
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
`PipelinePlan` holds the ordered stages, parallelizable groups (waves of independent stages, Req 5.5), and the validated edge map. Planning fails fast: no stage runs if any edge is incompatible or a cycle exists.
|
|
220
|
+
|
|
221
|
+
### StageExecutor and Runtime backends
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
class RuntimeBackend(Protocol):
|
|
225
|
+
name: str
|
|
226
|
+
def run_stage(self, image: ResolvedImage, inputs: list[LocusArtifact],
|
|
227
|
+
ctx: StageContext) -> LocusArtifact: ...
|
|
228
|
+
|
|
229
|
+
class ProcessBackend: # default (Req 1.3) — per-image venv, in-process engine call
|
|
230
|
+
...
|
|
231
|
+
class DockerBackend: # optional (Req 1.4); errors clearly if Docker absent (Req 1.5)
|
|
232
|
+
...
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
`StageExecutor` walks the plan wave by wave: gathers each stage's input artifacts (the pipeline source for root stages, dependency outputs otherwise — Req 5.3/5.4), invokes the backend, writes the output `Locus_Artifact` (Arrow file) to the run workspace, and records lineage. Stage caching (Req 5.8): if a stage's input artifact hashes and image version match a prior run, reuse the cached output.
|
|
236
|
+
|
|
237
|
+
### CrossStageProvenance
|
|
238
|
+
|
|
239
|
+
Wraps the engine's `ProvenanceComposer`/`LineageStore`. As artifacts flow between stages, cell ids and lineage edges are preserved in a run-scoped `Lineage_Store`; the terminal artifact's cells resolve to original `Source_Location`s (Req 7.5/7.6). Conformance is checked at build time (Req 9.4) and enforced at plan time per mode (Req 7.7/7.8).
|
|
240
|
+
|
|
241
|
+
### Distribution: OCI client + cache
|
|
242
|
+
|
|
243
|
+
```python
|
|
244
|
+
class ImageStore:
|
|
245
|
+
def pull(self, ref: str) -> ResolvedImage: ... # ORAS pull -> cache (Req 2.1)
|
|
246
|
+
def cached(self, ref: str) -> ResolvedImage | None: ... # (Req 2.2)
|
|
247
|
+
def push(self, image: BuiltImage, *, private: bool) -> None: ... # (Req 10)
|
|
248
|
+
def search(self, query: str) -> list[ImageSummary]: ... # (Req 11)
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
Images are OCI artifacts: the manifest + a layer with the capability code + pinned deps. Private/public visibility maps to Harbor project visibility (Req 10.3/10.4); pulls authenticate via the `Registry_Credential`, which is wholly separate from LLM keys (Req 10.8).
|
|
252
|
+
|
|
253
|
+
### ImageBuilder
|
|
254
|
+
|
|
255
|
+
Packages an `ImageManifest` + entrypoint into a `BuiltImage`, pins dependency versions (Req 9.3), and runs the engine's `conformance` harness (`probe_extraction_engine`) against the capability to set `provenance_conformant` (Req 9.4/9.5).
|
|
256
|
+
|
|
257
|
+
### Result serving and export
|
|
258
|
+
|
|
259
|
+
`ResultServer` (FastAPI) serves the final artifact on the mapped UI port (Req 8.1): rows, per-cell faithfulness + source location, flagged rows, and the engine mode (Req 8.2/8.3), reading directly from the engine's emitted DataFrame + `_lineage` column. `Exporter` writes the configured format (Req 8.4). Both are local; no hosted service (Req 8.5).
|
|
260
|
+
|
|
261
|
+
## Error Handling
|
|
262
|
+
|
|
263
|
+
| Class | Examples | Behavior |
|
|
264
|
+
|-------|----------|----------|
|
|
265
|
+
| Plan-fatal, fail-fast | cycle in DAG, artifact-type mismatch, non-conformant stage in strict mode, missing image | raise before any stage runs (Req 5.6, 6.5, 7.7, 2.4) |
|
|
266
|
+
| Config | invalid Locusfile, raw key in file, git-tracked `.env` | reject with offending setting (Req 3.7, 4.3, 4.5) |
|
|
267
|
+
| Backend | Docker selected but absent | error identifying the backend; no silent fallback (Req 1.5) |
|
|
268
|
+
| Permissive degradation | non-conformant stage in permissive mode | continue; mark downstream cells lineage-broken (Req 7.8) |
|
|
269
|
+
| Runtime privacy | external-LLM stage about to send data | consent notice before egress (Req 12.1) |
|
|
270
|
+
|
|
271
|
+
Errors derive from a `LocusRuntimeError` hierarchy, distinct from but mirroring the engine's `LocusError`.
|
|
272
|
+
|
|
273
|
+
## Correctness Properties
|
|
274
|
+
|
|
275
|
+
### Property 1: Fail-fast type safety
|
|
276
|
+
|
|
277
|
+
For any composed pipeline, no stage executes unless every dependency edge is artifact-type compatible (major version equal, kind equal). An incompatible edge raises before execution.
|
|
278
|
+
|
|
279
|
+
**Validates: Requirements 6.4, 6.5**
|
|
280
|
+
|
|
281
|
+
### Property 2: Acyclic execution
|
|
282
|
+
|
|
283
|
+
A pipeline graph containing a cycle never executes any stage and reports the cycle.
|
|
284
|
+
|
|
285
|
+
**Validates: Requirements 5.6**
|
|
286
|
+
|
|
287
|
+
### Property 3: Dependency ordering
|
|
288
|
+
|
|
289
|
+
A stage executes only after all stages in its `needs` have completed, and its inputs are exactly those dependencies' outputs (or the pipeline source for root stages).
|
|
290
|
+
|
|
291
|
+
**Validates: Requirements 5.3, 5.4**
|
|
292
|
+
|
|
293
|
+
### Property 4: End-to-end provenance survival
|
|
294
|
+
|
|
295
|
+
Every cell in the terminal artifact resolves, through the run lineage store, to at least one originating `Source_Location`, across all intermediate stages.
|
|
296
|
+
|
|
297
|
+
**Validates: Requirements 7.1, 7.5, 7.6**
|
|
298
|
+
|
|
299
|
+
### Property 5: Conformance gating
|
|
300
|
+
|
|
301
|
+
In strict mode a non-conformant stage fails the run; in permissive mode it continues and the affected downstream cells are marked lineage-broken.
|
|
302
|
+
|
|
303
|
+
**Validates: Requirements 7.7, 7.8**
|
|
304
|
+
|
|
305
|
+
### Property 6: Credential locality
|
|
306
|
+
|
|
307
|
+
No LLM credential or user data is transmitted to any hosted Locus service; credentials resolve only from local sources and a raw key in the Locusfile is rejected.
|
|
308
|
+
|
|
309
|
+
**Validates: Requirements 4.1, 4.2, 4.3**
|
|
310
|
+
|
|
311
|
+
### Property 7: Registry/credential separation
|
|
312
|
+
|
|
313
|
+
The registry credential authorizes only image pull/push and never grants access to LLM provider credentials.
|
|
314
|
+
|
|
315
|
+
**Validates: Requirements 10.8**
|
|
316
|
+
|
|
317
|
+
### Property 8: Privacy disclosure honesty
|
|
318
|
+
|
|
319
|
+
A run containing an external-LLM image never presents a blanket data-stays-local claim and surfaces a consent notice before data leaves.
|
|
320
|
+
|
|
321
|
+
**Validates: Requirements 12.1, 12.3**
|
|
322
|
+
|
|
323
|
+
## Testing strategy
|
|
324
|
+
|
|
325
|
+
1. **Locusfile validation** — minimal (source + image) accepted; raw-key rejection; `.env` gitignore/untracked guardrails; single-image shorthand expands to one stage.
|
|
326
|
+
2. **Planner unit tests** — DAG construction, cycle detection, topological waves, edge type-check (compatible / minor-warn / major-fail), conformance gating per mode.
|
|
327
|
+
3. **Interchange round-trip** — a `Locus_Artifact` written by one stage (Arrow) reads back identically in the next; lineage path resolves.
|
|
328
|
+
4. **Cross-stage provenance** — a two-stage pipeline (extract → redact) where the final masked cell still resolves to the original source; reuses the engine's conformance assertions.
|
|
329
|
+
5. **Backend tests** — process backend runs a stub image end-to-end; docker backend errors cleanly when Docker is absent (mocked).
|
|
330
|
+
6. **Distribution** — pull/cache hit-miss against a mocked OCI client; private-image auth; version resolution default.
|
|
331
|
+
7. **Serving/export** — FastAPI test client returns rows + provenance + engine mode; exporter writes the configured format.
|
|
332
|
+
8. **Privacy** — consent fires before an external-LLM stage; no blanket local claim when such a stage is present.
|
|
333
|
+
9. **Golden two-stage pipeline** — using two real engine-backed stub images, assert the typed DAG, provenance survival, and final emitted output end-to-end.
|
|
334
|
+
|
|
335
|
+
## Key design decisions and rationale
|
|
336
|
+
|
|
337
|
+
- **Separate `locus` package depending on `locus_engine`.** Keeps the engine reusable and the runtime's heavier deps (Typer, FastAPI, oras, optional docker) out of the engine.
|
|
338
|
+
- **Artifacts pass by file path in a run workspace, not in memory.** Enables the Docker backend (cross-process), stage caching (content-addressed), and large datasets — and reuses the engine's Arrow/Parquet emitters.
|
|
339
|
+
- **Static type-check before any execution.** Composition is only trustworthy if mismatches fail fast; this is a pipeline-level type system over the fixed `ArtifactType` set.
|
|
340
|
+
- **Provenance enforced at build (certification) and plan (gating).** Reuses the Layer 1 conformance harness so the guarantee is identical end to end; non-conformant images are visible, never silent.
|
|
341
|
+
- **OCI/Harbor borrowed, CLI registry-agnostic.** No registry is built; the default hub is swappable to GHCR/ECR/self-hosted Harbor via config.
|
|
342
|
+
```
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# Locus Image Catalog (working list)
|
|
2
|
+
|
|
3
|
+
> Candidate "capability images" to build and publish to Locus Hub. Each image contains
|
|
4
|
+
> LOGIC (parser + extractor + validator + emitter + pinned config); the USER brings their
|
|
5
|
+
> own data. Default engine = deterministic Python; LLM engine activates only when the user
|
|
6
|
+
> supplies a key (see architecture-notes.md). Every image inherits the cell-level
|
|
7
|
+
> provenance + faithfulness grounding contract from the Layer-1 engine.
|
|
8
|
+
>
|
|
9
|
+
> Status legend: [ ] not started | [~] in progress | [x] shipped
|
|
10
|
+
> This catalog will likely move into the sibling `data-image-runtime` spec.
|
|
11
|
+
|
|
12
|
+
## Guiding principle
|
|
13
|
+
- ~80% of enterprise data is unstructured; extraction (not the model) is the bottleneck.
|
|
14
|
+
- Highest recurring-spend jobs: invoices, receipts, bank statements, contracts, reports.
|
|
15
|
+
- DO NOT build breadth-first. Ship one fully-grounded image at a time.
|
|
16
|
+
- "Solve ALL data problems" is rejected as a goal — this list is the SUPERSET of what's
|
|
17
|
+
possible; discipline = ship incrementally starting with `doc-to-tables`.
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## Tier 1 — Document -> Structured (highest, proven demand)
|
|
22
|
+
1. [ ] `doc-to-tables` — tables from PDF/scan/Word -> rows. FIRST image to build.
|
|
23
|
+
2. [ ] `invoice-extractor` — invoices -> fields + line items.
|
|
24
|
+
3. [ ] `receipt-extractor` — receipts -> expense records.
|
|
25
|
+
4. [ ] `bank-statement-extractor` — statements -> transaction tables.
|
|
26
|
+
5. [ ] `resume-parser` — CVs -> candidate schema.
|
|
27
|
+
6. [ ] `contract-extractor` — clauses, parties, dates, obligations, renewal terms.
|
|
28
|
+
7. [ ] `form-extractor` — tax/customs/medical/court forms -> fields.
|
|
29
|
+
8. [ ] `report-extractor` — annual/financial reports -> tables + KPIs.
|
|
30
|
+
9. [ ] `scanned-ocr-to-table` — OCR-first for image-only scans (no text layer).
|
|
31
|
+
|
|
32
|
+
## Tier 2 — Format Conversion & Reshaping (broad everyday demand)
|
|
33
|
+
10. [ ] `any-to-json` — any source -> clean schema-stable JSON (dominant AI-ready format).
|
|
34
|
+
11. [ ] `any-to-csv` / `any-to-parquet` — tabular export for analytics.
|
|
35
|
+
12. [ ] `any-to-yaml` — config-shaped output.
|
|
36
|
+
13. [ ] `any-to-markdown` — doc/web -> clean Markdown (standard RAG ingestion format).
|
|
37
|
+
14. [ ] `json-reshaper` — JSON -> JSON remap/flatten/nest.
|
|
38
|
+
15. [ ] `schema-mapper` — map source schema -> target schema (rename, type-align).
|
|
39
|
+
|
|
40
|
+
## Tier 3 — Cleaning, Quality & Identity (the "data is messy" problems)
|
|
41
|
+
16. [ ] `data-cleaner` — type coercion, normalization, missing-value handling.
|
|
42
|
+
17. [ ] `deduplicator` / `entity-resolver` — match + merge same-real-world-entity records.
|
|
43
|
+
18. [ ] `data-validator` — assert dataset vs rules/constraints, report violations.
|
|
44
|
+
19. [ ] `data-enricher` — augment records with derived/looked-up fields.
|
|
45
|
+
20. [ ] `normalizer` — standardize units, currencies, dates, addresses, phone numbers.
|
|
46
|
+
|
|
47
|
+
## Tier 4 — RAG / AI-Prep (serves the "feed the LLM" goal directly)
|
|
48
|
+
21. [ ] `chunker` — layout-aware chunking (make-or-break RAG step).
|
|
49
|
+
22. [ ] `embedder` — chunk -> vector, output to Parquet / vector DB.
|
|
50
|
+
23. [ ] `knowledge-graph-builder` — text -> entities + relationships (triples) for GraphRAG.
|
|
51
|
+
24. [ ] `metadata-enricher` — LLM-generated metadata to improve retrieval.
|
|
52
|
+
25. [ ] `qa-pair-generator` — corpus -> synthetic Q&A pairs for fine-tune/eval.
|
|
53
|
+
|
|
54
|
+
## Tier 5 — Multimodal (next frontier; less crowded)
|
|
55
|
+
26. [ ] `audio-to-table` — transcription + diarization -> structured transcript/records.
|
|
56
|
+
27. [ ] `video-to-table` — frames + ASR -> structured events/timestamps.
|
|
57
|
+
28. [ ] `image-to-data` — charts/diagrams/photos -> structured values.
|
|
58
|
+
29. [ ] `email-extractor` — emails/threads -> structured fields.
|
|
59
|
+
30. [ ] `log-parser` — app/system logs -> structured events.
|
|
60
|
+
|
|
61
|
+
## Tier 6 — Unique / hard-problem images (Locus differentiation; lean on grounding engine)
|
|
62
|
+
31. [ ] `pii-redactor` — detect + mask PII in structured + unstructured data.
|
|
63
|
+
32. [ ] `provenance-tracker` — emit data with full cell-level lineage as the headline output.
|
|
64
|
+
33. [ ] `fact-grounder` — score faithfulness of every cell in an EXISTING table vs sources
|
|
65
|
+
(validation-as-a-service). Flagship/marketing wedge.
|
|
66
|
+
34. [ ] `synthetic-data-generator` — schema -> realistic synthetic rows, referential
|
|
67
|
+
integrity across multi-table schemas.
|
|
68
|
+
35. [ ] `cross-doc-reconciler` — reconcile conflicting values across multiple docs, surface
|
|
69
|
+
the conflict with provenance.
|
|
70
|
+
36. [ ] `table-joiner` — semantic/fuzzy join of tables lacking shared keys.
|
|
71
|
+
37. [ ] `time-series-structurer` — messy event text -> clean time series (no peak flatten).
|
|
72
|
+
38. [ ] `taxonomy-classifier` — auto-classify records into a user-defined taxonomy tree.
|
|
73
|
+
39. [ ] `change-diff` — diff two versions of a doc/dataset -> structured changeset.
|
|
74
|
+
40. [ ] `anomaly-flagger` — flag outlier/suspect rows with reasons (quality + fraud signal).
|
|
75
|
+
|
|
76
|
+
## Tier 7 — Domain-specific verticals (additional; high willingness-to-pay)
|
|
77
|
+
41. [ ] `medical-record-extractor` — clinical notes/EHR text -> structured codes (ICD/SNOMED-style).
|
|
78
|
+
42. [ ] `legal-doc-extractor` — case law / filings -> citations, holdings, parties.
|
|
79
|
+
43. [ ] `scientific-paper-extractor` — papers -> structured methods/results tables, citations.
|
|
80
|
+
44. [ ] `financial-filing-extractor` — 10-K/earnings -> normalized financial line items.
|
|
81
|
+
45. [ ] `real-estate-doc-extractor` — leases/deeds -> property + terms schema.
|
|
82
|
+
46. [ ] `insurance-claim-extractor` — claims/policies -> structured claim records.
|
|
83
|
+
47. [ ] `shipping-logistics-extractor` — BOL / customs / manifests -> structured shipment data.
|
|
84
|
+
48. [ ] `survey-response-structurer` — free-text survey answers -> coded categorical data.
|
|
85
|
+
49. [ ] `product-catalog-extractor` — supplier sheets/specs -> normalized product attributes.
|
|
86
|
+
50. [ ] `menu-extractor` — restaurant menus (img/PDF) -> items + prices + modifiers.
|
|
87
|
+
|
|
88
|
+
## Tier 8 — Web & API sources (additional)
|
|
89
|
+
51. [ ] `web-to-table` — web pages/listings -> structured rows (price/spec scraping).
|
|
90
|
+
52. [ ] `api-to-table` — paginated REST/GraphQL responses -> flattened tables.
|
|
91
|
+
53. [ ] `sitemap-crawler-to-corpus` — crawl a site -> clean corpus for downstream images.
|
|
92
|
+
54. [ ] `social-feed-structurer` — posts/threads -> structured engagement/content records.
|
|
93
|
+
55. [ ] `spreadsheet-normalizer` — messy human Excel (merged cells, multi-header) -> tidy table.
|
|
94
|
+
|
|
95
|
+
## Tier 9 — Operational / pipeline-utility images (additional; glue + governance)
|
|
96
|
+
56. [ ] `schema-inferer` — sample data -> proposed Pydantic schema (bootstraps other images).
|
|
97
|
+
57. [ ] `data-profiler` — dataset -> stats/quality report (nulls, cardinality, distributions).
|
|
98
|
+
58. [ ] `data-masker` / `anonymizer` — reversible/irreversible masking beyond PII (k-anon).
|
|
99
|
+
59. [ ] `format-detector` — sniff + classify unknown file/content types for routing.
|
|
100
|
+
60. [ ] `language-detector-translator` — detect language, optionally normalize to one language.
|
|
101
|
+
61. [ ] `unit-test-data-extractor` — code repos -> structured fixtures/test data.
|
|
102
|
+
62. [ ] `csv-repair` — fix broken/ragged CSVs (bad quoting, mixed delimiters, encoding).
|
|
103
|
+
63. [ ] `encoding-normalizer` — normalize text encodings / mojibake repair.
|
|
104
|
+
64. [ ] `dataset-merger` — union/merge heterogeneous datasets into one schema.
|
|
105
|
+
65. [ ] `incremental-sync` — detect new/changed source records since last run (CDC-style).
|
|
106
|
+
|
|
107
|
+
## Build-order recommendation (LOCKED priority)
|
|
108
|
+
1. `doc-to-tables` (Tier 1 #1) — proves full engine end-to-end; everything reuses its plumbing.
|
|
109
|
+
2. Tier-1 specializations `invoice-extractor`, `receipt-extractor` — base + baked-in schema; highest revenue.
|
|
110
|
+
3. 2-3 conversion images `any-to-json`, `any-to-markdown` — high pull volume, low build cost, drives adoption.
|
|
111
|
+
4. One flagship unique image `fact-grounder` (or `provenance-tracker`) — the marketing wedge competitors lack.
|
|
112
|
+
5. Tiers 4-9 follow once base engine + packaging are battle-tested.
|
|
113
|
+
|
|
114
|
+
## Notes / open
|
|
115
|
+
- Several Tier 6 images (fact-grounder, provenance-tracker, cross-doc-reconciler) are the
|
|
116
|
+
defensible differentiation — they ARE the grounding engine, packaged.
|
|
117
|
+
- Each image should advertise: engine mode support (deterministic / LLM / hybrid),
|
|
118
|
+
privacy class (local-only vs calls-external-LLM), and required vs inferred schema.
|
|
119
|
+
- Verticals (Tier 7) are mostly Tier-1 base + a baked-in domain schema + tuned prompts;
|
|
120
|
+
cheap to produce once base exists, high willingness-to-pay.
|