PyPI - fraclab-sdk - Versions diffs - 0.1.0__py3-none-any.whl - Mend

fraclab-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

README.md +1601 -0
fraclab_sdk/__init__.py +34 -0
fraclab_sdk/algorithm/__init__.py +13 -0
fraclab_sdk/algorithm/export.py +1 -0
fraclab_sdk/algorithm/library.py +378 -0
fraclab_sdk/cli.py +381 -0
fraclab_sdk/config.py +54 -0
fraclab_sdk/devkit/__init__.py +25 -0
fraclab_sdk/devkit/compile.py +342 -0
fraclab_sdk/devkit/export.py +354 -0
fraclab_sdk/devkit/validate.py +1043 -0
fraclab_sdk/errors.py +124 -0
fraclab_sdk/materialize/__init__.py +8 -0
fraclab_sdk/materialize/fsops.py +125 -0
fraclab_sdk/materialize/hash.py +28 -0
fraclab_sdk/materialize/materializer.py +241 -0
fraclab_sdk/models/__init__.py +52 -0
fraclab_sdk/models/bundle_manifest.py +51 -0
fraclab_sdk/models/dataspec.py +65 -0
fraclab_sdk/models/drs.py +47 -0
fraclab_sdk/models/output_contract.py +111 -0
fraclab_sdk/models/run_output_manifest.py +119 -0
fraclab_sdk/results/__init__.py +25 -0
fraclab_sdk/results/preview.py +150 -0
fraclab_sdk/results/reader.py +329 -0
fraclab_sdk/run/__init__.py +10 -0
fraclab_sdk/run/logs.py +42 -0
fraclab_sdk/run/manager.py +403 -0
fraclab_sdk/run/subprocess_runner.py +153 -0
fraclab_sdk/runtime/__init__.py +11 -0
fraclab_sdk/runtime/artifacts.py +303 -0
fraclab_sdk/runtime/data_client.py +123 -0
fraclab_sdk/runtime/runner_main.py +286 -0
fraclab_sdk/runtime/snapshot_provider.py +1 -0
fraclab_sdk/selection/__init__.py +11 -0
fraclab_sdk/selection/model.py +247 -0
fraclab_sdk/selection/validate.py +54 -0
fraclab_sdk/snapshot/__init__.py +12 -0
fraclab_sdk/snapshot/index.py +94 -0
fraclab_sdk/snapshot/library.py +205 -0
fraclab_sdk/snapshot/loader.py +217 -0
fraclab_sdk/specs/manifest.py +89 -0
fraclab_sdk/utils/io.py +32 -0
fraclab_sdk-0.1.0.dist-info/METADATA +1622 -0
fraclab_sdk-0.1.0.dist-info/RECORD +47 -0
fraclab_sdk-0.1.0.dist-info/WHEEL +4 -0
fraclab_sdk-0.1.0.dist-info/entry_points.txt +4 -0

fraclab_sdk/runtime/artifacts.py ADDED Viewed

@@ -0,0 +1,303 @@
+"""Artifact writer for algorithm runtime."""
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+from fraclab_sdk.errors import OutputContainmentError
+@dataclass
+class ArtifactRecord:
+    """Record of a written artifact."""
+    dataset_key: str
+    owner: dict[str, Any] | None
+    dims: dict[str, Any] | None
+    meta: dict[str, Any] | None
+    inline: dict[str, Any] | None
+    item_key: str | None
+    artifact_key: str
+    artifact_type: str  # "scalar", "blob", "json"
+    mime_type: str | None = None
+    file_uri: str | None = None
+    value: Any = None
+class ArtifactWriter:
+    """Writer for algorithm output artifacts with containment enforcement."""
+    def __init__(self, output_dir: Path) -> None:
+        """Initialize artifact writer.
+        Args:
+            output_dir: The run output directory. All writes must be under this.
+        """
+        self._output_dir = output_dir.resolve()
+        self._artifacts_dir = self._output_dir / "artifacts"
+        self._artifacts_dir.mkdir(parents=True, exist_ok=True)
+        self._records: list[ArtifactRecord] = []
+    def _validate_path(self, path: Path) -> Path:
+        """Validate that path is within output directory.
+        Args:
+            path: Path to validate.
+        Returns:
+            Resolved path.
+        Raises:
+            OutputContainmentError: If path is outside output directory.
+        """
+        resolved = path.resolve()
+        try:
+            resolved.relative_to(self._output_dir)
+        except ValueError:
+            raise OutputContainmentError(str(resolved), str(self._output_dir)) from None
+        return resolved
+    def write_scalar(
+        self,
+        artifact_key: str,
+        value: Any,
+        *,
+        dataset_key: str = "artifacts",
+        owner: dict[str, Any] | None = None,
+        dims: dict[str, Any] | None = None,
+        meta: dict[str, Any] | None = None,
+        inline: dict[str, Any] | None = None,
+        item_key: str | None = None,
+    ) -> None:
+        """Write a scalar artifact (number, string, bool).
+        Args:
+            artifact_key: Unique key for the artifact.
+            value: Scalar value to store.
+        """
+        self._records.append(
+            ArtifactRecord(
+                dataset_key=dataset_key,
+                owner=owner,
+                dims=dims,
+                meta=meta,
+                inline=inline,
+                item_key=item_key,
+                artifact_key=artifact_key,
+                artifact_type="scalar",
+                value=value,
+            )
+        )
+    def write_json(
+        self,
+        artifact_key: str,
+        data: Any,
+        filename: str | None = None,
+        *,
+        dataset_key: str = "artifacts",
+        owner: dict[str, Any] | None = None,
+        dims: dict[str, Any] | None = None,
+        meta: dict[str, Any] | None = None,
+        inline: dict[str, Any] | None = None,
+        item_key: str | None = None,
+    ) -> Path:
+        """Write a JSON artifact.
+        Args:
+            artifact_key: Unique key for the artifact.
+            data: Data to serialize as JSON.
+            filename: Optional filename. Defaults to {artifact_key}.json.
+            dataset_key: Dataset key this artifact belongs to.
+            owner: Optional owner map (stageId/wellId/platformId).
+            dims: Optional dimensions values.
+            meta: Optional meta info.
+            inline: Optional inline payload.
+            item_key: Optional item key override.
+        Returns:
+            Path to the written file.
+        """
+        if filename is None:
+            filename = f"{artifact_key}.json"
+        file_path = self._artifacts_dir / filename
+        file_path = self._validate_path(file_path)
+        content = json.dumps(data, indent=2, ensure_ascii=False)
+        file_path.write_text(content, encoding="utf-8")
+        file_uri = f"file://{file_path}"
+        self._records.append(
+            ArtifactRecord(
+                dataset_key=dataset_key,
+                owner=owner,
+                dims=dims,
+                meta=meta,
+                inline=inline,
+                item_key=item_key,
+                artifact_key=artifact_key,
+                artifact_type="json",
+                mime_type="application/json",
+                file_uri=file_uri,
+            )
+        )
+        return file_path
+    def write_blob(
+        self,
+        artifact_key: str,
+        data: bytes,
+        filename: str,
+        mime_type: str | None = None,
+        *,
+        dataset_key: str = "artifacts",
+        owner: dict[str, Any] | None = None,
+        dims: dict[str, Any] | None = None,
+        meta: dict[str, Any] | None = None,
+        inline: dict[str, Any] | None = None,
+        item_key: str | None = None,
+    ) -> Path:
+        """Write a binary blob artifact.
+        Args:
+            artifact_key: Unique key for the artifact.
+            data: Binary data to write.
+            filename: Filename for the blob.
+            mime_type: MIME type of the data.
+            dataset_key: Dataset key this artifact belongs to.
+            owner: Optional owner map (stageId/wellId/platformId).
+            dims: Optional dimensions values.
+            meta: Optional meta info.
+            inline: Optional inline payload.
+            item_key: Optional item key override.
+        Returns:
+            Path to the written file.
+        """
+        file_path = self._artifacts_dir / filename
+        file_path = self._validate_path(file_path)
+        file_path.write_bytes(data)
+        file_uri = f"file://{file_path}"
+        self._records.append(
+            ArtifactRecord(
+                dataset_key=dataset_key,
+                owner=owner,
+                dims=dims,
+                meta=meta,
+                inline=inline,
+                item_key=item_key,
+                artifact_key=artifact_key,
+                artifact_type="blob",
+                mime_type=mime_type,
+                file_uri=file_uri,
+            )
+        )
+        return file_path
+    def write_file(
+        self,
+        artifact_key: str,
+        source_path: Path,
+        filename: str | None = None,
+        mime_type: str | None = None,
+        *,
+        dataset_key: str = "artifacts",
+        owner: dict[str, Any] | None = None,
+        dims: dict[str, Any] | None = None,
+        meta: dict[str, Any] | None = None,
+        inline: dict[str, Any] | None = None,
+        item_key: str | None = None,
+    ) -> Path:
+        """Copy a file as an artifact.
+        Args:
+            artifact_key: Unique key for the artifact.
+            source_path: Path to source file.
+            filename: Optional destination filename. Defaults to source name.
+            mime_type: MIME type of the file.
+            dataset_key: Dataset key this artifact belongs to.
+            owner: Optional owner map (stageId/wellId/platformId).
+            dims: Optional dimensions values.
+            meta: Optional meta info.
+            inline: Optional inline payload.
+            item_key: Optional item key override.
+        Returns:
+            Path to the copied file.
+        """
+        import shutil
+        if filename is None:
+            filename = source_path.name
+        dest_path = self._artifacts_dir / filename
+        dest_path = self._validate_path(dest_path)
+        shutil.copy2(source_path, dest_path)
+        file_uri = f"file://{dest_path}"
+        self._records.append(
+            ArtifactRecord(
+                dataset_key=dataset_key,
+                owner=owner,
+                dims=dims,
+                meta=meta,
+                inline=inline,
+                item_key=item_key,
+                artifact_key=artifact_key,
+                artifact_type="blob",
+                mime_type=mime_type,
+                file_uri=file_uri,
+            )
+        )
+        return dest_path
+    def get_records(self) -> list[ArtifactRecord]:
+        """Get all artifact records."""
+        return self._records.copy()
+    def build_manifest_datasets(self) -> list[dict]:
+        """Build dataset -> items structure for output manifest."""
+        by_ds: dict[str, list[ArtifactRecord]] = {}
+        for rec in self._records:
+            by_ds.setdefault(rec.dataset_key, []).append(rec)
+        datasets: list[dict[str, Any]] = []
+        for ds_key, records in by_ds.items():
+            items: list[dict[str, Any]] = []
+            for rec in records:
+                artifact = {
+                    "artifactKey": rec.artifact_key,
+                    "type": rec.artifact_type,
+                }
+                if rec.mime_type:
+                    artifact["mimeType"] = rec.mime_type
+                if rec.file_uri is not None:
+                    artifact["uri"] = rec.file_uri
+                if rec.value is not None:
+                    artifact["value"] = rec.value
+                if rec.inline is not None:
+                    artifact["inline"] = rec.inline
+                item: dict[str, Any] = {
+                    "itemKey": rec.item_key or rec.artifact_key,
+                    "artifact": artifact,
+                }
+                if rec.owner:
+                    item["owner"] = rec.owner
+                if rec.dims:
+                    item["dims"] = rec.dims
+                if rec.meta:
+                    item["meta"] = rec.meta
+                if rec.inline is not None:
+                    item["inline"] = rec.inline
+                items.append(item)
+            datasets.append({"datasetKey": ds_key, "items": items})
+        return datasets

fraclab_sdk/runtime/data_client.py ADDED Viewed

@@ -0,0 +1,123 @@
+"""Data client for algorithm runtime."""
+import json
+from pathlib import Path
+from fraclab_sdk.models import DataSpec
+class DataClient:
+    """Client for reading input data during algorithm execution."""
+    def __init__(self, input_dir: Path) -> None:
+        """Initialize data client.
+        Args:
+            input_dir: The run input directory containing ds.json and data/.
+        """
+        self._input_dir = input_dir
+        self._dataspec: DataSpec | None = None
+    @property
+    def dataspec(self) -> DataSpec:
+        """Get the data specification."""
+        if self._dataspec is None:
+            ds_path = self._input_dir / "ds.json"
+            self._dataspec = DataSpec.model_validate_json(ds_path.read_text())
+        return self._dataspec
+    def get_dataset_keys(self) -> list[str]:
+        """Get list of available dataset keys."""
+        return self.dataspec.get_dataset_keys()
+    def get_item_count(self, dataset_key: str) -> int:
+        """Get number of items in a dataset."""
+        dataset = self.dataspec.get_dataset(dataset_key)
+        if dataset is None:
+            raise KeyError(f"Dataset not found: {dataset_key}")
+        return len(dataset.items)
+    def get_layout(self, dataset_key: str) -> str | None:
+        """Get the layout type for a dataset."""
+        dataset = self.dataspec.get_dataset(dataset_key)
+        if dataset is None:
+            raise KeyError(f"Dataset not found: {dataset_key}")
+        return dataset.layout
+    def read_object(self, dataset_key: str, item_index: int) -> dict:
+        """Read an object from ndjson dataset.
+        Args:
+            dataset_key: The dataset key.
+            item_index: The item index (0-based, run-indexed).
+        Returns:
+            Parsed JSON object.
+        """
+        layout = self.get_layout(dataset_key)
+        if layout != "object_ndjson_lines":
+            raise ValueError(
+                f"Cannot read object from layout '{layout}', "
+                f"expected 'object_ndjson_lines'"
+            )
+        ndjson_path = self._input_dir / "data" / dataset_key / "object.ndjson"
+        with ndjson_path.open() as f:
+            for i, line in enumerate(f):
+                if i == item_index:
+                    return json.loads(line)
+        raise IndexError(f"Item index {item_index} not found")
+    def get_parquet_dir(self, dataset_key: str, item_index: int) -> Path:
+        """Get path to parquet item directory.
+        Args:
+            dataset_key: The dataset key.
+            item_index: The item index (0-based, run-indexed).
+        Returns:
+            Path to the item directory.
+        """
+        layout = self.get_layout(dataset_key)
+        if layout != "frame_parquet_item_dirs":
+            raise ValueError(
+                f"Cannot get parquet dir from layout '{layout}', "
+                f"expected 'frame_parquet_item_dirs'"
+            )
+        return self._input_dir / "data" / dataset_key / "parquet" / f"item-{item_index:05d}"
+    def get_parquet_files(self, dataset_key: str, item_index: int) -> list[Path]:
+        """Get list of parquet files for an item.
+        Args:
+            dataset_key: The dataset key.
+            item_index: The item index (0-based, run-indexed).
+        Returns:
+            List of parquet file paths.
+        """
+        item_dir = self.get_parquet_dir(dataset_key, item_index)
+        return list(item_dir.rglob("*.parquet"))
+    def iterate_objects(self, dataset_key: str):
+        """Iterate over all objects in an ndjson dataset.
+        Args:
+            dataset_key: The dataset key.
+        Yields:
+            Tuple of (index, object dict).
+        """
+        layout = self.get_layout(dataset_key)
+        if layout != "object_ndjson_lines":
+            raise ValueError(
+                f"Cannot iterate objects from layout '{layout}', "
+                f"expected 'object_ndjson_lines'"
+            )
+        ndjson_path = self._input_dir / "data" / dataset_key / "object.ndjson"
+        with ndjson_path.open() as f:
+            for i, line in enumerate(f):
+                yield i, json.loads(line)

fraclab_sdk/runtime/runner_main.py ADDED Viewed

@@ -0,0 +1,286 @@
+"""Main entry point for algorithm runner subprocess."""
+import importlib.util
+import json
+import logging
+import os
+import sys
+import tempfile
+import traceback
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+from fraclab_sdk.runtime.artifacts import ArtifactWriter
+from fraclab_sdk.runtime.data_client import DataClient
+def validate_manifest_against_contract(
+    manifest: dict[str, Any],
+    contract_path: Path,
+    logger: logging.Logger,
+) -> tuple[bool, list[str]]:
+    """Validate run output manifest against OutputContract.
+    The manifest has a flat artifacts list, while the contract has hierarchical
+    datasets -> items -> artifacts structure. We validate that all contract
+    artifacts are present in the manifest.
+    Args:
+        manifest: The run output manifest dict.
+        contract_path: Path to output_contract.json.
+        logger: Logger for diagnostics.
+    Returns:
+        Tuple of (is_valid, list of error messages).
+    """
+    if not contract_path.exists():
+        logger.debug(f"No contract found at {contract_path}, skipping validation")
+        return True, []
+    try:
+        contract = json.loads(contract_path.read_text())
+    except (json.JSONDecodeError, OSError) as e:
+        logger.warning(f"Failed to load contract: {e}")
+        return True, []  # Don't fail on contract load errors
+    errors: list[str] = []
+    # Get all artifact keys from manifest (flat list)
+    manifest_artifact_keys = {
+        a.get("artifactKey") or a.get("key")
+        for a in manifest.get("artifacts", [])
+    }
+    manifest_artifact_keys.discard(None)
+    # Also check datasets structure if present (for hierarchical manifests)
+    for ds in manifest.get("datasets", []):
+        for item in ds.get("items", []):
+            for art in item.get("artifacts", []):
+                key = art.get("artifactKey") or art.get("key")
+                if key:
+                    manifest_artifact_keys.add(key)
+    # Extract all required artifact keys from contract
+    required_artifacts: list[tuple[str, str, str]] = []  # (ds_key, item_key, art_key)
+    for ds in contract.get("datasets", []):
+        ds_key = ds.get("key", "")
+        for item in ds.get("items", []):
+            item_key = item.get("key", "")
+            for art in item.get("artifacts", []):
+                art_key = art.get("key", "")
+                if art_key:
+                    required_artifacts.append((ds_key, item_key, art_key))
+    # Check all required artifacts are present
+    for ds_key, item_key, art_key in required_artifacts:
+        if art_key not in manifest_artifact_keys:
+            errors.append(f"Missing artifact: {ds_key}/{item_key}/{art_key}")
+    if errors:
+        logger.warning(
+            f"Contract validation found {len(errors)} missing artifacts. "
+            f"Required: {[a[2] for a in required_artifacts]}, "
+            f"Found: {manifest_artifact_keys}"
+        )
+    return len(errors) == 0, errors
+@dataclass
+class RunContext:
+    """Context provided to algorithm's run() function."""
+    data_client: DataClient
+    params: dict[str, Any]
+    artifacts: ArtifactWriter
+    logger: logging.Logger
+    run_context: dict[str, Any]
+def load_algorithm_module(algorithm_path: Path):
+    """Load algorithm.py as a module.
+    Args:
+        algorithm_path: Path to algorithm.py file.
+    Returns:
+        Loaded module.
+    """
+    spec = importlib.util.spec_from_file_location("algorithm", algorithm_path)
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"Failed to load algorithm from {algorithm_path}")
+    module = importlib.util.module_from_spec(spec)
+    sys.modules["algorithm"] = module
+    spec.loader.exec_module(module)
+    return module
+def write_manifest_atomic(output_dir: Path, manifest: dict) -> None:
+    """Write manifest.json atomically.
+    Writes to temp file, then renames to ensure atomic operation.
+    Args:
+        output_dir: Output directory.
+        manifest: Manifest dict to write.
+    """
+    manifest_path = output_dir / "manifest.json"
+    content = json.dumps(manifest, indent=2, ensure_ascii=False)
+    # Write to temp file in same directory (ensures same filesystem)
+    fd, tmp_path = tempfile.mkstemp(
+        dir=output_dir, prefix="manifest_", suffix=".json.tmp"
+    )
+    try:
+        with os.fdopen(fd, "w", encoding="utf-8") as f:
+            f.write(content)
+            f.flush()
+            os.fsync(f.fileno())
+        # Atomic rename
+        os.rename(tmp_path, manifest_path)
+    except Exception:
+        # Clean up temp file on error
+        if os.path.exists(tmp_path):
+            os.unlink(tmp_path)
+        raise
+def run_algorithm(run_dir: Path, algorithm_path: Path) -> int:
+    """Run the algorithm.
+    Args:
+        run_dir: The run directory.
+        algorithm_path: Path to algorithm.py.
+    Returns:
+        Exit code (0 for success, 1 for failure).
+    """
+    input_dir = run_dir / "input"
+    output_dir = run_dir / "output"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Set up logging
+    logs_dir = output_dir / "_logs"
+    logs_dir.mkdir(exist_ok=True)
+    logger = logging.getLogger("algorithm")
+    logger.setLevel(logging.DEBUG)
+    # File handler
+    log_file = logs_dir / "algorithm.log"
+    file_handler = logging.FileHandler(log_file)
+    file_handler.setLevel(logging.DEBUG)
+    file_handler.setFormatter(
+        logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+    )
+    logger.addHandler(file_handler)
+    # Console handler
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+    console_handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
+    logger.addHandler(console_handler)
+    start_time = datetime.now()
+    exit_code = 0
+    error_message = None
+    try:
+        # Load input data
+        params = json.loads((input_dir / "params.json").read_text())
+        run_context_data = json.loads((input_dir / "run_context.json").read_text())
+        # Create context components
+        data_client = DataClient(input_dir)
+        artifacts = ArtifactWriter(output_dir)
+        ctx = RunContext(
+            data_client=data_client,
+            params=params,
+            artifacts=artifacts,
+            logger=logger,
+            run_context=run_context_data,
+        )
+        # Load and run algorithm
+        logger.info(f"Loading algorithm from {algorithm_path}")
+        module = load_algorithm_module(algorithm_path)
+        if not hasattr(module, "run"):
+            raise RuntimeError("Algorithm module must define a 'run' function")
+        logger.info("Starting algorithm execution")
+        module.run(ctx)
+        logger.info("Algorithm execution completed successfully")
+    except Exception as e:
+        exit_code = 1
+        error_message = f"{type(e).__name__}: {e}"
+        logger.error(f"Algorithm failed: {error_message}")
+        logger.debug(traceback.format_exc())
+    end_time = datetime.now()
+    # Build output manifest
+    manifest = {
+        "schemaVersion": "1.0",
+        "run": run_context_data if "run_context_data" in dir() else {},
+        "status": "succeeded" if exit_code == 0 else "failed",
+        "startedAt": start_time.isoformat(),
+        "completedAt": end_time.isoformat(),
+        "datasets": artifacts.build_manifest_datasets() if exit_code == 0 else [],
+    }
+    if error_message:
+        manifest["error"] = error_message
+    # Validate manifest against OutputContract if algorithm succeeded
+    if exit_code == 0:
+        # Find output_contract.json in algorithm's dist/ directory
+        algorithm_dir = algorithm_path.parent
+        contract_path = algorithm_dir / "dist" / "output_contract.json"
+        is_valid, validation_errors = validate_manifest_against_contract(
+            manifest, contract_path, logger
+        )
+        if not is_valid:
+            exit_code = 1
+            error_message = f"Output validation failed: {'; '.join(validation_errors)}"
+            manifest["status"] = "failed"
+            manifest["error"] = error_message
+            manifest["validationErrors"] = validation_errors
+            logger.error(f"Output validation failed: {validation_errors}")
+    # Write manifest atomically
+    write_manifest_atomic(output_dir, manifest)
+    return exit_code
+def main() -> None:
+    """Entry point for fraclab-runner command."""
+    if len(sys.argv) != 3:
+        print(f"Usage: {sys.argv[0]} <run_dir> <algorithm_path>", file=sys.stderr)
+        sys.exit(2)
+    run_dir = Path(sys.argv[1])
+    algorithm_path = Path(sys.argv[2])
+    if not run_dir.exists():
+        print(f"Run directory not found: {run_dir}", file=sys.stderr)
+        sys.exit(2)
+    if not algorithm_path.exists():
+        print(f"Algorithm not found: {algorithm_path}", file=sys.stderr)
+        sys.exit(2)
+    exit_code = run_algorithm(run_dir, algorithm_path)
+    sys.exit(exit_code)
+if __name__ == "__main__":
+    main()