PyPI - stacpkg - Versions diffs - 0.1.0__py3-none-any.whl - Mend

stacpkg 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

stacpkg/__init__.py +6 -0
stacpkg/__main__.py +7 -0
stacpkg/arrow_io.py +286 -0
stacpkg/assets.py +306 -0
stacpkg/checksums.py +142 -0
stacpkg/cli.py +960 -0
stacpkg/dataset.py +177 -0
stacpkg/enrich.py +232 -0
stacpkg/geoparquet.py +289 -0
stacpkg/items.py +70 -0
stacpkg/locators.py +99 -0
stacpkg/object_store.py +749 -0
stacpkg/oci.py +244 -0
stacpkg/projection.py +560 -0
stacpkg/report.py +134 -0
stacpkg/schemas.py +89 -0
stacpkg/stac_json.py +154 -0
stacpkg-0.1.0.dist-info/METADATA +85 -0
stacpkg-0.1.0.dist-info/RECORD +22 -0
stacpkg-0.1.0.dist-info/WHEEL +4 -0
stacpkg-0.1.0.dist-info/entry_points.txt +2 -0
stacpkg-0.1.0.dist-info/licenses/LICENSE +201 -0

stacpkg/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+# Copyright 2026, Versioneer (https://versioneer.at)
+# SPDX-License-Identifier: Apache-2.0
+"""Arrow-native STAC asset lock packaging."""
+__version__ = "0.1.0"

stacpkg/__main__.py ADDED Viewed

@@ -0,0 +1,7 @@
+# Copyright 2026, Versioneer (https://versioneer.at)
+# SPDX-License-Identifier: Apache-2.0
+from stacpkg.cli import main
+if __name__ == "__main__":
+    raise SystemExit(main())

stacpkg/arrow_io.py ADDED Viewed

@@ -0,0 +1,286 @@
+# Copyright 2026, Versioneer (https://versioneer.at)
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+import json
+import shutil
+from collections.abc import Callable
+from pathlib import Path
+from typing import BinaryIO, TextIO
+import pyarrow as pa
+import pyarrow.parquet as pq
+DEFAULT_PRETTY_MAX_ROWS = 20
+DEFAULT_PRETTY_MAX_CELL_WIDTH = 48
+DEFAULT_PRETTY_WIDTH = 120
+DEFAULT_STREAM_BATCH_SIZE = 64_000
+def read_stream(source: BinaryIO) -> pa.Table:
+    with pa.ipc.open_stream(source) as reader:
+        return reader.read_all()
+def read_stream_path(path: str | Path) -> pa.Table:
+    with Path(path).open("rb") as source:
+        return read_stream(source)
+def write_stream(table: pa.Table, sink: BinaryIO) -> None:
+    with pa.ipc.new_stream(sink, table.schema) as writer:
+        writer.write_table(table)
+def align_table_to_schema(table: pa.Table, schema: pa.Schema) -> pa.Table:
+    try:
+        columns = []
+        for field in schema:
+            index = table.schema.get_field_index(field.name)
+            if index == -1:
+                columns.append(pa.nulls(table.num_rows, type=field.type))
+                continue
+            column = table.column(index)
+            if not column.type.equals(field.type):
+                column = column.cast(field.type)
+            columns.append(column)
+        return pa.Table.from_arrays(columns, schema=schema)
+    except pa.ArrowException:
+        return pa.Table.from_pylist(table.to_pylist(), schema=schema)
+def align_record_batch_to_schema(batch: pa.RecordBatch, schema: pa.Schema) -> pa.RecordBatch:
+    columns = []
+    for field in schema:
+        index = batch.schema.get_field_index(field.name)
+        if index == -1:
+            columns.append(pa.nulls(batch.num_rows, type=field.type))
+            continue
+        column = batch.column(index)
+        if not column.type.equals(field.type):
+            column = column.cast(field.type)
+        columns.append(column)
+    return pa.RecordBatch.from_arrays(columns, schema=schema)
+def write_transformed_stream(
+    source: BinaryIO,
+    sink: BinaryIO,
+    transform: Callable[[pa.Table], pa.Table],
+) -> None:
+    with pa.ipc.open_stream(source) as reader:
+        writer = None
+        output_schema = None
+        try:
+            for batch in reader:
+                table = pa.Table.from_batches([batch], schema=reader.schema)
+                transformed = transform(table)
+                if writer is None:
+                    output_schema = transformed.schema
+                    writer = pa.ipc.new_stream(sink, output_schema)
+                assert output_schema is not None
+                writer.write_table(align_table_to_schema(transformed, output_schema))
+            if writer is None:
+                empty = transform(pa.Table.from_batches([], schema=reader.schema))
+                with pa.ipc.new_stream(sink, empty.schema):
+                    pass
+        finally:
+            if writer is not None:
+                writer.close()
+def write_parquet_stream(
+    path: str | Path,
+    sink: BinaryIO,
+    *,
+    batch_size: int = DEFAULT_STREAM_BATCH_SIZE,
+    transform: Callable[[pa.Table], pa.Table] | None = None,
+) -> None:
+    parquet = pq.ParquetFile(path)
+    if transform is None:
+        with pa.ipc.new_stream(sink, parquet.schema_arrow) as writer:
+            for batch in parquet.iter_batches(batch_size=batch_size):
+                writer.write_batch(batch)
+        return
+    writer = None
+    output_schema = None
+    try:
+        for batch in parquet.iter_batches(batch_size=batch_size):
+            table = pa.Table.from_batches([batch], schema=parquet.schema_arrow)
+            transformed = transform(table)
+            if writer is None:
+                output_schema = transformed.schema
+                writer = pa.ipc.new_stream(sink, output_schema)
+            assert output_schema is not None
+            writer.write_table(align_table_to_schema(transformed, output_schema))
+        if writer is None:
+            empty = transform(pa.Table.from_batches([], schema=parquet.schema_arrow))
+            with pa.ipc.new_stream(sink, empty.schema):
+                pass
+    finally:
+        if writer is not None:
+            writer.close()
+def read_parquet_preview(
+    path: str | Path,
+    *,
+    max_rows: int = DEFAULT_PRETTY_MAX_ROWS,
+    batch_size: int = DEFAULT_STREAM_BATCH_SIZE,
+    transform: Callable[[pa.Table], pa.Table] | None = None,
+) -> tuple[pa.Table, int]:
+    parquet = pq.ParquetFile(path)
+    if transform is not None:
+        batches = []
+        output_schema = None
+        total_rows = 0
+        remaining = max(max_rows, 0)
+        for batch in parquet.iter_batches(batch_size=max(batch_size, 1)):
+            table = pa.Table.from_batches([batch], schema=parquet.schema_arrow)
+            transformed = transform(table)
+            if output_schema is None:
+                output_schema = transformed.schema
+            total_rows += transformed.num_rows
+            if remaining:
+                preview = transformed.slice(0, remaining)
+                batches.extend(preview.to_batches())
+                remaining -= preview.num_rows
+        if output_schema is None:
+            output_schema = transform(pa.Table.from_batches([], schema=parquet.schema_arrow)).schema
+        return pa.Table.from_batches(batches, schema=output_schema), total_rows
+    batches = []
+    remaining = max(max_rows, 0)
+    if remaining:
+        for batch in parquet.iter_batches(batch_size=max(batch_size, 1)):
+            if batch.num_rows > remaining:
+                batch = batch.slice(0, remaining)
+            batches.append(batch)
+            remaining -= batch.num_rows
+            if remaining == 0:
+                break
+    return pa.Table.from_batches(batches, schema=parquet.schema_arrow), parquet.metadata.num_rows
+def write_parquet_terminal_table(
+    path: str | Path,
+    sink: TextIO,
+    *,
+    batch_size: int = DEFAULT_STREAM_BATCH_SIZE,
+    max_width: int | None = None,
+    transform: Callable[[pa.Table], pa.Table] | None = None,
+) -> None:
+    table, total_rows = read_parquet_preview(path, batch_size=batch_size, transform=transform)
+    width = max_width or shutil.get_terminal_size((DEFAULT_PRETTY_WIDTH, 24)).columns
+    sink.write(format_table(table, max_width=width, total_rows=total_rows))
+def write_stream_to_parquet(source: BinaryIO, path: str | Path) -> None:
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with pa.ipc.open_stream(source) as reader:
+        with pq.ParquetWriter(path, reader.schema) as writer:
+            for batch in reader:
+                writer.write_batch(batch)
+def _cell_text(value: object) -> str:
+    if value is None:
+        return ""
+    if isinstance(value, dict | list | tuple):
+        text = json.dumps(value, sort_keys=True, default=str, separators=(",", ":"))
+    else:
+        text = str(value)
+    return " ".join(text.split())
+def _clip_text(text: str, width: int) -> str:
+    if len(text) <= width:
+        return text
+    if width <= 3:
+        return "." * width
+    return f"{text[: width - 3]}..."
+def _column_widths(
+    columns: list[str],
+    rows: list[list[str]],
+    *,
+    max_width: int,
+    max_cell_width: int,
+) -> list[int]:
+    widths = [
+        min(
+            max([len(column), *(len(row[index]) for row in rows)]),
+            max_cell_width,
+        )
+        for index, column in enumerate(columns)
+    ]
+    available = max(max_width, 20) - (3 * max(len(columns) - 1, 0))
+    minimums = [min(width, 4) for width in widths]
+    while sum(widths) > available and any(
+        width > minimum for width, minimum in zip(widths, minimums)
+    ):
+        widest = max(range(len(widths)), key=lambda index: widths[index] - minimums[index])
+        widths[widest] -= 1
+    return widths
+def _format_row(values: list[str], widths: list[int]) -> str:
+    return " | ".join(_clip_text(value, width).ljust(width) for value, width in zip(values, widths))
+def format_table(
+    table: pa.Table,
+    *,
+    max_rows: int = DEFAULT_PRETTY_MAX_ROWS,
+    max_width: int = DEFAULT_PRETTY_WIDTH,
+    max_cell_width: int = DEFAULT_PRETTY_MAX_CELL_WIDTH,
+    total_rows: int | None = None,
+) -> str:
+    columns = table.schema.names
+    total_row_count = table.num_rows if total_rows is None else total_rows
+    if not columns:
+        return f"{total_row_count} rows x 0 columns\n"
+    row_count = min(max(max_rows, 0), table.num_rows)
+    rows = [
+        [_cell_text(row.get(column)) for column in columns]
+        for row in table.slice(0, row_count).to_pylist()
+    ]
+    widths = _column_widths(
+        columns,
+        rows,
+        max_width=max_width,
+        max_cell_width=max(max_cell_width, 4),
+    )
+    lines = [
+        _format_row(columns, widths),
+        "-+-".join("-" * width for width in widths),
+    ]
+    lines.extend(_format_row(row, widths) for row in rows)
+    footer = f"{total_row_count} rows x {table.num_columns} columns"
+    if row_count < total_row_count:
+        footer = f"{footer} (showing first {row_count})"
+    lines.append(footer)
+    return f"{'\n'.join(lines)}\n"
+def write_terminal_table(table: pa.Table, sink: TextIO, *, max_width: int | None = None) -> None:
+    width = max_width or shutil.get_terminal_size((DEFAULT_PRETTY_WIDTH, 24)).columns
+    sink.write(format_table(table, max_width=width))
+def read_parquet(path: str | Path) -> pa.Table:
+    return pq.read_table(path)
+def write_parquet(table: pa.Table, path: str | Path) -> None:
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    pq.write_table(table, path)

stacpkg/assets.py ADDED Viewed

@@ -0,0 +1,306 @@
+# Copyright 2026, Versioneer (https://versioneer.at)
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+import json
+import logging
+from pathlib import Path
+from typing import BinaryIO
+from urllib.parse import urlparse
+import pyarrow as pa
+import pyarrow.parquet as pq
+from stacpkg.arrow_io import align_record_batch_to_schema
+from stacpkg.items import filter_items
+from stacpkg.locators import (
+    OBSTORE_STORE_TYPES,
+    child_location,
+    href_from_location,
+    location_from_href,
+    normalize_store_type,
+)
+from stacpkg.schemas import (
+    ASSET_LOCK_COLUMNS,
+    ASSET_LOCK_OPTIONAL_COLUMNS,
+    ASSET_LOCK_SCHEMA_VERSION,
+    SchemaKind,
+    asset_lock_schema,
+    with_schema_metadata,
+)
+LOGGER = logging.getLogger(__name__)
+DEFAULT_PROBE_METADATA = True
+METADATA_ASSET_KEY = "metadata"
+def _log_location(value: object) -> object:
+    if not isinstance(value, str):
+        return value
+    parsed = urlparse(value)
+    if parsed.query:
+        return parsed._replace(query="<redacted>").geturl()
+    return value
+def asset_lock_table(rows: list[dict[str, object]]) -> pa.Table:
+    rows = [_compact_asset_lock_row(row) for row in rows]
+    table = pa.Table.from_pylist(rows, schema=asset_lock_schema())
+    return table.replace_schema_metadata(
+        with_schema_metadata(
+            table.schema,
+            SchemaKind.ASSET_LOCK,
+            ASSET_LOCK_SCHEMA_VERSION,
+        ).metadata
+    )
+def write_asset_lock_parquet_stream(source: BinaryIO, output_path: str | Path) -> None:
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_schema = asset_lock_schema()
+    with pa.ipc.open_stream(source) as reader:
+        with pq.ParquetWriter(output_path, output_schema) as writer:
+            for batch in reader:
+                writer.write_batch(align_record_batch_to_schema(batch, output_schema))
+def _compact_asset_lock_row(source: dict[str, object]) -> dict[str, object]:
+    location = {
+        "store_type": normalize_store_type(source.get("store_type")),
+        "store_container": source.get("store_container"),
+        "key": source.get("key"),
+    }
+    if not location["store_type"] and not location["key"]:
+        location = location_from_href(source.get("href"))
+    row = {
+        "item_id": source.get("item_id"),
+        "asset_key": source.get("asset_key"),
+        "store_type": location.get("store_type"),
+        "store_container": location.get("store_container"),
+        "store_endpoint_url": _endpoint_url(source.get("store_endpoint_url")),
+        "key": location.get("key"),
+    }
+    size = _size(source.get("size_bytes", source.get("size")))
+    if size is not None:
+        row["size_bytes"] = size
+    for column in ASSET_LOCK_OPTIONAL_COLUMNS:
+        if column == "size_bytes":
+            continue
+        value = source.get(column)
+        if value is not None:
+            row[column] = value
+    return row
+def _item_assets(item: dict[str, object]) -> dict[str, dict[str, object]]:
+    if "assets_json" in item:
+        return json.loads(str(item.get("assets_json") or "{}"))
+    assets = item.get("assets")
+    if isinstance(assets, dict):
+        return {
+            key: asset
+            for key, asset in assets.items()
+            if isinstance(asset, dict) and asset.get("href")
+        }
+    return {}
+def _size(value: object) -> int | None:
+    if value is None:
+        return None
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return None
+def _endpoint_url(value: object) -> str | None:
+    if not isinstance(value, str) or not value.strip():
+        return None
+    endpoint = value.strip().rstrip("/")
+    if not urlparse(endpoint).scheme:
+        endpoint = f"https://{endpoint}"
+    return endpoint
+def derive_asset_lock(
+    items: pa.Table,
+    *,
+    probe_metadata: bool = DEFAULT_PROBE_METADATA,
+    item_ids: set[str] | None = None,
+    providers: set[str] | None = None,
+    asset_keys: set[str] | None = None,
+    include_metadata_assets: bool = False,
+    keep_going: bool = False,
+    max_workers: int | None = None,
+) -> pa.Table:
+    if item_ids or providers:
+        items = filter_items(items, item_ids=item_ids, providers=providers)
+    rows: list[dict[str, object]] = []
+    for item in items.to_pylist():
+        for asset_key, asset in _item_assets(item).items():
+            if asset_keys and asset_key not in asset_keys:
+                continue
+            if asset_key == METADATA_ASSET_KEY and not include_metadata_assets and not asset_keys:
+                continue
+            href = asset.get("href")
+            row: dict[str, object] = {
+                "item_id": item["id"],
+                "asset_key": asset_key,
+                **location_from_href(href),
+            }
+            if asset.get("store_endpoint_url"):
+                row["store_endpoint_url"] = asset.get("store_endpoint_url")
+            size = _size(asset.get("file:size"))
+            if size is not None:
+                row["size_bytes"] = size
+            rows.append(row)
+    assets = asset_lock_table(rows)
+    if not probe_metadata:
+        return assets
+    from stacpkg.object_store import stat_assets
+    if max_workers is not None:
+        return stat_assets(
+            assets,
+            keep_going=keep_going,
+            max_workers=max_workers,
+        )
+    return stat_assets(
+        assets,
+        keep_going=keep_going,
+    )
+def map_asset_locations(
+    asset_lock: pa.Table,
+    *,
+    target: str,
+    source_prefix: str | None = None,
+    layout: str = "item-asset",
+    target_endpoint_url: str | None = None,
+) -> pa.Table:
+    target_location = location_from_href(target)
+    target_endpoint_url = _endpoint_url(target_endpoint_url)
+    if target_endpoint_url is not None:
+        target_location["store_endpoint_url"] = target_endpoint_url
+    return _map_asset_locations_to_location(
+        asset_lock,
+        target_location,
+        source_prefix=source_prefix,
+        layout=layout,
+        log_target=target,
+    )
+def relocate_asset_locations(
+    asset_lock: pa.Table,
+    *,
+    store_type: str,
+    store_container: str | None = None,
+    store_endpoint_url: str | None = None,
+    key: str | None = None,
+    source_prefix: str | None = None,
+    layout: str = "item-asset",
+) -> pa.Table:
+    normalized_store_type = normalize_store_type(store_type)
+    if normalized_store_type is None:
+        expected = ", ".join(OBSTORE_STORE_TYPES)
+        raise ValueError(f"unsupported store_type: {store_type}. Expected one of: {expected}")
+    key = key or ""
+    if normalized_store_type == "file" and not key:
+        raise ValueError("key is required when store_type is file")
+    if normalized_store_type in {"s3", "gs", "az", "http", "https"} and not store_container:
+        raise ValueError(f"store_container is required when store_type is {normalized_store_type}")
+    target_location = {
+        "store_type": normalized_store_type,
+        "store_container": store_container,
+        "store_endpoint_url": _endpoint_url(store_endpoint_url),
+        "key": key,
+    }
+    return _map_asset_locations_to_location(
+        asset_lock,
+        target_location,
+        source_prefix=source_prefix,
+        layout=layout,
+        log_target=href_from_location(target_location),
+    )
+def _map_asset_locations_to_location(
+    asset_lock: pa.Table,
+    target_location: dict[str, object],
+    *,
+    source_prefix: str | None,
+    layout: str,
+    log_target: object,
+) -> pa.Table:
+    from stacpkg.object_store import target_path
+    rows: list[dict[str, object]] = []
+    mapped_count = 0
+    for row in asset_lock.to_pylist():
+        row = dict(row)
+        if not _matches_href_prefix(href_from_location(row), source_prefix):
+            rows.append(row)
+            continue
+        path = target_path(row, layout=layout)
+        row.update(child_location(target_location, path))
+        for field in ASSET_LOCK_COLUMNS:
+            if field in {
+                "item_id",
+                "asset_key",
+                "store_type",
+                "store_container",
+                "store_endpoint_url",
+                "key",
+                "size_bytes",
+            }:
+                continue
+            row.pop(field, None)
+        rows.append(row)
+        mapped_count += 1
+    LOGGER.info(
+        "map asset locations completed: input_rows=%s mapped_rows=%s from=%s target=%s layout=%s",
+        asset_lock.num_rows,
+        mapped_count,
+        _log_location(source_prefix),
+        _log_location(log_target),
+        layout,
+    )
+    return asset_lock_table(rows)
+def plan_copy_assets(
+    asset_lock: pa.Table,
+    *,
+    target: str,
+    source_prefix: str | None = None,
+    layout: str = "item-asset",
+    target_endpoint_url: str | None = None,
+) -> pa.Table:
+    return map_asset_locations(
+        asset_lock,
+        target=target,
+        source_prefix=source_prefix,
+        layout=layout,
+        target_endpoint_url=target_endpoint_url,
+    )
+def _matches_href_prefix(href: object, prefix: str | None) -> bool:
+    if prefix is None:
+        return True
+    if not isinstance(href, str) or not href:
+        return False
+    normalized = prefix.rstrip("/")
+    return href == normalized or href.startswith(f"{normalized}/")