stacpkg 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
stacpkg/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ # Copyright 2026, Versioneer (https://versioneer.at)
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """Arrow-native STAC asset lock packaging."""
5
+
6
+ __version__ = "0.1.0"
stacpkg/__main__.py ADDED
@@ -0,0 +1,7 @@
1
+ # Copyright 2026, Versioneer (https://versioneer.at)
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from stacpkg.cli import main
5
+
6
+ if __name__ == "__main__":
7
+ raise SystemExit(main())
stacpkg/arrow_io.py ADDED
@@ -0,0 +1,286 @@
1
+ # Copyright 2026, Versioneer (https://versioneer.at)
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ import json
7
+ import shutil
8
+ from collections.abc import Callable
9
+ from pathlib import Path
10
+ from typing import BinaryIO, TextIO
11
+
12
+ import pyarrow as pa
13
+ import pyarrow.parquet as pq
14
+
15
+ DEFAULT_PRETTY_MAX_ROWS = 20
16
+ DEFAULT_PRETTY_MAX_CELL_WIDTH = 48
17
+ DEFAULT_PRETTY_WIDTH = 120
18
+ DEFAULT_STREAM_BATCH_SIZE = 64_000
19
+
20
+
21
+ def read_stream(source: BinaryIO) -> pa.Table:
22
+ with pa.ipc.open_stream(source) as reader:
23
+ return reader.read_all()
24
+
25
+
26
+ def read_stream_path(path: str | Path) -> pa.Table:
27
+ with Path(path).open("rb") as source:
28
+ return read_stream(source)
29
+
30
+
31
+ def write_stream(table: pa.Table, sink: BinaryIO) -> None:
32
+ with pa.ipc.new_stream(sink, table.schema) as writer:
33
+ writer.write_table(table)
34
+
35
+
36
+ def align_table_to_schema(table: pa.Table, schema: pa.Schema) -> pa.Table:
37
+ try:
38
+ columns = []
39
+ for field in schema:
40
+ index = table.schema.get_field_index(field.name)
41
+ if index == -1:
42
+ columns.append(pa.nulls(table.num_rows, type=field.type))
43
+ continue
44
+ column = table.column(index)
45
+ if not column.type.equals(field.type):
46
+ column = column.cast(field.type)
47
+ columns.append(column)
48
+ return pa.Table.from_arrays(columns, schema=schema)
49
+ except pa.ArrowException:
50
+ return pa.Table.from_pylist(table.to_pylist(), schema=schema)
51
+
52
+
53
+ def align_record_batch_to_schema(batch: pa.RecordBatch, schema: pa.Schema) -> pa.RecordBatch:
54
+ columns = []
55
+ for field in schema:
56
+ index = batch.schema.get_field_index(field.name)
57
+ if index == -1:
58
+ columns.append(pa.nulls(batch.num_rows, type=field.type))
59
+ continue
60
+ column = batch.column(index)
61
+ if not column.type.equals(field.type):
62
+ column = column.cast(field.type)
63
+ columns.append(column)
64
+ return pa.RecordBatch.from_arrays(columns, schema=schema)
65
+
66
+
67
+ def write_transformed_stream(
68
+ source: BinaryIO,
69
+ sink: BinaryIO,
70
+ transform: Callable[[pa.Table], pa.Table],
71
+ ) -> None:
72
+ with pa.ipc.open_stream(source) as reader:
73
+ writer = None
74
+ output_schema = None
75
+ try:
76
+ for batch in reader:
77
+ table = pa.Table.from_batches([batch], schema=reader.schema)
78
+ transformed = transform(table)
79
+ if writer is None:
80
+ output_schema = transformed.schema
81
+ writer = pa.ipc.new_stream(sink, output_schema)
82
+ assert output_schema is not None
83
+ writer.write_table(align_table_to_schema(transformed, output_schema))
84
+
85
+ if writer is None:
86
+ empty = transform(pa.Table.from_batches([], schema=reader.schema))
87
+ with pa.ipc.new_stream(sink, empty.schema):
88
+ pass
89
+ finally:
90
+ if writer is not None:
91
+ writer.close()
92
+
93
+
94
+ def write_parquet_stream(
95
+ path: str | Path,
96
+ sink: BinaryIO,
97
+ *,
98
+ batch_size: int = DEFAULT_STREAM_BATCH_SIZE,
99
+ transform: Callable[[pa.Table], pa.Table] | None = None,
100
+ ) -> None:
101
+ parquet = pq.ParquetFile(path)
102
+ if transform is None:
103
+ with pa.ipc.new_stream(sink, parquet.schema_arrow) as writer:
104
+ for batch in parquet.iter_batches(batch_size=batch_size):
105
+ writer.write_batch(batch)
106
+ return
107
+
108
+ writer = None
109
+ output_schema = None
110
+ try:
111
+ for batch in parquet.iter_batches(batch_size=batch_size):
112
+ table = pa.Table.from_batches([batch], schema=parquet.schema_arrow)
113
+ transformed = transform(table)
114
+ if writer is None:
115
+ output_schema = transformed.schema
116
+ writer = pa.ipc.new_stream(sink, output_schema)
117
+ assert output_schema is not None
118
+ writer.write_table(align_table_to_schema(transformed, output_schema))
119
+
120
+ if writer is None:
121
+ empty = transform(pa.Table.from_batches([], schema=parquet.schema_arrow))
122
+ with pa.ipc.new_stream(sink, empty.schema):
123
+ pass
124
+ finally:
125
+ if writer is not None:
126
+ writer.close()
127
+
128
+
129
+ def read_parquet_preview(
130
+ path: str | Path,
131
+ *,
132
+ max_rows: int = DEFAULT_PRETTY_MAX_ROWS,
133
+ batch_size: int = DEFAULT_STREAM_BATCH_SIZE,
134
+ transform: Callable[[pa.Table], pa.Table] | None = None,
135
+ ) -> tuple[pa.Table, int]:
136
+ parquet = pq.ParquetFile(path)
137
+ if transform is not None:
138
+ batches = []
139
+ output_schema = None
140
+ total_rows = 0
141
+ remaining = max(max_rows, 0)
142
+ for batch in parquet.iter_batches(batch_size=max(batch_size, 1)):
143
+ table = pa.Table.from_batches([batch], schema=parquet.schema_arrow)
144
+ transformed = transform(table)
145
+ if output_schema is None:
146
+ output_schema = transformed.schema
147
+ total_rows += transformed.num_rows
148
+ if remaining:
149
+ preview = transformed.slice(0, remaining)
150
+ batches.extend(preview.to_batches())
151
+ remaining -= preview.num_rows
152
+ if output_schema is None:
153
+ output_schema = transform(pa.Table.from_batches([], schema=parquet.schema_arrow)).schema
154
+ return pa.Table.from_batches(batches, schema=output_schema), total_rows
155
+
156
+ batches = []
157
+ remaining = max(max_rows, 0)
158
+ if remaining:
159
+ for batch in parquet.iter_batches(batch_size=max(batch_size, 1)):
160
+ if batch.num_rows > remaining:
161
+ batch = batch.slice(0, remaining)
162
+ batches.append(batch)
163
+ remaining -= batch.num_rows
164
+ if remaining == 0:
165
+ break
166
+ return pa.Table.from_batches(batches, schema=parquet.schema_arrow), parquet.metadata.num_rows
167
+
168
+
169
+ def write_parquet_terminal_table(
170
+ path: str | Path,
171
+ sink: TextIO,
172
+ *,
173
+ batch_size: int = DEFAULT_STREAM_BATCH_SIZE,
174
+ max_width: int | None = None,
175
+ transform: Callable[[pa.Table], pa.Table] | None = None,
176
+ ) -> None:
177
+ table, total_rows = read_parquet_preview(path, batch_size=batch_size, transform=transform)
178
+ width = max_width or shutil.get_terminal_size((DEFAULT_PRETTY_WIDTH, 24)).columns
179
+ sink.write(format_table(table, max_width=width, total_rows=total_rows))
180
+
181
+
182
+ def write_stream_to_parquet(source: BinaryIO, path: str | Path) -> None:
183
+ path = Path(path)
184
+ path.parent.mkdir(parents=True, exist_ok=True)
185
+ with pa.ipc.open_stream(source) as reader:
186
+ with pq.ParquetWriter(path, reader.schema) as writer:
187
+ for batch in reader:
188
+ writer.write_batch(batch)
189
+
190
+
191
+ def _cell_text(value: object) -> str:
192
+ if value is None:
193
+ return ""
194
+ if isinstance(value, dict | list | tuple):
195
+ text = json.dumps(value, sort_keys=True, default=str, separators=(",", ":"))
196
+ else:
197
+ text = str(value)
198
+ return " ".join(text.split())
199
+
200
+
201
+ def _clip_text(text: str, width: int) -> str:
202
+ if len(text) <= width:
203
+ return text
204
+ if width <= 3:
205
+ return "." * width
206
+ return f"{text[: width - 3]}..."
207
+
208
+
209
+ def _column_widths(
210
+ columns: list[str],
211
+ rows: list[list[str]],
212
+ *,
213
+ max_width: int,
214
+ max_cell_width: int,
215
+ ) -> list[int]:
216
+ widths = [
217
+ min(
218
+ max([len(column), *(len(row[index]) for row in rows)]),
219
+ max_cell_width,
220
+ )
221
+ for index, column in enumerate(columns)
222
+ ]
223
+ available = max(max_width, 20) - (3 * max(len(columns) - 1, 0))
224
+ minimums = [min(width, 4) for width in widths]
225
+ while sum(widths) > available and any(
226
+ width > minimum for width, minimum in zip(widths, minimums)
227
+ ):
228
+ widest = max(range(len(widths)), key=lambda index: widths[index] - minimums[index])
229
+ widths[widest] -= 1
230
+ return widths
231
+
232
+
233
+ def _format_row(values: list[str], widths: list[int]) -> str:
234
+ return " | ".join(_clip_text(value, width).ljust(width) for value, width in zip(values, widths))
235
+
236
+
237
+ def format_table(
238
+ table: pa.Table,
239
+ *,
240
+ max_rows: int = DEFAULT_PRETTY_MAX_ROWS,
241
+ max_width: int = DEFAULT_PRETTY_WIDTH,
242
+ max_cell_width: int = DEFAULT_PRETTY_MAX_CELL_WIDTH,
243
+ total_rows: int | None = None,
244
+ ) -> str:
245
+ columns = table.schema.names
246
+ total_row_count = table.num_rows if total_rows is None else total_rows
247
+ if not columns:
248
+ return f"{total_row_count} rows x 0 columns\n"
249
+
250
+ row_count = min(max(max_rows, 0), table.num_rows)
251
+ rows = [
252
+ [_cell_text(row.get(column)) for column in columns]
253
+ for row in table.slice(0, row_count).to_pylist()
254
+ ]
255
+ widths = _column_widths(
256
+ columns,
257
+ rows,
258
+ max_width=max_width,
259
+ max_cell_width=max(max_cell_width, 4),
260
+ )
261
+ lines = [
262
+ _format_row(columns, widths),
263
+ "-+-".join("-" * width for width in widths),
264
+ ]
265
+ lines.extend(_format_row(row, widths) for row in rows)
266
+
267
+ footer = f"{total_row_count} rows x {table.num_columns} columns"
268
+ if row_count < total_row_count:
269
+ footer = f"{footer} (showing first {row_count})"
270
+ lines.append(footer)
271
+ return f"{'\n'.join(lines)}\n"
272
+
273
+
274
+ def write_terminal_table(table: pa.Table, sink: TextIO, *, max_width: int | None = None) -> None:
275
+ width = max_width or shutil.get_terminal_size((DEFAULT_PRETTY_WIDTH, 24)).columns
276
+ sink.write(format_table(table, max_width=width))
277
+
278
+
279
+ def read_parquet(path: str | Path) -> pa.Table:
280
+ return pq.read_table(path)
281
+
282
+
283
+ def write_parquet(table: pa.Table, path: str | Path) -> None:
284
+ path = Path(path)
285
+ path.parent.mkdir(parents=True, exist_ok=True)
286
+ pq.write_table(table, path)
stacpkg/assets.py ADDED
@@ -0,0 +1,306 @@
1
+ # Copyright 2026, Versioneer (https://versioneer.at)
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ import json
7
+ import logging
8
+ from pathlib import Path
9
+ from typing import BinaryIO
10
+ from urllib.parse import urlparse
11
+
12
+ import pyarrow as pa
13
+ import pyarrow.parquet as pq
14
+
15
+ from stacpkg.arrow_io import align_record_batch_to_schema
16
+ from stacpkg.items import filter_items
17
+ from stacpkg.locators import (
18
+ OBSTORE_STORE_TYPES,
19
+ child_location,
20
+ href_from_location,
21
+ location_from_href,
22
+ normalize_store_type,
23
+ )
24
+ from stacpkg.schemas import (
25
+ ASSET_LOCK_COLUMNS,
26
+ ASSET_LOCK_OPTIONAL_COLUMNS,
27
+ ASSET_LOCK_SCHEMA_VERSION,
28
+ SchemaKind,
29
+ asset_lock_schema,
30
+ with_schema_metadata,
31
+ )
32
+
33
+ LOGGER = logging.getLogger(__name__)
34
+ DEFAULT_PROBE_METADATA = True
35
+ METADATA_ASSET_KEY = "metadata"
36
+
37
+
38
+ def _log_location(value: object) -> object:
39
+ if not isinstance(value, str):
40
+ return value
41
+ parsed = urlparse(value)
42
+ if parsed.query:
43
+ return parsed._replace(query="<redacted>").geturl()
44
+ return value
45
+
46
+
47
+ def asset_lock_table(rows: list[dict[str, object]]) -> pa.Table:
48
+ rows = [_compact_asset_lock_row(row) for row in rows]
49
+ table = pa.Table.from_pylist(rows, schema=asset_lock_schema())
50
+ return table.replace_schema_metadata(
51
+ with_schema_metadata(
52
+ table.schema,
53
+ SchemaKind.ASSET_LOCK,
54
+ ASSET_LOCK_SCHEMA_VERSION,
55
+ ).metadata
56
+ )
57
+
58
+
59
+ def write_asset_lock_parquet_stream(source: BinaryIO, output_path: str | Path) -> None:
60
+ output_path = Path(output_path)
61
+ output_path.parent.mkdir(parents=True, exist_ok=True)
62
+ output_schema = asset_lock_schema()
63
+ with pa.ipc.open_stream(source) as reader:
64
+ with pq.ParquetWriter(output_path, output_schema) as writer:
65
+ for batch in reader:
66
+ writer.write_batch(align_record_batch_to_schema(batch, output_schema))
67
+
68
+
69
+ def _compact_asset_lock_row(source: dict[str, object]) -> dict[str, object]:
70
+ location = {
71
+ "store_type": normalize_store_type(source.get("store_type")),
72
+ "store_container": source.get("store_container"),
73
+ "key": source.get("key"),
74
+ }
75
+ if not location["store_type"] and not location["key"]:
76
+ location = location_from_href(source.get("href"))
77
+
78
+ row = {
79
+ "item_id": source.get("item_id"),
80
+ "asset_key": source.get("asset_key"),
81
+ "store_type": location.get("store_type"),
82
+ "store_container": location.get("store_container"),
83
+ "store_endpoint_url": _endpoint_url(source.get("store_endpoint_url")),
84
+ "key": location.get("key"),
85
+ }
86
+ size = _size(source.get("size_bytes", source.get("size")))
87
+ if size is not None:
88
+ row["size_bytes"] = size
89
+
90
+ for column in ASSET_LOCK_OPTIONAL_COLUMNS:
91
+ if column == "size_bytes":
92
+ continue
93
+ value = source.get(column)
94
+ if value is not None:
95
+ row[column] = value
96
+ return row
97
+
98
+
99
+ def _item_assets(item: dict[str, object]) -> dict[str, dict[str, object]]:
100
+ if "assets_json" in item:
101
+ return json.loads(str(item.get("assets_json") or "{}"))
102
+ assets = item.get("assets")
103
+ if isinstance(assets, dict):
104
+ return {
105
+ key: asset
106
+ for key, asset in assets.items()
107
+ if isinstance(asset, dict) and asset.get("href")
108
+ }
109
+ return {}
110
+
111
+
112
+ def _size(value: object) -> int | None:
113
+ if value is None:
114
+ return None
115
+ try:
116
+ return int(value)
117
+ except (TypeError, ValueError):
118
+ return None
119
+
120
+
121
+ def _endpoint_url(value: object) -> str | None:
122
+ if not isinstance(value, str) or not value.strip():
123
+ return None
124
+ endpoint = value.strip().rstrip("/")
125
+ if not urlparse(endpoint).scheme:
126
+ endpoint = f"https://{endpoint}"
127
+ return endpoint
128
+
129
+
130
+ def derive_asset_lock(
131
+ items: pa.Table,
132
+ *,
133
+ probe_metadata: bool = DEFAULT_PROBE_METADATA,
134
+ item_ids: set[str] | None = None,
135
+ providers: set[str] | None = None,
136
+ asset_keys: set[str] | None = None,
137
+ include_metadata_assets: bool = False,
138
+ keep_going: bool = False,
139
+ max_workers: int | None = None,
140
+ ) -> pa.Table:
141
+ if item_ids or providers:
142
+ items = filter_items(items, item_ids=item_ids, providers=providers)
143
+
144
+ rows: list[dict[str, object]] = []
145
+ for item in items.to_pylist():
146
+ for asset_key, asset in _item_assets(item).items():
147
+ if asset_keys and asset_key not in asset_keys:
148
+ continue
149
+ if asset_key == METADATA_ASSET_KEY and not include_metadata_assets and not asset_keys:
150
+ continue
151
+ href = asset.get("href")
152
+ row: dict[str, object] = {
153
+ "item_id": item["id"],
154
+ "asset_key": asset_key,
155
+ **location_from_href(href),
156
+ }
157
+ if asset.get("store_endpoint_url"):
158
+ row["store_endpoint_url"] = asset.get("store_endpoint_url")
159
+ size = _size(asset.get("file:size"))
160
+ if size is not None:
161
+ row["size_bytes"] = size
162
+ rows.append(row)
163
+
164
+ assets = asset_lock_table(rows)
165
+ if not probe_metadata:
166
+ return assets
167
+ from stacpkg.object_store import stat_assets
168
+
169
+ if max_workers is not None:
170
+ return stat_assets(
171
+ assets,
172
+ keep_going=keep_going,
173
+ max_workers=max_workers,
174
+ )
175
+ return stat_assets(
176
+ assets,
177
+ keep_going=keep_going,
178
+ )
179
+
180
+
181
+ def map_asset_locations(
182
+ asset_lock: pa.Table,
183
+ *,
184
+ target: str,
185
+ source_prefix: str | None = None,
186
+ layout: str = "item-asset",
187
+ target_endpoint_url: str | None = None,
188
+ ) -> pa.Table:
189
+ target_location = location_from_href(target)
190
+ target_endpoint_url = _endpoint_url(target_endpoint_url)
191
+ if target_endpoint_url is not None:
192
+ target_location["store_endpoint_url"] = target_endpoint_url
193
+ return _map_asset_locations_to_location(
194
+ asset_lock,
195
+ target_location,
196
+ source_prefix=source_prefix,
197
+ layout=layout,
198
+ log_target=target,
199
+ )
200
+
201
+
202
+ def relocate_asset_locations(
203
+ asset_lock: pa.Table,
204
+ *,
205
+ store_type: str,
206
+ store_container: str | None = None,
207
+ store_endpoint_url: str | None = None,
208
+ key: str | None = None,
209
+ source_prefix: str | None = None,
210
+ layout: str = "item-asset",
211
+ ) -> pa.Table:
212
+ normalized_store_type = normalize_store_type(store_type)
213
+ if normalized_store_type is None:
214
+ expected = ", ".join(OBSTORE_STORE_TYPES)
215
+ raise ValueError(f"unsupported store_type: {store_type}. Expected one of: {expected}")
216
+
217
+ key = key or ""
218
+ if normalized_store_type == "file" and not key:
219
+ raise ValueError("key is required when store_type is file")
220
+ if normalized_store_type in {"s3", "gs", "az", "http", "https"} and not store_container:
221
+ raise ValueError(f"store_container is required when store_type is {normalized_store_type}")
222
+
223
+ target_location = {
224
+ "store_type": normalized_store_type,
225
+ "store_container": store_container,
226
+ "store_endpoint_url": _endpoint_url(store_endpoint_url),
227
+ "key": key,
228
+ }
229
+ return _map_asset_locations_to_location(
230
+ asset_lock,
231
+ target_location,
232
+ source_prefix=source_prefix,
233
+ layout=layout,
234
+ log_target=href_from_location(target_location),
235
+ )
236
+
237
+
238
+ def _map_asset_locations_to_location(
239
+ asset_lock: pa.Table,
240
+ target_location: dict[str, object],
241
+ *,
242
+ source_prefix: str | None,
243
+ layout: str,
244
+ log_target: object,
245
+ ) -> pa.Table:
246
+ from stacpkg.object_store import target_path
247
+
248
+ rows: list[dict[str, object]] = []
249
+ mapped_count = 0
250
+ for row in asset_lock.to_pylist():
251
+ row = dict(row)
252
+ if not _matches_href_prefix(href_from_location(row), source_prefix):
253
+ rows.append(row)
254
+ continue
255
+
256
+ path = target_path(row, layout=layout)
257
+ row.update(child_location(target_location, path))
258
+ for field in ASSET_LOCK_COLUMNS:
259
+ if field in {
260
+ "item_id",
261
+ "asset_key",
262
+ "store_type",
263
+ "store_container",
264
+ "store_endpoint_url",
265
+ "key",
266
+ "size_bytes",
267
+ }:
268
+ continue
269
+ row.pop(field, None)
270
+ rows.append(row)
271
+ mapped_count += 1
272
+ LOGGER.info(
273
+ "map asset locations completed: input_rows=%s mapped_rows=%s from=%s target=%s layout=%s",
274
+ asset_lock.num_rows,
275
+ mapped_count,
276
+ _log_location(source_prefix),
277
+ _log_location(log_target),
278
+ layout,
279
+ )
280
+ return asset_lock_table(rows)
281
+
282
+
283
+ def plan_copy_assets(
284
+ asset_lock: pa.Table,
285
+ *,
286
+ target: str,
287
+ source_prefix: str | None = None,
288
+ layout: str = "item-asset",
289
+ target_endpoint_url: str | None = None,
290
+ ) -> pa.Table:
291
+ return map_asset_locations(
292
+ asset_lock,
293
+ target=target,
294
+ source_prefix=source_prefix,
295
+ layout=layout,
296
+ target_endpoint_url=target_endpoint_url,
297
+ )
298
+
299
+
300
+ def _matches_href_prefix(href: object, prefix: str | None) -> bool:
301
+ if prefix is None:
302
+ return True
303
+ if not isinstance(href, str) or not href:
304
+ return False
305
+ normalized = prefix.rstrip("/")
306
+ return href == normalized or href.startswith(f"{normalized}/")