pixelquery 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. pixelquery/__init__.py +118 -0
  2. pixelquery/_internal/__init__.py +11 -0
  3. pixelquery/_internal/codecs.py +77 -0
  4. pixelquery/_internal/storage/__init__.py +7 -0
  5. pixelquery/_internal/storage/arrow_chunk.py +412 -0
  6. pixelquery/_internal/storage/base.py +44 -0
  7. pixelquery/_internal/storage/geoparquet.py +323 -0
  8. pixelquery/_internal/storage/iceberg_schema.py +106 -0
  9. pixelquery/_internal/storage/iceberg_storage.py +336 -0
  10. pixelquery/_internal/storage/icechunk_storage.py +177 -0
  11. pixelquery/_internal/transactions/__init__.py +14 -0
  12. pixelquery/_internal/transactions/base.py +87 -0
  13. pixelquery/catalog/__init__.py +48 -0
  14. pixelquery/catalog/iceberg.py +534 -0
  15. pixelquery/catalog/icechunk_catalog.py +291 -0
  16. pixelquery/catalog/local.py +409 -0
  17. pixelquery/catalog/product_profile.py +302 -0
  18. pixelquery/cli/__init__.py +92 -0
  19. pixelquery/cli/info.py +119 -0
  20. pixelquery/cli/migrate.py +102 -0
  21. pixelquery/cli/recovery.py +126 -0
  22. pixelquery/core/__init__.py +43 -0
  23. pixelquery/core/api.py +460 -0
  24. pixelquery/core/bandmath.py +93 -0
  25. pixelquery/core/dataarray.py +415 -0
  26. pixelquery/core/dataset.py +751 -0
  27. pixelquery/core/exceptions.py +35 -0
  28. pixelquery/core/interfaces.py +132 -0
  29. pixelquery/core/result.py +51 -0
  30. pixelquery/core/timeseries.py +80 -0
  31. pixelquery/grid/__init__.py +13 -0
  32. pixelquery/grid/base.py +60 -0
  33. pixelquery/grid/tile_grid.py +258 -0
  34. pixelquery/io/__init__.py +10 -0
  35. pixelquery/io/auto_ingest.py +284 -0
  36. pixelquery/io/cog.py +234 -0
  37. pixelquery/io/cog_metadata.py +109 -0
  38. pixelquery/io/iceberg_reader.py +575 -0
  39. pixelquery/io/iceberg_writer.py +333 -0
  40. pixelquery/io/icechunk_reader.py +327 -0
  41. pixelquery/io/icechunk_writer.py +200 -0
  42. pixelquery/io/ingest.py +760 -0
  43. pixelquery/products/__init__.py +12 -0
  44. pixelquery/products/base.py +54 -0
  45. pixelquery/products/profiles/__init__.py +15 -0
  46. pixelquery/products/profiles/landsat8.py +188 -0
  47. pixelquery/products/profiles/sentinel2.py +217 -0
  48. pixelquery/query/__init__.py +21 -0
  49. pixelquery/query/executor.py +261 -0
  50. pixelquery/query/spatial.py +318 -0
  51. pixelquery/sample_data.py +156 -0
  52. pixelquery/testing/__init__.py +8 -0
  53. pixelquery/util/__init__.py +13 -0
  54. pixelquery/util/migrate.py +300 -0
  55. pixelquery/util/recovery.py +226 -0
  56. pixelquery-0.1.0.dist-info/METADATA +297 -0
  57. pixelquery-0.1.0.dist-info/RECORD +59 -0
  58. pixelquery-0.1.0.dist-info/WHEEL +4 -0
  59. pixelquery-0.1.0.dist-info/licenses/LICENSE +201 -0
pixelquery/__init__.py ADDED
@@ -0,0 +1,118 @@
1
+ """
2
+ PixelQuery - Turn your COG files into an analysis-ready time-series data cube
3
+
4
+ Zero-copy virtual references to Cloud-Optimized GeoTIFFs via Icechunk.
5
+
6
+ Quick Start:
7
+ >>> import pixelquery as pq
8
+ >>>
9
+ >>> # Ingest COGs from a directory
10
+ >>> result = pq.ingest("./my_cogs/", band_names=["blue", "green", "red", "nir"])
11
+ >>>
12
+ >>> # Query as lazy xarray Dataset
13
+ >>> ds = pq.open_xarray("./warehouse")
14
+ >>> ndvi = ds.bandmath("(b3 - b2) / (b3 + b2)") # by band index
15
+ >>> ndvi = ds.bandmath("(nir - red) / (nir + red)") # by name
16
+ >>>
17
+ >>> # Point time-series
18
+ >>> ts = pq.timeseries("./warehouse", lon=127.05, lat=37.55)
19
+ """
20
+
21
+ # Apply imagecodecs compatibility patch for Icechunk/VirtualTIFF
22
+ from pixelquery._internal.codecs import patch_imagecodecs
23
+
24
+ patch_imagecodecs()
25
+
26
+ from pixelquery.core import (
27
+ DataArray,
28
+ # Classes
29
+ Dataset,
30
+ IngestionError,
31
+ # Protocols
32
+ PixelQuery,
33
+ # Exceptions
34
+ PixelQueryError,
35
+ QueryError,
36
+ QueryResult,
37
+ TransactionError,
38
+ ValidationError,
39
+ compute_evi,
40
+ compute_ndvi,
41
+ list_tiles,
42
+ # Functions
43
+ open_dataset,
44
+ open_mfdataset,
45
+ open_xarray,
46
+ )
47
+
48
+ # Legacy imports (may fail if deps not installed)
49
+ try:
50
+ from pixelquery.products import BandInfo
51
+ except ImportError:
52
+ BandInfo = None # type: ignore[misc, assignment]
53
+
54
+ try:
55
+ from pixelquery.grid import TileGrid
56
+ except ImportError:
57
+ TileGrid = None # type: ignore[misc, assignment]
58
+
59
+ # Register xarray BandMath accessor (ds.bandmath.ndvi(), etc.)
60
+ import pixelquery.core.bandmath
61
+
62
+ __version__ = "0.1.0"
63
+
64
+ __all__ = [
65
+ "DataArray",
66
+ "Dataset",
67
+ "IngestionError",
68
+ "PixelQueryError",
69
+ "QueryError",
70
+ "TransactionError",
71
+ "ValidationError",
72
+ "__version__",
73
+ "catalog",
74
+ "compute_evi",
75
+ "compute_ndvi",
76
+ "ingest",
77
+ "inspect_cog",
78
+ "inspect_directory",
79
+ "list_tiles",
80
+ "open_dataset",
81
+ "open_mfdataset",
82
+ "open_xarray",
83
+ "register_product",
84
+ "timeseries",
85
+ ]
86
+
87
+
88
+ # Lazy imports for new Icechunk features (avoids heavy import at startup)
89
+ def __getattr__(name):
90
+ if name == "ingest":
91
+ from pixelquery.io.auto_ingest import ingest
92
+
93
+ return ingest
94
+ elif name == "timeseries":
95
+ from pixelquery.core.timeseries import timeseries
96
+
97
+ return timeseries
98
+ elif name == "inspect_cog":
99
+ from pixelquery.io.cog_metadata import inspect_cog
100
+
101
+ return inspect_cog
102
+ elif name == "inspect_directory":
103
+ from pixelquery.io.cog_metadata import inspect_directory
104
+
105
+ return inspect_directory
106
+ elif name == "catalog":
107
+ from pixelquery.catalog.icechunk_catalog import IcechunkCatalog
108
+
109
+ return IcechunkCatalog
110
+ elif name == "register_product":
111
+ from pixelquery.catalog.product_profile import register_product
112
+
113
+ return register_product
114
+ elif name == "ProductProfile":
115
+ from pixelquery.catalog.product_profile import ProductProfile
116
+
117
+ return ProductProfile
118
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -0,0 +1,11 @@
1
+ """
2
+ PixelQuery Internal Module
3
+
4
+ ⚠️ PRIVATE API - Do not use directly!
5
+
6
+ This module contains internal implementation details.
7
+ The API may change without notice.
8
+ """
9
+
10
+ # No exports - internal use only
11
+ __all__: list[str] = []
@@ -0,0 +1,77 @@
1
+ """
2
+ Codec compatibility layer for Icechunk/VirtualTIFF.
3
+
4
+ Fixes imagecodecs numcodecs bug where from_config() passes
5
+ 'name'/'configuration' keys that __init__() doesn't accept.
6
+ Applied once at package import time.
7
+ """
8
+
9
+ import inspect
10
+ import logging
11
+
12
+ logger = logging.getLogger(__name__)
13
+ _PATCHED = False
14
+
15
+
16
+ def patch_imagecodecs():
17
+ """
18
+ Monkeypatch all imagecodecs numcodecs to strip extra keys from from_config().
19
+
20
+ Safe to call multiple times; only patches once.
21
+ """
22
+ global _PATCHED
23
+ if _PATCHED:
24
+ return
25
+
26
+ try:
27
+ import imagecodecs.numcodecs as ic_numcodecs
28
+ except ImportError:
29
+ logger.debug("imagecodecs.numcodecs not available, skipping codec patch")
30
+ return
31
+
32
+ patched_count = 0
33
+ for name in dir(ic_numcodecs):
34
+ cls = getattr(ic_numcodecs, name)
35
+ if not isinstance(cls, type) or not hasattr(cls, "from_config"):
36
+ continue
37
+
38
+ try:
39
+ init_params = set(inspect.signature(cls.__init__).parameters.keys()) - {"self"} # type: ignore[misc]
40
+ except (ValueError, TypeError):
41
+ init_params = set()
42
+
43
+ def _make_patched(valid_params):
44
+ @classmethod # type: ignore[misc]
45
+ def patched_from_config(klass, config):
46
+ config = dict(config)
47
+ for key in ("id", "name", "configuration"):
48
+ config.pop(key, None)
49
+ if valid_params:
50
+ config = {k: v for k, v in config.items() if k in valid_params}
51
+ return klass(**config)
52
+
53
+ return patched_from_config
54
+
55
+ cls.from_config = _make_patched(init_params)
56
+ patched_count += 1
57
+
58
+ # Register virtual_tiff codecs under their short names for zarr v3.
59
+ # Entry points register them as "virtual_tiff.ChunkyCodec" but zarr
60
+ # metadata stores just "ChunkyCodec".
61
+ try:
62
+ import zarr.registry
63
+ from virtual_tiff.codecs import ChunkyCodec, HorizontalDeltaCodec
64
+
65
+ for codec_cls in (ChunkyCodec, HorizontalDeltaCodec):
66
+ codec_name = codec_cls.__name__
67
+ try:
68
+ zarr.registry.get_codec_class(codec_name)
69
+ except KeyError:
70
+ zarr.registry.register_codec(codec_name, codec_cls)
71
+ patched_count += 1
72
+ except ImportError:
73
+ pass
74
+
75
+ _PATCHED = True
76
+ if patched_count:
77
+ logger.debug("Patched %d codecs for imagecodecs/virtual_tiff compatibility", patched_count)
@@ -0,0 +1,7 @@
1
+ """
2
+ Internal Storage Module
3
+
4
+ ⚠️ PRIVATE API - Do not use directly!
5
+ """
6
+
7
+ __all__: list[str] = []
@@ -0,0 +1,412 @@
1
+ """
2
+ Arrow IPC Chunk Storage
3
+
4
+ Implements monthly spatiotemporal chunk storage using Arrow IPC format.
5
+
6
+ NOTE: With Iceberg integration, this module is retained for backwards compatibility
7
+ with existing Arrow-based warehouses. New warehouses should use Iceberg storage
8
+ via IcebergStorageManager.
9
+
10
+ Performance: Uses Rust extensions when available for 2-35x faster I/O.
11
+ """
12
+
13
+ from datetime import UTC, datetime
14
+ from pathlib import Path
15
+
16
+ import numpy as np
17
+ import pyarrow as pa
18
+ import pyarrow.ipc as ipc
19
+ from numpy.typing import NDArray
20
+
21
+ # Try to use Rust Arrow functions (2-35x faster than Python)
22
+ try:
23
+ from pixelquery_core import ( # type: ignore[attr-defined]
24
+ arrow_append_to_chunk,
25
+ arrow_write_chunk,
26
+ )
27
+
28
+ RUST_ARROW_AVAILABLE = True
29
+ except ImportError:
30
+ RUST_ARROW_AVAILABLE = False
31
+
32
+
33
+ class ArrowChunkWriter:
34
+ """
35
+ Arrow IPC chunk writer for monthly spatiotemporal data
36
+
37
+ Each chunk represents a single tile-month-band combination:
38
+ - Tile: Geographic tile (e.g., x0024_y0041)
39
+ - Month: Temporal partition (e.g., 2024-01)
40
+ - Band: Spectral band (e.g., red, nir)
41
+
42
+ File naming convention:
43
+ {warehouse}/{table}/data/{tile_id}/{year}-{month:02d}/{band}.arrow
44
+
45
+ Arrow Schema:
46
+ - time: timestamp[ms] - Observation timestamps
47
+ - pixels: list<uint16> - Variable-length pixel arrays (multi-resolution support)
48
+ - mask: list<bool> - Cloud/invalid pixel masks
49
+ - metadata: map<string, string> - Additional metadata
50
+
51
+ Examples:
52
+ >>> writer = ArrowChunkWriter()
53
+ >>> data = {
54
+ ... 'time': [datetime(2024, 1, 1), datetime(2024, 1, 15)],
55
+ ... 'pixels': [np.array([1000, 1100, ...]), np.array([1050, 1150, ...])],
56
+ ... 'mask': [np.array([False, False, ...]), np.array([True, False, ...])]
57
+ ... }
58
+ >>> writer.write_chunk(
59
+ ... path="warehouse/table/data/x0024_y0041/2024-01/red.arrow",
60
+ ... data=data,
61
+ ... product_id="sentinel2_l2a",
62
+ ... resolution=10.0
63
+ ... )
64
+ """
65
+
66
+ # Arrow schema for spatiotemporal chunks
67
+ SCHEMA = pa.schema(
68
+ [
69
+ ("time", pa.timestamp("ms", tz="UTC")),
70
+ ("pixels", pa.list_(pa.uint16())), # Variable-length arrays
71
+ ("mask", pa.list_(pa.bool_())), # Cloud/invalid mask
72
+ ]
73
+ )
74
+
75
+ def write_chunk(
76
+ self,
77
+ path: str,
78
+ data: dict[str, list],
79
+ product_id: str,
80
+ resolution: float,
81
+ metadata: dict[str, str] | None = None,
82
+ ) -> None:
83
+ """
84
+ Write spatiotemporal chunk to Arrow IPC file
85
+
86
+ Args:
87
+ path: Output file path
88
+ data: Dictionary with keys 'time', 'pixels', 'mask'
89
+ product_id: Product identifier (e.g., "sentinel2_l2a")
90
+ resolution: Spatial resolution in meters
91
+ metadata: Additional metadata
92
+
93
+ Raises:
94
+ ValueError: If data is invalid
95
+ IOError: If write fails
96
+ """
97
+ # Validate input data
98
+ self._validate_data(data)
99
+
100
+ # Use Rust implementation if available (2-35x faster)
101
+ if RUST_ARROW_AVAILABLE:
102
+ self._write_chunk_rust(path, data, product_id, resolution, metadata)
103
+ return
104
+
105
+ # Fall back to Python implementation
106
+ # Convert to Arrow arrays
107
+ time_array = pa.array(data["time"], type=pa.timestamp("ms", tz="UTC"))
108
+ pixels_array = self._convert_pixels_to_arrow(data["pixels"])
109
+ mask_array = self._convert_mask_to_arrow(data["mask"])
110
+
111
+ # Create record batch
112
+ batch = pa.RecordBatch.from_arrays(
113
+ [time_array, pixels_array, mask_array], schema=self.SCHEMA
114
+ )
115
+
116
+ # Prepare metadata
117
+ chunk_metadata = {
118
+ "product_id": product_id,
119
+ "resolution": str(resolution),
120
+ "num_observations": str(len(data["time"])),
121
+ "creation_time": datetime.now(UTC).isoformat(),
122
+ }
123
+ if metadata:
124
+ chunk_metadata.update(metadata)
125
+
126
+ # Create schema with metadata
127
+ schema_with_metadata = self.SCHEMA.with_metadata(chunk_metadata)
128
+
129
+ # Write to file
130
+ Path(path).parent.mkdir(parents=True, exist_ok=True)
131
+ with pa.OSFile(path, "wb") as sink, ipc.new_file(sink, schema_with_metadata) as writer:
132
+ writer.write_batch(batch)
133
+
134
+ def _validate_data(self, data: dict[str, list]) -> None:
135
+ """Validate input data structure"""
136
+ required_keys = {"time", "pixels", "mask"}
137
+ if not required_keys.issubset(data.keys()):
138
+ missing = required_keys - data.keys()
139
+ raise ValueError(f"Missing required keys: {missing}")
140
+
141
+ n_obs = len(data["time"])
142
+ if len(data["pixels"]) != n_obs:
143
+ raise ValueError(f"pixels length ({len(data['pixels'])}) != time length ({n_obs})")
144
+ if len(data["mask"]) != n_obs:
145
+ raise ValueError(f"mask length ({len(data['mask'])}) != time length ({n_obs})")
146
+
147
+ def _convert_pixels_to_arrow(self, pixels: list[NDArray]) -> pa.Array:
148
+ """Convert list of numpy arrays to Arrow list array"""
149
+ # Convert each numpy array to list
150
+ pixels_list = [arr.astype(np.uint16).tolist() for arr in pixels]
151
+ return pa.array(pixels_list, type=pa.list_(pa.uint16()))
152
+
153
+ def _convert_mask_to_arrow(self, masks: list[NDArray]) -> pa.Array:
154
+ """Convert list of mask arrays to Arrow list array"""
155
+ masks_list = [arr.astype(bool).tolist() for arr in masks]
156
+ return pa.array(masks_list, type=pa.list_(pa.bool_()))
157
+
158
+ def _write_chunk_rust(
159
+ self,
160
+ path: str,
161
+ data: dict[str, list],
162
+ product_id: str,
163
+ resolution: float,
164
+ metadata: dict[str, str] | None = None,
165
+ ) -> None:
166
+ """Write chunk using Rust implementation (2-35x faster)"""
167
+ # Convert timestamps to milliseconds since epoch
168
+ times_ms = [int(t.timestamp() * 1000) for t in data["time"]]
169
+
170
+ # Convert numpy arrays to lists
171
+ pixels_list = [arr.astype(np.uint16).flatten().tolist() for arr in data["pixels"]]
172
+ masks_list = [arr.astype(bool).flatten().tolist() for arr in data["mask"]]
173
+
174
+ # Prepare metadata
175
+ rust_metadata = {
176
+ "product_id": product_id,
177
+ "resolution": str(resolution),
178
+ }
179
+ if metadata:
180
+ rust_metadata.update(metadata)
181
+
182
+ # Call Rust function
183
+ arrow_write_chunk(path, times_ms, pixels_list, masks_list, rust_metadata)
184
+
185
+ def _append_to_chunk_rust(
186
+ self,
187
+ path: str,
188
+ data: dict[str, list],
189
+ product_id: str,
190
+ resolution: float,
191
+ metadata: dict[str, str] | None = None,
192
+ ) -> None:
193
+ """Append to chunk using Rust implementation (2-3x faster)"""
194
+ # Convert timestamps to milliseconds since epoch
195
+ times_ms = [int(t.timestamp() * 1000) for t in data["time"]]
196
+
197
+ # Convert numpy arrays to lists
198
+ pixels_list = [arr.astype(np.uint16).flatten().tolist() for arr in data["pixels"]]
199
+ masks_list = [arr.astype(bool).flatten().tolist() for arr in data["mask"]]
200
+
201
+ # Prepare metadata
202
+ rust_metadata = {
203
+ "product_id": product_id,
204
+ "resolution": str(resolution),
205
+ }
206
+ if metadata:
207
+ rust_metadata.update(metadata)
208
+
209
+ # Call Rust function
210
+ arrow_append_to_chunk(path, times_ms, pixels_list, masks_list, rust_metadata)
211
+
212
+ def append_to_chunk(
213
+ self,
214
+ path: str,
215
+ data: dict[str, list],
216
+ product_id: str,
217
+ resolution: float,
218
+ metadata: dict[str, str] | None = None,
219
+ ) -> None:
220
+ """
221
+ Append new observations to existing chunk file
222
+
223
+ This is more efficient than reading the entire file, concatenating in Python,
224
+ and rewriting. Uses Rust implementation when available (2-3x faster).
225
+
226
+ Args:
227
+ path: Chunk file path
228
+ data: Dictionary with keys 'time', 'pixels', 'mask' (single observation each)
229
+ product_id: Product identifier
230
+ resolution: Spatial resolution in meters
231
+ metadata: Additional metadata
232
+
233
+ Raises:
234
+ ValueError: If data is invalid
235
+ IOError: If read/write fails
236
+ """
237
+ path_obj = Path(path)
238
+
239
+ # Validate input data
240
+ self._validate_data(data)
241
+
242
+ # Use Rust implementation if available (2-3x faster)
243
+ if RUST_ARROW_AVAILABLE:
244
+ self._append_to_chunk_rust(path, data, product_id, resolution, metadata)
245
+ return
246
+
247
+ if not path_obj.exists():
248
+ # First write - use regular write_chunk
249
+ self.write_chunk(path, data, product_id, resolution, metadata)
250
+ return
251
+
252
+ # Read existing data as Arrow table (avoid Python conversion)
253
+ with pa.OSFile(str(path), "rb") as source, ipc.open_file(source) as reader:
254
+ existing_table = reader.read_all()
255
+ # Preserve existing metadata
256
+ existing_metadata = dict(reader.schema.metadata) if reader.schema.metadata else {}
257
+
258
+ # Convert new data to Arrow arrays
259
+ new_time_array = pa.array(data["time"], type=pa.timestamp("ms", tz="UTC"))
260
+ new_pixels_array = self._convert_pixels_to_arrow(data["pixels"])
261
+ new_mask_array = self._convert_mask_to_arrow(data["mask"])
262
+
263
+ # Create record batch for new data
264
+ new_batch = pa.RecordBatch.from_arrays(
265
+ [new_time_array, new_pixels_array, new_mask_array], schema=self.SCHEMA
266
+ )
267
+
268
+ # Convert to table and concatenate with existing
269
+ new_table = pa.Table.from_batches([new_batch])
270
+ combined_table = pa.concat_tables([existing_table, new_table])
271
+
272
+ # Update metadata
273
+ chunk_metadata = {
274
+ k.decode() if isinstance(k, bytes) else k: v.decode() if isinstance(v, bytes) else v
275
+ for k, v in existing_metadata.items()
276
+ }
277
+ chunk_metadata.update(
278
+ {
279
+ "product_id": product_id,
280
+ "resolution": str(resolution),
281
+ "num_observations": str(len(combined_table)),
282
+ "last_updated": datetime.now(UTC).isoformat(),
283
+ }
284
+ )
285
+ if metadata:
286
+ chunk_metadata.update(metadata)
287
+
288
+ # Create schema with updated metadata
289
+ schema_with_metadata = self.SCHEMA.with_metadata(chunk_metadata)
290
+
291
+ # Write combined data atomically
292
+ # Use temporary file + rename for atomic write
293
+ temp_path = str(path_obj) + ".tmp"
294
+ try:
295
+ with (
296
+ pa.OSFile(temp_path, "wb") as sink,
297
+ ipc.new_file(sink, schema_with_metadata) as writer,
298
+ ):
299
+ writer.write_table(combined_table)
300
+
301
+ # Atomic rename
302
+ import os
303
+
304
+ os.replace(temp_path, str(path_obj))
305
+ except Exception as e:
306
+ # Clean up temp file on error
307
+ if Path(temp_path).exists():
308
+ Path(temp_path).unlink()
309
+ raise e
310
+
311
+
312
+ class ArrowChunkReader:
313
+ """
314
+ Arrow IPC chunk reader for monthly spatiotemporal data
315
+
316
+ Reads chunks written by ArrowChunkWriter and reconstructs
317
+ spatiotemporal data.
318
+
319
+ Examples:
320
+ >>> reader = ArrowChunkReader()
321
+ >>> data, metadata = reader.read_chunk(
322
+ ... "warehouse/table/data/x0024_y0041/2024-01/red.arrow"
323
+ ... )
324
+ >>> print(len(data['time'])) # Number of observations
325
+ 2
326
+ >>> print(data['pixels'][0].shape) # First observation pixel array
327
+ (256, 256)
328
+ """
329
+
330
+ def read_chunk(
331
+ self, path: str, reshape: tuple[int, int] | None = None
332
+ ) -> tuple[dict[str, list], dict[str, str]]:
333
+ """
334
+ Read spatiotemporal chunk from Arrow IPC file
335
+
336
+ Args:
337
+ path: Input file path
338
+ reshape: Optional (height, width) to reshape pixel arrays
339
+
340
+ Returns:
341
+ Tuple of (data, metadata):
342
+ - data: Dictionary with 'time', 'pixels', 'mask'
343
+ - metadata: Chunk metadata (product_id, resolution, etc.)
344
+
345
+ Raises:
346
+ FileNotFoundError: If file doesn't exist
347
+ IOError: If read fails
348
+ """
349
+ if not Path(path).exists():
350
+ raise FileNotFoundError(f"Chunk file not found: {path}")
351
+
352
+ # Read Arrow file
353
+ with pa.OSFile(path, "rb") as source, ipc.open_file(source) as reader:
354
+ # Get metadata from schema
355
+ metadata = dict(reader.schema.metadata) if reader.schema.metadata else {}
356
+ # Decode bytes to strings
357
+ metadata = {k.decode(): v.decode() for k, v in metadata.items()}
358
+
359
+ # Read all batches (typically just one per chunk)
360
+ table = reader.read_all()
361
+
362
+ # Convert to Python objects
363
+ data = {
364
+ "time": table["time"].to_pylist(),
365
+ "pixels": self._convert_pixels_from_arrow(table["pixels"], reshape),
366
+ "mask": self._convert_mask_from_arrow(table["mask"], reshape),
367
+ }
368
+
369
+ return data, metadata
370
+
371
+ def _convert_pixels_from_arrow(
372
+ self, arrow_array: pa.Array, reshape: tuple[int, int] | None
373
+ ) -> list[NDArray]:
374
+ """Convert Arrow list array to list of numpy arrays"""
375
+ pixels_list = arrow_array.to_pylist()
376
+ arrays = [np.array(pixels, dtype=np.uint16) for pixels in pixels_list]
377
+
378
+ if reshape:
379
+ height, width = reshape
380
+ arrays = [arr.reshape(height, width) for arr in arrays]
381
+
382
+ return arrays
383
+
384
+ def _convert_mask_from_arrow(
385
+ self, arrow_array: pa.Array, reshape: tuple[int, int] | None
386
+ ) -> list[NDArray]:
387
+ """Convert Arrow list array to list of mask arrays"""
388
+ mask_list = arrow_array.to_pylist()
389
+ arrays = [np.array(mask, dtype=bool) for mask in mask_list]
390
+
391
+ if reshape:
392
+ height, width = reshape
393
+ arrays = [arr.reshape(height, width) for arr in arrays]
394
+
395
+ return arrays
396
+
397
+ def read_chunk_metadata(self, path: str) -> dict[str, str]:
398
+ """
399
+ Read only chunk metadata without loading data
400
+
401
+ Args:
402
+ path: Input file path
403
+
404
+ Returns:
405
+ Chunk metadata dictionary
406
+ """
407
+ if not Path(path).exists():
408
+ raise FileNotFoundError(f"Chunk file not found: {path}")
409
+
410
+ with pa.OSFile(path, "rb") as source, ipc.open_file(source) as reader:
411
+ metadata = dict(reader.schema.metadata) if reader.schema.metadata else {}
412
+ return {k.decode(): v.decode() for k, v in metadata.items()}
@@ -0,0 +1,44 @@
1
+ """
2
+ Storage Backend Protocol
3
+
4
+ Abstract storage interface for multiple backends (local, S3, Azure, GCS).
5
+ """
6
+
7
+ from typing import Protocol
8
+
9
+
10
+ class StorageBackend(Protocol):
11
+ """
12
+ Abstract storage interface
13
+
14
+ Enables multiple storage backends (local filesystem, S3, Azure, GCS)
15
+ while maintaining consistent API.
16
+ """
17
+
18
+ def read_bytes(self, path: str) -> bytes:
19
+ """Read file contents as bytes"""
20
+ ...
21
+
22
+ def write_bytes(self, path: str, data: bytes) -> None:
23
+ """Write bytes to file"""
24
+ ...
25
+
26
+ def atomic_rename(self, src: str, dest: str) -> None:
27
+ """
28
+ Atomically rename file (critical for transactions)
29
+
30
+ This is used for two-phase commit: write to .tmp, then rename.
31
+ """
32
+ ...
33
+
34
+ def delete(self, path: str) -> None:
35
+ """Delete file"""
36
+ ...
37
+
38
+ def exists(self, path: str) -> bool:
39
+ """Check if file exists"""
40
+ ...
41
+
42
+ def list_files(self, prefix: str) -> list[str]:
43
+ """List files matching prefix (for discovery)"""
44
+ ...