pixelquery 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixelquery/__init__.py +118 -0
- pixelquery/_internal/__init__.py +11 -0
- pixelquery/_internal/codecs.py +77 -0
- pixelquery/_internal/storage/__init__.py +7 -0
- pixelquery/_internal/storage/arrow_chunk.py +412 -0
- pixelquery/_internal/storage/base.py +44 -0
- pixelquery/_internal/storage/geoparquet.py +323 -0
- pixelquery/_internal/storage/iceberg_schema.py +106 -0
- pixelquery/_internal/storage/iceberg_storage.py +336 -0
- pixelquery/_internal/storage/icechunk_storage.py +177 -0
- pixelquery/_internal/transactions/__init__.py +14 -0
- pixelquery/_internal/transactions/base.py +87 -0
- pixelquery/catalog/__init__.py +48 -0
- pixelquery/catalog/iceberg.py +534 -0
- pixelquery/catalog/icechunk_catalog.py +291 -0
- pixelquery/catalog/local.py +409 -0
- pixelquery/catalog/product_profile.py +302 -0
- pixelquery/cli/__init__.py +92 -0
- pixelquery/cli/info.py +119 -0
- pixelquery/cli/migrate.py +102 -0
- pixelquery/cli/recovery.py +126 -0
- pixelquery/core/__init__.py +43 -0
- pixelquery/core/api.py +460 -0
- pixelquery/core/bandmath.py +93 -0
- pixelquery/core/dataarray.py +415 -0
- pixelquery/core/dataset.py +751 -0
- pixelquery/core/exceptions.py +35 -0
- pixelquery/core/interfaces.py +132 -0
- pixelquery/core/result.py +51 -0
- pixelquery/core/timeseries.py +80 -0
- pixelquery/grid/__init__.py +13 -0
- pixelquery/grid/base.py +60 -0
- pixelquery/grid/tile_grid.py +258 -0
- pixelquery/io/__init__.py +10 -0
- pixelquery/io/auto_ingest.py +284 -0
- pixelquery/io/cog.py +234 -0
- pixelquery/io/cog_metadata.py +109 -0
- pixelquery/io/iceberg_reader.py +575 -0
- pixelquery/io/iceberg_writer.py +333 -0
- pixelquery/io/icechunk_reader.py +327 -0
- pixelquery/io/icechunk_writer.py +200 -0
- pixelquery/io/ingest.py +760 -0
- pixelquery/products/__init__.py +12 -0
- pixelquery/products/base.py +54 -0
- pixelquery/products/profiles/__init__.py +15 -0
- pixelquery/products/profiles/landsat8.py +188 -0
- pixelquery/products/profiles/sentinel2.py +217 -0
- pixelquery/query/__init__.py +21 -0
- pixelquery/query/executor.py +261 -0
- pixelquery/query/spatial.py +318 -0
- pixelquery/sample_data.py +156 -0
- pixelquery/testing/__init__.py +8 -0
- pixelquery/util/__init__.py +13 -0
- pixelquery/util/migrate.py +300 -0
- pixelquery/util/recovery.py +226 -0
- pixelquery-0.1.0.dist-info/METADATA +297 -0
- pixelquery-0.1.0.dist-info/RECORD +59 -0
- pixelquery-0.1.0.dist-info/WHEEL +4 -0
- pixelquery-0.1.0.dist-info/licenses/LICENSE +201 -0
pixelquery/__init__.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PixelQuery - Turn your COG files into an analysis-ready time-series data cube
|
|
3
|
+
|
|
4
|
+
Zero-copy virtual references to Cloud-Optimized GeoTIFFs via Icechunk.
|
|
5
|
+
|
|
6
|
+
Quick Start:
|
|
7
|
+
>>> import pixelquery as pq
|
|
8
|
+
>>>
|
|
9
|
+
>>> # Ingest COGs from a directory
|
|
10
|
+
>>> result = pq.ingest("./my_cogs/", band_names=["blue", "green", "red", "nir"])
|
|
11
|
+
>>>
|
|
12
|
+
>>> # Query as lazy xarray Dataset
|
|
13
|
+
>>> ds = pq.open_xarray("./warehouse")
|
|
14
|
+
>>> ndvi = ds.bandmath("(b3 - b2) / (b3 + b2)") # by band index
|
|
15
|
+
>>> ndvi = ds.bandmath("(nir - red) / (nir + red)") # by name
|
|
16
|
+
>>>
|
|
17
|
+
>>> # Point time-series
|
|
18
|
+
>>> ts = pq.timeseries("./warehouse", lon=127.05, lat=37.55)
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
# Apply imagecodecs compatibility patch for Icechunk/VirtualTIFF
|
|
22
|
+
from pixelquery._internal.codecs import patch_imagecodecs
|
|
23
|
+
|
|
24
|
+
patch_imagecodecs()
|
|
25
|
+
|
|
26
|
+
from pixelquery.core import (
|
|
27
|
+
DataArray,
|
|
28
|
+
# Classes
|
|
29
|
+
Dataset,
|
|
30
|
+
IngestionError,
|
|
31
|
+
# Protocols
|
|
32
|
+
PixelQuery,
|
|
33
|
+
# Exceptions
|
|
34
|
+
PixelQueryError,
|
|
35
|
+
QueryError,
|
|
36
|
+
QueryResult,
|
|
37
|
+
TransactionError,
|
|
38
|
+
ValidationError,
|
|
39
|
+
compute_evi,
|
|
40
|
+
compute_ndvi,
|
|
41
|
+
list_tiles,
|
|
42
|
+
# Functions
|
|
43
|
+
open_dataset,
|
|
44
|
+
open_mfdataset,
|
|
45
|
+
open_xarray,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# Legacy imports (may fail if deps not installed)
|
|
49
|
+
try:
|
|
50
|
+
from pixelquery.products import BandInfo
|
|
51
|
+
except ImportError:
|
|
52
|
+
BandInfo = None # type: ignore[misc, assignment]
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
from pixelquery.grid import TileGrid
|
|
56
|
+
except ImportError:
|
|
57
|
+
TileGrid = None # type: ignore[misc, assignment]
|
|
58
|
+
|
|
59
|
+
# Register xarray BandMath accessor (ds.bandmath.ndvi(), etc.)
|
|
60
|
+
import pixelquery.core.bandmath
|
|
61
|
+
|
|
62
|
+
__version__ = "0.1.0"
|
|
63
|
+
|
|
64
|
+
__all__ = [
|
|
65
|
+
"DataArray",
|
|
66
|
+
"Dataset",
|
|
67
|
+
"IngestionError",
|
|
68
|
+
"PixelQueryError",
|
|
69
|
+
"QueryError",
|
|
70
|
+
"TransactionError",
|
|
71
|
+
"ValidationError",
|
|
72
|
+
"__version__",
|
|
73
|
+
"catalog",
|
|
74
|
+
"compute_evi",
|
|
75
|
+
"compute_ndvi",
|
|
76
|
+
"ingest",
|
|
77
|
+
"inspect_cog",
|
|
78
|
+
"inspect_directory",
|
|
79
|
+
"list_tiles",
|
|
80
|
+
"open_dataset",
|
|
81
|
+
"open_mfdataset",
|
|
82
|
+
"open_xarray",
|
|
83
|
+
"register_product",
|
|
84
|
+
"timeseries",
|
|
85
|
+
]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# Lazy imports for new Icechunk features (avoids heavy import at startup)
|
|
89
|
+
def __getattr__(name):
|
|
90
|
+
if name == "ingest":
|
|
91
|
+
from pixelquery.io.auto_ingest import ingest
|
|
92
|
+
|
|
93
|
+
return ingest
|
|
94
|
+
elif name == "timeseries":
|
|
95
|
+
from pixelquery.core.timeseries import timeseries
|
|
96
|
+
|
|
97
|
+
return timeseries
|
|
98
|
+
elif name == "inspect_cog":
|
|
99
|
+
from pixelquery.io.cog_metadata import inspect_cog
|
|
100
|
+
|
|
101
|
+
return inspect_cog
|
|
102
|
+
elif name == "inspect_directory":
|
|
103
|
+
from pixelquery.io.cog_metadata import inspect_directory
|
|
104
|
+
|
|
105
|
+
return inspect_directory
|
|
106
|
+
elif name == "catalog":
|
|
107
|
+
from pixelquery.catalog.icechunk_catalog import IcechunkCatalog
|
|
108
|
+
|
|
109
|
+
return IcechunkCatalog
|
|
110
|
+
elif name == "register_product":
|
|
111
|
+
from pixelquery.catalog.product_profile import register_product
|
|
112
|
+
|
|
113
|
+
return register_product
|
|
114
|
+
elif name == "ProductProfile":
|
|
115
|
+
from pixelquery.catalog.product_profile import ProductProfile
|
|
116
|
+
|
|
117
|
+
return ProductProfile
|
|
118
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Codec compatibility layer for Icechunk/VirtualTIFF.
|
|
3
|
+
|
|
4
|
+
Fixes imagecodecs numcodecs bug where from_config() passes
|
|
5
|
+
'name'/'configuration' keys that __init__() doesn't accept.
|
|
6
|
+
Applied once at package import time.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import inspect
|
|
10
|
+
import logging
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
_PATCHED = False
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def patch_imagecodecs():
|
|
17
|
+
"""
|
|
18
|
+
Monkeypatch all imagecodecs numcodecs to strip extra keys from from_config().
|
|
19
|
+
|
|
20
|
+
Safe to call multiple times; only patches once.
|
|
21
|
+
"""
|
|
22
|
+
global _PATCHED
|
|
23
|
+
if _PATCHED:
|
|
24
|
+
return
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
import imagecodecs.numcodecs as ic_numcodecs
|
|
28
|
+
except ImportError:
|
|
29
|
+
logger.debug("imagecodecs.numcodecs not available, skipping codec patch")
|
|
30
|
+
return
|
|
31
|
+
|
|
32
|
+
patched_count = 0
|
|
33
|
+
for name in dir(ic_numcodecs):
|
|
34
|
+
cls = getattr(ic_numcodecs, name)
|
|
35
|
+
if not isinstance(cls, type) or not hasattr(cls, "from_config"):
|
|
36
|
+
continue
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
init_params = set(inspect.signature(cls.__init__).parameters.keys()) - {"self"} # type: ignore[misc]
|
|
40
|
+
except (ValueError, TypeError):
|
|
41
|
+
init_params = set()
|
|
42
|
+
|
|
43
|
+
def _make_patched(valid_params):
|
|
44
|
+
@classmethod # type: ignore[misc]
|
|
45
|
+
def patched_from_config(klass, config):
|
|
46
|
+
config = dict(config)
|
|
47
|
+
for key in ("id", "name", "configuration"):
|
|
48
|
+
config.pop(key, None)
|
|
49
|
+
if valid_params:
|
|
50
|
+
config = {k: v for k, v in config.items() if k in valid_params}
|
|
51
|
+
return klass(**config)
|
|
52
|
+
|
|
53
|
+
return patched_from_config
|
|
54
|
+
|
|
55
|
+
cls.from_config = _make_patched(init_params)
|
|
56
|
+
patched_count += 1
|
|
57
|
+
|
|
58
|
+
# Register virtual_tiff codecs under their short names for zarr v3.
|
|
59
|
+
# Entry points register them as "virtual_tiff.ChunkyCodec" but zarr
|
|
60
|
+
# metadata stores just "ChunkyCodec".
|
|
61
|
+
try:
|
|
62
|
+
import zarr.registry
|
|
63
|
+
from virtual_tiff.codecs import ChunkyCodec, HorizontalDeltaCodec
|
|
64
|
+
|
|
65
|
+
for codec_cls in (ChunkyCodec, HorizontalDeltaCodec):
|
|
66
|
+
codec_name = codec_cls.__name__
|
|
67
|
+
try:
|
|
68
|
+
zarr.registry.get_codec_class(codec_name)
|
|
69
|
+
except KeyError:
|
|
70
|
+
zarr.registry.register_codec(codec_name, codec_cls)
|
|
71
|
+
patched_count += 1
|
|
72
|
+
except ImportError:
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
_PATCHED = True
|
|
76
|
+
if patched_count:
|
|
77
|
+
logger.debug("Patched %d codecs for imagecodecs/virtual_tiff compatibility", patched_count)
|
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Arrow IPC Chunk Storage
|
|
3
|
+
|
|
4
|
+
Implements monthly spatiotemporal chunk storage using Arrow IPC format.
|
|
5
|
+
|
|
6
|
+
NOTE: With Iceberg integration, this module is retained for backwards compatibility
|
|
7
|
+
with existing Arrow-based warehouses. New warehouses should use Iceberg storage
|
|
8
|
+
via IcebergStorageManager.
|
|
9
|
+
|
|
10
|
+
Performance: Uses Rust extensions when available for 2-35x faster I/O.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from datetime import UTC, datetime
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
import pyarrow as pa
|
|
18
|
+
import pyarrow.ipc as ipc
|
|
19
|
+
from numpy.typing import NDArray
|
|
20
|
+
|
|
21
|
+
# Try to use Rust Arrow functions (2-35x faster than Python)
|
|
22
|
+
try:
|
|
23
|
+
from pixelquery_core import ( # type: ignore[attr-defined]
|
|
24
|
+
arrow_append_to_chunk,
|
|
25
|
+
arrow_write_chunk,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
RUST_ARROW_AVAILABLE = True
|
|
29
|
+
except ImportError:
|
|
30
|
+
RUST_ARROW_AVAILABLE = False
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ArrowChunkWriter:
|
|
34
|
+
"""
|
|
35
|
+
Arrow IPC chunk writer for monthly spatiotemporal data
|
|
36
|
+
|
|
37
|
+
Each chunk represents a single tile-month-band combination:
|
|
38
|
+
- Tile: Geographic tile (e.g., x0024_y0041)
|
|
39
|
+
- Month: Temporal partition (e.g., 2024-01)
|
|
40
|
+
- Band: Spectral band (e.g., red, nir)
|
|
41
|
+
|
|
42
|
+
File naming convention:
|
|
43
|
+
{warehouse}/{table}/data/{tile_id}/{year}-{month:02d}/{band}.arrow
|
|
44
|
+
|
|
45
|
+
Arrow Schema:
|
|
46
|
+
- time: timestamp[ms] - Observation timestamps
|
|
47
|
+
- pixels: list<uint16> - Variable-length pixel arrays (multi-resolution support)
|
|
48
|
+
- mask: list<bool> - Cloud/invalid pixel masks
|
|
49
|
+
- metadata: map<string, string> - Additional metadata
|
|
50
|
+
|
|
51
|
+
Examples:
|
|
52
|
+
>>> writer = ArrowChunkWriter()
|
|
53
|
+
>>> data = {
|
|
54
|
+
... 'time': [datetime(2024, 1, 1), datetime(2024, 1, 15)],
|
|
55
|
+
... 'pixels': [np.array([1000, 1100, ...]), np.array([1050, 1150, ...])],
|
|
56
|
+
... 'mask': [np.array([False, False, ...]), np.array([True, False, ...])]
|
|
57
|
+
... }
|
|
58
|
+
>>> writer.write_chunk(
|
|
59
|
+
... path="warehouse/table/data/x0024_y0041/2024-01/red.arrow",
|
|
60
|
+
... data=data,
|
|
61
|
+
... product_id="sentinel2_l2a",
|
|
62
|
+
... resolution=10.0
|
|
63
|
+
... )
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
# Arrow schema for spatiotemporal chunks
|
|
67
|
+
SCHEMA = pa.schema(
|
|
68
|
+
[
|
|
69
|
+
("time", pa.timestamp("ms", tz="UTC")),
|
|
70
|
+
("pixels", pa.list_(pa.uint16())), # Variable-length arrays
|
|
71
|
+
("mask", pa.list_(pa.bool_())), # Cloud/invalid mask
|
|
72
|
+
]
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
def write_chunk(
|
|
76
|
+
self,
|
|
77
|
+
path: str,
|
|
78
|
+
data: dict[str, list],
|
|
79
|
+
product_id: str,
|
|
80
|
+
resolution: float,
|
|
81
|
+
metadata: dict[str, str] | None = None,
|
|
82
|
+
) -> None:
|
|
83
|
+
"""
|
|
84
|
+
Write spatiotemporal chunk to Arrow IPC file
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
path: Output file path
|
|
88
|
+
data: Dictionary with keys 'time', 'pixels', 'mask'
|
|
89
|
+
product_id: Product identifier (e.g., "sentinel2_l2a")
|
|
90
|
+
resolution: Spatial resolution in meters
|
|
91
|
+
metadata: Additional metadata
|
|
92
|
+
|
|
93
|
+
Raises:
|
|
94
|
+
ValueError: If data is invalid
|
|
95
|
+
IOError: If write fails
|
|
96
|
+
"""
|
|
97
|
+
# Validate input data
|
|
98
|
+
self._validate_data(data)
|
|
99
|
+
|
|
100
|
+
# Use Rust implementation if available (2-35x faster)
|
|
101
|
+
if RUST_ARROW_AVAILABLE:
|
|
102
|
+
self._write_chunk_rust(path, data, product_id, resolution, metadata)
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
# Fall back to Python implementation
|
|
106
|
+
# Convert to Arrow arrays
|
|
107
|
+
time_array = pa.array(data["time"], type=pa.timestamp("ms", tz="UTC"))
|
|
108
|
+
pixels_array = self._convert_pixels_to_arrow(data["pixels"])
|
|
109
|
+
mask_array = self._convert_mask_to_arrow(data["mask"])
|
|
110
|
+
|
|
111
|
+
# Create record batch
|
|
112
|
+
batch = pa.RecordBatch.from_arrays(
|
|
113
|
+
[time_array, pixels_array, mask_array], schema=self.SCHEMA
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# Prepare metadata
|
|
117
|
+
chunk_metadata = {
|
|
118
|
+
"product_id": product_id,
|
|
119
|
+
"resolution": str(resolution),
|
|
120
|
+
"num_observations": str(len(data["time"])),
|
|
121
|
+
"creation_time": datetime.now(UTC).isoformat(),
|
|
122
|
+
}
|
|
123
|
+
if metadata:
|
|
124
|
+
chunk_metadata.update(metadata)
|
|
125
|
+
|
|
126
|
+
# Create schema with metadata
|
|
127
|
+
schema_with_metadata = self.SCHEMA.with_metadata(chunk_metadata)
|
|
128
|
+
|
|
129
|
+
# Write to file
|
|
130
|
+
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
|
131
|
+
with pa.OSFile(path, "wb") as sink, ipc.new_file(sink, schema_with_metadata) as writer:
|
|
132
|
+
writer.write_batch(batch)
|
|
133
|
+
|
|
134
|
+
def _validate_data(self, data: dict[str, list]) -> None:
|
|
135
|
+
"""Validate input data structure"""
|
|
136
|
+
required_keys = {"time", "pixels", "mask"}
|
|
137
|
+
if not required_keys.issubset(data.keys()):
|
|
138
|
+
missing = required_keys - data.keys()
|
|
139
|
+
raise ValueError(f"Missing required keys: {missing}")
|
|
140
|
+
|
|
141
|
+
n_obs = len(data["time"])
|
|
142
|
+
if len(data["pixels"]) != n_obs:
|
|
143
|
+
raise ValueError(f"pixels length ({len(data['pixels'])}) != time length ({n_obs})")
|
|
144
|
+
if len(data["mask"]) != n_obs:
|
|
145
|
+
raise ValueError(f"mask length ({len(data['mask'])}) != time length ({n_obs})")
|
|
146
|
+
|
|
147
|
+
def _convert_pixels_to_arrow(self, pixels: list[NDArray]) -> pa.Array:
|
|
148
|
+
"""Convert list of numpy arrays to Arrow list array"""
|
|
149
|
+
# Convert each numpy array to list
|
|
150
|
+
pixels_list = [arr.astype(np.uint16).tolist() for arr in pixels]
|
|
151
|
+
return pa.array(pixels_list, type=pa.list_(pa.uint16()))
|
|
152
|
+
|
|
153
|
+
def _convert_mask_to_arrow(self, masks: list[NDArray]) -> pa.Array:
|
|
154
|
+
"""Convert list of mask arrays to Arrow list array"""
|
|
155
|
+
masks_list = [arr.astype(bool).tolist() for arr in masks]
|
|
156
|
+
return pa.array(masks_list, type=pa.list_(pa.bool_()))
|
|
157
|
+
|
|
158
|
+
def _write_chunk_rust(
|
|
159
|
+
self,
|
|
160
|
+
path: str,
|
|
161
|
+
data: dict[str, list],
|
|
162
|
+
product_id: str,
|
|
163
|
+
resolution: float,
|
|
164
|
+
metadata: dict[str, str] | None = None,
|
|
165
|
+
) -> None:
|
|
166
|
+
"""Write chunk using Rust implementation (2-35x faster)"""
|
|
167
|
+
# Convert timestamps to milliseconds since epoch
|
|
168
|
+
times_ms = [int(t.timestamp() * 1000) for t in data["time"]]
|
|
169
|
+
|
|
170
|
+
# Convert numpy arrays to lists
|
|
171
|
+
pixels_list = [arr.astype(np.uint16).flatten().tolist() for arr in data["pixels"]]
|
|
172
|
+
masks_list = [arr.astype(bool).flatten().tolist() for arr in data["mask"]]
|
|
173
|
+
|
|
174
|
+
# Prepare metadata
|
|
175
|
+
rust_metadata = {
|
|
176
|
+
"product_id": product_id,
|
|
177
|
+
"resolution": str(resolution),
|
|
178
|
+
}
|
|
179
|
+
if metadata:
|
|
180
|
+
rust_metadata.update(metadata)
|
|
181
|
+
|
|
182
|
+
# Call Rust function
|
|
183
|
+
arrow_write_chunk(path, times_ms, pixels_list, masks_list, rust_metadata)
|
|
184
|
+
|
|
185
|
+
def _append_to_chunk_rust(
|
|
186
|
+
self,
|
|
187
|
+
path: str,
|
|
188
|
+
data: dict[str, list],
|
|
189
|
+
product_id: str,
|
|
190
|
+
resolution: float,
|
|
191
|
+
metadata: dict[str, str] | None = None,
|
|
192
|
+
) -> None:
|
|
193
|
+
"""Append to chunk using Rust implementation (2-3x faster)"""
|
|
194
|
+
# Convert timestamps to milliseconds since epoch
|
|
195
|
+
times_ms = [int(t.timestamp() * 1000) for t in data["time"]]
|
|
196
|
+
|
|
197
|
+
# Convert numpy arrays to lists
|
|
198
|
+
pixels_list = [arr.astype(np.uint16).flatten().tolist() for arr in data["pixels"]]
|
|
199
|
+
masks_list = [arr.astype(bool).flatten().tolist() for arr in data["mask"]]
|
|
200
|
+
|
|
201
|
+
# Prepare metadata
|
|
202
|
+
rust_metadata = {
|
|
203
|
+
"product_id": product_id,
|
|
204
|
+
"resolution": str(resolution),
|
|
205
|
+
}
|
|
206
|
+
if metadata:
|
|
207
|
+
rust_metadata.update(metadata)
|
|
208
|
+
|
|
209
|
+
# Call Rust function
|
|
210
|
+
arrow_append_to_chunk(path, times_ms, pixels_list, masks_list, rust_metadata)
|
|
211
|
+
|
|
212
|
+
def append_to_chunk(
|
|
213
|
+
self,
|
|
214
|
+
path: str,
|
|
215
|
+
data: dict[str, list],
|
|
216
|
+
product_id: str,
|
|
217
|
+
resolution: float,
|
|
218
|
+
metadata: dict[str, str] | None = None,
|
|
219
|
+
) -> None:
|
|
220
|
+
"""
|
|
221
|
+
Append new observations to existing chunk file
|
|
222
|
+
|
|
223
|
+
This is more efficient than reading the entire file, concatenating in Python,
|
|
224
|
+
and rewriting. Uses Rust implementation when available (2-3x faster).
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
path: Chunk file path
|
|
228
|
+
data: Dictionary with keys 'time', 'pixels', 'mask' (single observation each)
|
|
229
|
+
product_id: Product identifier
|
|
230
|
+
resolution: Spatial resolution in meters
|
|
231
|
+
metadata: Additional metadata
|
|
232
|
+
|
|
233
|
+
Raises:
|
|
234
|
+
ValueError: If data is invalid
|
|
235
|
+
IOError: If read/write fails
|
|
236
|
+
"""
|
|
237
|
+
path_obj = Path(path)
|
|
238
|
+
|
|
239
|
+
# Validate input data
|
|
240
|
+
self._validate_data(data)
|
|
241
|
+
|
|
242
|
+
# Use Rust implementation if available (2-3x faster)
|
|
243
|
+
if RUST_ARROW_AVAILABLE:
|
|
244
|
+
self._append_to_chunk_rust(path, data, product_id, resolution, metadata)
|
|
245
|
+
return
|
|
246
|
+
|
|
247
|
+
if not path_obj.exists():
|
|
248
|
+
# First write - use regular write_chunk
|
|
249
|
+
self.write_chunk(path, data, product_id, resolution, metadata)
|
|
250
|
+
return
|
|
251
|
+
|
|
252
|
+
# Read existing data as Arrow table (avoid Python conversion)
|
|
253
|
+
with pa.OSFile(str(path), "rb") as source, ipc.open_file(source) as reader:
|
|
254
|
+
existing_table = reader.read_all()
|
|
255
|
+
# Preserve existing metadata
|
|
256
|
+
existing_metadata = dict(reader.schema.metadata) if reader.schema.metadata else {}
|
|
257
|
+
|
|
258
|
+
# Convert new data to Arrow arrays
|
|
259
|
+
new_time_array = pa.array(data["time"], type=pa.timestamp("ms", tz="UTC"))
|
|
260
|
+
new_pixels_array = self._convert_pixels_to_arrow(data["pixels"])
|
|
261
|
+
new_mask_array = self._convert_mask_to_arrow(data["mask"])
|
|
262
|
+
|
|
263
|
+
# Create record batch for new data
|
|
264
|
+
new_batch = pa.RecordBatch.from_arrays(
|
|
265
|
+
[new_time_array, new_pixels_array, new_mask_array], schema=self.SCHEMA
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# Convert to table and concatenate with existing
|
|
269
|
+
new_table = pa.Table.from_batches([new_batch])
|
|
270
|
+
combined_table = pa.concat_tables([existing_table, new_table])
|
|
271
|
+
|
|
272
|
+
# Update metadata
|
|
273
|
+
chunk_metadata = {
|
|
274
|
+
k.decode() if isinstance(k, bytes) else k: v.decode() if isinstance(v, bytes) else v
|
|
275
|
+
for k, v in existing_metadata.items()
|
|
276
|
+
}
|
|
277
|
+
chunk_metadata.update(
|
|
278
|
+
{
|
|
279
|
+
"product_id": product_id,
|
|
280
|
+
"resolution": str(resolution),
|
|
281
|
+
"num_observations": str(len(combined_table)),
|
|
282
|
+
"last_updated": datetime.now(UTC).isoformat(),
|
|
283
|
+
}
|
|
284
|
+
)
|
|
285
|
+
if metadata:
|
|
286
|
+
chunk_metadata.update(metadata)
|
|
287
|
+
|
|
288
|
+
# Create schema with updated metadata
|
|
289
|
+
schema_with_metadata = self.SCHEMA.with_metadata(chunk_metadata)
|
|
290
|
+
|
|
291
|
+
# Write combined data atomically
|
|
292
|
+
# Use temporary file + rename for atomic write
|
|
293
|
+
temp_path = str(path_obj) + ".tmp"
|
|
294
|
+
try:
|
|
295
|
+
with (
|
|
296
|
+
pa.OSFile(temp_path, "wb") as sink,
|
|
297
|
+
ipc.new_file(sink, schema_with_metadata) as writer,
|
|
298
|
+
):
|
|
299
|
+
writer.write_table(combined_table)
|
|
300
|
+
|
|
301
|
+
# Atomic rename
|
|
302
|
+
import os
|
|
303
|
+
|
|
304
|
+
os.replace(temp_path, str(path_obj))
|
|
305
|
+
except Exception as e:
|
|
306
|
+
# Clean up temp file on error
|
|
307
|
+
if Path(temp_path).exists():
|
|
308
|
+
Path(temp_path).unlink()
|
|
309
|
+
raise e
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
class ArrowChunkReader:
|
|
313
|
+
"""
|
|
314
|
+
Arrow IPC chunk reader for monthly spatiotemporal data
|
|
315
|
+
|
|
316
|
+
Reads chunks written by ArrowChunkWriter and reconstructs
|
|
317
|
+
spatiotemporal data.
|
|
318
|
+
|
|
319
|
+
Examples:
|
|
320
|
+
>>> reader = ArrowChunkReader()
|
|
321
|
+
>>> data, metadata = reader.read_chunk(
|
|
322
|
+
... "warehouse/table/data/x0024_y0041/2024-01/red.arrow"
|
|
323
|
+
... )
|
|
324
|
+
>>> print(len(data['time'])) # Number of observations
|
|
325
|
+
2
|
|
326
|
+
>>> print(data['pixels'][0].shape) # First observation pixel array
|
|
327
|
+
(256, 256)
|
|
328
|
+
"""
|
|
329
|
+
|
|
330
|
+
def read_chunk(
|
|
331
|
+
self, path: str, reshape: tuple[int, int] | None = None
|
|
332
|
+
) -> tuple[dict[str, list], dict[str, str]]:
|
|
333
|
+
"""
|
|
334
|
+
Read spatiotemporal chunk from Arrow IPC file
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
path: Input file path
|
|
338
|
+
reshape: Optional (height, width) to reshape pixel arrays
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
Tuple of (data, metadata):
|
|
342
|
+
- data: Dictionary with 'time', 'pixels', 'mask'
|
|
343
|
+
- metadata: Chunk metadata (product_id, resolution, etc.)
|
|
344
|
+
|
|
345
|
+
Raises:
|
|
346
|
+
FileNotFoundError: If file doesn't exist
|
|
347
|
+
IOError: If read fails
|
|
348
|
+
"""
|
|
349
|
+
if not Path(path).exists():
|
|
350
|
+
raise FileNotFoundError(f"Chunk file not found: {path}")
|
|
351
|
+
|
|
352
|
+
# Read Arrow file
|
|
353
|
+
with pa.OSFile(path, "rb") as source, ipc.open_file(source) as reader:
|
|
354
|
+
# Get metadata from schema
|
|
355
|
+
metadata = dict(reader.schema.metadata) if reader.schema.metadata else {}
|
|
356
|
+
# Decode bytes to strings
|
|
357
|
+
metadata = {k.decode(): v.decode() for k, v in metadata.items()}
|
|
358
|
+
|
|
359
|
+
# Read all batches (typically just one per chunk)
|
|
360
|
+
table = reader.read_all()
|
|
361
|
+
|
|
362
|
+
# Convert to Python objects
|
|
363
|
+
data = {
|
|
364
|
+
"time": table["time"].to_pylist(),
|
|
365
|
+
"pixels": self._convert_pixels_from_arrow(table["pixels"], reshape),
|
|
366
|
+
"mask": self._convert_mask_from_arrow(table["mask"], reshape),
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
return data, metadata
|
|
370
|
+
|
|
371
|
+
def _convert_pixels_from_arrow(
|
|
372
|
+
self, arrow_array: pa.Array, reshape: tuple[int, int] | None
|
|
373
|
+
) -> list[NDArray]:
|
|
374
|
+
"""Convert Arrow list array to list of numpy arrays"""
|
|
375
|
+
pixels_list = arrow_array.to_pylist()
|
|
376
|
+
arrays = [np.array(pixels, dtype=np.uint16) for pixels in pixels_list]
|
|
377
|
+
|
|
378
|
+
if reshape:
|
|
379
|
+
height, width = reshape
|
|
380
|
+
arrays = [arr.reshape(height, width) for arr in arrays]
|
|
381
|
+
|
|
382
|
+
return arrays
|
|
383
|
+
|
|
384
|
+
def _convert_mask_from_arrow(
|
|
385
|
+
self, arrow_array: pa.Array, reshape: tuple[int, int] | None
|
|
386
|
+
) -> list[NDArray]:
|
|
387
|
+
"""Convert Arrow list array to list of mask arrays"""
|
|
388
|
+
mask_list = arrow_array.to_pylist()
|
|
389
|
+
arrays = [np.array(mask, dtype=bool) for mask in mask_list]
|
|
390
|
+
|
|
391
|
+
if reshape:
|
|
392
|
+
height, width = reshape
|
|
393
|
+
arrays = [arr.reshape(height, width) for arr in arrays]
|
|
394
|
+
|
|
395
|
+
return arrays
|
|
396
|
+
|
|
397
|
+
def read_chunk_metadata(self, path: str) -> dict[str, str]:
|
|
398
|
+
"""
|
|
399
|
+
Read only chunk metadata without loading data
|
|
400
|
+
|
|
401
|
+
Args:
|
|
402
|
+
path: Input file path
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
Chunk metadata dictionary
|
|
406
|
+
"""
|
|
407
|
+
if not Path(path).exists():
|
|
408
|
+
raise FileNotFoundError(f"Chunk file not found: {path}")
|
|
409
|
+
|
|
410
|
+
with pa.OSFile(path, "rb") as source, ipc.open_file(source) as reader:
|
|
411
|
+
metadata = dict(reader.schema.metadata) if reader.schema.metadata else {}
|
|
412
|
+
return {k.decode(): v.decode() for k, v in metadata.items()}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Storage Backend Protocol
|
|
3
|
+
|
|
4
|
+
Abstract storage interface for multiple backends (local, S3, Azure, GCS).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Protocol
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class StorageBackend(Protocol):
|
|
11
|
+
"""
|
|
12
|
+
Abstract storage interface
|
|
13
|
+
|
|
14
|
+
Enables multiple storage backends (local filesystem, S3, Azure, GCS)
|
|
15
|
+
while maintaining consistent API.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def read_bytes(self, path: str) -> bytes:
|
|
19
|
+
"""Read file contents as bytes"""
|
|
20
|
+
...
|
|
21
|
+
|
|
22
|
+
def write_bytes(self, path: str, data: bytes) -> None:
|
|
23
|
+
"""Write bytes to file"""
|
|
24
|
+
...
|
|
25
|
+
|
|
26
|
+
def atomic_rename(self, src: str, dest: str) -> None:
|
|
27
|
+
"""
|
|
28
|
+
Atomically rename file (critical for transactions)
|
|
29
|
+
|
|
30
|
+
This is used for two-phase commit: write to .tmp, then rename.
|
|
31
|
+
"""
|
|
32
|
+
...
|
|
33
|
+
|
|
34
|
+
def delete(self, path: str) -> None:
|
|
35
|
+
"""Delete file"""
|
|
36
|
+
...
|
|
37
|
+
|
|
38
|
+
def exists(self, path: str) -> bool:
|
|
39
|
+
"""Check if file exists"""
|
|
40
|
+
...
|
|
41
|
+
|
|
42
|
+
def list_files(self, prefix: str) -> list[str]:
|
|
43
|
+
"""List files matching prefix (for discovery)"""
|
|
44
|
+
...
|