cubexpress 0.1.9__tar.gz → 0.1.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cubexpress might be problematic. Click here for more details.
- {cubexpress-0.1.9 → cubexpress-0.1.10}/PKG-INFO +2 -1
- {cubexpress-0.1.9 → cubexpress-0.1.10}/cubexpress/cloud_utils.py +19 -12
- {cubexpress-0.1.9 → cubexpress-0.1.10}/cubexpress/cube.py +24 -5
- {cubexpress-0.1.9 → cubexpress-0.1.10}/cubexpress/downloader.py +37 -37
- {cubexpress-0.1.9 → cubexpress-0.1.10}/cubexpress/geotyping.py +5 -15
- {cubexpress-0.1.9 → cubexpress-0.1.10}/cubexpress/request.py +42 -19
- {cubexpress-0.1.9 → cubexpress-0.1.10}/pyproject.toml +2 -1
- {cubexpress-0.1.9 → cubexpress-0.1.10}/LICENSE +0 -0
- {cubexpress-0.1.9 → cubexpress-0.1.10}/README.md +0 -0
- {cubexpress-0.1.9 → cubexpress-0.1.10}/cubexpress/__init__.py +0 -0
- {cubexpress-0.1.9 → cubexpress-0.1.10}/cubexpress/cache.py +0 -0
- {cubexpress-0.1.9 → cubexpress-0.1.10}/cubexpress/conversion.py +0 -0
- {cubexpress-0.1.9 → cubexpress-0.1.10}/cubexpress/geospatial.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cubexpress
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.10
|
|
4
4
|
Summary: Efficient processing of cubic Earth-observation (EO) data.
|
|
5
5
|
Home-page: https://github.com/andesdatacube/cubexpress
|
|
6
6
|
License: MIT
|
|
@@ -20,6 +20,7 @@ Requires-Dist: earthengine-api (>=1.5.12)
|
|
|
20
20
|
Requires-Dist: numpy (>=2.0.2)
|
|
21
21
|
Requires-Dist: pandas (>=2.2.2)
|
|
22
22
|
Requires-Dist: pyarrow (>=14.0.0)
|
|
23
|
+
Requires-Dist: pydantic (>=2.11.4)
|
|
23
24
|
Requires-Dist: pygeohash (>=1.2.0)
|
|
24
25
|
Requires-Dist: pyproj (>=3.6.0)
|
|
25
26
|
Requires-Dist: rasterio (>=1.3.9)
|
|
@@ -46,7 +46,7 @@ def _cloud_table_single_range(
|
|
|
46
46
|
* ``id`` – Sentinel-2 ID
|
|
47
47
|
* ``cs_cdf`` – Cloud Score Plus CDF (0–1)
|
|
48
48
|
* ``date`` – acquisition date (YYYY-MM-DD)
|
|
49
|
-
* ``
|
|
49
|
+
* ``null_flag`` – 1 if cloud score missing
|
|
50
50
|
|
|
51
51
|
Notes
|
|
52
52
|
-----
|
|
@@ -83,7 +83,7 @@ def _cloud_table_single_range(
|
|
|
83
83
|
except ee.ee_exception.EEException as e:
|
|
84
84
|
if "No bands in collection" in str(e):
|
|
85
85
|
return pd.DataFrame(
|
|
86
|
-
columns=["id", "cs_cdf", "date", "
|
|
86
|
+
columns=["id", "cs_cdf", "date", "null_flag"]
|
|
87
87
|
)
|
|
88
88
|
raise
|
|
89
89
|
|
|
@@ -95,7 +95,7 @@ def _cloud_table_single_range(
|
|
|
95
95
|
.merge(df_raw, on="id", how="left")
|
|
96
96
|
.assign(
|
|
97
97
|
date=lambda d: pd.to_datetime(d["id"].str[:8], format="%Y%m%d").dt.strftime("%Y-%m-%d"),
|
|
98
|
-
|
|
98
|
+
null_flag=lambda d: d["cs_cdf"].isna().astype(int),
|
|
99
99
|
)
|
|
100
100
|
.drop(columns=["longitude", "latitude", "time"])
|
|
101
101
|
)
|
|
@@ -161,7 +161,7 @@ def s2_cloud_table(
|
|
|
161
161
|
# ─── 1. Load cached data if present ────────────────────────────────────
|
|
162
162
|
if cache and cache_file.exists():
|
|
163
163
|
if verbose:
|
|
164
|
-
print("📂 Loading cached
|
|
164
|
+
print("📂 Loading cached metadata …")
|
|
165
165
|
df_cached = pd.read_parquet(cache_file)
|
|
166
166
|
have_idx = pd.to_datetime(df_cached["date"], errors="coerce").dropna()
|
|
167
167
|
|
|
@@ -173,7 +173,7 @@ def s2_cloud_table(
|
|
|
173
173
|
and dt.date.fromisoformat(end) <= cached_end
|
|
174
174
|
):
|
|
175
175
|
if verbose:
|
|
176
|
-
print("✅ Served entirely from
|
|
176
|
+
print("✅ Served entirely from metadata.")
|
|
177
177
|
df_full = df_cached
|
|
178
178
|
else:
|
|
179
179
|
# Identify missing segments and fetch only those.
|
|
@@ -192,15 +192,21 @@ def s2_cloud_table(
|
|
|
192
192
|
lon, lat, edge_size, a2, b2
|
|
193
193
|
)
|
|
194
194
|
)
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
195
|
+
df_new_parts = [df for df in df_new_parts if not df.empty]
|
|
196
|
+
|
|
197
|
+
if df_new_parts:
|
|
198
|
+
|
|
199
|
+
df_new = pd.concat(df_new_parts, ignore_index=True)
|
|
200
|
+
df_full = (
|
|
201
|
+
pd.concat([df_cached, df_new], ignore_index=True)
|
|
202
|
+
.sort_values("date", kind="mergesort")
|
|
203
|
+
)
|
|
204
|
+
else:
|
|
205
|
+
df_full = df_cached
|
|
200
206
|
else:
|
|
201
207
|
|
|
202
208
|
if verbose:
|
|
203
|
-
msg = "Generating
|
|
209
|
+
msg = "Generating metadata (no cache found)…" if cache else "Generating metadata…"
|
|
204
210
|
print("⏳", msg)
|
|
205
211
|
df_full = _cloud_table_single_range(
|
|
206
212
|
lon, lat, edge_size, start, end
|
|
@@ -230,4 +236,5 @@ def s2_cloud_table(
|
|
|
230
236
|
"collection": collection
|
|
231
237
|
}
|
|
232
238
|
)
|
|
233
|
-
return result
|
|
239
|
+
return result
|
|
240
|
+
|
|
@@ -29,7 +29,6 @@ def get_geotiff(
|
|
|
29
29
|
manifest: Dict[str, Any],
|
|
30
30
|
full_outname: pathlib.Path | str,
|
|
31
31
|
join: bool = True,
|
|
32
|
-
eraser: bool = True,
|
|
33
32
|
nworks: int = 4,
|
|
34
33
|
verbose: bool = True,
|
|
35
34
|
) -> None:
|
|
@@ -52,7 +51,7 @@ def get_geotiff(
|
|
|
52
51
|
size = manifest["grid"]["dimensions"]["width"] # square images assumed
|
|
53
52
|
cell_w, cell_h, power = calculate_cell_size(str(err), size)
|
|
54
53
|
tiled = quadsplit_manifest(manifest, cell_w, cell_h, power)
|
|
55
|
-
download_manifests(tiled, full_outname, join,
|
|
54
|
+
download_manifests(tiled, full_outname, join, nworks)
|
|
56
55
|
|
|
57
56
|
if verbose:
|
|
58
57
|
print(f"Downloaded {full_outname}")
|
|
@@ -61,10 +60,11 @@ def get_geotiff(
|
|
|
61
60
|
def get_cube(
|
|
62
61
|
table: pd.DataFrame,
|
|
63
62
|
outfolder: pathlib.Path | str,
|
|
64
|
-
join: bool = True,
|
|
65
|
-
eraser: bool = True,
|
|
66
63
|
mosaic: bool = True,
|
|
64
|
+
join: bool = True,
|
|
67
65
|
nworks: int = 4,
|
|
66
|
+
verbose: bool = True,
|
|
67
|
+
cache: bool = True
|
|
68
68
|
) -> None:
|
|
69
69
|
"""Download every request in *requests* to *outfolder* using a thread pool.
|
|
70
70
|
|
|
@@ -85,16 +85,35 @@ def get_cube(
|
|
|
85
85
|
table=table,
|
|
86
86
|
mosaic=mosaic
|
|
87
87
|
)
|
|
88
|
+
|
|
89
|
+
outfolder = pathlib.Path(outfolder).expanduser().resolve()
|
|
88
90
|
|
|
89
91
|
with concurrent.futures.ThreadPoolExecutor(max_workers=nworks) as pool:
|
|
90
92
|
futures = []
|
|
91
93
|
for _, row in requests._dataframe.iterrows():
|
|
92
94
|
outname = pathlib.Path(outfolder) / f"{row.id}.tif"
|
|
95
|
+
if outname.exists() and cache:
|
|
96
|
+
continue
|
|
93
97
|
outname.parent.mkdir(parents=True, exist_ok=True)
|
|
94
|
-
futures.append(
|
|
98
|
+
futures.append(
|
|
99
|
+
pool.submit(
|
|
100
|
+
get_geotiff,
|
|
101
|
+
row.manifest,
|
|
102
|
+
outname,
|
|
103
|
+
join,
|
|
104
|
+
nworks,
|
|
105
|
+
verbose
|
|
106
|
+
)
|
|
107
|
+
)
|
|
95
108
|
|
|
96
109
|
for fut in concurrent.futures.as_completed(futures):
|
|
97
110
|
try:
|
|
98
111
|
fut.result()
|
|
99
112
|
except Exception as exc: # noqa: BLE001 – log and keep going
|
|
100
113
|
print(f"Download error: {exc}")
|
|
114
|
+
|
|
115
|
+
download_df = requests._dataframe[["outname", "cs_cdf", "date"]].copy()
|
|
116
|
+
download_df["outname"] = outfolder / requests._dataframe["outname"]
|
|
117
|
+
download_df.rename(columns={"outname": "full_outname"}, inplace=True)
|
|
118
|
+
|
|
119
|
+
return download_df
|
|
@@ -25,6 +25,7 @@ from rasterio.merge import merge
|
|
|
25
25
|
from rasterio.enums import Resampling
|
|
26
26
|
import os
|
|
27
27
|
import shutil
|
|
28
|
+
import tempfile
|
|
28
29
|
|
|
29
30
|
os.environ['CPL_LOG_ERRORS'] = 'OFF'
|
|
30
31
|
logging.getLogger('rasterio._env').setLevel(logging.ERROR)
|
|
@@ -53,7 +54,7 @@ def download_manifest(ulist: Dict[str, Any], full_outname: pathlib.Path) -> None
|
|
|
53
54
|
driver="GTiff",
|
|
54
55
|
tiled=True,
|
|
55
56
|
interleave="band",
|
|
56
|
-
blockxsize=256,
|
|
57
|
+
blockxsize=256, # TODO: Creo que es 128 (por de la superresolucion)
|
|
57
58
|
blockysize=256,
|
|
58
59
|
compress="ZSTD",
|
|
59
60
|
# zstd_level=13,
|
|
@@ -69,10 +70,9 @@ def download_manifest(ulist: Dict[str, Any], full_outname: pathlib.Path) -> None
|
|
|
69
70
|
dst.write(src.read())
|
|
70
71
|
|
|
71
72
|
def download_manifests(
|
|
72
|
-
manifests:
|
|
73
|
+
manifests: list[Dict[str, Any]],
|
|
73
74
|
full_outname: pathlib.Path,
|
|
74
75
|
join: bool = True,
|
|
75
|
-
eraser: bool = True,
|
|
76
76
|
max_workers: int = 4,
|
|
77
77
|
) -> None:
|
|
78
78
|
"""Download every manifest in *manifests* concurrently.
|
|
@@ -81,6 +81,12 @@ def download_manifests(
|
|
|
81
81
|
``full_outname.parent/full_outname.stem`` with names ``000000.tif``,
|
|
82
82
|
``000001.tif`` … according to the list order.
|
|
83
83
|
"""
|
|
84
|
+
# full_outname = pathlib.Path("/home/contreras/Documents/GitHub/cubexpress/cubexpress_test/2017-08-19_6mfrw_18LVN.tif")
|
|
85
|
+
original_dir = full_outname.parent
|
|
86
|
+
if join:
|
|
87
|
+
tmp_dir = pathlib.Path(tempfile.mkdtemp(prefix="s2tmp_"))
|
|
88
|
+
full_outname = tmp_dir / full_outname.name
|
|
89
|
+
|
|
84
90
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
85
91
|
futures = []
|
|
86
92
|
|
|
@@ -95,41 +101,35 @@ def download_manifests(
|
|
|
95
101
|
fut.result()
|
|
96
102
|
except Exception as exc: # noqa: BLE001
|
|
97
103
|
print(f"Error en una de las descargas: {exc}") # noqa: T201
|
|
98
|
-
|
|
99
|
-
if join:
|
|
100
|
-
|
|
101
|
-
dir_path = full_outname.parent / full_outname.stem
|
|
102
|
-
input_files = sorted(dir_path.glob("*.tif"))
|
|
103
|
-
|
|
104
|
-
if dir_path.exists() and len(input_files) > 1:
|
|
105
|
-
|
|
106
|
-
with rio.Env(GDAL_NUM_THREADS="8", NUM_THREADS="8"):
|
|
107
|
-
srcs = [rio.open(fp) for fp in input_files]
|
|
108
|
-
mosaic, out_transform = merge(
|
|
109
|
-
srcs,
|
|
110
|
-
nodata=65535,
|
|
111
|
-
resampling=Resampling.nearest
|
|
112
|
-
)
|
|
113
104
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
meta.update(
|
|
117
|
-
height=mosaic.shape[1],
|
|
118
|
-
width=mosaic.shape[2]
|
|
119
|
-
)
|
|
105
|
+
dir_path = full_outname.parent / full_outname.stem
|
|
106
|
+
input_files = sorted(dir_path.glob("*.tif"))
|
|
120
107
|
|
|
121
|
-
|
|
122
|
-
dst.write(mosaic)
|
|
108
|
+
if dir_path.exists() and len(input_files) > 1:
|
|
123
109
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
print("✅ Mosaico generado:", full_outname)
|
|
132
|
-
return full_outname
|
|
110
|
+
with rio.Env(GDAL_NUM_THREADS="8", NUM_THREADS="8"):
|
|
111
|
+
srcs = [rio.open(fp) for fp in input_files]
|
|
112
|
+
mosaic, out_transform = merge(
|
|
113
|
+
srcs,
|
|
114
|
+
nodata=65535,
|
|
115
|
+
resampling=Resampling.nearest
|
|
116
|
+
)
|
|
133
117
|
|
|
134
|
-
|
|
135
|
-
|
|
118
|
+
meta = srcs[0].profile.copy()
|
|
119
|
+
meta["transform"] = out_transform
|
|
120
|
+
meta.update(
|
|
121
|
+
height=mosaic.shape[1],
|
|
122
|
+
width=mosaic.shape[2]
|
|
123
|
+
)
|
|
124
|
+
outname = original_dir / full_outname.name
|
|
125
|
+
outname.parent.mkdir(parents=True, exist_ok=True)
|
|
126
|
+
with rio.open(outname, "w", **meta) as dst:
|
|
127
|
+
dst.write(mosaic)
|
|
128
|
+
|
|
129
|
+
for src in srcs:
|
|
130
|
+
src.close()
|
|
131
|
+
|
|
132
|
+
# Delete a folder with pathlib
|
|
133
|
+
shutil.rmtree(dir_path)
|
|
134
|
+
else:
|
|
135
|
+
return outname
|
|
@@ -306,13 +306,17 @@ class RequestSet(BaseModel):
|
|
|
306
306
|
"crsCode": meta.raster_transform.crs,
|
|
307
307
|
},
|
|
308
308
|
},
|
|
309
|
+
"cs_cdf": int(meta.id.split("_")[-1]) / 100,
|
|
310
|
+
"date": meta.id.split("_")[0],
|
|
309
311
|
"outname": f"{meta.id}.tif",
|
|
310
312
|
}
|
|
313
|
+
|
|
311
314
|
for index, meta in enumerate(self.requestset)
|
|
312
315
|
]
|
|
313
316
|
)
|
|
314
317
|
|
|
315
318
|
|
|
319
|
+
|
|
316
320
|
def _validate_dataframe_schema(self) -> None:
|
|
317
321
|
"""
|
|
318
322
|
Checks that the `_dataframe` contains the required columns and that each column
|
|
@@ -367,21 +371,7 @@ class RequestSet(BaseModel):
|
|
|
367
371
|
f"Column '{col_name}' has an invalid type in row {i}. "
|
|
368
372
|
f"Expected {expected_type}, got {type(value)}"
|
|
369
373
|
)
|
|
370
|
-
|
|
371
|
-
# B) Validation of the `manifest` column structure
|
|
372
|
-
# - Must contain at least 'assetId' or 'expression'
|
|
373
|
-
# - Must contain 'grid' with the minimum required sub-keys
|
|
374
|
-
# - Example:
|
|
375
|
-
# {
|
|
376
|
-
# "fileFormat": "GEO_TIFF",
|
|
377
|
-
# "bandIds": [...],
|
|
378
|
-
# "grid": {
|
|
379
|
-
# "dimensions": {"width": ..., "height": ...},
|
|
380
|
-
# "affineTransform": {...},
|
|
381
|
-
# "crsCode": ...
|
|
382
|
-
# },
|
|
383
|
-
# // Either "assetId" or "expression" must be here
|
|
384
|
-
# }
|
|
374
|
+
|
|
385
375
|
for i, row in self._dataframe.iterrows():
|
|
386
376
|
manifest = row["manifest"]
|
|
387
377
|
|
|
@@ -32,7 +32,7 @@ def table_to_requestset(
|
|
|
32
32
|
|
|
33
33
|
"""
|
|
34
34
|
|
|
35
|
-
|
|
35
|
+
|
|
36
36
|
df = table.copy()
|
|
37
37
|
|
|
38
38
|
if df.empty:
|
|
@@ -47,34 +47,57 @@ def table_to_requestset(
|
|
|
47
47
|
centre_hash = pgh.encode(df.attrs["lat"], df.attrs["lon"], precision=5)
|
|
48
48
|
reqs: list[Request] = []
|
|
49
49
|
|
|
50
|
+
|
|
51
|
+
|
|
50
52
|
if mosaic:
|
|
51
|
-
# group all asset IDs per day
|
|
52
53
|
grouped = (
|
|
53
|
-
|
|
54
|
-
.
|
|
54
|
+
df.groupby('date')
|
|
55
|
+
.agg(
|
|
56
|
+
id_list = ('id', list),
|
|
57
|
+
cs_cdf_mean = ('cs_cdf', lambda x: int(round(x.mean(), 2) * 100))
|
|
58
|
+
)
|
|
55
59
|
)
|
|
56
60
|
|
|
57
|
-
for day,
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
61
|
+
for day, row in grouped.iterrows():
|
|
62
|
+
|
|
63
|
+
img_ids = row["id_list"]
|
|
64
|
+
cdf = row["cs_cdf_mean"]
|
|
65
|
+
|
|
66
|
+
if len(img_ids) > 1:
|
|
67
|
+
|
|
68
|
+
ee_img = ee.ImageCollection(
|
|
69
|
+
[ee.Image(f"{df.attrs['collection']}/{img}") for img in img_ids]
|
|
70
|
+
).mosaic()
|
|
71
|
+
|
|
72
|
+
reqs.append(
|
|
73
|
+
Request(
|
|
74
|
+
id=f"{day}_{centre_hash}_{cdf}",
|
|
75
|
+
raster_transform=rt,
|
|
76
|
+
image=ee_img,
|
|
77
|
+
bands=df.attrs["bands"],
|
|
78
|
+
)
|
|
68
79
|
)
|
|
69
|
-
|
|
70
|
-
|
|
80
|
+
else:
|
|
81
|
+
for img_id in img_ids:
|
|
82
|
+
tile = img_id.split("_")[-1][1:]
|
|
83
|
+
reqs.append(
|
|
84
|
+
Request(
|
|
85
|
+
id=f"{day}_{centre_hash}_{tile}_{cdf}",
|
|
86
|
+
raster_transform=rt,
|
|
87
|
+
image=f"{df.attrs['collection']}/{img_id}",
|
|
88
|
+
bands=df.attrs["bands"],
|
|
89
|
+
)
|
|
90
|
+
)
|
|
91
|
+
else:
|
|
71
92
|
for _, row in df.iterrows():
|
|
72
93
|
img_id = row["id"]
|
|
73
|
-
|
|
94
|
+
tile = img_id.split("_")[-1][1:]
|
|
95
|
+
day = row["date"]
|
|
96
|
+
cdf = int(round(row["cs_cdf"], 2) * 100)
|
|
74
97
|
|
|
75
98
|
reqs.append(
|
|
76
99
|
Request(
|
|
77
|
-
id=f"{day}_{centre_hash}_{
|
|
100
|
+
id=f"{day}_{centre_hash}_{tile}_{cdf}",
|
|
78
101
|
raster_transform=rt,
|
|
79
102
|
image=f"{df.attrs['collection']}/{img_id}",
|
|
80
103
|
bands=df.attrs["bands"],
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "cubexpress"
|
|
3
|
-
version = "0.1.
|
|
3
|
+
version = "0.1.10"
|
|
4
4
|
description = "Efficient processing of cubic Earth-observation (EO) data."
|
|
5
5
|
authors = [
|
|
6
6
|
"Julio Contreras <contrerasnetk@gmail.com>",
|
|
@@ -32,6 +32,7 @@ rasterio = ">=1.3.9"
|
|
|
32
32
|
earthengine-api = ">=1.5.12"
|
|
33
33
|
pyarrow = ">=14.0.0"
|
|
34
34
|
pyproj = ">=3.6.0"
|
|
35
|
+
pydantic = ">=2.11.4"
|
|
35
36
|
|
|
36
37
|
|
|
37
38
|
[tool.poetry.group.dev.dependencies]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|