cubexpress 0.1.9__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cubexpress/cloud_utils.py CHANGED
@@ -46,7 +46,7 @@ def _cloud_table_single_range(
46
46
  * ``id`` – Sentinel-2 ID
47
47
  * ``cs_cdf`` – Cloud Score Plus CDF (0–1)
48
48
  * ``date`` – acquisition date (YYYY-MM-DD)
49
- * ``high_null_flag`` – 1 if cloud score missing
49
+ * ``null_flag`` – 1 if cloud score missing
50
50
 
51
51
  Notes
52
52
  -----
@@ -83,7 +83,7 @@ def _cloud_table_single_range(
83
83
  except ee.ee_exception.EEException as e:
84
84
  if "No bands in collection" in str(e):
85
85
  return pd.DataFrame(
86
- columns=["id", "cs_cdf", "date", "high_null_flag"]
86
+ columns=["id", "cs_cdf", "date", "null_flag"]
87
87
  )
88
88
  raise
89
89
 
@@ -95,7 +95,7 @@ def _cloud_table_single_range(
95
95
  .merge(df_raw, on="id", how="left")
96
96
  .assign(
97
97
  date=lambda d: pd.to_datetime(d["id"].str[:8], format="%Y%m%d").dt.strftime("%Y-%m-%d"),
98
- high_null_flag=lambda d: d["cs_cdf"].isna().astype(int),
98
+ null_flag=lambda d: d["cs_cdf"].isna().astype(int),
99
99
  )
100
100
  .drop(columns=["longitude", "latitude", "time"])
101
101
  )
@@ -161,7 +161,7 @@ def s2_cloud_table(
161
161
  # ─── 1. Load cached data if present ────────────────────────────────────
162
162
  if cache and cache_file.exists():
163
163
  if verbose:
164
- print("📂 Loading cached table …")
164
+ print("📂 Loading cached metadata …")
165
165
  df_cached = pd.read_parquet(cache_file)
166
166
  have_idx = pd.to_datetime(df_cached["date"], errors="coerce").dropna()
167
167
 
@@ -173,7 +173,7 @@ def s2_cloud_table(
173
173
  and dt.date.fromisoformat(end) <= cached_end
174
174
  ):
175
175
  if verbose:
176
- print("✅ Served entirely from cache.")
176
+ print("✅ Served entirely from metadata.")
177
177
  df_full = df_cached
178
178
  else:
179
179
  # Identify missing segments and fetch only those.
@@ -192,15 +192,21 @@ def s2_cloud_table(
192
192
  lon, lat, edge_size, a2, b2
193
193
  )
194
194
  )
195
- df_new = pd.concat(df_new_parts, ignore_index=True)
196
- df_full = (
197
- pd.concat([df_cached, df_new], ignore_index=True)
198
- .sort_values("date", kind="mergesort")
199
- )
195
+ df_new_parts = [df for df in df_new_parts if not df.empty]
196
+
197
+ if df_new_parts:
198
+
199
+ df_new = pd.concat(df_new_parts, ignore_index=True)
200
+ df_full = (
201
+ pd.concat([df_cached, df_new], ignore_index=True)
202
+ .sort_values("date", kind="mergesort")
203
+ )
204
+ else:
205
+ df_full = df_cached
200
206
  else:
201
207
 
202
208
  if verbose:
203
- msg = "Generating table (no cache found)…" if cache else "Generating table…"
209
+ msg = "Generating metadata (no cache found)…" if cache else "Generating metadata…"
204
210
  print("⏳", msg)
205
211
  df_full = _cloud_table_single_range(
206
212
  lon, lat, edge_size, start, end
@@ -230,4 +236,5 @@ def s2_cloud_table(
230
236
  "collection": collection
231
237
  }
232
238
  )
233
- return result
239
+ return result
240
+
cubexpress/cube.py CHANGED
@@ -29,7 +29,6 @@ def get_geotiff(
29
29
  manifest: Dict[str, Any],
30
30
  full_outname: pathlib.Path | str,
31
31
  join: bool = True,
32
- eraser: bool = True,
33
32
  nworks: int = 4,
34
33
  verbose: bool = True,
35
34
  ) -> None:
@@ -52,7 +51,7 @@ def get_geotiff(
52
51
  size = manifest["grid"]["dimensions"]["width"] # square images assumed
53
52
  cell_w, cell_h, power = calculate_cell_size(str(err), size)
54
53
  tiled = quadsplit_manifest(manifest, cell_w, cell_h, power)
55
- download_manifests(tiled, full_outname, join, eraser, nworks)
54
+ download_manifests(tiled, full_outname, join, nworks)
56
55
 
57
56
  if verbose:
58
57
  print(f"Downloaded {full_outname}")
@@ -61,10 +60,11 @@ def get_geotiff(
61
60
  def get_cube(
62
61
  table: pd.DataFrame,
63
62
  outfolder: pathlib.Path | str,
64
- join: bool = True,
65
- eraser: bool = True,
66
63
  mosaic: bool = True,
64
+ join: bool = True,
67
65
  nworks: int = 4,
66
+ verbose: bool = True,
67
+ cache: bool = True
68
68
  ) -> None:
69
69
  """Download every request in *requests* to *outfolder* using a thread pool.
70
70
 
@@ -85,16 +85,35 @@ def get_cube(
85
85
  table=table,
86
86
  mosaic=mosaic
87
87
  )
88
+
89
+ outfolder = pathlib.Path(outfolder).expanduser().resolve()
88
90
 
89
91
  with concurrent.futures.ThreadPoolExecutor(max_workers=nworks) as pool:
90
92
  futures = []
91
93
  for _, row in requests._dataframe.iterrows():
92
94
  outname = pathlib.Path(outfolder) / f"{row.id}.tif"
95
+ if outname.exists() and cache:
96
+ continue
93
97
  outname.parent.mkdir(parents=True, exist_ok=True)
94
- futures.append(pool.submit(get_geotiff, row.manifest, outname, join, eraser, nworks))
98
+ futures.append(
99
+ pool.submit(
100
+ get_geotiff,
101
+ row.manifest,
102
+ outname,
103
+ join,
104
+ nworks,
105
+ verbose
106
+ )
107
+ )
95
108
 
96
109
  for fut in concurrent.futures.as_completed(futures):
97
110
  try:
98
111
  fut.result()
99
112
  except Exception as exc: # noqa: BLE001 – log and keep going
100
113
  print(f"Download error: {exc}")
114
+
115
+ download_df = requests._dataframe[["outname", "cs_cdf", "date"]].copy()
116
+ download_df["outname"] = outfolder / requests._dataframe["outname"]
117
+ download_df.rename(columns={"outname": "full_outname"}, inplace=True)
118
+
119
+ return download_df
cubexpress/downloader.py CHANGED
@@ -25,6 +25,7 @@ from rasterio.merge import merge
25
25
  from rasterio.enums import Resampling
26
26
  import os
27
27
  import shutil
28
+ import tempfile
28
29
 
29
30
  os.environ['CPL_LOG_ERRORS'] = 'OFF'
30
31
  logging.getLogger('rasterio._env').setLevel(logging.ERROR)
@@ -53,7 +54,7 @@ def download_manifest(ulist: Dict[str, Any], full_outname: pathlib.Path) -> None
53
54
  driver="GTiff",
54
55
  tiled=True,
55
56
  interleave="band",
56
- blockxsize=256,
57
+ blockxsize=256, # TODO: Creo que es 128 (por de la superresolucion)
57
58
  blockysize=256,
58
59
  compress="ZSTD",
59
60
  # zstd_level=13,
@@ -69,10 +70,9 @@ def download_manifest(ulist: Dict[str, Any], full_outname: pathlib.Path) -> None
69
70
  dst.write(src.read())
70
71
 
71
72
  def download_manifests(
72
- manifests: List[Dict[str, Any]],
73
+ manifests: list[Dict[str, Any]],
73
74
  full_outname: pathlib.Path,
74
75
  join: bool = True,
75
- eraser: bool = True,
76
76
  max_workers: int = 4,
77
77
  ) -> None:
78
78
  """Download every manifest in *manifests* concurrently.
@@ -81,6 +81,12 @@ def download_manifests(
81
81
  ``full_outname.parent/full_outname.stem`` with names ``000000.tif``,
82
82
  ``000001.tif`` … according to the list order.
83
83
  """
84
+ # full_outname = pathlib.Path("/home/contreras/Documents/GitHub/cubexpress/cubexpress_test/2017-08-19_6mfrw_18LVN.tif")
85
+ original_dir = full_outname.parent
86
+ if join:
87
+ tmp_dir = pathlib.Path(tempfile.mkdtemp(prefix="s2tmp_"))
88
+ full_outname = tmp_dir / full_outname.name
89
+
84
90
  with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
85
91
  futures = []
86
92
 
@@ -95,41 +101,35 @@ def download_manifests(
95
101
  fut.result()
96
102
  except Exception as exc: # noqa: BLE001
97
103
  print(f"Error en una de las descargas: {exc}") # noqa: T201
98
-
99
- if join:
100
-
101
- dir_path = full_outname.parent / full_outname.stem
102
- input_files = sorted(dir_path.glob("*.tif"))
103
-
104
- if dir_path.exists() and len(input_files) > 1:
105
-
106
- with rio.Env(GDAL_NUM_THREADS="8", NUM_THREADS="8"):
107
- srcs = [rio.open(fp) for fp in input_files]
108
- mosaic, out_transform = merge(
109
- srcs,
110
- nodata=65535,
111
- resampling=Resampling.nearest
112
- )
113
104
 
114
- meta = srcs[0].profile.copy()
115
- meta["transform"] = out_transform
116
- meta.update(
117
- height=mosaic.shape[1],
118
- width=mosaic.shape[2]
119
- )
105
+ dir_path = full_outname.parent / full_outname.stem
106
+ input_files = sorted(dir_path.glob("*.tif"))
120
107
 
121
- with rio.open(full_outname, "w", **meta) as dst:
122
- dst.write(mosaic)
108
+ if dir_path.exists() and len(input_files) > 1:
123
109
 
124
- for src in srcs:
125
- src.close()
126
-
127
- if eraser:
128
- # Delete a folder with pathlib
129
- shutil.rmtree(dir_path)
130
-
131
- print("✅ Mosaico generado:", full_outname)
132
- return full_outname
110
+ with rio.Env(GDAL_NUM_THREADS="8", NUM_THREADS="8"):
111
+ srcs = [rio.open(fp) for fp in input_files]
112
+ mosaic, out_transform = merge(
113
+ srcs,
114
+ nodata=65535,
115
+ resampling=Resampling.nearest
116
+ )
133
117
 
134
- else:
135
- return full_outname
118
+ meta = srcs[0].profile.copy()
119
+ meta["transform"] = out_transform
120
+ meta.update(
121
+ height=mosaic.shape[1],
122
+ width=mosaic.shape[2]
123
+ )
124
+ outname = original_dir / full_outname.name
125
+ outname.parent.mkdir(parents=True, exist_ok=True)
126
+ with rio.open(outname, "w", **meta) as dst:
127
+ dst.write(mosaic)
128
+
129
+ for src in srcs:
130
+ src.close()
131
+
132
+ # Delete a folder with pathlib
133
+ shutil.rmtree(dir_path)
134
+ else:
135
+ return outname
cubexpress/geotyping.py CHANGED
@@ -306,13 +306,17 @@ class RequestSet(BaseModel):
306
306
  "crsCode": meta.raster_transform.crs,
307
307
  },
308
308
  },
309
+ "cs_cdf": int(meta.id.split("_")[-1]) / 100,
310
+ "date": meta.id.split("_")[0],
309
311
  "outname": f"{meta.id}.tif",
310
312
  }
313
+
311
314
  for index, meta in enumerate(self.requestset)
312
315
  ]
313
316
  )
314
317
 
315
318
 
319
+
316
320
  def _validate_dataframe_schema(self) -> None:
317
321
  """
318
322
  Checks that the `_dataframe` contains the required columns and that each column
@@ -367,21 +371,7 @@ class RequestSet(BaseModel):
367
371
  f"Column '{col_name}' has an invalid type in row {i}. "
368
372
  f"Expected {expected_type}, got {type(value)}"
369
373
  )
370
-
371
- # B) Validation of the `manifest` column structure
372
- # - Must contain at least 'assetId' or 'expression'
373
- # - Must contain 'grid' with the minimum required sub-keys
374
- # - Example:
375
- # {
376
- # "fileFormat": "GEO_TIFF",
377
- # "bandIds": [...],
378
- # "grid": {
379
- # "dimensions": {"width": ..., "height": ...},
380
- # "affineTransform": {...},
381
- # "crsCode": ...
382
- # },
383
- # // Either "assetId" or "expression" must be here
384
- # }
374
+
385
375
  for i, row in self._dataframe.iterrows():
386
376
  manifest = row["manifest"]
387
377
 
cubexpress/request.py CHANGED
@@ -32,7 +32,7 @@ def table_to_requestset(
32
32
 
33
33
  """
34
34
 
35
-
35
+
36
36
  df = table.copy()
37
37
 
38
38
  if df.empty:
@@ -47,34 +47,57 @@ def table_to_requestset(
47
47
  centre_hash = pgh.encode(df.attrs["lat"], df.attrs["lon"], precision=5)
48
48
  reqs: list[Request] = []
49
49
 
50
+
51
+
50
52
  if mosaic:
51
- # group all asset IDs per day
52
53
  grouped = (
53
- df.groupby("date")["id"] # Series con listas de ids por día
54
- .apply(list)
54
+ df.groupby('date')
55
+ .agg(
56
+ id_list = ('id', list),
57
+ cs_cdf_mean = ('cs_cdf', lambda x: int(round(x.mean(), 2) * 100))
58
+ )
55
59
  )
56
60
 
57
- for day, img_ids in grouped.items():
58
- ee_img = ee.ImageCollection(
59
- [ee.Image(f"{df.attrs['collection']}/{img}") for img in img_ids]
60
- ).mosaic()
61
-
62
- reqs.append(
63
- Request(
64
- id=f"{day}_{centre_hash}",
65
- raster_transform=rt,
66
- image=ee_img,
67
- bands=df.attrs["bands"],
61
+ for day, row in grouped.iterrows():
62
+
63
+ img_ids = row["id_list"]
64
+ cdf = row["cs_cdf_mean"]
65
+
66
+ if len(img_ids) > 1:
67
+
68
+ ee_img = ee.ImageCollection(
69
+ [ee.Image(f"{df.attrs['collection']}/{img}") for img in img_ids]
70
+ ).mosaic()
71
+
72
+ reqs.append(
73
+ Request(
74
+ id=f"{day}_{centre_hash}_{cdf}",
75
+ raster_transform=rt,
76
+ image=ee_img,
77
+ bands=df.attrs["bands"],
78
+ )
68
79
  )
69
- )
70
- else: # one request per asset
80
+ else:
81
+ for img_id in img_ids:
82
+ tile = img_id.split("_")[-1][1:]
83
+ reqs.append(
84
+ Request(
85
+ id=f"{day}_{centre_hash}_{tile}_{cdf}",
86
+ raster_transform=rt,
87
+ image=f"{df.attrs['collection']}/{img_id}",
88
+ bands=df.attrs["bands"],
89
+ )
90
+ )
91
+ else:
71
92
  for _, row in df.iterrows():
72
93
  img_id = row["id"]
73
- day = row["date"]
94
+ tile = img_id.split("_")[-1][1:]
95
+ day = row["date"]
96
+ cdf = int(round(row["cs_cdf"], 2) * 100)
74
97
 
75
98
  reqs.append(
76
99
  Request(
77
- id=f"{day}_{centre_hash}_{img_id}",
100
+ id=f"{day}_{centre_hash}_{tile}_{cdf}",
78
101
  raster_transform=rt,
79
102
  image=f"{df.attrs['collection']}/{img_id}",
80
103
  bands=df.attrs["bands"],
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cubexpress
3
- Version: 0.1.9
3
+ Version: 0.1.10
4
4
  Summary: Efficient processing of cubic Earth-observation (EO) data.
5
5
  Home-page: https://github.com/andesdatacube/cubexpress
6
6
  License: MIT
@@ -20,6 +20,7 @@ Requires-Dist: earthengine-api (>=1.5.12)
20
20
  Requires-Dist: numpy (>=2.0.2)
21
21
  Requires-Dist: pandas (>=2.2.2)
22
22
  Requires-Dist: pyarrow (>=14.0.0)
23
+ Requires-Dist: pydantic (>=2.11.4)
23
24
  Requires-Dist: pygeohash (>=1.2.0)
24
25
  Requires-Dist: pyproj (>=3.6.0)
25
26
  Requires-Dist: rasterio (>=1.3.9)
@@ -0,0 +1,13 @@
1
+ cubexpress/__init__.py,sha256=RjyAqwiD0rU_Z5tCJTYNGKXZ1ggpfPB51wzhr0KwweY,570
2
+ cubexpress/cache.py,sha256=EZiR2AJfplaLpqMIVFb5piCAgFqHKF1vgLIrutfz8tA,1425
3
+ cubexpress/cloud_utils.py,sha256=Vr2A1SZDKP_2xNiLYgwmWOUX8P8I-pXQrxBETiUDq60,7441
4
+ cubexpress/conversion.py,sha256=JSaMnswY-2n5E4H2zxb-oEOTJ8UPzXfMeSVCremtvTw,2520
5
+ cubexpress/cube.py,sha256=tU0lqhtQUwEiz33yebYIbw-a0R4zmTAei-b_xqMIcWU,3719
6
+ cubexpress/downloader.py,sha256=gHVNCNTwK9qA5MPaEHB_m0wOPprw010qaTVnszwbuUk,4668
7
+ cubexpress/geospatial.py,sha256=ZbsPIgsYQFnNFXUuQ136rJsL4b2Bf91o0Vsswby2dFc,1812
8
+ cubexpress/geotyping.py,sha256=XoSXQuoq5CfzKndM2Pko5KXIP0vxGNm02LOOMbCWkrs,16692
9
+ cubexpress/request.py,sha256=jy5K9MQEurNlwhF0izFmoIh3o7m9bC97fsTT_7C7Gv0,3051
10
+ cubexpress-0.1.10.dist-info/LICENSE,sha256=XjoS-d76b7Cl-VgCWhQk83tNf2dNldKBN8SrImwGc2Q,1072
11
+ cubexpress-0.1.10.dist-info/METADATA,sha256=Tn_XBaLWbO4xbmzYJCM6vnbwRNZ1d1ABZ6uF4G4REYM,9664
12
+ cubexpress-0.1.10.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
13
+ cubexpress-0.1.10.dist-info/RECORD,,
@@ -1,13 +0,0 @@
1
- cubexpress/__init__.py,sha256=RjyAqwiD0rU_Z5tCJTYNGKXZ1ggpfPB51wzhr0KwweY,570
2
- cubexpress/cache.py,sha256=EZiR2AJfplaLpqMIVFb5piCAgFqHKF1vgLIrutfz8tA,1425
3
- cubexpress/cloud_utils.py,sha256=aamTm-PxbPQ4ARwd5faG1a1sjKegbtkd0LxT7wYZJ60,7238
4
- cubexpress/conversion.py,sha256=JSaMnswY-2n5E4H2zxb-oEOTJ8UPzXfMeSVCremtvTw,2520
5
- cubexpress/cube.py,sha256=fwD_UdH0oBWSK-2-fMPPm3YKxcw1xxnm2g0vrZuChI8,3172
6
- cubexpress/downloader.py,sha256=NoJXxCZ7SXBMzUDcXU6DGa2vce61g716FYYfq17pH0k,4461
7
- cubexpress/geospatial.py,sha256=ZbsPIgsYQFnNFXUuQ136rJsL4b2Bf91o0Vsswby2dFc,1812
8
- cubexpress/geotyping.py,sha256=XuBcJAgNxvXCCIDmWijI70p6dEFlu6UfbqwQlWXSWQw,17155
9
- cubexpress/request.py,sha256=ZWVIXo0_rVkX1fBWREbtvvdYUSZPCv4LIcPdrMKKuLs,2270
10
- cubexpress-0.1.9.dist-info/LICENSE,sha256=XjoS-d76b7Cl-VgCWhQk83tNf2dNldKBN8SrImwGc2Q,1072
11
- cubexpress-0.1.9.dist-info/METADATA,sha256=qplHASBXni3m6kOAFIw8Jy2fBFqY1QfLDaNM3ou6cMk,9628
12
- cubexpress-0.1.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
13
- cubexpress-0.1.9.dist-info/RECORD,,