giga-spatial 0.6.4__py3-none-any.whl → 0.6.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  import numpy as np
2
2
  import pandas as pd
3
3
  import geopandas as gpd
4
- from typing import List, Optional, Tuple, Union, Literal
4
+ from typing import List, Optional, Tuple, Union, Literal, Callable
5
5
  from pydantic import ConfigDict
6
6
  from pydantic.dataclasses import dataclass
7
7
  from contextlib import contextmanager
@@ -9,6 +9,9 @@ from shapely.geometry import box, Polygon, MultiPolygon
9
9
  from pathlib import Path
10
10
  import rasterio
11
11
  from rasterio.mask import mask
12
+ from functools import partial
13
+ import multiprocessing
14
+ from tqdm import tqdm
12
15
 
13
16
  from gigaspatial.core.io.data_store import DataStore
14
17
  from gigaspatial.core.io.local_data_store import LocalDataStore
@@ -113,6 +116,9 @@ class TifProcessor:
113
116
  @property
114
117
  def tabular(self) -> pd.DataFrame:
115
118
  """Get the data from the TIF file"""
119
+ self.logger.warning(
120
+ "The `tabular` property is deprecated, use `to_dataframe` instead"
121
+ )
116
122
  if not hasattr(self, "_tabular"):
117
123
  try:
118
124
  if self.mode == "single":
@@ -142,14 +148,56 @@ class TifProcessor:
142
148
 
143
149
  return self._tabular
144
150
 
145
- def to_dataframe(self) -> pd.DataFrame:
146
- return self.tabular
151
+ def to_dataframe(self, drop_nodata=True, **kwargs) -> pd.DataFrame:
152
+ try:
153
+ if self.mode == "single":
154
+ df = self._to_band_dataframe(drop_nodata=drop_nodata, **kwargs)
155
+ elif self.mode == "rgb":
156
+ df = self._to_rgb_dataframe(drop_nodata=drop_nodata)
157
+ elif self.mode == "rgba":
158
+ df = self._to_rgba_dataframe(drop_transparent=drop_nodata)
159
+ elif self.mode == "multi":
160
+ df = self._to_multi_band_dataframe(drop_nodata=drop_nodata, **kwargs)
161
+ else:
162
+ raise ValueError(
163
+ f"Invalid mode: {self.mode}. Must be one of: single, rgb, rgba, multi"
164
+ )
165
+ except Exception as e:
166
+ raise ValueError(
167
+ f"Failed to process TIF file in mode '{self.mode}'. "
168
+ f"Please ensure the file is valid and matches the selected mode. "
169
+ f"Original error: {str(e)}"
170
+ )
171
+
172
+ return df
173
+
174
+ def to_geodataframe(self, **kwargs) -> gpd.GeoDataFrame:
175
+ """
176
+ Convert the processed TIF data into a GeoDataFrame, where each row represents a pixel zone.
177
+ Each zone is defined by its bounding box, based on pixel resolution and coordinates.
178
+ """
179
+ df = self.to_dataframe(**kwargs)
180
+
181
+ x_res, y_res = self.resolution
182
+
183
+ # create bounding box for each pixel
184
+ geometries = [
185
+ box(lon - x_res / 2, lat - y_res / 2, lon + x_res / 2, lat + y_res / 2)
186
+ for lon, lat in zip(df["lon"], df["lat"])
187
+ ]
188
+
189
+ gdf = gpd.GeoDataFrame(df, geometry=geometries, crs=self.crs)
190
+
191
+ return gdf
147
192
 
148
193
  def get_zoned_geodataframe(self) -> gpd.GeoDataFrame:
149
194
  """
150
195
  Convert the processed TIF data into a GeoDataFrame, where each row represents a pixel zone.
151
196
  Each zone is defined by its bounding box, based on pixel resolution and coordinates.
152
197
  """
198
+ self.logger.warning(
199
+ "The `get_zoned_geodataframe` method is deprecated, use `to_geodataframe` instead"
200
+ )
153
201
  self.logger.info("Converting data to GeoDataFrame with zones...")
154
202
 
155
203
  df = self.tabular
@@ -168,7 +216,7 @@ class TifProcessor:
168
216
  return gdf
169
217
 
170
218
  def sample_by_coordinates(
171
- self, coordinate_list: List[Tuple[float, float]]
219
+ self, coordinate_list: List[Tuple[float, float]], **kwargs
172
220
  ) -> Union[np.ndarray, dict]:
173
221
  self.logger.info("Sampling raster values at the coordinates...")
174
222
 
@@ -200,63 +248,188 @@ class TifProcessor:
200
248
  ]
201
249
 
202
250
  return rgb_values
251
+ elif self.count > 1:
252
+ return np.array(
253
+ [vals for vals in src.sample(coordinate_list, **kwargs)]
254
+ )
203
255
  else:
204
- if src.count != 1:
205
- raise ValueError("Single band mode requires a 1-band TIF file")
206
256
  return np.array([vals[0] for vals in src.sample(coordinate_list)])
207
257
 
208
258
  def sample_by_polygons(
209
- self, polygon_list: List[Union[Polygon, MultiPolygon]], stat: str = "mean"
210
- ) -> np.ndarray:
259
+ self,
260
+ polygon_list,
261
+ stat: Union[str, Callable, List[Union[str, Callable]]] = "mean",
262
+ ):
211
263
  """
212
- Sample raster values within each polygon of a GeoDataFrame.
264
+ Sample raster values by polygons and compute statistic(s) for each polygon.
265
+
266
+ Args:
267
+ polygon_list: List of shapely Polygon or MultiPolygon objects.
268
+ stat: Statistic(s) to compute. Can be:
269
+ - Single string: 'mean', 'median', 'sum', 'min', 'max', 'std', 'count'
270
+ - Single callable: custom function that takes array and returns scalar
271
+ - List of strings/callables: multiple statistics to compute
213
272
 
214
- Parameters:
215
- polygon_list: List of polygon geometries (can include MultiPolygons).
216
- stat (str): Aggregation statistic to compute within each polygon.
217
- Options: "mean", "median", "sum", "min", "max".
218
273
  Returns:
219
- A NumPy array of sampled values
274
+ If single stat: np.ndarray of computed statistics for each polygon
275
+ If multiple stats: List of dictionaries with stat names as keys
220
276
  """
221
- self.logger.info("Sampling raster values within polygons...")
222
-
223
- with self.open_dataset() as src:
224
- results = []
277
+ # Determine if single or multiple stats
278
+ single_stat = not isinstance(stat, list)
279
+ stats_list = [stat] if single_stat else stat
280
+
281
+ # Prepare stat functions
282
+ stat_funcs = []
283
+ stat_names = []
284
+
285
+ for s in stats_list:
286
+ if callable(s):
287
+ stat_funcs.append(s)
288
+ stat_names.append(
289
+ s.__name__
290
+ if hasattr(s, "__name__")
291
+ else f"custom_{len(stat_names)}"
292
+ )
293
+ else:
294
+ # Handle string statistics
295
+ if s == "count":
296
+ stat_funcs.append(len)
297
+ else:
298
+ stat_funcs.append(getattr(np, s))
299
+ stat_names.append(s)
225
300
 
226
- for geom in polygon_list:
227
- if geom.is_empty:
228
- results.append(np.nan)
229
- continue
301
+ results = []
230
302
 
303
+ with self.open_dataset() as src:
304
+ for polygon in tqdm(polygon_list):
231
305
  try:
232
- # Mask the raster with the polygon
233
- out_image, _ = mask(src, [geom], crop=True)
234
-
235
- # Flatten the raster values and remove NoData values
236
- values = out_image[out_image != src.nodata].flatten()
306
+ out_image, _ = mask(src, [polygon], crop=True, filled=False)
237
307
 
238
- # Compute the desired statistic
239
- if len(values) == 0:
240
- results.append(np.nan)
308
+ # Use masked arrays for more efficient nodata handling
309
+ if hasattr(out_image, "mask"):
310
+ valid_data = out_image.compressed()
311
+ else:
312
+ valid_data = (
313
+ out_image[out_image != self.nodata]
314
+ if self.nodata
315
+ else out_image.flatten()
316
+ )
317
+
318
+ if len(valid_data) == 0:
319
+ if single_stat:
320
+ results.append(np.nan)
321
+ else:
322
+ results.append({name: np.nan for name in stat_names})
241
323
  else:
242
- if stat == "mean":
243
- results.append(np.mean(values))
244
- elif stat == "median":
245
- results.append(np.median(values))
246
- elif stat == "sum":
247
- results.append(np.sum(values))
248
- elif stat == "min":
249
- results.append(np.min(values))
250
- elif stat == "max":
251
- results.append(np.max(values))
324
+ if single_stat:
325
+ results.append(stat_funcs[0](valid_data))
252
326
  else:
253
- raise ValueError(f"Unknown statistic: {stat}")
327
+ # Compute all statistics for this polygon
328
+ polygon_stats = {}
329
+ for func, name in zip(stat_funcs, stat_names):
330
+ try:
331
+ polygon_stats[name] = func(valid_data)
332
+ except Exception:
333
+ polygon_stats[name] = np.nan
334
+ results.append(polygon_stats)
335
+
336
+ except Exception:
337
+ if single_stat:
338
+ results.append(np.nan)
339
+ else:
340
+ results.append({name: np.nan for name in stat_names})
341
+
342
+ return np.array(results) if single_stat else results
343
+
344
+ def sample_by_polygons_batched(
345
+ self,
346
+ polygon_list: List[Union[Polygon, MultiPolygon]],
347
+ stat: Union[str, Callable] = "mean",
348
+ batch_size: int = 100,
349
+ n_workers: int = 4,
350
+ **kwargs,
351
+ ) -> np.ndarray:
352
+ """
353
+ Sample raster values by polygons in parallel using batching.
354
+ """
355
+
356
+ def _chunk_list(data_list, chunk_size):
357
+ """Yield successive chunks from data_list."""
358
+ for i in range(0, len(data_list), chunk_size):
359
+ yield data_list[i : i + chunk_size]
360
+
361
+ if len(polygon_list) == 0:
362
+ return np.array([])
363
+
364
+ stat_func = stat if callable(stat) else getattr(np, stat)
365
+
366
+ polygon_chunks = list(_chunk_list(polygon_list, batch_size))
367
+
368
+ with multiprocessing.Pool(
369
+ initializer=self._initializer_worker, processes=n_workers
370
+ ) as pool:
371
+ process_func = partial(self._process_polygon_batch, stat_func=stat_func)
372
+ batched_results = list(
373
+ tqdm(
374
+ pool.imap(process_func, polygon_chunks),
375
+ total=len(polygon_chunks),
376
+ desc=f"Sampling polygons",
377
+ )
378
+ )
254
379
 
255
- except Exception as e:
256
- self.logger.error(f"Error processing polygon: {e}")
257
- results.append(np.nan)
380
+ results = [item for sublist in batched_results for item in sublist]
258
381
 
259
382
  return np.array(results)
383
+
384
+ def _initializer_worker(self):
385
+ """
386
+ Initializer function for each worker process.
387
+ Opens the raster dataset and stores it in a process-local variable.
388
+ This function runs once per worker, not for every task.
389
+ """
390
+ global src_handle
391
+ with self.data_store.open(self.dataset_path, "rb") as f:
392
+ with rasterio.MemoryFile(f.read()) as memfile:
393
+ src_handle = memfile.open()
394
+
395
+ def _process_single_polygon(self, polygon, stat_func):
396
+ """
397
+ Helper function to process a single polygon.
398
+ This will be run in a separate process.
399
+ """
400
+ global src_handle
401
+ if src_handle is None:
402
+ # This should not happen if the initializer is set up correctly,
403
+ # but it's a good defensive check.
404
+ raise RuntimeError("Raster dataset not initialized in this process.")
405
+
406
+ try:
407
+ out_image, _ = mask(src_handle, [polygon], crop=True, filled=False)
408
+
409
+ if hasattr(out_image, "mask"):
410
+ valid_data = out_image.compressed()
411
+ else:
412
+ valid_data = (
413
+ out_image[out_image != self.nodata]
414
+ if self.nodata
415
+ else out_image.flatten()
416
+ )
417
+
418
+ if len(valid_data) == 0:
419
+ return np.nan
420
+ else:
421
+ return stat_func(valid_data)
422
+ except Exception:
423
+ return np.nan
424
+
425
+ def _process_polygon_batch(self, polygon_batch, stat_func):
426
+ """
427
+ Processes a batch of polygons.
428
+ """
429
+ return [
430
+ self._process_single_polygon(polygon, stat_func)
431
+ for polygon in polygon_batch
432
+ ]
260
433
 
261
434
  def _to_rgba_dataframe(self, drop_transparent: bool = False) -> pd.DataFrame:
262
435
  """
@@ -554,7 +727,9 @@ def sample_multiple_tifs_by_polygons(
554
727
  sampled_values = np.full(len(polygon_list), np.nan, dtype=np.float32)
555
728
 
556
729
  for tp in tif_processors:
557
- values = tp.sample_by_polygons(polygon_list=polygon_list, stat=stat)
730
+ values = tp.sample_by_polygons(
731
+ polygon_list=polygon_list, stat=stat
732
+ )
558
733
 
559
734
  mask = np.isnan(sampled_values) # replace all NaNs
560
735
 
@@ -1,47 +0,0 @@
1
- giga_spatial-0.6.4.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
2
- gigaspatial/__init__.py,sha256=WMmvm2Keb76yMz8OL_h4fKT34Xpi-1BVfCiTn2QGzz4,22
3
- gigaspatial/config.py,sha256=PR6n6NDDD4560zWEbaFiYSitr9PAKik915cxCCMZNQc,8392
4
- gigaspatial/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- gigaspatial/core/io/__init__.py,sha256=y4QNWx6US1-adTuAO_NZwLmjzSQj25HNDL5hUGvEHZc,263
6
- gigaspatial/core/io/adls_data_store.py,sha256=Zv-D_8d_2h57HnCUTJb0JWWjXqR_0XH4F8Nu_UFZK9E,11975
7
- gigaspatial/core/io/data_api.py,sha256=3HMstau3zH3JPRUW0t83DZt74N39bt-jsfAyrUUFMoc,3944
8
- gigaspatial/core/io/data_store.py,sha256=mi8fy78Dtwj4dpKkyDM6kTlna1lfCQ5ro2hUAOFr83A,3223
9
- gigaspatial/core/io/local_data_store.py,sha256=hcu7DNYa3AL6sEPMqguzxWal_bnP7CIpbwpoiyf5TCw,2933
10
- gigaspatial/core/io/readers.py,sha256=gqFKGRCsAP_EBXipqGtT8MEV-x0u6SrCqaSiOC5YPTA,9284
11
- gigaspatial/core/io/writers.py,sha256=asb56ZHQEWO2rdilIq7QywDRk8yfebecWv1KwzUpaXI,4367
12
- gigaspatial/core/schemas/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- gigaspatial/core/schemas/entity.py,sha256=QAhEW0-JgdWh9pjKGbB5ArvqtVK85ayYZJPgjdb-IKw,8590
14
- gigaspatial/generators/__init__.py,sha256=dGxmWHwkvjYopdlE9d4tEaQtn4uh391zZ7NhOG5OVTk,153
15
- gigaspatial/generators/poi.py,sha256=meP_bnGnKout8ik1G2XSCyKM0RhegCZg-5woVXLqIog,26509
16
- gigaspatial/generators/zonal/__init__.py,sha256=6bDVRRxim5qpdTE_b2dLeWmrTrDSZBnM7gn5MztCbGk,233
17
- gigaspatial/generators/zonal/base.py,sha256=up3k5ApPn9387DRu34VIJqA1_6lWdi5UpGVjcQQIDsE,15346
18
- gigaspatial/generators/zonal/geometry.py,sha256=XPcX5lT7X7Z1vn72sN-VKLb2hDP9F_w3LwdRNecmJn0,18339
19
- gigaspatial/generators/zonal/mercator.py,sha256=R_KlaqF4lnc0cRqVfcNVO8i0Re21_6w7pnclVKSohcY,3125
20
- gigaspatial/grid/__init__.py,sha256=H8SnNAMDafJXJ9bUp2zU0Z3t6s8niqY5rGP5nFhnbLA,45
21
- gigaspatial/grid/mercator_tiles.py,sha256=Z_3M4sy1tyxywAo2wmBb6niBP3x-IWgwMkmUp8LOSDg,10492
22
- gigaspatial/handlers/__init__.py,sha256=R2rugXR5kF4lLkSO1fjpVDYK_jWdD8U2NbXbW71Ezv8,1523
23
- gigaspatial/handlers/base.py,sha256=rL94c3wDjsqzLp4na8FfYXW6tNjVGX6v4M-Ce4LrAro,26413
24
- gigaspatial/handlers/boundaries.py,sha256=UM0lFcTzy64ADdMnPOkzLGJ-OG5P7KyoZtA91GTWxYs,17242
25
- gigaspatial/handlers/ghsl.py,sha256=GHao8lkmj1C0-QFqNwH9jr0Lqzu6NTj_7ooQdj1h6ok,27760
26
- gigaspatial/handlers/giga.py,sha256=F5ZfcE37a24X-c6Xhyt72C9eZZbyN_gV7w_InxKFMQQ,28348
27
- gigaspatial/handlers/google_open_buildings.py,sha256=Liqk7qJhDtB4Ia4uhBe44LFcf-XVKBjRfj-pWlE5erY,16594
28
- gigaspatial/handlers/hdx.py,sha256=LTEs_xZF1yPhD8dAdZ_YN8Vcan7iB5_tZ8NjF_ip6u0,18001
29
- gigaspatial/handlers/mapbox_image.py,sha256=M_nkJ_b1PD8FG1ajVgSycCb0NRTAI_SLpHdzszNetKA,7786
30
- gigaspatial/handlers/maxar_image.py,sha256=kcc8uGljQB0Yh0MKBA7lT7KwBbNZwFzuyBklR3db1P4,10204
31
- gigaspatial/handlers/microsoft_global_buildings.py,sha256=bQ5WHIv3v0wWrZZUbZkKPRjgdlqIxlK7CV_0zSvdrTw,20292
32
- gigaspatial/handlers/ookla_speedtest.py,sha256=EcvSAxJZ9GPfzYnT_C85Qgy2ecc9ndf70Pklk53OdC8,6506
33
- gigaspatial/handlers/opencellid.py,sha256=KuJqd-5-RO5ZzyDaBSrTgCK2ib5N_m3RUcPlX5heWwI,10683
34
- gigaspatial/handlers/osm.py,sha256=sLNMkOVh1v50jrWw7Z0-HILY5QTQjgKCHCeAfXj5jA8,14084
35
- gigaspatial/handlers/overture.py,sha256=lKeNw00v5Qia7LdWORuYihnlKEqxE9m38tdeRrvag9k,4218
36
- gigaspatial/handlers/rwi.py,sha256=GDpQH9K96QZD3yezJOBiy5yZvYmrj4xbjUNSjYfNAh0,4875
37
- gigaspatial/handlers/unicef_georepo.py,sha256=ODYNvkU_UKgOHXT--0MqmJ4Uk6U1_mp9xgehbTzKpX8,31924
38
- gigaspatial/handlers/worldpop.py,sha256=oJ39NGajXi0rn829ZoFiaeG4_wavyPvljdActpxs12I,9850
39
- gigaspatial/processing/__init__.py,sha256=QDVL-QbLCrIb19lrajP7LrHNdGdnsLeGcvAs_jQpdRM,183
40
- gigaspatial/processing/geo.py,sha256=D-S3IlhQwLIxrCcxy6NhNmKLrOIjoRHfK_eZJGKpe2U,36947
41
- gigaspatial/processing/sat_images.py,sha256=YUbH5MFNzl6NX49Obk14WaFcr1s3SyGJIOk-kRpbBNg,1429
42
- gigaspatial/processing/tif_processor.py,sha256=zqcP_ioo9KHNJ6H0uba4UghW4MToTRwq1iE-nZbb8zA,21101
43
- gigaspatial/processing/utils.py,sha256=HC85vGKQakxlkoQAkZmeAXWHsenAwTIRn7jPKUA7x20,1500
44
- giga_spatial-0.6.4.dist-info/METADATA,sha256=WQUWSdjlmfh09kkX20cgudrGHWmldXlNbh4DNjB0Xgo,7467
45
- giga_spatial-0.6.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
46
- giga_spatial-0.6.4.dist-info/top_level.txt,sha256=LZsccgw6H4zXT7m6Y4XChm-Y5LjHAwZ2hkGN_B3ExmI,12
47
- giga_spatial-0.6.4.dist-info/RECORD,,