giga-spatial 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. giga_spatial-0.6.0.dist-info/METADATA +141 -0
  2. giga_spatial-0.6.0.dist-info/RECORD +47 -0
  3. giga_spatial-0.6.0.dist-info/WHEEL +5 -0
  4. giga_spatial-0.6.0.dist-info/licenses/LICENSE +661 -0
  5. giga_spatial-0.6.0.dist-info/top_level.txt +1 -0
  6. gigaspatial/__init__.py +1 -0
  7. gigaspatial/config.py +226 -0
  8. gigaspatial/core/__init__.py +0 -0
  9. gigaspatial/core/io/__init__.py +5 -0
  10. gigaspatial/core/io/adls_data_store.py +325 -0
  11. gigaspatial/core/io/data_api.py +113 -0
  12. gigaspatial/core/io/data_store.py +147 -0
  13. gigaspatial/core/io/local_data_store.py +92 -0
  14. gigaspatial/core/io/readers.py +265 -0
  15. gigaspatial/core/io/writers.py +128 -0
  16. gigaspatial/core/schemas/__init__.py +0 -0
  17. gigaspatial/core/schemas/entity.py +244 -0
  18. gigaspatial/generators/__init__.py +2 -0
  19. gigaspatial/generators/poi.py +636 -0
  20. gigaspatial/generators/zonal/__init__.py +3 -0
  21. gigaspatial/generators/zonal/base.py +370 -0
  22. gigaspatial/generators/zonal/geometry.py +439 -0
  23. gigaspatial/generators/zonal/mercator.py +78 -0
  24. gigaspatial/grid/__init__.py +1 -0
  25. gigaspatial/grid/mercator_tiles.py +286 -0
  26. gigaspatial/handlers/__init__.py +40 -0
  27. gigaspatial/handlers/base.py +761 -0
  28. gigaspatial/handlers/boundaries.py +305 -0
  29. gigaspatial/handlers/ghsl.py +772 -0
  30. gigaspatial/handlers/giga.py +145 -0
  31. gigaspatial/handlers/google_open_buildings.py +472 -0
  32. gigaspatial/handlers/hdx.py +241 -0
  33. gigaspatial/handlers/mapbox_image.py +208 -0
  34. gigaspatial/handlers/maxar_image.py +291 -0
  35. gigaspatial/handlers/microsoft_global_buildings.py +548 -0
  36. gigaspatial/handlers/ookla_speedtest.py +199 -0
  37. gigaspatial/handlers/opencellid.py +290 -0
  38. gigaspatial/handlers/osm.py +356 -0
  39. gigaspatial/handlers/overture.py +126 -0
  40. gigaspatial/handlers/rwi.py +157 -0
  41. gigaspatial/handlers/unicef_georepo.py +806 -0
  42. gigaspatial/handlers/worldpop.py +266 -0
  43. gigaspatial/processing/__init__.py +4 -0
  44. gigaspatial/processing/geo.py +1054 -0
  45. gigaspatial/processing/sat_images.py +39 -0
  46. gigaspatial/processing/tif_processor.py +477 -0
  47. gigaspatial/processing/utils.py +49 -0
@@ -0,0 +1,39 @@
1
+ import numpy as np
2
+ import pyproj
3
+
4
+ def calculate_pixels_at_location(gdf, resolution, bbox_size=300, crs="EPSG:3857"):
5
+ """
6
+ Calculates the number of pixels required to cover a given bounding box
7
+ around a geographic coordinate, given a resolution in meters per pixel.
8
+
9
+ Parameters:
10
+ gdf: a geodataframe with Point geometries that are geographic coordinates
11
+ resolution (float): Desired resolution (meters per pixel).
12
+ bbox_size (float): Bounding box size in meters (default 300m x 300m).
13
+ crs (str): Target projection (default is EPSG:3857).
14
+
15
+ Returns:
16
+ int: Number of pixels per side (width and height).
17
+ """
18
+
19
+ # Calculate avg lat and lon
20
+ lon = gdf.geometry.x.mean()
21
+ lat = gdf.geometry.y.mean()
22
+
23
+ # Define projections
24
+ wgs84 = pyproj.CRS("EPSG:4326") # Geographic coordinate system
25
+ mercator = pyproj.CRS(crs) # Target CRS (EPSG:3857)
26
+
27
+ # Transform the center coordinate to EPSG:3857
28
+ transformer = pyproj.Transformer.from_crs(wgs84, mercator, always_xy=True)
29
+ x, y = transformer.transform(lon, lat)
30
+
31
+ # Calculate scale factor (distortion) at given latitude
32
+ scale_factor = np.cos(np.radians(lat)) # Mercator scale correction
33
+
34
+ # Adjust the effective resolution
35
+ effective_resolution = resolution * scale_factor
36
+
37
+ # Compute number of pixels per side
38
+ pixels = bbox_size / effective_resolution
39
+ return int(round(pixels))
@@ -0,0 +1,477 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import geopandas as gpd
4
+ from typing import List, Optional, Tuple, Union, Literal
5
+ from pydantic import ConfigDict
6
+ from pydantic.dataclasses import dataclass
7
+ from contextlib import contextmanager
8
+ from shapely.geometry import box, Polygon, MultiPolygon
9
+ from pathlib import Path
10
+ import rasterio
11
+ from rasterio.mask import mask
12
+
13
+ from gigaspatial.core.io.data_store import DataStore
14
+ from gigaspatial.core.io.local_data_store import LocalDataStore
15
+ from gigaspatial.config import config
16
+
17
+
18
+ @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
19
+ class TifProcessor:
20
+ """
21
+ A class to handle tif data processing, supporting single-band, RGB, and RGBA data.
22
+ """
23
+
24
+ dataset_path: Union[Path, str]
25
+ data_store: Optional[DataStore] = None
26
+ mode: Literal["single", "rgb", "rgba"] = "single"
27
+
28
+ def __post_init__(self):
29
+ """Validate inputs and set up logging."""
30
+ self.data_store = self.data_store or LocalDataStore()
31
+ self.logger = config.get_logger(self.__class__.__name__)
32
+ self._cache = {}
33
+
34
+ if not self.data_store.file_exists(self.dataset_path):
35
+ raise FileNotFoundError(f"Dataset not found at {self.dataset_path}")
36
+
37
+ self._load_metadata()
38
+
39
+ if self.mode == "rgba" and self.count != 4:
40
+ raise ValueError("RGBA mode requires a 4-band TIF file")
41
+ if self.mode == "rgb" and self.count != 3:
42
+ raise ValueError("RGB mode requires a 3-band TIF file")
43
+
44
+ @contextmanager
45
+ def open_dataset(self):
46
+ """Context manager for accessing the dataset"""
47
+ with self.data_store.open(self.dataset_path, "rb") as f:
48
+ with rasterio.MemoryFile(f.read()) as memfile:
49
+ with memfile.open() as src:
50
+ yield src
51
+
52
+ def _load_metadata(self):
53
+ """Load metadata from the TIF file if not already cached"""
54
+ if not self._cache:
55
+ with self.open_dataset() as src:
56
+ self._cache["transform"] = src.transform
57
+ self._cache["crs"] = src.crs.to_string()
58
+ self._cache["bounds"] = src.bounds
59
+ self._cache["width"] = src.width
60
+ self._cache["height"] = src.height
61
+ self._cache["resolution"] = (abs(src.transform.a), abs(src.transform.e))
62
+ self._cache["x_transform"] = src.transform.a
63
+ self._cache["y_transform"] = src.transform.e
64
+ self._cache["nodata"] = src.nodata
65
+ self._cache["count"] = src.count
66
+ self._cache["dtype"] = src.dtypes[0]
67
+
68
+ @property
69
+ def transform(self):
70
+ """Get the transform from the TIF file"""
71
+ return self._cache["transform"]
72
+
73
+ @property
74
+ def crs(self):
75
+ """Get the coordinate reference system from the TIF file"""
76
+ return self._cache["crs"]
77
+
78
+ @property
79
+ def bounds(self):
80
+ """Get the bounds of the TIF file"""
81
+ return self._cache["bounds"]
82
+
83
+ @property
84
+ def resolution(self) -> Tuple[float, float]:
85
+ """Get the x and y resolution (pixel width and height or pixel size) from the TIF file"""
86
+ return self._cache["resolution"]
87
+
88
+ @property
89
+ def x_transform(self) -> float:
90
+ """Get the x transform from the TIF file"""
91
+ return self._cache["x_transform"]
92
+
93
+ @property
94
+ def y_transform(self) -> float:
95
+ """Get the y transform from the TIF file"""
96
+ return self._cache["y_transform"]
97
+
98
+ @property
99
+ def count(self) -> int:
100
+ """Get the band count from the TIF file"""
101
+ return self._cache["count"]
102
+
103
+ @property
104
+ def nodata(self) -> int:
105
+ """Get the value representing no data in the rasters"""
106
+ return self._cache["nodata"]
107
+
108
+ @property
109
+ def tabular(self) -> pd.DataFrame:
110
+ """Get the data from the TIF file"""
111
+ if not hasattr(self, "_tabular"):
112
+ try:
113
+ if self.mode == "single":
114
+ self._tabular = self._to_band_dataframe(
115
+ drop_nodata=True, drop_values=[]
116
+ )
117
+ elif self.mode == "rgb":
118
+ self._tabular = self._to_rgb_dataframe(drop_nodata=True)
119
+ elif self.mode == "rgba":
120
+ self._tabular = self._to_rgba_dataframe(drop_transparent=True)
121
+ except Exception as e:
122
+ raise ValueError(
123
+ f"Failed to process TIF file in mode '{self.mode}'. "
124
+ f"Please ensure the file is valid and matches the selected mode. "
125
+ f"Original error: {str(e)}"
126
+ )
127
+
128
+ return self._tabular
129
+
130
+ def to_dataframe(self) -> pd.DataFrame:
131
+ return self.tabular
132
+
133
+ def get_zoned_geodataframe(self) -> gpd.GeoDataFrame:
134
+ """
135
+ Convert the processed TIF data into a GeoDataFrame, where each row represents a pixel zone.
136
+ Each zone is defined by its bounding box, based on pixel resolution and coordinates.
137
+ """
138
+ self.logger.info("Converting data to GeoDataFrame with zones...")
139
+
140
+ df = self.tabular
141
+
142
+ x_res, y_res = self.resolution
143
+
144
+ # create bounding box for each pixel
145
+ geometries = [
146
+ box(lon - x_res / 2, lat - y_res / 2, lon + x_res / 2, lat + y_res / 2)
147
+ for lon, lat in zip(df["lon"], df["lat"])
148
+ ]
149
+
150
+ gdf = gpd.GeoDataFrame(df, geometry=geometries, crs=self.crs)
151
+
152
+ self.logger.info("Conversion to GeoDataFrame complete!")
153
+ return gdf
154
+
155
+ def sample_by_coordinates(
156
+ self, coordinate_list: List[Tuple[float, float]]
157
+ ) -> Union[np.ndarray, dict]:
158
+ self.logger.info("Sampling raster values at the coordinates...")
159
+
160
+ with self.open_dataset() as src:
161
+ if self.mode == "rgba":
162
+ if self.count != 4:
163
+ raise ValueError("RGBA mode requires a 4-band TIF file")
164
+
165
+ rgba_values = {"red": [], "green": [], "blue": [], "alpha": []}
166
+
167
+ for band_idx, color in enumerate(["red", "green", "blue", "alpha"], 1):
168
+ rgba_values[color] = [
169
+ vals[0]
170
+ for vals in src.sample(coordinate_list, indexes=band_idx)
171
+ ]
172
+
173
+ return rgba_values
174
+
175
+ elif self.mode == "rgb":
176
+ if self.count != 3:
177
+ raise ValueError("RGB mode requires a 3-band TIF file")
178
+
179
+ rgb_values = {"red": [], "green": [], "blue": []}
180
+
181
+ for band_idx, color in enumerate(["red", "green", "blue"], 1):
182
+ rgb_values[color] = [
183
+ vals[0]
184
+ for vals in src.sample(coordinate_list, indexes=band_idx)
185
+ ]
186
+
187
+ return rgb_values
188
+ else:
189
+ if src.count != 1:
190
+ raise ValueError("Single band mode requires a 1-band TIF file")
191
+ return np.array([vals[0] for vals in src.sample(coordinate_list)])
192
+
193
+ def sample_by_polygons(
194
+ self, polygon_list: List[Union[Polygon, MultiPolygon]], stat: str = "mean"
195
+ ) -> np.ndarray:
196
+ """
197
+ Sample raster values within each polygon of a GeoDataFrame.
198
+
199
+ Parameters:
200
+ polygon_list: List of polygon geometries (can include MultiPolygons).
201
+ stat (str): Aggregation statistic to compute within each polygon.
202
+ Options: "mean", "median", "sum", "min", "max".
203
+ Returns:
204
+ A NumPy array of sampled values
205
+ """
206
+ self.logger.info("Sampling raster values within polygons...")
207
+
208
+ with self.open_dataset() as src:
209
+ results = []
210
+
211
+ for geom in polygon_list:
212
+ if geom.is_empty:
213
+ results.append(np.nan)
214
+ continue
215
+
216
+ try:
217
+ # Mask the raster with the polygon
218
+ out_image, _ = mask(src, [geom], crop=True)
219
+
220
+ # Flatten the raster values and remove NoData values
221
+ values = out_image[out_image != src.nodata].flatten()
222
+
223
+ # Compute the desired statistic
224
+ if len(values) == 0:
225
+ results.append(np.nan)
226
+ else:
227
+ if stat == "mean":
228
+ results.append(np.mean(values))
229
+ elif stat == "median":
230
+ results.append(np.median(values))
231
+ elif stat == "sum":
232
+ results.append(np.sum(values))
233
+ elif stat == "min":
234
+ results.append(np.min(values))
235
+ elif stat == "max":
236
+ results.append(np.max(values))
237
+ else:
238
+ raise ValueError(f"Unknown statistic: {stat}")
239
+
240
+ except Exception as e:
241
+ self.logger.error(f"Error processing polygon: {e}")
242
+ results.append(np.nan)
243
+
244
+ return np.array(results)
245
+
246
+ def _to_rgba_dataframe(self, drop_transparent: bool = False) -> pd.DataFrame:
247
+ """
248
+ Convert RGBA TIF to DataFrame with separate columns for R, G, B, A values.
249
+ """
250
+ self.logger.info("Processing RGBA dataset...")
251
+
252
+ with self.open_dataset() as src:
253
+ if self.count != 4:
254
+ raise ValueError("RGBA mode requires a 4-band TIF file")
255
+
256
+ # Read all four bands
257
+ red, green, blue, alpha = src.read()
258
+
259
+ x_coords, y_coords = self._get_pixel_coordinates()
260
+
261
+ if drop_transparent:
262
+ mask = alpha > 0
263
+ red = np.extract(mask, red)
264
+ green = np.extract(mask, green)
265
+ blue = np.extract(mask, blue)
266
+ alpha = np.extract(mask, alpha)
267
+ lons = np.extract(mask, x_coords)
268
+ lats = np.extract(mask, y_coords)
269
+ else:
270
+ lons = x_coords.flatten()
271
+ lats = y_coords.flatten()
272
+ red = red.flatten()
273
+ green = green.flatten()
274
+ blue = blue.flatten()
275
+ alpha = alpha.flatten()
276
+
277
+ # Create DataFrame with RGBA values
278
+ data = pd.DataFrame(
279
+ {
280
+ "lon": lons,
281
+ "lat": lats,
282
+ "red": red,
283
+ "green": green,
284
+ "blue": blue,
285
+ "alpha": alpha,
286
+ }
287
+ )
288
+
289
+ # Normalize alpha values if they're not in [0, 1] range
290
+ if data["alpha"].max() > 1:
291
+ data["alpha"] = data["alpha"] / data["alpha"].max()
292
+
293
+ self.logger.info("RGBA dataset is processed!")
294
+ return data
295
+
296
+ def _to_rgb_dataframe(self, drop_nodata: bool = True) -> pd.DataFrame:
297
+ """Convert RGB TIF to DataFrame with separate columns for R, G, B values."""
298
+ if self.mode != "rgb":
299
+ raise ValueError("Use appropriate method for current mode")
300
+
301
+ self.logger.info("Processing RGB dataset...")
302
+
303
+ with self.open_dataset() as src:
304
+ if self.count != 3:
305
+ raise ValueError("RGB mode requires a 3-band TIF file")
306
+
307
+ # Read all three bands
308
+ red, green, blue = src.read()
309
+
310
+ x_coords, y_coords = self._get_pixel_coordinates()
311
+
312
+ if drop_nodata:
313
+ nodata_value = src.nodata
314
+ if nodata_value is not None:
315
+ mask = ~(
316
+ (red == nodata_value)
317
+ | (green == nodata_value)
318
+ | (blue == nodata_value)
319
+ )
320
+ red = np.extract(mask, red)
321
+ green = np.extract(mask, green)
322
+ blue = np.extract(mask, blue)
323
+ lons = np.extract(mask, x_coords)
324
+ lats = np.extract(mask, y_coords)
325
+ else:
326
+ lons = x_coords.flatten()
327
+ lats = y_coords.flatten()
328
+ red = red.flatten()
329
+ green = green.flatten()
330
+ blue = blue.flatten()
331
+ else:
332
+ lons = x_coords.flatten()
333
+ lats = y_coords.flatten()
334
+ red = red.flatten()
335
+ green = green.flatten()
336
+ blue = blue.flatten()
337
+
338
+ data = pd.DataFrame(
339
+ {
340
+ "lon": lons,
341
+ "lat": lats,
342
+ "red": red,
343
+ "green": green,
344
+ "blue": blue,
345
+ }
346
+ )
347
+
348
+ self.logger.info("RGB dataset is processed!")
349
+ return data
350
+
351
+ def _to_band_dataframe(
352
+ self, band_number: int = 1, drop_nodata: bool = True, drop_values: list = []
353
+ ) -> pd.DataFrame:
354
+ """Process single-band TIF to DataFrame."""
355
+ if self.mode != "single":
356
+ raise ValueError("Use appropriate method for current mode")
357
+
358
+ self.logger.info("Processing single-band dataset...")
359
+
360
+ if band_number <= 0 or band_number > self.count:
361
+ self.logger.error(
362
+ f"Error: Band number {band_number} is out of range. The file has {self.count} bands."
363
+ )
364
+ return None
365
+
366
+ with self.open_dataset() as src:
367
+
368
+ band = src.read(band_number)
369
+
370
+ x_coords, y_coords = self._get_pixel_coordinates()
371
+
372
+ values_to_mask = []
373
+ if drop_nodata:
374
+ nodata_value = src.nodata
375
+ if nodata_value is not None:
376
+ values_to_mask.append(nodata_value)
377
+
378
+ if drop_values:
379
+ values_to_mask.extend(drop_values)
380
+
381
+ if values_to_mask:
382
+ data_mask = ~np.isin(band, values_to_mask)
383
+ pixel_values = np.extract(data_mask, band)
384
+ lons = np.extract(data_mask, x_coords)
385
+ lats = np.extract(data_mask, y_coords)
386
+ else:
387
+ pixel_values = band.flatten()
388
+ lons = x_coords.flatten()
389
+ lats = y_coords.flatten()
390
+
391
+ data = pd.DataFrame({"lon": lons, "lat": lats, "pixel_value": pixel_values})
392
+
393
+ self.logger.info("Dataset is processed!")
394
+ return data
395
+
396
+ def _get_pixel_coordinates(self):
397
+ """Helper method to generate coordinate arrays for all pixels"""
398
+ if "pixel_coords" not in self._cache:
399
+ # use cached values
400
+ bounds = self._cache["bounds"]
401
+ width = self._cache["width"]
402
+ height = self._cache["height"]
403
+ pixel_size_x = self._cache["x_transform"]
404
+ pixel_size_y = self._cache["y_transform"]
405
+
406
+ self._cache["pixel_coords"] = np.meshgrid(
407
+ np.linspace(
408
+ bounds.left + pixel_size_x / 2,
409
+ bounds.right - pixel_size_x / 2,
410
+ width,
411
+ ),
412
+ np.linspace(
413
+ bounds.top + pixel_size_y / 2,
414
+ bounds.bottom - pixel_size_y / 2,
415
+ height,
416
+ ),
417
+ )
418
+
419
+ return self._cache["pixel_coords"]
420
+
421
+
422
+ def sample_multiple_tifs_by_coordinates(
423
+ tif_processors: List[TifProcessor], coordinate_list: List[Tuple[float, float]]
424
+ ):
425
+ """
426
+ Sample raster values from multiple TIFF files for given coordinates.
427
+
428
+ Parameters:
429
+ - tif_processors: List of TifProcessor instances.
430
+ - coordinate_list: List of (x, y) coordinates.
431
+
432
+ Returns:
433
+ - A NumPy array of sampled values, taking the first non-nodata value encountered.
434
+ """
435
+ sampled_values = np.full(len(coordinate_list), np.nan, dtype=np.float32)
436
+
437
+ for tp in tif_processors:
438
+ values = tp.sample_by_coordinates(coordinate_list=coordinate_list)
439
+
440
+ if tp.nodata is not None:
441
+ mask = (np.isnan(sampled_values)) & (
442
+ values != tp.nodata
443
+ ) # Replace only NaNs
444
+ else:
445
+ mask = np.isnan(sampled_values) # No explicit nodata, replace all NaNs
446
+
447
+ sampled_values[mask] = values[mask] # Update only missing values
448
+
449
+ return sampled_values
450
+
451
+
452
+ def sample_multiple_tifs_by_polygons(
453
+ tif_processors: List[TifProcessor],
454
+ polygon_list: List[Union[Polygon, MultiPolygon]],
455
+ stat: str = "mean",
456
+ ) -> np.ndarray:
457
+ """
458
+ Sample raster values from multiple TIFF files for polygons in a list and join the results.
459
+
460
+ Parameters:
461
+ - tif_processors: List of TifProcessor instances.
462
+ - polygon_list: List of polygon geometries (can include MultiPolygons).
463
+ - stat: Aggregation statistic to compute within each polygon (mean, median, sum, min, max).
464
+
465
+ Returns:
466
+ - A NumPy array of sampled values, taking the first non-nodata value encountered.
467
+ """
468
+ sampled_values = np.full(len(polygon_list), np.nan, dtype=np.float32)
469
+
470
+ for tp in tif_processors:
471
+ values = tp.sample_by_polygons(polygon_list=polygon_list, stat=stat)
472
+
473
+ mask = np.isnan(sampled_values) # replace all NaNs
474
+
475
+ sampled_values[mask] = values[mask] # Update only values with samapled value
476
+
477
+ return sampled_values
@@ -0,0 +1,49 @@
1
+ import pandas as pd
2
+ import uuid
3
+ from typing import List
4
+
5
+
6
+ def assign_id(
7
+ df: pd.DataFrame, required_columns: List[str], id_column: str = "id"
8
+ ) -> pd.DataFrame:
9
+ """
10
+ Generate IDs for any entity type in a pandas DataFrame.
11
+
12
+ Args:
13
+ df (pd.DataFrame): Input DataFrame containing entity data
14
+ required_columns (List[str]): List of column names required for ID generation
15
+ id_column (str): Name for the id column that will be generated
16
+
17
+ Returns:
18
+ pd.DataFrame: DataFrame with generated id column
19
+ """
20
+ # Create a copy to avoid modifying the original DataFrame
21
+ df = df.copy()
22
+
23
+ # Check if ID column exists, if not create it with None values
24
+ if id_column not in df.columns:
25
+ df[id_column] = None
26
+
27
+ # Check required columns exist
28
+ if not all(col in df.columns for col in required_columns):
29
+ return df
30
+
31
+ # Create identifier concat for UUID generation
32
+ df["identifier_concat"] = (
33
+ df[required_columns].astype(str).fillna("").agg("".join, axis=1)
34
+ )
35
+
36
+ # Generate UUIDs only where all required fields are present and no existing ID
37
+ mask = df[id_column].isna()
38
+ for col in required_columns:
39
+ mask &= df[col].notna()
40
+
41
+ # Apply UUID generation only where mask is True
42
+ df.loc[mask, id_column] = df.loc[mask, "identifier_concat"].apply(
43
+ lambda x: str(uuid.uuid3(uuid.NAMESPACE_DNS, x))
44
+ )
45
+
46
+ # Drop temporary column
47
+ df = df.drop(columns=["identifier_concat"])
48
+
49
+ return df