giga-spatial 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. giga_spatial-0.6.0.dist-info/METADATA +141 -0
  2. giga_spatial-0.6.0.dist-info/RECORD +47 -0
  3. giga_spatial-0.6.0.dist-info/WHEEL +5 -0
  4. giga_spatial-0.6.0.dist-info/licenses/LICENSE +661 -0
  5. giga_spatial-0.6.0.dist-info/top_level.txt +1 -0
  6. gigaspatial/__init__.py +1 -0
  7. gigaspatial/config.py +226 -0
  8. gigaspatial/core/__init__.py +0 -0
  9. gigaspatial/core/io/__init__.py +5 -0
  10. gigaspatial/core/io/adls_data_store.py +325 -0
  11. gigaspatial/core/io/data_api.py +113 -0
  12. gigaspatial/core/io/data_store.py +147 -0
  13. gigaspatial/core/io/local_data_store.py +92 -0
  14. gigaspatial/core/io/readers.py +265 -0
  15. gigaspatial/core/io/writers.py +128 -0
  16. gigaspatial/core/schemas/__init__.py +0 -0
  17. gigaspatial/core/schemas/entity.py +244 -0
  18. gigaspatial/generators/__init__.py +2 -0
  19. gigaspatial/generators/poi.py +636 -0
  20. gigaspatial/generators/zonal/__init__.py +3 -0
  21. gigaspatial/generators/zonal/base.py +370 -0
  22. gigaspatial/generators/zonal/geometry.py +439 -0
  23. gigaspatial/generators/zonal/mercator.py +78 -0
  24. gigaspatial/grid/__init__.py +1 -0
  25. gigaspatial/grid/mercator_tiles.py +286 -0
  26. gigaspatial/handlers/__init__.py +40 -0
  27. gigaspatial/handlers/base.py +761 -0
  28. gigaspatial/handlers/boundaries.py +305 -0
  29. gigaspatial/handlers/ghsl.py +772 -0
  30. gigaspatial/handlers/giga.py +145 -0
  31. gigaspatial/handlers/google_open_buildings.py +472 -0
  32. gigaspatial/handlers/hdx.py +241 -0
  33. gigaspatial/handlers/mapbox_image.py +208 -0
  34. gigaspatial/handlers/maxar_image.py +291 -0
  35. gigaspatial/handlers/microsoft_global_buildings.py +548 -0
  36. gigaspatial/handlers/ookla_speedtest.py +199 -0
  37. gigaspatial/handlers/opencellid.py +290 -0
  38. gigaspatial/handlers/osm.py +356 -0
  39. gigaspatial/handlers/overture.py +126 -0
  40. gigaspatial/handlers/rwi.py +157 -0
  41. gigaspatial/handlers/unicef_georepo.py +806 -0
  42. gigaspatial/handlers/worldpop.py +266 -0
  43. gigaspatial/processing/__init__.py +4 -0
  44. gigaspatial/processing/geo.py +1054 -0
  45. gigaspatial/processing/sat_images.py +39 -0
  46. gigaspatial/processing/tif_processor.py +477 -0
  47. gigaspatial/processing/utils.py +49 -0
@@ -0,0 +1,548 @@
1
+ from dataclasses import field
2
+ from pydantic.dataclasses import dataclass
3
+ from pydantic import ConfigDict
4
+ from pathlib import Path
5
+ import functools
6
+ import multiprocessing
7
+ from typing import List, Optional, Tuple, Union, Dict, Iterable
8
+ import numpy as np
9
+ import pandas as pd
10
+ from shapely.geometry import Point
11
+ from shapely.geometry.base import BaseGeometry
12
+ from difflib import SequenceMatcher
13
+ import pycountry
14
+ import requests
15
+ from tqdm import tqdm
16
+ import logging
17
+ import geopandas as gpd
18
+
19
+ from gigaspatial.core.io.data_store import DataStore
20
+ from gigaspatial.grid.mercator_tiles import (
21
+ MercatorTiles,
22
+ CountryMercatorTiles,
23
+ )
24
+ from gigaspatial.handlers.base import (
25
+ BaseHandlerReader,
26
+ BaseHandlerConfig,
27
+ BaseHandlerDownloader,
28
+ BaseHandler,
29
+ )
30
+ from gigaspatial.config import config as global_config
31
+
32
+
33
+ @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
34
+ class MSBuildingsConfig(BaseHandlerConfig):
35
+ """Configuration for Microsoft Global Buildings dataset files."""
36
+
37
+ TILE_URLS: str = (
38
+ "https://minedbuildings.z5.web.core.windows.net/global-buildings/dataset-links.csv"
39
+ )
40
+ MERCATOR_ZOOM_LEVEL: int = 9
41
+ base_path: Path = global_config.get_path("microsoft_global_buildings", "bronze")
42
+
43
+ LOCATION_MAPPING_FILE: Path = base_path / "location_mapping.json"
44
+ SIMILARITY_SCORE: float = 0.8
45
+ DEFAULT_MAPPING: Dict[str, str] = field(
46
+ default_factory=lambda: {
47
+ "Bonaire": "BES",
48
+ "Brunei": "BRN",
49
+ "IvoryCoast": "CIV",
50
+ "CongoDRC": "COD",
51
+ "DemocraticRepublicoftheCongo": "COD",
52
+ "RepublicoftheCongo": "COG",
53
+ "TheGambia": "GMB",
54
+ "FYROMakedonija": "MKD",
55
+ "SultanateofOman": "OMN",
56
+ "StateofQatar": "QAT",
57
+ "Russia": "RUS",
58
+ "KingdomofSaudiArabia": "SAU",
59
+ "Svalbard": "SJM",
60
+ "Swaziland": "SWZ",
61
+ "StMartin": "SXM",
62
+ "leSaint-Martin": "MAF",
63
+ "Turkey": "TUR",
64
+ "VaticanCity": "VAT",
65
+ "BritishVirginIslands": "VGB",
66
+ "USVirginIslands": "VIR",
67
+ "RepublicofYemen": "YEM",
68
+ "CzechRepublic": "CZE",
69
+ "French-Martinique": "MTQ",
70
+ "French-Guadeloupe": "GLP",
71
+ "UnitedStates": "USA",
72
+ }
73
+ )
74
+ CUSTOM_MAPPING: Optional[Dict[str, str]] = None
75
+
76
+ def __post_init__(self):
77
+ """Initialize the configuration, load tile URLs, and set up location mapping."""
78
+ super().__post_init__()
79
+ self._load_tile_urls()
80
+ self.upload_date = self.df_tiles.upload_date[0]
81
+ self._setup_location_mapping()
82
+
83
+ def _load_tile_urls(self):
84
+ """Load dataset links from csv file."""
85
+ self.df_tiles = pd.read_csv(
86
+ self.TILE_URLS,
87
+ names=["location", "quadkey", "url", "size", "upload_date"],
88
+ dtype={"quadkey": str},
89
+ header=0,
90
+ )
91
+
92
+ def _setup_location_mapping(self):
93
+ """Load or create the mapping between dataset locations and ISO country codes."""
94
+ from gigaspatial.core.io.readers import read_json
95
+ from gigaspatial.core.io.writers import write_json
96
+
97
+ if self.data_store.file_exists(str(self.LOCATION_MAPPING_FILE)):
98
+ self.location_mapping = read_json(
99
+ self.data_store, str(self.LOCATION_MAPPING_FILE)
100
+ )
101
+ else:
102
+ self.location_mapping = self.create_location_mapping(
103
+ similarity_score_threshold=self.SIMILARITY_SCORE
104
+ )
105
+ self.location_mapping.update(self.DEFAULT_MAPPING)
106
+ write_json(
107
+ self.location_mapping, self.data_store, str(self.LOCATION_MAPPING_FILE)
108
+ )
109
+
110
+ self.location_mapping.update(self.CUSTOM_MAPPING or {})
111
+ self._map_locations()
112
+ self.df_tiles.loc[self.df_tiles.country.isnull(), "country"] = None
113
+
114
+ def _map_locations(self):
115
+ """Map the 'location' column in the tiles DataFrame to ISO country codes."""
116
+ self.df_tiles["country"] = self.df_tiles.location.map(self.location_mapping)
117
+
118
+ def create_location_mapping(self, similarity_score_threshold: float = 0.8):
119
+ """
120
+ Create a mapping between the dataset's location names and ISO 3166-1 alpha-3 country codes.
121
+
122
+ This function iterates through known countries and attempts to find matching
123
+ locations in the dataset based on string similarity.
124
+
125
+ Args:
126
+ similarity_score_threshold: The minimum similarity score (between 0 and 1)
127
+ for a dataset location to be considered a match
128
+ for a country. Defaults to 0.8.
129
+
130
+ Returns:
131
+ A dictionary where keys are dataset location names and values are
132
+ the corresponding ISO 3166-1 alpha-3 country codes.
133
+ """
134
+
135
+ def similar(a, b):
136
+ return SequenceMatcher(None, a, b).ratio()
137
+
138
+ location_mapping = dict()
139
+
140
+ for country in pycountry.countries:
141
+ if country.name not in self.df_tiles.location.unique():
142
+ try:
143
+ country_quadkey = CountryMercatorTiles.create(
144
+ country.alpha_3, self.MERCATOR_ZOOM_LEVEL
145
+ )
146
+ except:
147
+ self.logger.warning(f"{country.name} is not mapped.")
148
+ continue
149
+ country_datasets = country_quadkey.filter_quadkeys(
150
+ self.df_tiles.quadkey
151
+ )
152
+ matching_locations = self.df_tiles[
153
+ self.df_tiles.quadkey.isin(country_datasets.quadkeys)
154
+ ].location.unique()
155
+ scores = np.array(
156
+ [
157
+ (
158
+ similar(c, country.common_name)
159
+ if hasattr(country, "common_name")
160
+ else similar(c, country.name)
161
+ )
162
+ for c in matching_locations
163
+ ]
164
+ )
165
+ if any(scores > similarity_score_threshold):
166
+ matched = matching_locations[scores > similarity_score_threshold]
167
+ if len(matched) > 2:
168
+ self.logger.warning(
169
+ f"Multiple matches exist for {country.name}. {country.name} is not mapped."
170
+ )
171
+ location_mapping[matched[0]] = country.alpha_3
172
+ self.logger.debug(f"{country.name} matched with {matched[0]}!")
173
+ else:
174
+ self.logger.warning(
175
+ f"No direct matches for {country.name}. {country.name} is not mapped."
176
+ )
177
+ self.logger.debug("Possible matches are: ")
178
+ for c, score in zip(matching_locations, scores):
179
+ self.logger.debug(c, score)
180
+ else:
181
+ location_mapping[country.name] = country.alpha_3
182
+
183
+ return location_mapping
184
+
185
+ def get_relevant_data_units_by_geometry(
186
+ self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
187
+ ) -> pd.DataFrame:
188
+ """
189
+ Return intersecting tiles for a given geometry or GeoDataFrame.
190
+ """
191
+ return self._get_relevant_tiles(geometry)
192
+
193
+ def get_relevant_data_units_by_points(
194
+ self, points: Iterable[Union[Point, tuple]], **kwargs
195
+ ) -> pd.DataFrame:
196
+ """
197
+ Return intersecting tiles for a list of points.
198
+ """
199
+ return self._get_relevant_tiles(points)
200
+
201
+ def get_relevant_data_units_by_country(
202
+ self, country: str, **kwargs
203
+ ) -> pd.DataFrame:
204
+ """
205
+ Return intersecting tiles for a given country.
206
+ """
207
+ return self._get_relevant_tiles(country)
208
+
209
+ def get_data_unit_path(self, unit: Union[pd.Series, dict], **kwargs) -> Path:
210
+
211
+ tile_location = unit["country"] if unit["country"] else unit["location"]
212
+
213
+ return (
214
+ self.base_path
215
+ / tile_location
216
+ / self.upload_date
217
+ / f'{unit["quadkey"]}.csv.gz'
218
+ )
219
+
220
+ def get_data_unit_paths(
221
+ self, units: Union[pd.DataFrame, Iterable[dict]], **kwargs
222
+ ) -> List:
223
+ if isinstance(units, pd.DataFrame):
224
+ return [self.get_data_unit_path(row) for _, row in units.iterrows()]
225
+ return super().get_data_unit_paths(units)
226
+
227
+ def _get_relevant_tiles(
228
+ self,
229
+ source: Union[
230
+ str, # country
231
+ BaseGeometry, # shapely geoms
232
+ gpd.GeoDataFrame,
233
+ Iterable[Union[Point, Tuple[float, float]]], # points
234
+ ],
235
+ ) -> pd.DataFrame:
236
+ """
237
+ Get the DataFrame of Microsoft Buildings tiles that intersect with a given source spatial geometry.
238
+
239
+ In case country given, this method first tries to find tiles directly mapped to the given country.
240
+ If no directly mapped tiles are found and the country is not in the location
241
+ mapping, it attempts to find overlapping tiles by creating Mercator tiles
242
+ for the country and filtering the dataset's tiles.
243
+
244
+ Args:
245
+ source: A country code/name, a Shapely geometry, a GeoDataFrame, or a list of Point
246
+ objects or (lat, lon) tuples representing the area of interest.
247
+ The coordinates are assumed to be in EPSG:4326.
248
+
249
+ Returns:
250
+ A pandas DataFrame containing the rows from the tiles list that
251
+ spatially intersect with the `source`. Returns an empty DataFrame
252
+ if no intersecting tiles are found.
253
+ """
254
+ if isinstance(source, str):
255
+ try:
256
+ country_code = pycountry.countries.lookup(source).alpha_3
257
+ except:
258
+ raise ValueError("Invalid`country` value!")
259
+
260
+ mask = self.df_tiles["country"] == country_code
261
+
262
+ if any(mask):
263
+ return self.df_tiles.loc[
264
+ mask, ["quadkey", "url", "country", "location"]
265
+ ].to_dict("records")
266
+
267
+ self.logger.warning(
268
+ f"The country code '{country_code}' is not directly in the location mapping. "
269
+ "Manually checking for overlapping locations with the country boundary."
270
+ )
271
+
272
+ source_tiles = CountryMercatorTiles.create(
273
+ country_code, self.MERCATOR_ZOOM_LEVEL
274
+ )
275
+ else:
276
+ source_tiles = MercatorTiles.from_spatial(
277
+ source=source, zoom_level=self.MERCATOR_ZOOM_LEVEL
278
+ )
279
+
280
+ filtered_tiles = source_tiles.filter_quadkeys(self.df_tiles.quadkey)
281
+
282
+ mask = self.df_tiles.quadkey.isin(filtered_tiles.quadkeys)
283
+
284
+ return self.df_tiles.loc[
285
+ mask, ["quadkey", "url", "country", "location"]
286
+ ].to_dict("records")
287
+
288
+
289
+ class MSBuildingsDownloader(BaseHandlerDownloader):
290
+ """A class to handle downloads of Microsoft's Global ML Building Footprints dataset."""
291
+
292
+ def __init__(
293
+ self,
294
+ config: Optional[MSBuildingsConfig] = None,
295
+ data_store: Optional[DataStore] = None,
296
+ logger: Optional[logging.Logger] = None,
297
+ ):
298
+ """
299
+ Initialize the downloader.
300
+
301
+ Args:
302
+ config: Optional configuration for customizing download behavior and file paths.
303
+ If None, a default `MSBuildingsConfig` is used.
304
+ data_store: Optional instance of a `DataStore` for managing data storage.
305
+ If provided, it overrides the `data_store` in the `config`.
306
+ If None, the `data_store` from the `config` is used.
307
+ logger: Optional custom logger instance. If None, a default logger
308
+ named after the module is created and used.
309
+ """
310
+ config = config or MSBuildingsConfig()
311
+ super().__init__(config=config, data_store=data_store, logger=logger)
312
+
313
+ def download_data_unit(
314
+ self,
315
+ tile_info: Union[pd.Series, dict],
316
+ **kwargs,
317
+ ) -> Optional[str]:
318
+ """Download data file for a single tile."""
319
+
320
+ tile_url = tile_info["url"]
321
+
322
+ try:
323
+ response = requests.get(tile_url, stream=True)
324
+ response.raise_for_status()
325
+
326
+ file_path = str(self.config.get_data_unit_path(tile_info))
327
+
328
+ with self.data_store.open(file_path, "wb") as file:
329
+ for chunk in response.iter_content(chunk_size=8192):
330
+ file.write(chunk)
331
+
332
+ self.logger.debug(
333
+ f"Successfully downloaded tile: {tile_info['quadkey']}"
334
+ )
335
+ return file_path
336
+
337
+ except requests.exceptions.RequestException as e:
338
+ self.logger.error(
339
+ f"Failed to download tile {tile_info['quadkey']}: {str(e)}"
340
+ )
341
+ return None
342
+ except Exception as e:
343
+ self.logger.error(f"Unexpected error downloading dataset: {str(e)}")
344
+ return None
345
+
346
+ def download_data_units(
347
+ self,
348
+ tiles: Union[pd.DataFrame, List[dict]],
349
+ **kwargs,
350
+ ) -> List[str]:
351
+ """Download data files for multiple tiles."""
352
+
353
+ if len(tiles) == 0:
354
+ self.logger.warning(f"There is no matching data")
355
+ return []
356
+
357
+ with multiprocessing.Pool(self.config.n_workers) as pool:
358
+ download_func = functools.partial(self.download_data_unit)
359
+ file_paths = list(
360
+ tqdm(
361
+ pool.imap(
362
+ download_func,
363
+ (
364
+ [row for _, row in tiles.iterrows()]
365
+ if isinstance(tiles, pd.DataFrame)
366
+ else tiles
367
+ ),
368
+ ),
369
+ total=len(tiles),
370
+ desc=f"Downloading polygons data",
371
+ )
372
+ )
373
+
374
+ return [path for path in file_paths if path is not None]
375
+
376
+ def download(
377
+ self,
378
+ source: Union[
379
+ str, # country
380
+ List[Union[Tuple[float, float], Point]], # points
381
+ BaseGeometry, # shapely geoms
382
+ gpd.GeoDataFrame,
383
+ ],
384
+ **kwargs,
385
+ ) -> List[str]:
386
+ """
387
+ Download Microsoft Global ML Building Footprints data for a specified geographic region.
388
+
389
+ The region can be defined by a country, a list of points,
390
+ a Shapely geometry, or a GeoDataFrame. This method identifies the
391
+ relevant data tiles intersecting the region and downloads them in parallel.
392
+
393
+ Args:
394
+ source: Defines the geographic area for which to download data.
395
+ Can be:
396
+ - A string representing a country code or name.
397
+ - A list of (latitude, longitude) tuples or Shapely Point objects.
398
+ - A Shapely BaseGeometry object (e.g., Polygon, MultiPolygon).
399
+ - A GeoDataFrame with a geometry column in EPSG:4326.
400
+ **kwargs: Additional parameters passed to data unit resolution methods
401
+
402
+ Returns:
403
+ A list of local file paths for the successfully downloaded tiles.
404
+ Returns an empty list if no data is found for the region or if
405
+ all downloads fail.
406
+ """
407
+
408
+ tiles = self.config.get_relevant_data_units(source, **kwargs)
409
+ return self.download_data_units(tiles, **kwargs)
410
+
411
+ def download_by_country(
412
+ self,
413
+ country: str,
414
+ data_store: Optional[DataStore] = None,
415
+ country_geom_path: Optional[Union[str, Path]] = None,
416
+ ) -> List[str]:
417
+ """
418
+ Download Microsoft Global ML Building Footprints data for a specific country.
419
+
420
+ This is a convenience method to download data for an entire country
421
+ using its code or name.
422
+
423
+ Args:
424
+ country: The country code (e.g., 'USA', 'GBR') or name.
425
+ data_store: Optional instance of a `DataStore` to be used by
426
+ `AdminBoundaries` for loading country boundaries. If None,
427
+ `AdminBoundaries` will use its default data loading.
428
+ country_geom_path: Optional path to a GeoJSON file containing the
429
+ country boundary. If provided, this boundary is used
430
+ instead of the default from `AdminBoundaries`.
431
+
432
+ Returns:
433
+ A list of local file paths for the successfully downloaded tiles.
434
+ Returns an empty list if no data is found for the country or if
435
+ all downloads fail.
436
+ """
437
+ return self.download(
438
+ source=country, data_store=data_store, path=country_geom_path
439
+ )
440
+
441
+
442
+ class MSBuildingsReader(BaseHandlerReader):
443
+ """
444
+ Reader for Microsoft Global Buildings data, supporting country, points, and geometry-based resolution.
445
+ """
446
+
447
+ def __init__(
448
+ self,
449
+ config: Optional[MSBuildingsConfig] = None,
450
+ data_store: Optional[DataStore] = None,
451
+ logger: Optional[logging.Logger] = None,
452
+ ):
453
+ config = config or MSBuildingsConfig()
454
+ super().__init__(config=config, data_store=data_store, logger=logger)
455
+
456
+ def load_from_paths(
457
+ self, source_data_path: List[Union[str, Path]], **kwargs
458
+ ) -> gpd.GeoDataFrame:
459
+ """
460
+ Load building data from Microsoft Buildings dataset.
461
+ Args:
462
+ source_data_path: List of file paths to load
463
+ Returns:
464
+ GeoDataFrame containing building data
465
+ """
466
+ from gigaspatial.core.io.readers import read_gzipped_json_or_csv
467
+ from shapely.geometry import shape
468
+
469
+ def read_ms_dataset(data_store: DataStore, file_path: str):
470
+ df = read_gzipped_json_or_csv(file_path=file_path, data_store=data_store)
471
+ df["geometry"] = df["geometry"].apply(shape)
472
+ return gpd.GeoDataFrame(df, crs=4326)
473
+
474
+ result = self._load_tabular_data(
475
+ file_paths=source_data_path, read_function=read_ms_dataset
476
+ )
477
+ return result
478
+
479
+
480
+ class MSBuildingsHandler(BaseHandler):
481
+ """
482
+ Handler for Microsoft Global Buildings dataset.
483
+
484
+ This class provides a unified interface for downloading and loading Microsoft Global Buildings data.
485
+ It manages the lifecycle of configuration, downloading, and reading components.
486
+ """
487
+
488
+ def create_config(
489
+ self, data_store: DataStore, logger: logging.Logger, **kwargs
490
+ ) -> MSBuildingsConfig:
491
+ """
492
+ Create and return a MSBuildingsConfig instance.
493
+
494
+ Args:
495
+ data_store: The data store instance to use
496
+ logger: The logger instance to use
497
+ **kwargs: Additional configuration parameters
498
+
499
+ Returns:
500
+ Configured MSBuildingsConfig instance
501
+ """
502
+ return MSBuildingsConfig(data_store=data_store, logger=logger, **kwargs)
503
+
504
+ def create_downloader(
505
+ self,
506
+ config: MSBuildingsConfig,
507
+ data_store: DataStore,
508
+ logger: logging.Logger,
509
+ **kwargs,
510
+ ) -> MSBuildingsDownloader:
511
+ """
512
+ Create and return a MSBuildingsDownloader instance.
513
+
514
+ Args:
515
+ config: The configuration object
516
+ data_store: The data store instance to use
517
+ logger: The logger instance to use
518
+ **kwargs: Additional downloader parameters
519
+
520
+ Returns:
521
+ Configured MSBuildingsDownloader instance
522
+ """
523
+ return MSBuildingsDownloader(
524
+ config=config, data_store=data_store, logger=logger, **kwargs
525
+ )
526
+
527
+ def create_reader(
528
+ self,
529
+ config: MSBuildingsConfig,
530
+ data_store: DataStore,
531
+ logger: logging.Logger,
532
+ **kwargs,
533
+ ) -> MSBuildingsReader:
534
+ """
535
+ Create and return a MSBuildingsReader instance.
536
+
537
+ Args:
538
+ config: The configuration object
539
+ data_store: The data store instance to use
540
+ logger: The logger instance to use
541
+ **kwargs: Additional reader parameters
542
+
543
+ Returns:
544
+ Configured MSBuildingsReader instance
545
+ """
546
+ return MSBuildingsReader(
547
+ config=config, data_store=data_store, logger=logger, **kwargs
548
+ )