giga-spatial 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. giga_spatial-0.6.0.dist-info/METADATA +141 -0
  2. giga_spatial-0.6.0.dist-info/RECORD +47 -0
  3. giga_spatial-0.6.0.dist-info/WHEEL +5 -0
  4. giga_spatial-0.6.0.dist-info/licenses/LICENSE +661 -0
  5. giga_spatial-0.6.0.dist-info/top_level.txt +1 -0
  6. gigaspatial/__init__.py +1 -0
  7. gigaspatial/config.py +226 -0
  8. gigaspatial/core/__init__.py +0 -0
  9. gigaspatial/core/io/__init__.py +5 -0
  10. gigaspatial/core/io/adls_data_store.py +325 -0
  11. gigaspatial/core/io/data_api.py +113 -0
  12. gigaspatial/core/io/data_store.py +147 -0
  13. gigaspatial/core/io/local_data_store.py +92 -0
  14. gigaspatial/core/io/readers.py +265 -0
  15. gigaspatial/core/io/writers.py +128 -0
  16. gigaspatial/core/schemas/__init__.py +0 -0
  17. gigaspatial/core/schemas/entity.py +244 -0
  18. gigaspatial/generators/__init__.py +2 -0
  19. gigaspatial/generators/poi.py +636 -0
  20. gigaspatial/generators/zonal/__init__.py +3 -0
  21. gigaspatial/generators/zonal/base.py +370 -0
  22. gigaspatial/generators/zonal/geometry.py +439 -0
  23. gigaspatial/generators/zonal/mercator.py +78 -0
  24. gigaspatial/grid/__init__.py +1 -0
  25. gigaspatial/grid/mercator_tiles.py +286 -0
  26. gigaspatial/handlers/__init__.py +40 -0
  27. gigaspatial/handlers/base.py +761 -0
  28. gigaspatial/handlers/boundaries.py +305 -0
  29. gigaspatial/handlers/ghsl.py +772 -0
  30. gigaspatial/handlers/giga.py +145 -0
  31. gigaspatial/handlers/google_open_buildings.py +472 -0
  32. gigaspatial/handlers/hdx.py +241 -0
  33. gigaspatial/handlers/mapbox_image.py +208 -0
  34. gigaspatial/handlers/maxar_image.py +291 -0
  35. gigaspatial/handlers/microsoft_global_buildings.py +548 -0
  36. gigaspatial/handlers/ookla_speedtest.py +199 -0
  37. gigaspatial/handlers/opencellid.py +290 -0
  38. gigaspatial/handlers/osm.py +356 -0
  39. gigaspatial/handlers/overture.py +126 -0
  40. gigaspatial/handlers/rwi.py +157 -0
  41. gigaspatial/handlers/unicef_georepo.py +806 -0
  42. gigaspatial/handlers/worldpop.py +266 -0
  43. gigaspatial/processing/__init__.py +4 -0
  44. gigaspatial/processing/geo.py +1054 -0
  45. gigaspatial/processing/sat_images.py +39 -0
  46. gigaspatial/processing/tif_processor.py +477 -0
  47. gigaspatial/processing/utils.py +49 -0
@@ -0,0 +1,761 @@
1
+ from abc import ABC, abstractmethod
2
+ from pathlib import Path
3
+ from typing import Any, List, Optional, Union, Tuple, Callable, Iterable
4
+ import pandas as pd
5
+ import geopandas as gpd
6
+ from shapely.geometry import Point
7
+ from shapely.geometry.base import BaseGeometry
8
+ import multiprocessing
9
+ import logging
10
+
11
+ from gigaspatial.config import config as global_config
12
+ from gigaspatial.core.io.data_store import DataStore
13
+ from gigaspatial.core.io.local_data_store import LocalDataStore
14
+ from gigaspatial.core.io.readers import read_dataset
15
+ from gigaspatial.processing.tif_processor import TifProcessor
16
+ from dataclasses import dataclass, field
17
+
18
+
19
+ @dataclass
20
+ class BaseHandlerConfig(ABC):
21
+ """
22
+ Abstract base class for handler configuration objects.
23
+ Provides standard fields for path, parallelism, data store, and logger.
24
+ Extend this class for dataset-specific configuration.
25
+ """
26
+
27
+ base_path: Path = None
28
+ n_workers: int = multiprocessing.cpu_count()
29
+ data_store: DataStore = field(default_factory=LocalDataStore)
30
+ logger: logging.Logger = field(default=None, repr=False)
31
+
32
+ def __post_init__(self):
33
+ if self.logger is None:
34
+ self.logger = global_config.get_logger(self.__class__.__name__)
35
+
36
+ def get_relevant_data_units(
37
+ self,
38
+ source: Union[
39
+ str, # country
40
+ List[Union[Tuple[float, float], Point]], # points
41
+ BaseGeometry, # geometry
42
+ gpd.GeoDataFrame, # geodataframe
43
+ ],
44
+ **kwargs,
45
+ ):
46
+ if isinstance(source, str):
47
+ data_units = self.get_relevant_data_units_by_country(source, **kwargs)
48
+ elif isinstance(source, (BaseGeometry, gpd.GeoDataFrame)):
49
+ data_units = self.get_relevant_data_units_by_geometry(source, **kwargs)
50
+ elif isinstance(source, Iterable):
51
+ if all(isinstance(p, (Iterable, Point)) for p in source):
52
+ data_units = self.get_relevant_data_units_by_points(source, **kwargs)
53
+ else:
54
+ raise ValueError(
55
+ "List input to get_relevant_data_units must be all points."
56
+ )
57
+ else:
58
+ raise NotImplementedError(f"Unsupported source type: {type(source)}")
59
+
60
+ return data_units
61
+
62
+ @abstractmethod
63
+ def get_relevant_data_units_by_geometry(
64
+ self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
65
+ ) -> Any:
66
+ """
67
+ Given a geometry, return a list of relevant data unit identifiers (e.g., tiles, files, resources).
68
+ """
69
+ pass
70
+
71
+ @abstractmethod
72
+ def get_relevant_data_units_by_points(
73
+ self, points: Iterable[Union[Point, tuple]], **kwargs
74
+ ) -> Any:
75
+ """
76
+ Given a list of points, return a list of relevant data unit identifiers.
77
+ """
78
+ pass
79
+
80
+ def get_relevant_data_units_by_country(self, country: str, **kwargs) -> Any:
81
+ """
82
+ Given a country code or name, return a list of relevant data unit identifiers.
83
+ """
84
+ from gigaspatial.handlers.boundaries import AdminBoundaries
85
+
86
+ country_geometry = (
87
+ AdminBoundaries.create(country_code=country, **kwargs)
88
+ .boundaries[0]
89
+ .geometry
90
+ )
91
+ return self.get_relevant_data_units_by_geometry(
92
+ geometry=country_geometry, **kwargs
93
+ )
94
+
95
+ @abstractmethod
96
+ def get_data_unit_path(self, unit: Any, **kwargs) -> list:
97
+ """
98
+ Given a data unit identifier, return the corresponding file path.
99
+ """
100
+ pass
101
+
102
+ def get_data_unit_paths(self, units: Union[Iterable[Any]], **kwargs) -> list:
103
+ """
104
+ Given data unit identifiers, return the corresponding file paths.
105
+ """
106
+ if not isinstance(units, Iterable):
107
+ units = [units]
108
+
109
+ if not units:
110
+ return []
111
+
112
+ return [self.get_data_unit_path(unit=unit, **kwargs) for unit in units]
113
+
114
+
115
+ class BaseHandlerDownloader(ABC):
116
+ """
117
+ Abstract base class for handler downloader classes.
118
+ Standardizes config, data_store, and logger initialization.
119
+ Extend this class for dataset-specific downloaders.
120
+ """
121
+
122
+ def __init__(
123
+ self,
124
+ config: Optional[BaseHandlerConfig] = None,
125
+ data_store: Optional[DataStore] = None,
126
+ logger: Optional[logging.Logger] = None,
127
+ ):
128
+ self.config = config
129
+ if data_store:
130
+ self.data_store = data_store
131
+ elif config and hasattr(config, "data_store"):
132
+ self.data_store = config.data_store
133
+ else:
134
+ self.data_store = LocalDataStore()
135
+
136
+ self.logger = (
137
+ logger
138
+ or (getattr(config, "logger", None) if config else None)
139
+ or global_config.get_logger(self.__class__.__name__)
140
+ )
141
+
142
+ @abstractmethod
143
+ def download_data_unit(self, *args, **kwargs):
144
+ """
145
+ Abstract method to download data. Implement in subclasses.
146
+ """
147
+ pass
148
+
149
+ @abstractmethod
150
+ def download_data_units(self, *args, **kwargs):
151
+ """
152
+ Abstract method to download data. Implement in subclasses.
153
+ """
154
+ pass
155
+
156
+ @abstractmethod
157
+ def download(self, *args, **kwargs):
158
+ """
159
+ Abstract method to download data. Implement in subclasses.
160
+ """
161
+ pass
162
+
163
+
164
+ class BaseHandlerReader(ABC):
165
+ """
166
+ Abstract base class for handler reader classes.
167
+ Provides common methods for resolving source paths and loading data.
168
+ Supports resolving by country, points, geometry, GeoDataFrame, or explicit paths.
169
+ Includes generic loader functions for raster and tabular data.
170
+ """
171
+
172
+ def __init__(
173
+ self,
174
+ config: Optional[BaseHandlerConfig] = None,
175
+ data_store: Optional[DataStore] = None,
176
+ logger: Optional[logging.Logger] = None,
177
+ ):
178
+ self.config = config
179
+ if data_store:
180
+ self.data_store = data_store
181
+ elif config and hasattr(config, "data_store"):
182
+ self.data_store = config.data_store
183
+ else:
184
+ self.data_store = LocalDataStore()
185
+
186
+ self.logger = (
187
+ logger
188
+ or (getattr(config, "logger", None) if config else None)
189
+ or global_config.get_logger(self.__class__.__name__)
190
+ )
191
+
192
+ def resolve_source_paths(
193
+ self,
194
+ source: Union[
195
+ str, # country code
196
+ List[Union[Tuple[float, float], Point]], # points
197
+ BaseGeometry,
198
+ gpd.GeoDataFrame,
199
+ Path, # path
200
+ str, # path
201
+ List[Union[str, Path]],
202
+ ],
203
+ **kwargs,
204
+ ) -> List[Union[str, Path]]:
205
+ """
206
+ Resolve source data paths based on the type of source input.
207
+
208
+ Args:
209
+ source: Can be a country code or name (str), list of points, geometry, GeoDataFrame, or explicit path(s)
210
+ **kwargs: Additional parameters for path resolution
211
+
212
+ Returns:
213
+ List of resolved source paths
214
+ """
215
+ if isinstance(source, (str, Path)):
216
+ # Could be a country code or a path
217
+ if self.data_store.file_exists(str(source)) or str(source).endswith(
218
+ (".csv", ".tif", ".json", ".parquet", ".gz", ".geojson", ".zip")
219
+ ):
220
+ source_data_paths = self.resolve_by_paths(source)
221
+ else:
222
+ source_data_paths = self.resolve_by_country(source, **kwargs)
223
+ elif isinstance(source, (BaseGeometry, gpd.GeoDataFrame)):
224
+ source_data_paths = self.resolve_by_geometry(source, **kwargs)
225
+ elif isinstance(source, Iterable):
226
+ # List of points or paths
227
+ if all(isinstance(p, (Iterable, Point)) for p in source):
228
+ source_data_paths = self.resolve_by_points(source, **kwargs)
229
+ elif all(isinstance(p, (str, Path)) for p in source):
230
+ source_data_paths = self.resolve_by_paths(source)
231
+ else:
232
+ raise ValueError(
233
+ "List input to resolve_source_paths must be all points or all paths."
234
+ )
235
+ else:
236
+ raise NotImplementedError(f"Unsupported source type: {type(source)}")
237
+
238
+ self.logger.info(f"Resolved {len(source_data_paths)} paths!")
239
+ return source_data_paths
240
+
241
+ def resolve_by_country(self, country: str, **kwargs) -> List[Union[str, Path]]:
242
+ """
243
+ Resolve source paths for a given country code/name.
244
+ Uses the config's get_relevant_data_units_by_country method.
245
+ """
246
+ if not self.config:
247
+ raise ValueError("Config is required for resolving by country")
248
+ data_units = self.config.get_relevant_data_units_by_country(country, **kwargs)
249
+ return self.config.get_data_unit_paths(data_units, **kwargs)
250
+
251
+ def resolve_by_points(
252
+ self, points: List[Union[Tuple[float, float], Point]], **kwargs
253
+ ) -> List[Union[str, Path]]:
254
+ """
255
+ Resolve source paths for a list of points.
256
+ Uses the config's get_relevant_data_units_by_points method.
257
+ """
258
+ if not self.config:
259
+ raise ValueError("Config is required for resolving by points")
260
+ data_units = self.config.get_relevant_data_units_by_points(points, **kwargs)
261
+ return self.config.get_data_unit_paths(data_units, **kwargs)
262
+
263
+ def resolve_by_geometry(
264
+ self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
265
+ ) -> List[Union[str, Path]]:
266
+ """
267
+ Resolve source paths for a geometry or GeoDataFrame.
268
+ Uses the config's get_relevant_data_units_by_geometry method.
269
+ """
270
+ if not self.config:
271
+ raise ValueError("Config is required for resolving by geometry")
272
+ data_units = self.config.get_relevant_data_units_by_geometry(geometry, **kwargs)
273
+ return self.config.get_data_unit_paths(data_units, **kwargs)
274
+
275
+ def resolve_by_paths(
276
+ self, paths: Union[Path, str, List[Union[str, Path]]], **kwargs
277
+ ) -> List[Union[str, Path]]:
278
+ """
279
+ Return explicit paths as a list.
280
+ """
281
+ if isinstance(paths, (str, Path)):
282
+ return [paths]
283
+ return list(paths)
284
+
285
+ def _pre_load_hook(self, source_data_path, **kwargs) -> Any:
286
+ """Hook called before loading data."""
287
+ if isinstance(source_data_path, (Path, str)):
288
+ source_data_path = [source_data_path]
289
+
290
+ if not source_data_path:
291
+ self.logger.warning("No paths found!")
292
+ return []
293
+
294
+ source_data_paths = [str(file_path) for file_path in source_data_path]
295
+
296
+ self.logger.info(
297
+ f"Pre-loading validation complete for {len(source_data_path)} files"
298
+ )
299
+ return source_data_paths
300
+
301
+ def _post_load_hook(self, data, **kwargs) -> Any:
302
+ """Hook called after loading data."""
303
+ if isinstance(data, Iterable):
304
+ if len(data) == 0:
305
+ self.logger.warning("No data was loaded from the source files")
306
+ return data
307
+
308
+ self.logger.info(f"{len(data)} valid data records.")
309
+
310
+ self.logger.info(f"Post-load processing complete.")
311
+
312
+ return data
313
+
314
+ def _check_file_exists(self, file_paths: List[Union[str, Path]]):
315
+ """
316
+ Check that all specified files exist in the data store.
317
+
318
+ Args:
319
+ file_paths (List[Union[str, Path]]): List of file paths to check.
320
+
321
+ Raises:
322
+ RuntimeError: If any file does not exist in the data store.
323
+ """
324
+ for file_path in file_paths:
325
+ if not self.data_store.file_exists(str(file_path)):
326
+ raise RuntimeError(
327
+ f"Source file does not exist in the data store: {file_path}"
328
+ )
329
+
330
+ def _load_raster_data(
331
+ self, raster_paths: List[Union[str, Path]]
332
+ ) -> List[TifProcessor]:
333
+ """
334
+ Load raster data from file paths.
335
+
336
+ Args:
337
+ raster_paths (List[Union[str, Path]]): List of file paths to raster files.
338
+
339
+ Returns:
340
+ List[TifProcessor]: List of TifProcessor objects for accessing the raster data.
341
+ """
342
+ return [
343
+ TifProcessor(data_path, self.data_store, mode="single")
344
+ for data_path in raster_paths
345
+ ]
346
+
347
+ def _load_tabular_data(
348
+ self, file_paths: List[Union[str, Path]], read_function: Callable = read_dataset
349
+ ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
350
+ """
351
+ Load and concatenate tabular data from multiple files.
352
+
353
+ Args:
354
+ file_paths (List[Union[str, Path]]): List of file paths to load data from.
355
+ read_function (Callable): Function to use for reading individual files.
356
+ Defaults to read_dataset. Should accept (data_store, file_path) arguments.
357
+
358
+ Returns:
359
+ Union[pd.DataFrame, gpd.GeoDataFrame]: Concatenated data from all files.
360
+ Returns empty DataFrame if no data is loaded.
361
+ """
362
+ all_data = []
363
+ for file_path in file_paths:
364
+ all_data.append(read_function(self.data_store, file_path))
365
+ if not all_data:
366
+ return pd.DataFrame()
367
+ result = pd.concat(all_data, ignore_index=True)
368
+ return result
369
+
370
+ @abstractmethod
371
+ def load_from_paths(
372
+ self, source_data_path: List[Union[str, Path]], **kwargs
373
+ ) -> Any:
374
+ """
375
+ Abstract method to load source data from paths.
376
+
377
+ Args:
378
+ source_data_path: List of source paths
379
+ **kwargs: Additional parameters for data loading
380
+
381
+ Returns:
382
+ Loaded data (DataFrame, GeoDataFrame, etc.)
383
+ """
384
+ pass
385
+
386
+ def load(
387
+ self,
388
+ source: Union[
389
+ str, # country
390
+ List[Union[Tuple[float, float], Point]], # points
391
+ BaseGeometry,
392
+ gpd.GeoDataFrame,
393
+ Path,
394
+ str,
395
+ List[Union[str, Path]],
396
+ ],
397
+ **kwargs,
398
+ ) -> Any:
399
+ """
400
+ Load data from the given source.
401
+
402
+ Args:
403
+ source: The data source (country code/name, points, geometry, paths, etc.).
404
+ **kwargs: Additional parameters to pass to the loading process.
405
+
406
+ Returns:
407
+ The loaded data. The type depends on the subclass implementation.
408
+ """
409
+ source_data_paths = self.resolve_source_paths(source, **kwargs)
410
+ if not source_data_paths:
411
+ self.logger.warning(
412
+ "No source data paths resolved. There's no matching data to load!"
413
+ )
414
+ return None
415
+ processed_paths = self._pre_load_hook(source_data_paths, **kwargs)
416
+ if not processed_paths:
417
+ self.logger.warning("No valid paths to load data from.")
418
+ return None
419
+
420
+ loaded_data = self.load_from_paths(processed_paths, **kwargs)
421
+ return self._post_load_hook(loaded_data, **kwargs)
422
+
423
+
424
+ class BaseHandler(ABC):
425
+ """
426
+ Abstract base class that orchestrates configuration, downloading, and reading functionality.
427
+
428
+ This class serves as the main entry point for dataset handlers, providing a unified
429
+ interface for data acquisition and loading. It manages the lifecycle of config,
430
+ downloader, and reader components.
431
+
432
+ Subclasses should implement the abstract methods to provide specific handler types
433
+ and define how components are created and interact.
434
+ """
435
+
436
+ def __init__(
437
+ self,
438
+ config: Optional[BaseHandlerConfig] = None,
439
+ downloader: Optional[BaseHandlerDownloader] = None,
440
+ reader: Optional[BaseHandlerReader] = None,
441
+ data_store: Optional[DataStore] = None,
442
+ logger: Optional[logging.Logger] = None,
443
+ ):
444
+ """
445
+ Initialize the BaseHandler with optional components.
446
+
447
+ Args:
448
+ config: Configuration object. If None, will be created via create_config()
449
+ downloader: Downloader instance. If None, will be created via create_downloader()
450
+ reader: Reader instance. If None, will be created via create_reader()
451
+ data_store: Data store instance. Defaults to LocalDataStore if not provided
452
+ logger: Logger instance. If not provided, creates one based on class name
453
+ """
454
+ # Initialize data store first as it's used by other components
455
+ self.data_store = data_store or LocalDataStore()
456
+
457
+ # Initialize logger
458
+ self.logger = logger or global_config.get_logger(self.__class__.__name__)
459
+
460
+ # Initialize or create config
461
+ self._config = config
462
+ if self._config is None:
463
+ self._config = self.create_config(
464
+ data_store=self.data_store, logger=self.logger
465
+ )
466
+
467
+ # Initialize or create downloader
468
+ self._downloader = downloader
469
+ if self._downloader is None:
470
+ self._downloader = self.create_downloader(
471
+ config=self._config, data_store=self.data_store, logger=self.logger
472
+ )
473
+
474
+ # Initialize or create reader
475
+ self._reader = reader
476
+ if self._reader is None:
477
+ self._reader = self.create_reader(
478
+ config=self._config, data_store=self.data_store, logger=self.logger
479
+ )
480
+
481
+ @property
482
+ def config(self) -> BaseHandlerConfig:
483
+ """Get the configuration object."""
484
+ return self._config
485
+
486
+ @property
487
+ def downloader(self) -> BaseHandlerDownloader:
488
+ """Get the downloader object."""
489
+ return self._downloader
490
+
491
+ @property
492
+ def reader(self) -> BaseHandlerReader:
493
+ """Get the reader object."""
494
+ return self._reader
495
+
496
+ # Abstract factory methods for creating components
497
+ @abstractmethod
498
+ def create_config(
499
+ self, data_store: DataStore, logger: logging.Logger, **kwargs
500
+ ) -> BaseHandlerConfig:
501
+ """
502
+ Create and return a configuration object for this handler.
503
+
504
+ Args:
505
+ data_store: The data store instance to use
506
+ logger: The logger instance to use
507
+ **kwargs: Additional configuration parameters
508
+
509
+ Returns:
510
+ Configured BaseHandlerConfig instance
511
+ """
512
+ pass
513
+
514
+ @abstractmethod
515
+ def create_downloader(
516
+ self,
517
+ config: BaseHandlerConfig,
518
+ data_store: DataStore,
519
+ logger: logging.Logger,
520
+ **kwargs,
521
+ ) -> BaseHandlerDownloader:
522
+ """
523
+ Create and return a downloader object for this handler.
524
+
525
+ Args:
526
+ config: The configuration object
527
+ data_store: The data store instance to use
528
+ logger: The logger instance to use
529
+ **kwargs: Additional downloader parameters
530
+
531
+ Returns:
532
+ Configured BaseHandlerDownloader instance
533
+ """
534
+ pass
535
+
536
+ @abstractmethod
537
+ def create_reader(
538
+ self,
539
+ config: BaseHandlerConfig,
540
+ data_store: DataStore,
541
+ logger: logging.Logger,
542
+ **kwargs,
543
+ ) -> BaseHandlerReader:
544
+ """
545
+ Create and return a reader object for this handler.
546
+
547
+ Args:
548
+ config: The configuration object
549
+ data_store: The data store instance to use
550
+ logger: The logger instance to use
551
+ **kwargs: Additional reader parameters
552
+
553
+ Returns:
554
+ Configured BaseHandlerReader instance
555
+ """
556
+ pass
557
+
558
+ # High-level interface methods
559
+ def ensure_data_available(
560
+ self,
561
+ source: Union[
562
+ str, # country
563
+ List[Union[tuple, Point]], # points
564
+ BaseGeometry, # geometry
565
+ gpd.GeoDataFrame, # geodataframe
566
+ Path, # path
567
+ List[Union[str, Path]], # list of paths
568
+ ],
569
+ force_download: bool = False,
570
+ **kwargs,
571
+ ) -> bool:
572
+ """
573
+ Ensure that data is available for the given source.
574
+
575
+ This method checks if the required data exists locally, and if not (or if
576
+ force_download is True), downloads it using the downloader.
577
+
578
+ Args:
579
+ source: The data source specification
580
+ force_download: If True, download even if data exists locally
581
+ **kwargs: Additional parameters passed to download methods
582
+
583
+ Returns:
584
+ bool: True if data is available after this operation
585
+ """
586
+ try:
587
+ # Resolve what data units are needed
588
+ if hasattr(self.config, "get_relevant_data_units"):
589
+ data_units = self.config.get_relevant_data_units(source, **kwargs)
590
+ data_paths = self.config.get_data_unit_paths(data_units, **kwargs)
591
+ else:
592
+ # Fallback: try to resolve paths directly
593
+ if hasattr(self.reader, "resolve_source_paths"):
594
+ data_paths = self.reader.resolve_source_paths(source, **kwargs)
595
+ else:
596
+ self.logger.warning("Cannot determine required data paths")
597
+ return False
598
+
599
+ # Check if data exists (unless force download)
600
+ if not force_download:
601
+ missing_paths = [
602
+ path
603
+ for path in data_paths
604
+ if not self.data_store.file_exists(str(path))
605
+ ]
606
+ if not missing_paths:
607
+ self.logger.info("All required data is already available")
608
+ return True
609
+
610
+ # Download missing or all data
611
+ if hasattr(self.config, "get_relevant_data_units"):
612
+ data_units = self.config.get_relevant_data_units(source, **kwargs)
613
+ self.downloader.download_data_units(data_units, **kwargs)
614
+ else:
615
+ self.downloader.download(source, **kwargs)
616
+
617
+ return True
618
+
619
+ except Exception as e:
620
+ self.logger.error(f"Failed to ensure data availability: {e}")
621
+ return False
622
+
623
+ def load_data(
624
+ self,
625
+ source: Union[
626
+ str, # country
627
+ List[Union[tuple, Point]], # points
628
+ BaseGeometry, # geometry
629
+ gpd.GeoDataFrame, # geodataframe
630
+ Path, # path
631
+ List[Union[str, Path]], # list of paths
632
+ ],
633
+ ensure_available: bool = True,
634
+ **kwargs,
635
+ ) -> Any:
636
+ """
637
+ Load data from the given source.
638
+
639
+ Args:
640
+ source: The data source specification
641
+ ensure_available: If True, ensure data is downloaded before loading
642
+ **kwargs: Additional parameters passed to load methods
643
+
644
+ Returns:
645
+ Loaded data (type depends on specific handler implementation)
646
+ """
647
+ if ensure_available:
648
+ if not self.ensure_data_available(source, **kwargs):
649
+ raise RuntimeError("Could not ensure data availability for loading")
650
+
651
+ return self.reader.load(source, **kwargs)
652
+
653
+ def download_and_load(
654
+ self,
655
+ source: Union[
656
+ str, # country
657
+ List[Union[tuple, Point]], # points
658
+ BaseGeometry, # geometry
659
+ gpd.GeoDataFrame, # geodataframe
660
+ Path, # path
661
+ List[Union[str, Path]], # list of paths
662
+ ],
663
+ force_download: bool = False,
664
+ **kwargs,
665
+ ) -> Any:
666
+ """
667
+ Convenience method to download (if needed) and load data in one call.
668
+
669
+ Args:
670
+ source: The data source specification
671
+ force_download: If True, download even if data exists locally
672
+ **kwargs: Additional parameters
673
+
674
+ Returns:
675
+ Loaded data
676
+ """
677
+ self.ensure_data_available(source, force_download=force_download, **kwargs)
678
+ return self.reader.load(source, **kwargs)
679
+
680
+ def get_available_data_info(
681
+ self,
682
+ source: Union[
683
+ str, # country
684
+ List[Union[tuple, Point]], # points
685
+ BaseGeometry, # geometry
686
+ gpd.GeoDataFrame, # geodataframe
687
+ ],
688
+ **kwargs,
689
+ ) -> dict:
690
+ """
691
+ Get information about available data for the given source.
692
+
693
+ Args:
694
+ source: The data source specification
695
+ **kwargs: Additional parameters
696
+
697
+ Returns:
698
+ dict: Information about data availability, paths, etc.
699
+ """
700
+ try:
701
+ if hasattr(self.config, "get_relevant_data_units"):
702
+ data_units = self.config.get_relevant_data_units(source, **kwargs)
703
+ data_paths = self.config.get_data_unit_paths(data_units, **kwargs)
704
+ else:
705
+ data_paths = self.reader.resolve_source_paths(source, **kwargs)
706
+
707
+ existing_paths = [
708
+ path for path in data_paths if self.data_store.file_exists(str(path))
709
+ ]
710
+ missing_paths = [
711
+ path
712
+ for path in data_paths
713
+ if not self.data_store.file_exists(str(path))
714
+ ]
715
+
716
+ return {
717
+ "total_data_units": len(data_paths),
718
+ "available_data_units": len(existing_paths),
719
+ "missing_data_units": len(missing_paths),
720
+ "available_paths": existing_paths,
721
+ "missing_paths": missing_paths,
722
+ "all_available": len(missing_paths) == 0,
723
+ }
724
+
725
+ except Exception as e:
726
+ self.logger.error(f"Failed to get data info: {e}")
727
+ return {
728
+ "error": str(e),
729
+ "total_data_units": 0,
730
+ "available_data_units": 0,
731
+ "missing_data_units": 0,
732
+ "available_paths": [],
733
+ "missing_paths": [],
734
+ "all_available": False,
735
+ }
736
+
737
+ def cleanup(self):
738
+ """
739
+ Cleanup resources used by the handler.
740
+
741
+ Override in subclasses if specific cleanup is needed.
742
+ """
743
+ self.logger.info(f"Cleaning up {self.__class__.__name__}")
744
+ # Subclasses can override to add specific cleanup logic
745
+
746
+ def __enter__(self):
747
+ """Context manager entry."""
748
+ return self
749
+
750
+ def __exit__(self, exc_type, exc_val, exc_tb):
751
+ """Context manager exit."""
752
+ self.cleanup()
753
+
754
+ def __repr__(self) -> str:
755
+ """String representation of the handler."""
756
+ return (
757
+ f"{self.__class__.__name__}("
758
+ f"config={self.config.__class__.__name__}, "
759
+ f"downloader={self.downloader.__class__.__name__}, "
760
+ f"reader={self.reader.__class__.__name__})"
761
+ )