giga-spatial 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- giga_spatial-0.6.0.dist-info/METADATA +141 -0
- giga_spatial-0.6.0.dist-info/RECORD +47 -0
- giga_spatial-0.6.0.dist-info/WHEEL +5 -0
- giga_spatial-0.6.0.dist-info/licenses/LICENSE +661 -0
- giga_spatial-0.6.0.dist-info/top_level.txt +1 -0
- gigaspatial/__init__.py +1 -0
- gigaspatial/config.py +226 -0
- gigaspatial/core/__init__.py +0 -0
- gigaspatial/core/io/__init__.py +5 -0
- gigaspatial/core/io/adls_data_store.py +325 -0
- gigaspatial/core/io/data_api.py +113 -0
- gigaspatial/core/io/data_store.py +147 -0
- gigaspatial/core/io/local_data_store.py +92 -0
- gigaspatial/core/io/readers.py +265 -0
- gigaspatial/core/io/writers.py +128 -0
- gigaspatial/core/schemas/__init__.py +0 -0
- gigaspatial/core/schemas/entity.py +244 -0
- gigaspatial/generators/__init__.py +2 -0
- gigaspatial/generators/poi.py +636 -0
- gigaspatial/generators/zonal/__init__.py +3 -0
- gigaspatial/generators/zonal/base.py +370 -0
- gigaspatial/generators/zonal/geometry.py +439 -0
- gigaspatial/generators/zonal/mercator.py +78 -0
- gigaspatial/grid/__init__.py +1 -0
- gigaspatial/grid/mercator_tiles.py +286 -0
- gigaspatial/handlers/__init__.py +40 -0
- gigaspatial/handlers/base.py +761 -0
- gigaspatial/handlers/boundaries.py +305 -0
- gigaspatial/handlers/ghsl.py +772 -0
- gigaspatial/handlers/giga.py +145 -0
- gigaspatial/handlers/google_open_buildings.py +472 -0
- gigaspatial/handlers/hdx.py +241 -0
- gigaspatial/handlers/mapbox_image.py +208 -0
- gigaspatial/handlers/maxar_image.py +291 -0
- gigaspatial/handlers/microsoft_global_buildings.py +548 -0
- gigaspatial/handlers/ookla_speedtest.py +199 -0
- gigaspatial/handlers/opencellid.py +290 -0
- gigaspatial/handlers/osm.py +356 -0
- gigaspatial/handlers/overture.py +126 -0
- gigaspatial/handlers/rwi.py +157 -0
- gigaspatial/handlers/unicef_georepo.py +806 -0
- gigaspatial/handlers/worldpop.py +266 -0
- gigaspatial/processing/__init__.py +4 -0
- gigaspatial/processing/geo.py +1054 -0
- gigaspatial/processing/sat_images.py +39 -0
- gigaspatial/processing/tif_processor.py +477 -0
- gigaspatial/processing/utils.py +49 -0
@@ -0,0 +1,761 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Any, List, Optional, Union, Tuple, Callable, Iterable
|
4
|
+
import pandas as pd
|
5
|
+
import geopandas as gpd
|
6
|
+
from shapely.geometry import Point
|
7
|
+
from shapely.geometry.base import BaseGeometry
|
8
|
+
import multiprocessing
|
9
|
+
import logging
|
10
|
+
|
11
|
+
from gigaspatial.config import config as global_config
|
12
|
+
from gigaspatial.core.io.data_store import DataStore
|
13
|
+
from gigaspatial.core.io.local_data_store import LocalDataStore
|
14
|
+
from gigaspatial.core.io.readers import read_dataset
|
15
|
+
from gigaspatial.processing.tif_processor import TifProcessor
|
16
|
+
from dataclasses import dataclass, field
|
17
|
+
|
18
|
+
|
19
|
+
@dataclass
|
20
|
+
class BaseHandlerConfig(ABC):
|
21
|
+
"""
|
22
|
+
Abstract base class for handler configuration objects.
|
23
|
+
Provides standard fields for path, parallelism, data store, and logger.
|
24
|
+
Extend this class for dataset-specific configuration.
|
25
|
+
"""
|
26
|
+
|
27
|
+
base_path: Path = None
|
28
|
+
n_workers: int = multiprocessing.cpu_count()
|
29
|
+
data_store: DataStore = field(default_factory=LocalDataStore)
|
30
|
+
logger: logging.Logger = field(default=None, repr=False)
|
31
|
+
|
32
|
+
def __post_init__(self):
|
33
|
+
if self.logger is None:
|
34
|
+
self.logger = global_config.get_logger(self.__class__.__name__)
|
35
|
+
|
36
|
+
def get_relevant_data_units(
|
37
|
+
self,
|
38
|
+
source: Union[
|
39
|
+
str, # country
|
40
|
+
List[Union[Tuple[float, float], Point]], # points
|
41
|
+
BaseGeometry, # geometry
|
42
|
+
gpd.GeoDataFrame, # geodataframe
|
43
|
+
],
|
44
|
+
**kwargs,
|
45
|
+
):
|
46
|
+
if isinstance(source, str):
|
47
|
+
data_units = self.get_relevant_data_units_by_country(source, **kwargs)
|
48
|
+
elif isinstance(source, (BaseGeometry, gpd.GeoDataFrame)):
|
49
|
+
data_units = self.get_relevant_data_units_by_geometry(source, **kwargs)
|
50
|
+
elif isinstance(source, Iterable):
|
51
|
+
if all(isinstance(p, (Iterable, Point)) for p in source):
|
52
|
+
data_units = self.get_relevant_data_units_by_points(source, **kwargs)
|
53
|
+
else:
|
54
|
+
raise ValueError(
|
55
|
+
"List input to get_relevant_data_units must be all points."
|
56
|
+
)
|
57
|
+
else:
|
58
|
+
raise NotImplementedError(f"Unsupported source type: {type(source)}")
|
59
|
+
|
60
|
+
return data_units
|
61
|
+
|
62
|
+
@abstractmethod
|
63
|
+
def get_relevant_data_units_by_geometry(
|
64
|
+
self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
|
65
|
+
) -> Any:
|
66
|
+
"""
|
67
|
+
Given a geometry, return a list of relevant data unit identifiers (e.g., tiles, files, resources).
|
68
|
+
"""
|
69
|
+
pass
|
70
|
+
|
71
|
+
@abstractmethod
|
72
|
+
def get_relevant_data_units_by_points(
|
73
|
+
self, points: Iterable[Union[Point, tuple]], **kwargs
|
74
|
+
) -> Any:
|
75
|
+
"""
|
76
|
+
Given a list of points, return a list of relevant data unit identifiers.
|
77
|
+
"""
|
78
|
+
pass
|
79
|
+
|
80
|
+
def get_relevant_data_units_by_country(self, country: str, **kwargs) -> Any:
|
81
|
+
"""
|
82
|
+
Given a country code or name, return a list of relevant data unit identifiers.
|
83
|
+
"""
|
84
|
+
from gigaspatial.handlers.boundaries import AdminBoundaries
|
85
|
+
|
86
|
+
country_geometry = (
|
87
|
+
AdminBoundaries.create(country_code=country, **kwargs)
|
88
|
+
.boundaries[0]
|
89
|
+
.geometry
|
90
|
+
)
|
91
|
+
return self.get_relevant_data_units_by_geometry(
|
92
|
+
geometry=country_geometry, **kwargs
|
93
|
+
)
|
94
|
+
|
95
|
+
@abstractmethod
|
96
|
+
def get_data_unit_path(self, unit: Any, **kwargs) -> list:
|
97
|
+
"""
|
98
|
+
Given a data unit identifier, return the corresponding file path.
|
99
|
+
"""
|
100
|
+
pass
|
101
|
+
|
102
|
+
def get_data_unit_paths(self, units: Union[Iterable[Any]], **kwargs) -> list:
|
103
|
+
"""
|
104
|
+
Given data unit identifiers, return the corresponding file paths.
|
105
|
+
"""
|
106
|
+
if not isinstance(units, Iterable):
|
107
|
+
units = [units]
|
108
|
+
|
109
|
+
if not units:
|
110
|
+
return []
|
111
|
+
|
112
|
+
return [self.get_data_unit_path(unit=unit, **kwargs) for unit in units]
|
113
|
+
|
114
|
+
|
115
|
+
class BaseHandlerDownloader(ABC):
|
116
|
+
"""
|
117
|
+
Abstract base class for handler downloader classes.
|
118
|
+
Standardizes config, data_store, and logger initialization.
|
119
|
+
Extend this class for dataset-specific downloaders.
|
120
|
+
"""
|
121
|
+
|
122
|
+
def __init__(
|
123
|
+
self,
|
124
|
+
config: Optional[BaseHandlerConfig] = None,
|
125
|
+
data_store: Optional[DataStore] = None,
|
126
|
+
logger: Optional[logging.Logger] = None,
|
127
|
+
):
|
128
|
+
self.config = config
|
129
|
+
if data_store:
|
130
|
+
self.data_store = data_store
|
131
|
+
elif config and hasattr(config, "data_store"):
|
132
|
+
self.data_store = config.data_store
|
133
|
+
else:
|
134
|
+
self.data_store = LocalDataStore()
|
135
|
+
|
136
|
+
self.logger = (
|
137
|
+
logger
|
138
|
+
or (getattr(config, "logger", None) if config else None)
|
139
|
+
or global_config.get_logger(self.__class__.__name__)
|
140
|
+
)
|
141
|
+
|
142
|
+
@abstractmethod
|
143
|
+
def download_data_unit(self, *args, **kwargs):
|
144
|
+
"""
|
145
|
+
Abstract method to download data. Implement in subclasses.
|
146
|
+
"""
|
147
|
+
pass
|
148
|
+
|
149
|
+
@abstractmethod
|
150
|
+
def download_data_units(self, *args, **kwargs):
|
151
|
+
"""
|
152
|
+
Abstract method to download data. Implement in subclasses.
|
153
|
+
"""
|
154
|
+
pass
|
155
|
+
|
156
|
+
@abstractmethod
|
157
|
+
def download(self, *args, **kwargs):
|
158
|
+
"""
|
159
|
+
Abstract method to download data. Implement in subclasses.
|
160
|
+
"""
|
161
|
+
pass
|
162
|
+
|
163
|
+
|
164
|
+
class BaseHandlerReader(ABC):
|
165
|
+
"""
|
166
|
+
Abstract base class for handler reader classes.
|
167
|
+
Provides common methods for resolving source paths and loading data.
|
168
|
+
Supports resolving by country, points, geometry, GeoDataFrame, or explicit paths.
|
169
|
+
Includes generic loader functions for raster and tabular data.
|
170
|
+
"""
|
171
|
+
|
172
|
+
def __init__(
|
173
|
+
self,
|
174
|
+
config: Optional[BaseHandlerConfig] = None,
|
175
|
+
data_store: Optional[DataStore] = None,
|
176
|
+
logger: Optional[logging.Logger] = None,
|
177
|
+
):
|
178
|
+
self.config = config
|
179
|
+
if data_store:
|
180
|
+
self.data_store = data_store
|
181
|
+
elif config and hasattr(config, "data_store"):
|
182
|
+
self.data_store = config.data_store
|
183
|
+
else:
|
184
|
+
self.data_store = LocalDataStore()
|
185
|
+
|
186
|
+
self.logger = (
|
187
|
+
logger
|
188
|
+
or (getattr(config, "logger", None) if config else None)
|
189
|
+
or global_config.get_logger(self.__class__.__name__)
|
190
|
+
)
|
191
|
+
|
192
|
+
def resolve_source_paths(
|
193
|
+
self,
|
194
|
+
source: Union[
|
195
|
+
str, # country code
|
196
|
+
List[Union[Tuple[float, float], Point]], # points
|
197
|
+
BaseGeometry,
|
198
|
+
gpd.GeoDataFrame,
|
199
|
+
Path, # path
|
200
|
+
str, # path
|
201
|
+
List[Union[str, Path]],
|
202
|
+
],
|
203
|
+
**kwargs,
|
204
|
+
) -> List[Union[str, Path]]:
|
205
|
+
"""
|
206
|
+
Resolve source data paths based on the type of source input.
|
207
|
+
|
208
|
+
Args:
|
209
|
+
source: Can be a country code or name (str), list of points, geometry, GeoDataFrame, or explicit path(s)
|
210
|
+
**kwargs: Additional parameters for path resolution
|
211
|
+
|
212
|
+
Returns:
|
213
|
+
List of resolved source paths
|
214
|
+
"""
|
215
|
+
if isinstance(source, (str, Path)):
|
216
|
+
# Could be a country code or a path
|
217
|
+
if self.data_store.file_exists(str(source)) or str(source).endswith(
|
218
|
+
(".csv", ".tif", ".json", ".parquet", ".gz", ".geojson", ".zip")
|
219
|
+
):
|
220
|
+
source_data_paths = self.resolve_by_paths(source)
|
221
|
+
else:
|
222
|
+
source_data_paths = self.resolve_by_country(source, **kwargs)
|
223
|
+
elif isinstance(source, (BaseGeometry, gpd.GeoDataFrame)):
|
224
|
+
source_data_paths = self.resolve_by_geometry(source, **kwargs)
|
225
|
+
elif isinstance(source, Iterable):
|
226
|
+
# List of points or paths
|
227
|
+
if all(isinstance(p, (Iterable, Point)) for p in source):
|
228
|
+
source_data_paths = self.resolve_by_points(source, **kwargs)
|
229
|
+
elif all(isinstance(p, (str, Path)) for p in source):
|
230
|
+
source_data_paths = self.resolve_by_paths(source)
|
231
|
+
else:
|
232
|
+
raise ValueError(
|
233
|
+
"List input to resolve_source_paths must be all points or all paths."
|
234
|
+
)
|
235
|
+
else:
|
236
|
+
raise NotImplementedError(f"Unsupported source type: {type(source)}")
|
237
|
+
|
238
|
+
self.logger.info(f"Resolved {len(source_data_paths)} paths!")
|
239
|
+
return source_data_paths
|
240
|
+
|
241
|
+
def resolve_by_country(self, country: str, **kwargs) -> List[Union[str, Path]]:
|
242
|
+
"""
|
243
|
+
Resolve source paths for a given country code/name.
|
244
|
+
Uses the config's get_relevant_data_units_by_country method.
|
245
|
+
"""
|
246
|
+
if not self.config:
|
247
|
+
raise ValueError("Config is required for resolving by country")
|
248
|
+
data_units = self.config.get_relevant_data_units_by_country(country, **kwargs)
|
249
|
+
return self.config.get_data_unit_paths(data_units, **kwargs)
|
250
|
+
|
251
|
+
def resolve_by_points(
|
252
|
+
self, points: List[Union[Tuple[float, float], Point]], **kwargs
|
253
|
+
) -> List[Union[str, Path]]:
|
254
|
+
"""
|
255
|
+
Resolve source paths for a list of points.
|
256
|
+
Uses the config's get_relevant_data_units_by_points method.
|
257
|
+
"""
|
258
|
+
if not self.config:
|
259
|
+
raise ValueError("Config is required for resolving by points")
|
260
|
+
data_units = self.config.get_relevant_data_units_by_points(points, **kwargs)
|
261
|
+
return self.config.get_data_unit_paths(data_units, **kwargs)
|
262
|
+
|
263
|
+
def resolve_by_geometry(
|
264
|
+
self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
|
265
|
+
) -> List[Union[str, Path]]:
|
266
|
+
"""
|
267
|
+
Resolve source paths for a geometry or GeoDataFrame.
|
268
|
+
Uses the config's get_relevant_data_units_by_geometry method.
|
269
|
+
"""
|
270
|
+
if not self.config:
|
271
|
+
raise ValueError("Config is required for resolving by geometry")
|
272
|
+
data_units = self.config.get_relevant_data_units_by_geometry(geometry, **kwargs)
|
273
|
+
return self.config.get_data_unit_paths(data_units, **kwargs)
|
274
|
+
|
275
|
+
def resolve_by_paths(
|
276
|
+
self, paths: Union[Path, str, List[Union[str, Path]]], **kwargs
|
277
|
+
) -> List[Union[str, Path]]:
|
278
|
+
"""
|
279
|
+
Return explicit paths as a list.
|
280
|
+
"""
|
281
|
+
if isinstance(paths, (str, Path)):
|
282
|
+
return [paths]
|
283
|
+
return list(paths)
|
284
|
+
|
285
|
+
def _pre_load_hook(self, source_data_path, **kwargs) -> Any:
|
286
|
+
"""Hook called before loading data."""
|
287
|
+
if isinstance(source_data_path, (Path, str)):
|
288
|
+
source_data_path = [source_data_path]
|
289
|
+
|
290
|
+
if not source_data_path:
|
291
|
+
self.logger.warning("No paths found!")
|
292
|
+
return []
|
293
|
+
|
294
|
+
source_data_paths = [str(file_path) for file_path in source_data_path]
|
295
|
+
|
296
|
+
self.logger.info(
|
297
|
+
f"Pre-loading validation complete for {len(source_data_path)} files"
|
298
|
+
)
|
299
|
+
return source_data_paths
|
300
|
+
|
301
|
+
def _post_load_hook(self, data, **kwargs) -> Any:
|
302
|
+
"""Hook called after loading data."""
|
303
|
+
if isinstance(data, Iterable):
|
304
|
+
if len(data) == 0:
|
305
|
+
self.logger.warning("No data was loaded from the source files")
|
306
|
+
return data
|
307
|
+
|
308
|
+
self.logger.info(f"{len(data)} valid data records.")
|
309
|
+
|
310
|
+
self.logger.info(f"Post-load processing complete.")
|
311
|
+
|
312
|
+
return data
|
313
|
+
|
314
|
+
def _check_file_exists(self, file_paths: List[Union[str, Path]]):
|
315
|
+
"""
|
316
|
+
Check that all specified files exist in the data store.
|
317
|
+
|
318
|
+
Args:
|
319
|
+
file_paths (List[Union[str, Path]]): List of file paths to check.
|
320
|
+
|
321
|
+
Raises:
|
322
|
+
RuntimeError: If any file does not exist in the data store.
|
323
|
+
"""
|
324
|
+
for file_path in file_paths:
|
325
|
+
if not self.data_store.file_exists(str(file_path)):
|
326
|
+
raise RuntimeError(
|
327
|
+
f"Source file does not exist in the data store: {file_path}"
|
328
|
+
)
|
329
|
+
|
330
|
+
def _load_raster_data(
|
331
|
+
self, raster_paths: List[Union[str, Path]]
|
332
|
+
) -> List[TifProcessor]:
|
333
|
+
"""
|
334
|
+
Load raster data from file paths.
|
335
|
+
|
336
|
+
Args:
|
337
|
+
raster_paths (List[Union[str, Path]]): List of file paths to raster files.
|
338
|
+
|
339
|
+
Returns:
|
340
|
+
List[TifProcessor]: List of TifProcessor objects for accessing the raster data.
|
341
|
+
"""
|
342
|
+
return [
|
343
|
+
TifProcessor(data_path, self.data_store, mode="single")
|
344
|
+
for data_path in raster_paths
|
345
|
+
]
|
346
|
+
|
347
|
+
def _load_tabular_data(
|
348
|
+
self, file_paths: List[Union[str, Path]], read_function: Callable = read_dataset
|
349
|
+
) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
|
350
|
+
"""
|
351
|
+
Load and concatenate tabular data from multiple files.
|
352
|
+
|
353
|
+
Args:
|
354
|
+
file_paths (List[Union[str, Path]]): List of file paths to load data from.
|
355
|
+
read_function (Callable): Function to use for reading individual files.
|
356
|
+
Defaults to read_dataset. Should accept (data_store, file_path) arguments.
|
357
|
+
|
358
|
+
Returns:
|
359
|
+
Union[pd.DataFrame, gpd.GeoDataFrame]: Concatenated data from all files.
|
360
|
+
Returns empty DataFrame if no data is loaded.
|
361
|
+
"""
|
362
|
+
all_data = []
|
363
|
+
for file_path in file_paths:
|
364
|
+
all_data.append(read_function(self.data_store, file_path))
|
365
|
+
if not all_data:
|
366
|
+
return pd.DataFrame()
|
367
|
+
result = pd.concat(all_data, ignore_index=True)
|
368
|
+
return result
|
369
|
+
|
370
|
+
@abstractmethod
|
371
|
+
def load_from_paths(
|
372
|
+
self, source_data_path: List[Union[str, Path]], **kwargs
|
373
|
+
) -> Any:
|
374
|
+
"""
|
375
|
+
Abstract method to load source data from paths.
|
376
|
+
|
377
|
+
Args:
|
378
|
+
source_data_path: List of source paths
|
379
|
+
**kwargs: Additional parameters for data loading
|
380
|
+
|
381
|
+
Returns:
|
382
|
+
Loaded data (DataFrame, GeoDataFrame, etc.)
|
383
|
+
"""
|
384
|
+
pass
|
385
|
+
|
386
|
+
def load(
|
387
|
+
self,
|
388
|
+
source: Union[
|
389
|
+
str, # country
|
390
|
+
List[Union[Tuple[float, float], Point]], # points
|
391
|
+
BaseGeometry,
|
392
|
+
gpd.GeoDataFrame,
|
393
|
+
Path,
|
394
|
+
str,
|
395
|
+
List[Union[str, Path]],
|
396
|
+
],
|
397
|
+
**kwargs,
|
398
|
+
) -> Any:
|
399
|
+
"""
|
400
|
+
Load data from the given source.
|
401
|
+
|
402
|
+
Args:
|
403
|
+
source: The data source (country code/name, points, geometry, paths, etc.).
|
404
|
+
**kwargs: Additional parameters to pass to the loading process.
|
405
|
+
|
406
|
+
Returns:
|
407
|
+
The loaded data. The type depends on the subclass implementation.
|
408
|
+
"""
|
409
|
+
source_data_paths = self.resolve_source_paths(source, **kwargs)
|
410
|
+
if not source_data_paths:
|
411
|
+
self.logger.warning(
|
412
|
+
"No source data paths resolved. There's no matching data to load!"
|
413
|
+
)
|
414
|
+
return None
|
415
|
+
processed_paths = self._pre_load_hook(source_data_paths, **kwargs)
|
416
|
+
if not processed_paths:
|
417
|
+
self.logger.warning("No valid paths to load data from.")
|
418
|
+
return None
|
419
|
+
|
420
|
+
loaded_data = self.load_from_paths(processed_paths, **kwargs)
|
421
|
+
return self._post_load_hook(loaded_data, **kwargs)
|
422
|
+
|
423
|
+
|
424
|
+
class BaseHandler(ABC):
|
425
|
+
"""
|
426
|
+
Abstract base class that orchestrates configuration, downloading, and reading functionality.
|
427
|
+
|
428
|
+
This class serves as the main entry point for dataset handlers, providing a unified
|
429
|
+
interface for data acquisition and loading. It manages the lifecycle of config,
|
430
|
+
downloader, and reader components.
|
431
|
+
|
432
|
+
Subclasses should implement the abstract methods to provide specific handler types
|
433
|
+
and define how components are created and interact.
|
434
|
+
"""
|
435
|
+
|
436
|
+
def __init__(
|
437
|
+
self,
|
438
|
+
config: Optional[BaseHandlerConfig] = None,
|
439
|
+
downloader: Optional[BaseHandlerDownloader] = None,
|
440
|
+
reader: Optional[BaseHandlerReader] = None,
|
441
|
+
data_store: Optional[DataStore] = None,
|
442
|
+
logger: Optional[logging.Logger] = None,
|
443
|
+
):
|
444
|
+
"""
|
445
|
+
Initialize the BaseHandler with optional components.
|
446
|
+
|
447
|
+
Args:
|
448
|
+
config: Configuration object. If None, will be created via create_config()
|
449
|
+
downloader: Downloader instance. If None, will be created via create_downloader()
|
450
|
+
reader: Reader instance. If None, will be created via create_reader()
|
451
|
+
data_store: Data store instance. Defaults to LocalDataStore if not provided
|
452
|
+
logger: Logger instance. If not provided, creates one based on class name
|
453
|
+
"""
|
454
|
+
# Initialize data store first as it's used by other components
|
455
|
+
self.data_store = data_store or LocalDataStore()
|
456
|
+
|
457
|
+
# Initialize logger
|
458
|
+
self.logger = logger or global_config.get_logger(self.__class__.__name__)
|
459
|
+
|
460
|
+
# Initialize or create config
|
461
|
+
self._config = config
|
462
|
+
if self._config is None:
|
463
|
+
self._config = self.create_config(
|
464
|
+
data_store=self.data_store, logger=self.logger
|
465
|
+
)
|
466
|
+
|
467
|
+
# Initialize or create downloader
|
468
|
+
self._downloader = downloader
|
469
|
+
if self._downloader is None:
|
470
|
+
self._downloader = self.create_downloader(
|
471
|
+
config=self._config, data_store=self.data_store, logger=self.logger
|
472
|
+
)
|
473
|
+
|
474
|
+
# Initialize or create reader
|
475
|
+
self._reader = reader
|
476
|
+
if self._reader is None:
|
477
|
+
self._reader = self.create_reader(
|
478
|
+
config=self._config, data_store=self.data_store, logger=self.logger
|
479
|
+
)
|
480
|
+
|
481
|
+
@property
|
482
|
+
def config(self) -> BaseHandlerConfig:
|
483
|
+
"""Get the configuration object."""
|
484
|
+
return self._config
|
485
|
+
|
486
|
+
@property
|
487
|
+
def downloader(self) -> BaseHandlerDownloader:
|
488
|
+
"""Get the downloader object."""
|
489
|
+
return self._downloader
|
490
|
+
|
491
|
+
@property
|
492
|
+
def reader(self) -> BaseHandlerReader:
|
493
|
+
"""Get the reader object."""
|
494
|
+
return self._reader
|
495
|
+
|
496
|
+
# Abstract factory methods for creating components
|
497
|
+
@abstractmethod
|
498
|
+
def create_config(
|
499
|
+
self, data_store: DataStore, logger: logging.Logger, **kwargs
|
500
|
+
) -> BaseHandlerConfig:
|
501
|
+
"""
|
502
|
+
Create and return a configuration object for this handler.
|
503
|
+
|
504
|
+
Args:
|
505
|
+
data_store: The data store instance to use
|
506
|
+
logger: The logger instance to use
|
507
|
+
**kwargs: Additional configuration parameters
|
508
|
+
|
509
|
+
Returns:
|
510
|
+
Configured BaseHandlerConfig instance
|
511
|
+
"""
|
512
|
+
pass
|
513
|
+
|
514
|
+
@abstractmethod
|
515
|
+
def create_downloader(
|
516
|
+
self,
|
517
|
+
config: BaseHandlerConfig,
|
518
|
+
data_store: DataStore,
|
519
|
+
logger: logging.Logger,
|
520
|
+
**kwargs,
|
521
|
+
) -> BaseHandlerDownloader:
|
522
|
+
"""
|
523
|
+
Create and return a downloader object for this handler.
|
524
|
+
|
525
|
+
Args:
|
526
|
+
config: The configuration object
|
527
|
+
data_store: The data store instance to use
|
528
|
+
logger: The logger instance to use
|
529
|
+
**kwargs: Additional downloader parameters
|
530
|
+
|
531
|
+
Returns:
|
532
|
+
Configured BaseHandlerDownloader instance
|
533
|
+
"""
|
534
|
+
pass
|
535
|
+
|
536
|
+
@abstractmethod
|
537
|
+
def create_reader(
|
538
|
+
self,
|
539
|
+
config: BaseHandlerConfig,
|
540
|
+
data_store: DataStore,
|
541
|
+
logger: logging.Logger,
|
542
|
+
**kwargs,
|
543
|
+
) -> BaseHandlerReader:
|
544
|
+
"""
|
545
|
+
Create and return a reader object for this handler.
|
546
|
+
|
547
|
+
Args:
|
548
|
+
config: The configuration object
|
549
|
+
data_store: The data store instance to use
|
550
|
+
logger: The logger instance to use
|
551
|
+
**kwargs: Additional reader parameters
|
552
|
+
|
553
|
+
Returns:
|
554
|
+
Configured BaseHandlerReader instance
|
555
|
+
"""
|
556
|
+
pass
|
557
|
+
|
558
|
+
# High-level interface methods
|
559
|
+
def ensure_data_available(
|
560
|
+
self,
|
561
|
+
source: Union[
|
562
|
+
str, # country
|
563
|
+
List[Union[tuple, Point]], # points
|
564
|
+
BaseGeometry, # geometry
|
565
|
+
gpd.GeoDataFrame, # geodataframe
|
566
|
+
Path, # path
|
567
|
+
List[Union[str, Path]], # list of paths
|
568
|
+
],
|
569
|
+
force_download: bool = False,
|
570
|
+
**kwargs,
|
571
|
+
) -> bool:
|
572
|
+
"""
|
573
|
+
Ensure that data is available for the given source.
|
574
|
+
|
575
|
+
This method checks if the required data exists locally, and if not (or if
|
576
|
+
force_download is True), downloads it using the downloader.
|
577
|
+
|
578
|
+
Args:
|
579
|
+
source: The data source specification
|
580
|
+
force_download: If True, download even if data exists locally
|
581
|
+
**kwargs: Additional parameters passed to download methods
|
582
|
+
|
583
|
+
Returns:
|
584
|
+
bool: True if data is available after this operation
|
585
|
+
"""
|
586
|
+
try:
|
587
|
+
# Resolve what data units are needed
|
588
|
+
if hasattr(self.config, "get_relevant_data_units"):
|
589
|
+
data_units = self.config.get_relevant_data_units(source, **kwargs)
|
590
|
+
data_paths = self.config.get_data_unit_paths(data_units, **kwargs)
|
591
|
+
else:
|
592
|
+
# Fallback: try to resolve paths directly
|
593
|
+
if hasattr(self.reader, "resolve_source_paths"):
|
594
|
+
data_paths = self.reader.resolve_source_paths(source, **kwargs)
|
595
|
+
else:
|
596
|
+
self.logger.warning("Cannot determine required data paths")
|
597
|
+
return False
|
598
|
+
|
599
|
+
# Check if data exists (unless force download)
|
600
|
+
if not force_download:
|
601
|
+
missing_paths = [
|
602
|
+
path
|
603
|
+
for path in data_paths
|
604
|
+
if not self.data_store.file_exists(str(path))
|
605
|
+
]
|
606
|
+
if not missing_paths:
|
607
|
+
self.logger.info("All required data is already available")
|
608
|
+
return True
|
609
|
+
|
610
|
+
# Download missing or all data
|
611
|
+
if hasattr(self.config, "get_relevant_data_units"):
|
612
|
+
data_units = self.config.get_relevant_data_units(source, **kwargs)
|
613
|
+
self.downloader.download_data_units(data_units, **kwargs)
|
614
|
+
else:
|
615
|
+
self.downloader.download(source, **kwargs)
|
616
|
+
|
617
|
+
return True
|
618
|
+
|
619
|
+
except Exception as e:
|
620
|
+
self.logger.error(f"Failed to ensure data availability: {e}")
|
621
|
+
return False
|
622
|
+
|
623
|
+
def load_data(
|
624
|
+
self,
|
625
|
+
source: Union[
|
626
|
+
str, # country
|
627
|
+
List[Union[tuple, Point]], # points
|
628
|
+
BaseGeometry, # geometry
|
629
|
+
gpd.GeoDataFrame, # geodataframe
|
630
|
+
Path, # path
|
631
|
+
List[Union[str, Path]], # list of paths
|
632
|
+
],
|
633
|
+
ensure_available: bool = True,
|
634
|
+
**kwargs,
|
635
|
+
) -> Any:
|
636
|
+
"""
|
637
|
+
Load data from the given source.
|
638
|
+
|
639
|
+
Args:
|
640
|
+
source: The data source specification
|
641
|
+
ensure_available: If True, ensure data is downloaded before loading
|
642
|
+
**kwargs: Additional parameters passed to load methods
|
643
|
+
|
644
|
+
Returns:
|
645
|
+
Loaded data (type depends on specific handler implementation)
|
646
|
+
"""
|
647
|
+
if ensure_available:
|
648
|
+
if not self.ensure_data_available(source, **kwargs):
|
649
|
+
raise RuntimeError("Could not ensure data availability for loading")
|
650
|
+
|
651
|
+
return self.reader.load(source, **kwargs)
|
652
|
+
|
653
|
+
def download_and_load(
|
654
|
+
self,
|
655
|
+
source: Union[
|
656
|
+
str, # country
|
657
|
+
List[Union[tuple, Point]], # points
|
658
|
+
BaseGeometry, # geometry
|
659
|
+
gpd.GeoDataFrame, # geodataframe
|
660
|
+
Path, # path
|
661
|
+
List[Union[str, Path]], # list of paths
|
662
|
+
],
|
663
|
+
force_download: bool = False,
|
664
|
+
**kwargs,
|
665
|
+
) -> Any:
|
666
|
+
"""
|
667
|
+
Convenience method to download (if needed) and load data in one call.
|
668
|
+
|
669
|
+
Args:
|
670
|
+
source: The data source specification
|
671
|
+
force_download: If True, download even if data exists locally
|
672
|
+
**kwargs: Additional parameters
|
673
|
+
|
674
|
+
Returns:
|
675
|
+
Loaded data
|
676
|
+
"""
|
677
|
+
self.ensure_data_available(source, force_download=force_download, **kwargs)
|
678
|
+
return self.reader.load(source, **kwargs)
|
679
|
+
|
680
|
+
def get_available_data_info(
|
681
|
+
self,
|
682
|
+
source: Union[
|
683
|
+
str, # country
|
684
|
+
List[Union[tuple, Point]], # points
|
685
|
+
BaseGeometry, # geometry
|
686
|
+
gpd.GeoDataFrame, # geodataframe
|
687
|
+
],
|
688
|
+
**kwargs,
|
689
|
+
) -> dict:
|
690
|
+
"""
|
691
|
+
Get information about available data for the given source.
|
692
|
+
|
693
|
+
Args:
|
694
|
+
source: The data source specification
|
695
|
+
**kwargs: Additional parameters
|
696
|
+
|
697
|
+
Returns:
|
698
|
+
dict: Information about data availability, paths, etc.
|
699
|
+
"""
|
700
|
+
try:
|
701
|
+
if hasattr(self.config, "get_relevant_data_units"):
|
702
|
+
data_units = self.config.get_relevant_data_units(source, **kwargs)
|
703
|
+
data_paths = self.config.get_data_unit_paths(data_units, **kwargs)
|
704
|
+
else:
|
705
|
+
data_paths = self.reader.resolve_source_paths(source, **kwargs)
|
706
|
+
|
707
|
+
existing_paths = [
|
708
|
+
path for path in data_paths if self.data_store.file_exists(str(path))
|
709
|
+
]
|
710
|
+
missing_paths = [
|
711
|
+
path
|
712
|
+
for path in data_paths
|
713
|
+
if not self.data_store.file_exists(str(path))
|
714
|
+
]
|
715
|
+
|
716
|
+
return {
|
717
|
+
"total_data_units": len(data_paths),
|
718
|
+
"available_data_units": len(existing_paths),
|
719
|
+
"missing_data_units": len(missing_paths),
|
720
|
+
"available_paths": existing_paths,
|
721
|
+
"missing_paths": missing_paths,
|
722
|
+
"all_available": len(missing_paths) == 0,
|
723
|
+
}
|
724
|
+
|
725
|
+
except Exception as e:
|
726
|
+
self.logger.error(f"Failed to get data info: {e}")
|
727
|
+
return {
|
728
|
+
"error": str(e),
|
729
|
+
"total_data_units": 0,
|
730
|
+
"available_data_units": 0,
|
731
|
+
"missing_data_units": 0,
|
732
|
+
"available_paths": [],
|
733
|
+
"missing_paths": [],
|
734
|
+
"all_available": False,
|
735
|
+
}
|
736
|
+
|
737
|
+
def cleanup(self):
|
738
|
+
"""
|
739
|
+
Cleanup resources used by the handler.
|
740
|
+
|
741
|
+
Override in subclasses if specific cleanup is needed.
|
742
|
+
"""
|
743
|
+
self.logger.info(f"Cleaning up {self.__class__.__name__}")
|
744
|
+
# Subclasses can override to add specific cleanup logic
|
745
|
+
|
746
|
+
def __enter__(self):
|
747
|
+
"""Context manager entry."""
|
748
|
+
return self
|
749
|
+
|
750
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
751
|
+
"""Context manager exit."""
|
752
|
+
self.cleanup()
|
753
|
+
|
754
|
+
def __repr__(self) -> str:
|
755
|
+
"""String representation of the handler."""
|
756
|
+
return (
|
757
|
+
f"{self.__class__.__name__}("
|
758
|
+
f"config={self.config.__class__.__name__}, "
|
759
|
+
f"downloader={self.downloader.__class__.__name__}, "
|
760
|
+
f"reader={self.reader.__class__.__name__})"
|
761
|
+
)
|