giga-spatial 0.6.2__py3-none-any.whl → 0.6.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {giga_spatial-0.6.2.dist-info → giga_spatial-0.6.4.dist-info}/METADATA +18 -8
- {giga_spatial-0.6.2.dist-info → giga_spatial-0.6.4.dist-info}/RECORD +15 -15
- gigaspatial/__init__.py +1 -1
- gigaspatial/config.py +6 -0
- gigaspatial/handlers/__init__.py +7 -3
- gigaspatial/handlers/boundaries.py +196 -43
- gigaspatial/handlers/ghsl.py +7 -6
- gigaspatial/handlers/giga.py +641 -0
- gigaspatial/handlers/hdx.py +411 -143
- gigaspatial/handlers/maxar_image.py +1 -2
- gigaspatial/handlers/rwi.py +119 -121
- gigaspatial/processing/tif_processor.py +88 -2
- {giga_spatial-0.6.2.dist-info → giga_spatial-0.6.4.dist-info}/WHEEL +0 -0
- {giga_spatial-0.6.2.dist-info → giga_spatial-0.6.4.dist-info}/licenses/LICENSE +0 -0
- {giga_spatial-0.6.2.dist-info → giga_spatial-0.6.4.dist-info}/top_level.txt +0 -0
gigaspatial/handlers/hdx.py
CHANGED
@@ -1,88 +1,115 @@
|
|
1
|
-
import os
|
2
1
|
import logging
|
2
|
+
from tqdm import tqdm
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import List, Optional, Union, Dict, Any
|
4
|
+
from typing import List, Optional, Union, Dict, Any, Iterable
|
5
5
|
import tempfile
|
6
6
|
|
7
|
-
import pandas as pd
|
8
7
|
import geopandas as gpd
|
9
|
-
from pydantic import
|
8
|
+
from pydantic import Field, ConfigDict
|
9
|
+
from pydantic.dataclasses import dataclass
|
10
|
+
from shapely.geometry.base import BaseGeometry
|
11
|
+
from shapely.geometry import Point
|
12
|
+
import pycountry
|
10
13
|
|
11
14
|
from hdx.api.configuration import Configuration
|
12
15
|
from hdx.data.dataset import Dataset
|
13
16
|
from hdx.data.resource import Resource
|
14
17
|
|
15
18
|
from gigaspatial.core.io.data_store import DataStore
|
16
|
-
from gigaspatial.core.io.local_data_store import LocalDataStore
|
17
19
|
from gigaspatial.core.io.readers import read_dataset
|
18
20
|
from gigaspatial.config import config as global_config
|
21
|
+
from gigaspatial.handlers.base import (
|
22
|
+
BaseHandlerConfig,
|
23
|
+
BaseHandlerDownloader,
|
24
|
+
BaseHandlerReader,
|
25
|
+
BaseHandler,
|
26
|
+
)
|
19
27
|
|
20
28
|
|
21
|
-
|
29
|
+
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
30
|
+
class HDXConfig(BaseHandlerConfig):
|
22
31
|
"""Configuration for HDX data access"""
|
23
32
|
|
24
33
|
# User configuration
|
25
|
-
dataset_name: str = Field(
|
34
|
+
dataset_name: str = Field(
|
35
|
+
default=..., description="Name of the HDX dataset to download"
|
36
|
+
)
|
37
|
+
|
38
|
+
# Optional configuration with defaults
|
26
39
|
base_path: Path = Field(default=global_config.get_path("hdx", "bronze"))
|
27
40
|
user_agent: str = Field(
|
28
41
|
default="gigaspatial", description="User agent for HDX API requests"
|
29
42
|
)
|
30
43
|
hdx_site: str = Field(default="prod", description="HDX site to use (prod or test)")
|
31
|
-
resource_filter: Optional[Dict[str, Any]] = Field(
|
32
|
-
default=None, description="Filter to apply to resources"
|
33
|
-
)
|
34
|
-
|
35
|
-
@property
|
36
|
-
def output_dir_path(self) -> Path:
|
37
|
-
"""Path to save the downloaded HDX dataset"""
|
38
|
-
return self.base_path / self.dataset_name
|
39
|
-
|
40
|
-
def __repr__(self) -> str:
|
41
|
-
return (
|
42
|
-
f"HDXConfig(\n"
|
43
|
-
f" dataset_name='{self.dataset_name}'\n"
|
44
|
-
f" base_path='{self.base_path}'\n"
|
45
|
-
f" hdx_site='{self.hdx_site}'\n"
|
46
|
-
f" user_agent='{self.user_agent}'\n"
|
47
|
-
f")"
|
48
|
-
)
|
49
44
|
|
45
|
+
# Internal state
|
46
|
+
_hdx_configured: bool = Field(default=False, init=False)
|
47
|
+
dataset: Optional[Dataset] = Field(default=None, init=False)
|
48
|
+
|
49
|
+
@staticmethod
|
50
|
+
def search_datasets(
|
51
|
+
query: str,
|
52
|
+
rows: int = None,
|
53
|
+
sort: str = "relevance asc, metadata_modified desc",
|
54
|
+
hdx_site: str = "prod",
|
55
|
+
user_agent: str = "gigaspatial",
|
56
|
+
) -> List[Dict]:
|
57
|
+
"""Search for datasets in HDX before initializing the class.
|
58
|
+
|
59
|
+
Args:
|
60
|
+
query: Search query string
|
61
|
+
rows: Number of results per page. Defaults to all datasets (sys.maxsize).
|
62
|
+
sort: Sort order - one of 'relevance', 'views_recent', 'views_total', 'last_modified' (default: 'relevance')
|
63
|
+
hdx_site: HDX site to use - 'prod' or 'test' (default: 'prod')
|
64
|
+
user_agent: User agent for HDX API requests (default: 'gigaspatial')
|
65
|
+
|
66
|
+
Returns:
|
67
|
+
List of dataset dictionaries containing search results
|
68
|
+
|
69
|
+
Example:
|
70
|
+
>>> results = HDXConfig.search_datasets("population", rows=5)
|
71
|
+
>>> for dataset in results:
|
72
|
+
>>> print(f"Name: {dataset['name']}, Title: {dataset['title']}")
|
73
|
+
"""
|
74
|
+
try:
|
75
|
+
Configuration.create(
|
76
|
+
hdx_site=hdx_site,
|
77
|
+
user_agent=user_agent,
|
78
|
+
hdx_read_only=True,
|
79
|
+
)
|
80
|
+
except:
|
81
|
+
pass
|
50
82
|
|
51
|
-
|
52
|
-
|
83
|
+
try:
|
84
|
+
results = Dataset.search_in_hdx(query=query, rows=rows, sort=sort)
|
53
85
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
logger: Optional[logging.Logger] = None,
|
59
|
-
):
|
60
|
-
if isinstance(config, dict):
|
61
|
-
self.config = HDXConfig(**config)
|
62
|
-
else:
|
63
|
-
self.config = config
|
86
|
+
return results
|
87
|
+
except Exception as e:
|
88
|
+
logging.error(f"Error searching HDX datasets: {str(e)}")
|
89
|
+
raise
|
64
90
|
|
65
|
-
|
66
|
-
|
91
|
+
def __post_init__(self):
|
92
|
+
super().__post_init__()
|
67
93
|
try:
|
68
94
|
Configuration.read()
|
69
95
|
self._hdx_configured = True
|
70
|
-
except:
|
96
|
+
except Exception:
|
71
97
|
self._hdx_configured = False
|
98
|
+
self.configure_hdx()
|
99
|
+
self.dataset = self.fetch_dataset()
|
72
100
|
|
73
|
-
@
|
74
|
-
def
|
75
|
-
"""
|
76
|
-
|
77
|
-
return cls(config=config)
|
101
|
+
@property
|
102
|
+
def output_dir_path(self) -> Path:
|
103
|
+
"""Path to save the downloaded HDX dataset"""
|
104
|
+
return self.base_path / self.dataset_name
|
78
105
|
|
79
|
-
def
|
106
|
+
def configure_hdx(self):
|
80
107
|
"""Configure HDX API if not already configured"""
|
81
108
|
if not self._hdx_configured:
|
82
109
|
try:
|
83
110
|
Configuration.create(
|
84
|
-
hdx_site=self.
|
85
|
-
user_agent=self.
|
111
|
+
hdx_site=self.hdx_site,
|
112
|
+
user_agent=self.user_agent,
|
86
113
|
hdx_read_only=True,
|
87
114
|
)
|
88
115
|
self._hdx_configured = True
|
@@ -90,40 +117,108 @@ class HDXDownloader:
|
|
90
117
|
self.logger.error(f"Error configuring HDX API: {str(e)}")
|
91
118
|
raise
|
92
119
|
|
93
|
-
def
|
120
|
+
def fetch_dataset(self) -> Dataset:
|
94
121
|
"""Get the HDX dataset"""
|
95
|
-
self._configure_hdx()
|
96
|
-
|
97
122
|
try:
|
98
|
-
self.logger.info(f"Fetching HDX dataset: {self.
|
99
|
-
dataset = Dataset.read_from_hdx(self.
|
123
|
+
self.logger.info(f"Fetching HDX dataset: {self.dataset_name}")
|
124
|
+
dataset = Dataset.read_from_hdx(self.dataset_name)
|
100
125
|
if not dataset:
|
101
126
|
raise ValueError(
|
102
|
-
f"Dataset '{self.
|
127
|
+
f"Dataset '{self.dataset_name}' not found on HDX. "
|
128
|
+
"Please verify the dataset name or use search_datasets() "
|
129
|
+
"to find available datasets."
|
103
130
|
)
|
104
131
|
return dataset
|
105
132
|
except Exception as e:
|
106
133
|
self.logger.error(f"Error fetching HDX dataset: {str(e)}")
|
107
134
|
raise
|
108
135
|
|
136
|
+
def _match_pattern(self, value: str, pattern: str) -> bool:
|
137
|
+
"""Check if a value matches a pattern"""
|
138
|
+
if isinstance(pattern, str):
|
139
|
+
return pattern.lower() in value.lower()
|
140
|
+
return value == pattern
|
141
|
+
|
142
|
+
def _get_patterns_for_value(self, value: Any) -> List[str]:
|
143
|
+
"""Generate patterns for a given value or list of values"""
|
144
|
+
if isinstance(value, list):
|
145
|
+
patterns = []
|
146
|
+
for v in value:
|
147
|
+
patterns.extend(self._get_patterns_for_value(v))
|
148
|
+
return patterns
|
149
|
+
|
150
|
+
if not isinstance(value, str):
|
151
|
+
return [value]
|
152
|
+
|
153
|
+
patterns = []
|
154
|
+
value = value.lower()
|
155
|
+
|
156
|
+
# Add exact match
|
157
|
+
patterns.append(value)
|
158
|
+
|
159
|
+
# Add common variations
|
160
|
+
patterns.extend(
|
161
|
+
[
|
162
|
+
f"/{value}_", # URL path with prefix
|
163
|
+
f"/{value}.", # URL path with extension
|
164
|
+
f"_{value}_", # Filename with value in middle
|
165
|
+
f"_{value}.", # Filename with value at end
|
166
|
+
]
|
167
|
+
)
|
168
|
+
|
169
|
+
# If value contains spaces, generate additional patterns
|
170
|
+
if " " in value:
|
171
|
+
# Generate patterns for space-less version
|
172
|
+
no_space = value.replace(" ", "")
|
173
|
+
patterns.extend(self._get_patterns_for_value(no_space))
|
174
|
+
|
175
|
+
# Generate patterns for hyphenated version
|
176
|
+
hyphenated = value.replace(" ", "-")
|
177
|
+
patterns.extend(self._get_patterns_for_value(hyphenated))
|
178
|
+
|
179
|
+
return patterns
|
180
|
+
|
109
181
|
def get_dataset_resources(
|
110
|
-
self,
|
182
|
+
self, filter: Optional[Dict[str, Any]] = None, exact_match: bool = False
|
111
183
|
) -> List[Resource]:
|
112
|
-
"""Get resources from the HDX dataset
|
113
|
-
dataset = dataset or self.get_dataset()
|
184
|
+
"""Get resources from the HDX dataset
|
114
185
|
|
186
|
+
Args:
|
187
|
+
filter: Dictionary of key-value pairs to filter resources
|
188
|
+
exact_match: If True, perform exact matching. If False, use pattern matching
|
189
|
+
"""
|
115
190
|
try:
|
116
|
-
resources = dataset.get_resources()
|
191
|
+
resources = self.dataset.get_resources()
|
117
192
|
|
118
193
|
# Apply resource filter if specified
|
119
|
-
if
|
194
|
+
if filter:
|
120
195
|
filtered_resources = []
|
121
196
|
for res in resources:
|
122
197
|
match = True
|
123
|
-
for key, value in
|
124
|
-
if key in res.data
|
198
|
+
for key, value in filter.items():
|
199
|
+
if key not in res.data:
|
125
200
|
match = False
|
126
201
|
break
|
202
|
+
|
203
|
+
if exact_match:
|
204
|
+
# For exact matching, check if value matches or is in list of values
|
205
|
+
if isinstance(value, list):
|
206
|
+
if res.data[key] not in value:
|
207
|
+
match = False
|
208
|
+
break
|
209
|
+
elif res.data[key] != value:
|
210
|
+
match = False
|
211
|
+
break
|
212
|
+
else:
|
213
|
+
# For pattern matching, generate patterns for value(s)
|
214
|
+
patterns = self._get_patterns_for_value(value)
|
215
|
+
if not any(
|
216
|
+
self._match_pattern(str(res.data[key]), pattern)
|
217
|
+
for pattern in patterns
|
218
|
+
):
|
219
|
+
match = False
|
220
|
+
break
|
221
|
+
|
127
222
|
if match:
|
128
223
|
filtered_resources.append(res)
|
129
224
|
resources = filtered_resources
|
@@ -133,109 +228,282 @@ class HDXDownloader:
|
|
133
228
|
self.logger.error(f"Error getting dataset resources: {str(e)}")
|
134
229
|
raise
|
135
230
|
|
136
|
-
def
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
231
|
+
def get_relevant_data_units(
|
232
|
+
self, source: Union[str, Dict], **kwargs
|
233
|
+
) -> List[Resource]:
|
234
|
+
"""Get relevant data units based on the source type
|
235
|
+
|
236
|
+
Args:
|
237
|
+
source: Either a country name/code (str) or a filter dictionary
|
238
|
+
**kwargs: Additional keyword arguments passed to the specific method
|
239
|
+
|
240
|
+
Returns:
|
241
|
+
List of matching resources
|
242
|
+
"""
|
243
|
+
if isinstance(source, str):
|
244
|
+
# If source is a string, assume it's a country and use country-based filtering
|
245
|
+
return self.get_relevant_data_units_by_country(source, **kwargs)
|
246
|
+
elif isinstance(source, dict):
|
247
|
+
# If source is a dict, use it directly as a filter
|
248
|
+
return self.get_dataset_resources(filter=source, **kwargs)
|
249
|
+
else:
|
250
|
+
raise ValueError(f"Unsupported source type: {type(source)}")
|
147
251
|
|
148
|
-
|
149
|
-
|
150
|
-
|
252
|
+
def get_relevant_data_units_by_geometry(
|
253
|
+
self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
|
254
|
+
) -> List[Resource]:
|
255
|
+
raise NotImplementedError(
|
256
|
+
"HDX does not support geometry-based filtering. "
|
257
|
+
"Please use country-based filtering or direct resource filtering instead."
|
258
|
+
)
|
151
259
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
with tempfile.TemporaryDirectory() as tmpdir:
|
160
|
-
url, local_path = res.download(folder=tmpdir)
|
161
|
-
# Read the file and write to the DataStore
|
162
|
-
with open(local_path, "rb") as f:
|
163
|
-
data = f.read()
|
164
|
-
# Compose the target path in the DataStore
|
165
|
-
target_path = str(
|
166
|
-
self.config.output_dir_path / Path(local_path).name
|
167
|
-
)
|
168
|
-
self.data_store.write_file(target_path, data)
|
169
|
-
downloaded_paths.append(target_path)
|
170
|
-
|
171
|
-
self.logger.info(
|
172
|
-
f"Downloaded resource: {resource_name} to {target_path}"
|
173
|
-
)
|
174
|
-
except Exception as e:
|
175
|
-
self.logger.error(
|
176
|
-
f"Error downloading resource {res.get('name', 'Unknown')}: {str(e)}"
|
177
|
-
)
|
178
|
-
|
179
|
-
return downloaded_paths
|
260
|
+
def get_relevant_data_units_by_points(
|
261
|
+
self, points: List[Union[Point, tuple]], **kwargs
|
262
|
+
) -> List[Resource]:
|
263
|
+
raise NotImplementedError(
|
264
|
+
"HDX does not support point-based filtering. "
|
265
|
+
"Please use country-based filtering or direct resource filtering instead."
|
266
|
+
)
|
180
267
|
|
181
|
-
|
182
|
-
|
183
|
-
|
268
|
+
def get_relevant_data_units_by_country(
|
269
|
+
self,
|
270
|
+
country: str,
|
271
|
+
key: str = "url",
|
272
|
+
**kwargs,
|
273
|
+
) -> Any:
|
274
|
+
"""Get relevant data units for a country
|
275
|
+
|
276
|
+
Args:
|
277
|
+
country: Country name or code
|
278
|
+
key: The key to filter on in the resource data
|
279
|
+
patterns: List of patterns to match against the resource data
|
280
|
+
**kwargs: Additional keyword arguments
|
281
|
+
"""
|
282
|
+
country = pycountry.countries.lookup(country)
|
283
|
+
values = [country.alpha_3, country.alpha_2, country.name]
|
284
|
+
return self.get_dataset_resources(
|
285
|
+
filter={key: values},
|
286
|
+
)
|
184
287
|
|
288
|
+
def get_data_unit_path(self, unit: str, **kwargs) -> str:
|
289
|
+
"""Get the path for a data unit"""
|
290
|
+
try:
|
291
|
+
filename = unit.data["name"]
|
292
|
+
except:
|
293
|
+
filename = unit.get("download_url").split("/")[-1]
|
185
294
|
|
186
|
-
|
187
|
-
"""Reader for HDX datasets"""
|
188
|
-
|
189
|
-
def __init__(
|
190
|
-
self,
|
191
|
-
dataset_name: str,
|
192
|
-
data_store: Optional[DataStore] = None,
|
193
|
-
base_path: Optional[Path] = None,
|
194
|
-
):
|
195
|
-
self.dataset_name = dataset_name
|
196
|
-
self.data_store = data_store or LocalDataStore()
|
197
|
-
self.base_path = base_path or global_config.get_path("hdx", "bronze")
|
198
|
-
self.dataset_path = self.base_path / self.dataset_name
|
295
|
+
return self.output_dir_path / filename
|
199
296
|
|
200
297
|
def list_resources(self) -> List[str]:
|
201
298
|
"""List all resources in the dataset directory using the data_store."""
|
299
|
+
dataset_folder = str(self.output_dir_path)
|
202
300
|
# Check if the dataset directory exists in the data_store
|
203
301
|
if not (
|
204
|
-
self.data_store.is_dir(
|
205
|
-
or self.data_store.file_exists(
|
302
|
+
self.data_store.is_dir(dataset_folder)
|
303
|
+
or self.data_store.file_exists(dataset_folder)
|
206
304
|
):
|
207
305
|
raise FileNotFoundError(
|
208
|
-
f"HDX dataset
|
306
|
+
f"HDX dataset not found at {dataset_folder}. "
|
209
307
|
"Download the data first using HDXDownloader."
|
210
308
|
)
|
211
|
-
|
212
|
-
|
309
|
+
return self.data_store.list_files(dataset_folder)
|
310
|
+
|
311
|
+
def __repr__(self) -> str:
|
312
|
+
return (
|
313
|
+
f"HDXConfig(\n"
|
314
|
+
f" dataset_name='{self.dataset_name}'\n"
|
315
|
+
f" base_path='{self.base_path}'\n"
|
316
|
+
f" hdx_site='{self.hdx_site}'\n"
|
317
|
+
f" user_agent='{self.user_agent}'\n"
|
318
|
+
f")"
|
319
|
+
)
|
213
320
|
|
214
|
-
def read_resource(
|
215
|
-
self, resource_file: str
|
216
|
-
) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
|
217
|
-
"""Read a specific resource file from the dataset using the data_store."""
|
218
|
-
file_path = str(self.dataset_path / resource_file)
|
219
321
|
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
322
|
+
class HDXDownloader(BaseHandlerDownloader):
|
323
|
+
"""Downloader for HDX datasets"""
|
324
|
+
|
325
|
+
def __init__(
|
326
|
+
self,
|
327
|
+
config: Union[HDXConfig, dict],
|
328
|
+
data_store: Optional[DataStore] = None,
|
329
|
+
logger: Optional[logging.Logger] = None,
|
330
|
+
):
|
331
|
+
config = config if isinstance(config, HDXConfig) else HDXConfig(**config)
|
332
|
+
super().__init__(config=config, data_store=data_store, logger=logger)
|
224
333
|
|
334
|
+
def download_data_unit(self, resource: str, **kwargs) -> str:
|
335
|
+
"""Download a single resource"""
|
225
336
|
try:
|
226
|
-
|
337
|
+
resource_name = resource.get("name", "Unknown")
|
338
|
+
self.logger.info(f"Downloading resource: {resource_name}")
|
339
|
+
|
340
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
341
|
+
url, local_path = resource.download(folder=tmpdir)
|
342
|
+
with open(local_path, "rb") as f:
|
343
|
+
data = f.read()
|
344
|
+
# Compose the target path in the DataStore
|
345
|
+
target_path = str(self.config.get_data_unit_path(resource))
|
346
|
+
self.data_store.write_file(target_path, data)
|
347
|
+
self.logger.info(
|
348
|
+
f"Downloaded resource: {resource_name} to {target_path}"
|
349
|
+
)
|
350
|
+
return target_path
|
227
351
|
except Exception as e:
|
228
|
-
|
352
|
+
self.logger.error(f"Error downloading resource {resource_name}: {str(e)}")
|
353
|
+
return None
|
354
|
+
|
355
|
+
def download_data_units(self, resources: List[Resource], **kwargs) -> List[str]:
|
356
|
+
"""Download multiple resources sequentially
|
357
|
+
|
358
|
+
Args:
|
359
|
+
resources: List of HDX Resource objects
|
360
|
+
**kwargs: Additional keyword arguments
|
361
|
+
|
362
|
+
Returns:
|
363
|
+
List of paths to downloaded files
|
364
|
+
"""
|
365
|
+
if len(resources) == 0:
|
366
|
+
self.logger.warning("There is no resource to download")
|
367
|
+
return []
|
229
368
|
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
369
|
+
downloaded_paths = []
|
370
|
+
for resource in tqdm(resources, desc="Downloading resources"):
|
371
|
+
path = self.download_data_unit(resource)
|
372
|
+
if path:
|
373
|
+
downloaded_paths.append(path)
|
234
374
|
|
235
|
-
|
375
|
+
return downloaded_paths
|
376
|
+
|
377
|
+
def download(self, source: Union[Dict, str], **kwargs) -> List[str]:
|
378
|
+
"""Download data for a source"""
|
379
|
+
resources = self.config.get_relevant_data_units(source, **kwargs)
|
380
|
+
return self.download_data_units(resources)
|
381
|
+
|
382
|
+
|
383
|
+
class HDXReader(BaseHandlerReader):
|
384
|
+
"""Reader for HDX datasets"""
|
385
|
+
|
386
|
+
def __init__(
|
387
|
+
self,
|
388
|
+
config: Optional[HDXConfig] = None,
|
389
|
+
data_store: Optional[DataStore] = None,
|
390
|
+
logger: Optional[logging.Logger] = None,
|
391
|
+
):
|
392
|
+
config = config if isinstance(config, HDXConfig) else HDXConfig(**config)
|
393
|
+
super().__init__(config=config, data_store=data_store, logger=logger)
|
394
|
+
|
395
|
+
def resolve_source_paths(
|
396
|
+
self,
|
397
|
+
source: Union[
|
398
|
+
str, # country code
|
399
|
+
Dict, # filter
|
400
|
+
Path, # path
|
401
|
+
str, # path
|
402
|
+
List[Union[str, Path]],
|
403
|
+
],
|
404
|
+
**kwargs,
|
405
|
+
) -> List[Union[str, Path]]:
|
406
|
+
if isinstance(source, (str, Path)):
|
407
|
+
# Could be a country code or a path
|
408
|
+
if self.data_store.file_exists(str(source)) or str(source).endswith(
|
409
|
+
(".csv", ".tif", ".json", ".parquet", ".gz", ".geojson", ".zip")
|
410
|
+
):
|
411
|
+
source_data_paths = self.resolve_by_paths(source)
|
412
|
+
else:
|
413
|
+
source_data_paths = self.resolve_by_country(source, **kwargs)
|
414
|
+
elif isinstance(source, Dict):
|
415
|
+
resources = self.config.get_relevant_data_units(source=source, **kwargs)
|
416
|
+
source_data_paths = self.config.get_data_unit_paths(resources, **kwargs)
|
417
|
+
elif isinstance(source, Iterable) and all(
|
418
|
+
isinstance(p, (str, Path)) for p in source
|
419
|
+
):
|
420
|
+
source_data_paths = self.resolve_by_paths(source)
|
421
|
+
else:
|
422
|
+
raise NotImplementedError(f"Unsupported source type: {type(source)}")
|
423
|
+
|
424
|
+
self.logger.info(f"Resolved {len(source_data_paths)} paths!")
|
425
|
+
return source_data_paths
|
426
|
+
|
427
|
+
def load_from_paths(
|
428
|
+
self, source_data_path: List[Union[str, Path]], **kwargs
|
429
|
+
) -> Any:
|
430
|
+
"""Load data from paths"""
|
431
|
+
if len(source_data_path) == 1:
|
432
|
+
return read_dataset(self.data_store, source_data_path[0])
|
433
|
+
|
434
|
+
all_data = {}
|
435
|
+
for file_path in source_data_path:
|
236
436
|
try:
|
237
|
-
|
437
|
+
all_data[file_path] = read_dataset(self.data_store, file_path)
|
238
438
|
except Exception as e:
|
239
|
-
|
439
|
+
raise ValueError(f"Could not read file {file_path}: {str(e)}")
|
440
|
+
return all_data
|
441
|
+
|
442
|
+
def load_all_resources(self):
|
443
|
+
resources = self.config.list_resources()
|
444
|
+
return self.load_from_paths(resources)
|
445
|
+
|
446
|
+
|
447
|
+
class HDXHandler(BaseHandler):
|
448
|
+
"""Handler for HDX datasets"""
|
240
449
|
|
241
|
-
|
450
|
+
def __init__(
|
451
|
+
self,
|
452
|
+
dataset_name: str,
|
453
|
+
config: Optional[HDXConfig] = None,
|
454
|
+
downloader: Optional[HDXDownloader] = None,
|
455
|
+
reader: Optional[HDXReader] = None,
|
456
|
+
data_store: Optional[DataStore] = None,
|
457
|
+
logger: Optional[logging.Logger] = None,
|
458
|
+
**kwargs,
|
459
|
+
):
|
460
|
+
self._dataset_name = dataset_name
|
461
|
+
super().__init__(
|
462
|
+
config=config,
|
463
|
+
downloader=downloader,
|
464
|
+
reader=reader,
|
465
|
+
data_store=data_store,
|
466
|
+
logger=logger,
|
467
|
+
**kwargs,
|
468
|
+
)
|
469
|
+
|
470
|
+
def create_config(
|
471
|
+
self, data_store: DataStore, logger: logging.Logger, **kwargs
|
472
|
+
) -> HDXConfig:
|
473
|
+
"""Create and return a HDXConfig instance"""
|
474
|
+
return HDXConfig(
|
475
|
+
dataset_name=self._dataset_name,
|
476
|
+
data_store=data_store,
|
477
|
+
logger=logger,
|
478
|
+
**kwargs,
|
479
|
+
)
|
480
|
+
|
481
|
+
def create_downloader(
|
482
|
+
self,
|
483
|
+
config: HDXConfig,
|
484
|
+
data_store: DataStore,
|
485
|
+
logger: logging.Logger,
|
486
|
+
**kwargs,
|
487
|
+
) -> HDXDownloader:
|
488
|
+
"""Create and return a HDXDownloader instance"""
|
489
|
+
return HDXDownloader(
|
490
|
+
config=config,
|
491
|
+
data_store=data_store,
|
492
|
+
logger=logger,
|
493
|
+
**kwargs,
|
494
|
+
)
|
495
|
+
|
496
|
+
def create_reader(
|
497
|
+
self,
|
498
|
+
config: HDXConfig,
|
499
|
+
data_store: DataStore,
|
500
|
+
logger: logging.Logger,
|
501
|
+
**kwargs,
|
502
|
+
) -> HDXReader:
|
503
|
+
"""Create and return a HDXReader instance"""
|
504
|
+
return HDXReader(
|
505
|
+
config=config,
|
506
|
+
data_store=data_store,
|
507
|
+
logger=logger,
|
508
|
+
**kwargs,
|
509
|
+
)
|
@@ -14,7 +14,6 @@ from gigaspatial.processing.geo import (
|
|
14
14
|
convert_to_geodataframe,
|
15
15
|
buffer_geodataframe,
|
16
16
|
)
|
17
|
-
from gigaspatial.processing.sat_images import calculate_pixels_at_location
|
18
17
|
from gigaspatial.config import config as global_config
|
19
18
|
|
20
19
|
|
@@ -142,7 +141,7 @@ class MaxarImageDownloader:
|
|
142
141
|
self.logger.warning(
|
143
142
|
f"Attempt {attempt + 1} of downloading {output_path.name} failed: {str(e)}"
|
144
143
|
)
|
145
|
-
if attempt < self.max_retries - 1:
|
144
|
+
if attempt < self.config.max_retries - 1:
|
146
145
|
sleep(self.config.retry_delay)
|
147
146
|
else:
|
148
147
|
self.logger.warning(
|