giga-spatial 0.6.3__py3-none-any.whl → 0.6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: giga-spatial
3
- Version: 0.6.3
3
+ Version: 0.6.4
4
4
  Summary: A package for spatial data download & processing
5
5
  Home-page: https://github.com/unicef/giga-spatial
6
6
  Author: Utku Can Ozturk
@@ -1,6 +1,6 @@
1
- giga_spatial-0.6.3.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
2
- gigaspatial/__init__.py,sha256=zYiFHqR7JwbvdK9dvKrh-RTNfUqjHUwC4CTcFAPVYLc,22
3
- gigaspatial/config.py,sha256=yMf1ofOU0_I6iKDqshiFSYmK6TDIVpPm1AZo4e2okHU,8166
1
+ giga_spatial-0.6.4.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
2
+ gigaspatial/__init__.py,sha256=WMmvm2Keb76yMz8OL_h4fKT34Xpi-1BVfCiTn2QGzz4,22
3
+ gigaspatial/config.py,sha256=PR6n6NDDD4560zWEbaFiYSitr9PAKik915cxCCMZNQc,8392
4
4
  gigaspatial/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  gigaspatial/core/io/__init__.py,sha256=y4QNWx6US1-adTuAO_NZwLmjzSQj25HNDL5hUGvEHZc,263
6
6
  gigaspatial/core/io/adls_data_store.py,sha256=Zv-D_8d_2h57HnCUTJb0JWWjXqR_0XH4F8Nu_UFZK9E,11975
@@ -19,15 +19,15 @@ gigaspatial/generators/zonal/geometry.py,sha256=XPcX5lT7X7Z1vn72sN-VKLb2hDP9F_w3
19
19
  gigaspatial/generators/zonal/mercator.py,sha256=R_KlaqF4lnc0cRqVfcNVO8i0Re21_6w7pnclVKSohcY,3125
20
20
  gigaspatial/grid/__init__.py,sha256=H8SnNAMDafJXJ9bUp2zU0Z3t6s8niqY5rGP5nFhnbLA,45
21
21
  gigaspatial/grid/mercator_tiles.py,sha256=Z_3M4sy1tyxywAo2wmBb6niBP3x-IWgwMkmUp8LOSDg,10492
22
- gigaspatial/handlers/__init__.py,sha256=pqK3rJtelOAkBaWNhpGy2t-p_zrwO-9BqABLQufTXF0,1449
22
+ gigaspatial/handlers/__init__.py,sha256=R2rugXR5kF4lLkSO1fjpVDYK_jWdD8U2NbXbW71Ezv8,1523
23
23
  gigaspatial/handlers/base.py,sha256=rL94c3wDjsqzLp4na8FfYXW6tNjVGX6v4M-Ce4LrAro,26413
24
- gigaspatial/handlers/boundaries.py,sha256=hoO-b5MlFYwlCWogApcFyEx6OnxMJG29lqJurNGwOWg,11260
24
+ gigaspatial/handlers/boundaries.py,sha256=UM0lFcTzy64ADdMnPOkzLGJ-OG5P7KyoZtA91GTWxYs,17242
25
25
  gigaspatial/handlers/ghsl.py,sha256=GHao8lkmj1C0-QFqNwH9jr0Lqzu6NTj_7ooQdj1h6ok,27760
26
- gigaspatial/handlers/giga.py,sha256=2aP1EenDAQXn-h-uCyuVxEVZvAFEvrL17_z0MiS8FDs,4867
26
+ gigaspatial/handlers/giga.py,sha256=F5ZfcE37a24X-c6Xhyt72C9eZZbyN_gV7w_InxKFMQQ,28348
27
27
  gigaspatial/handlers/google_open_buildings.py,sha256=Liqk7qJhDtB4Ia4uhBe44LFcf-XVKBjRfj-pWlE5erY,16594
28
- gigaspatial/handlers/hdx.py,sha256=DNw-LhxuJU3eNGihQGyPJT0a1PaOCupNHr7BDGal4Zo,18088
28
+ gigaspatial/handlers/hdx.py,sha256=LTEs_xZF1yPhD8dAdZ_YN8Vcan7iB5_tZ8NjF_ip6u0,18001
29
29
  gigaspatial/handlers/mapbox_image.py,sha256=M_nkJ_b1PD8FG1ajVgSycCb0NRTAI_SLpHdzszNetKA,7786
30
- gigaspatial/handlers/maxar_image.py,sha256=g5YVGV-8JjeG9bGBOp7ZfKani22J4izXX4hnB9A99Jk,10272
30
+ gigaspatial/handlers/maxar_image.py,sha256=kcc8uGljQB0Yh0MKBA7lT7KwBbNZwFzuyBklR3db1P4,10204
31
31
  gigaspatial/handlers/microsoft_global_buildings.py,sha256=bQ5WHIv3v0wWrZZUbZkKPRjgdlqIxlK7CV_0zSvdrTw,20292
32
32
  gigaspatial/handlers/ookla_speedtest.py,sha256=EcvSAxJZ9GPfzYnT_C85Qgy2ecc9ndf70Pklk53OdC8,6506
33
33
  gigaspatial/handlers/opencellid.py,sha256=KuJqd-5-RO5ZzyDaBSrTgCK2ib5N_m3RUcPlX5heWwI,10683
@@ -41,7 +41,7 @@ gigaspatial/processing/geo.py,sha256=D-S3IlhQwLIxrCcxy6NhNmKLrOIjoRHfK_eZJGKpe2U
41
41
  gigaspatial/processing/sat_images.py,sha256=YUbH5MFNzl6NX49Obk14WaFcr1s3SyGJIOk-kRpbBNg,1429
42
42
  gigaspatial/processing/tif_processor.py,sha256=zqcP_ioo9KHNJ6H0uba4UghW4MToTRwq1iE-nZbb8zA,21101
43
43
  gigaspatial/processing/utils.py,sha256=HC85vGKQakxlkoQAkZmeAXWHsenAwTIRn7jPKUA7x20,1500
44
- giga_spatial-0.6.3.dist-info/METADATA,sha256=Aw5adPdTcA3AuJBmZgAG4rJQYW4dJqw2GT90mYE7cgU,7467
45
- giga_spatial-0.6.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
46
- giga_spatial-0.6.3.dist-info/top_level.txt,sha256=LZsccgw6H4zXT7m6Y4XChm-Y5LjHAwZ2hkGN_B3ExmI,12
47
- giga_spatial-0.6.3.dist-info/RECORD,,
44
+ giga_spatial-0.6.4.dist-info/METADATA,sha256=WQUWSdjlmfh09kkX20cgudrGHWmldXlNbh4DNjB0Xgo,7467
45
+ giga_spatial-0.6.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
46
+ giga_spatial-0.6.4.dist-info/top_level.txt,sha256=LZsccgw6H4zXT7m6Y4XChm-Y5LjHAwZ2hkGN_B3ExmI,12
47
+ giga_spatial-0.6.4.dist-info/RECORD,,
gigaspatial/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.6.3"
1
+ __version__ = "0.6.4"
gigaspatial/config.py CHANGED
@@ -32,6 +32,12 @@ class Config(BaseSettings):
32
32
  GIGA_SCHOOL_LOCATION_API_KEY: str = Field(
33
33
  default="", alias="GIGA_SCHOOL_LOCATION_API_KEY"
34
34
  )
35
+ GIGA_SCHOOL_PROFILE_API_KEY: str = Field(
36
+ default="", alias="GIGA_SCHOOL_PROFILE_API_KEY"
37
+ )
38
+ GIGA_SCHOOL_MEASUREMENTS_API_KEY: str = Field(
39
+ default="", alias="GIGA_SCHOOL_MEASUREMENTS_API_KEY"
40
+ )
35
41
 
36
42
  ROOT_DATA_DIR: Path = Field(
37
43
  default=Path("."),
@@ -37,4 +37,8 @@ from gigaspatial.handlers.unicef_georepo import (
37
37
  GeoRepoClient,
38
38
  get_country_boundaries_by_iso3,
39
39
  )
40
- from gigaspatial.handlers.giga import GigaSchoolLocationFetcher
40
+ from gigaspatial.handlers.giga import (
41
+ GigaSchoolLocationFetcher,
42
+ GigaSchoolProfileFetcher,
43
+ GigaSchoolMeasurementsFetcher,
44
+ )
@@ -4,10 +4,12 @@ import geopandas as gpd
4
4
  from pathlib import Path
5
5
  from urllib.error import HTTPError
6
6
  from shapely.geometry import Polygon, MultiPolygon, shape
7
+ import tempfile
7
8
  import pycountry
8
9
 
9
10
  from gigaspatial.core.io.data_store import DataStore
10
11
  from gigaspatial.core.io.readers import read_dataset
12
+ from gigaspatial.handlers.hdx import HDXConfig
11
13
  from gigaspatial.config import config
12
14
 
13
15
 
@@ -61,8 +63,31 @@ class AdminBoundaries(BaseModel):
61
63
  "name_en": "name_en",
62
64
  "country_code": "iso_3166_1_alpha_3",
63
65
  },
66
+ "geoBoundaries": {
67
+ "id": "shapeID",
68
+ "name": "shapeName",
69
+ "country_code": "shapeGroup",
70
+ },
64
71
  }
65
72
 
73
+ def to_geodataframe(self) -> gpd.GeoDataFrame:
74
+ """Convert the AdminBoundaries to a GeoDataFrame."""
75
+ if not self.boundaries:
76
+ if hasattr(self, "_empty_schema"):
77
+ columns = self._empty_schema
78
+ else:
79
+ columns = ["id", "name", "country_code", "geometry"]
80
+ if self.level > 0:
81
+ columns.append("parent_id")
82
+
83
+ return gpd.GeoDataFrame(columns=columns, geometry="geometry", crs=4326)
84
+
85
+ return gpd.GeoDataFrame(
86
+ [boundary.model_dump() for boundary in self.boundaries],
87
+ geometry="geometry",
88
+ crs=4326,
89
+ )
90
+
66
91
  @classmethod
67
92
  def get_schema_config(cls) -> Dict[str, Dict[str, str]]:
68
93
  """Return field mappings for different data sources"""
@@ -100,6 +125,7 @@ class AdminBoundaries(BaseModel):
100
125
  cls.logger.warning(
101
126
  f"Error loading GADM data for {country_code} at admin level {admin_level}: {str(e)}"
102
127
  )
128
+ cls.logger.info("Falling back to empty instance")
103
129
  return cls._create_empty_instance(country_code, admin_level, "gadm")
104
130
 
105
131
  @classmethod
@@ -138,6 +164,7 @@ class AdminBoundaries(BaseModel):
138
164
  cls.logger.warning(
139
165
  f"No data found at {path} for admin level {admin_level}: {str(e)}"
140
166
  )
167
+ cls.logger.info("Falling back to empty instance")
141
168
  return cls._create_empty_instance(None, admin_level, "internal")
142
169
 
143
170
  @classmethod
@@ -202,6 +229,69 @@ class AdminBoundaries(BaseModel):
202
229
 
203
230
  return cls(boundaries=boundaries, level=admin_level)
204
231
 
232
+ @classmethod
233
+ def from_geoboundaries(cls, country_code, admin_level: int = 0):
234
+ cls.logger.info(
235
+ f"Searching for geoBoundaries data for country: {country_code}, admin level: {admin_level}"
236
+ )
237
+
238
+ country_datasets = HDXConfig.search_datasets(
239
+ query=f'dataseries_name:"geoBoundaries - Subnational Administrative Boundaries" AND groups:"{country_code.lower()}"',
240
+ rows=1,
241
+ )
242
+ if not country_datasets:
243
+ cls.logger.error(f"No datasets found for country: {country_code}")
244
+ raise ValueError(
245
+ "No resources found for the specified country. Please check your search parameters and try again."
246
+ )
247
+
248
+ cls.logger.info(f"Found dataset: {country_datasets[0].get('title', 'Unknown')}")
249
+
250
+ resources = [
251
+ resource
252
+ for resource in country_datasets[0].get_resources()
253
+ if (
254
+ resource.data["name"]
255
+ == f"geoBoundaries-{country_code.upper()}-ADM{admin_level}.geojson"
256
+ )
257
+ ]
258
+
259
+ if not resources:
260
+ cls.logger.error(
261
+ f"No resources found for {country_code} at admin level {admin_level}"
262
+ )
263
+ raise ValueError(
264
+ "No resources found for the specified criteria. Please check your search parameters and try again."
265
+ )
266
+
267
+ cls.logger.info(f"Found resource: {resources[0].data.get('name', 'Unknown')}")
268
+
269
+ try:
270
+ cls.logger.info("Downloading and processing boundary data...")
271
+ with tempfile.TemporaryDirectory() as tmpdir:
272
+ url, local_path = resources[0].download(folder=tmpdir)
273
+ cls.logger.debug(f"Downloaded file to temporary path: {local_path}")
274
+ with open(local_path, "rb") as f:
275
+ gdf = gpd.read_file(f)
276
+
277
+ gdf = cls._map_fields(gdf, "geoBoundaries", admin_level)
278
+ boundaries = [
279
+ AdminBoundary(**row_dict) for row_dict in gdf.to_dict("records")
280
+ ]
281
+ cls.logger.info(
282
+ f"Successfully created {len(boundaries)} AdminBoundary objects"
283
+ )
284
+ return cls(boundaries=boundaries, level=admin_level)
285
+
286
+ except (ValueError, HTTPError, FileNotFoundError) as e:
287
+ cls.logger.warning(
288
+ f"Error loading geoBoundaries data for {country_code} at admin level {admin_level}: {str(e)}"
289
+ )
290
+ cls.logger.info("Falling back to empty instance")
291
+ return cls._create_empty_instance(
292
+ country_code, admin_level, "geoBoundaries"
293
+ )
294
+
205
295
  @classmethod
206
296
  def create(
207
297
  cls,
@@ -211,45 +301,126 @@ class AdminBoundaries(BaseModel):
211
301
  path: Optional[Union[str, "Path"]] = None,
212
302
  **kwargs,
213
303
  ) -> "AdminBoundaries":
214
- """Factory method to create AdminBoundaries instance from either GADM or data store."""
304
+ """Factory method to create AdminBoundaries instance from either GADM or data store.
305
+
306
+ Args:
307
+ country_code: ISO country code (2 or 3 letter) or country name
308
+ admin_level: Administrative level (0=country, 1=state/province, etc.)
309
+ data_store: Optional data store instance for loading from existing data
310
+ path: Optional path to data file (used with data_store)
311
+ **kwargs: Additional arguments passed to the underlying creation methods
312
+
313
+ Returns:
314
+ AdminBoundaries: Configured instance
315
+
316
+ Raises:
317
+ ValueError: If neither country_code nor (data_store, path) are provided,
318
+ or if country_code lookup fails
319
+
320
+ Example:
321
+ # From country code
322
+ boundaries = AdminBoundaries.create(country_code="USA", admin_level=1)
323
+
324
+ # From data store
325
+ boundaries = AdminBoundaries.create(data_store=store, path="data.shp")
326
+ """
215
327
  cls.logger.info(
216
- f"Creating AdminBoundaries instance. Country: {country_code}, admin level: {admin_level}, data_store provided: {data_store is not None}, path provided: {path is not None}"
328
+ f"Creating AdminBoundaries instance. Country: {country_code}, "
329
+ f"admin level: {admin_level}, data_store provided: {data_store is not None}, "
330
+ f"path provided: {path is not None}"
217
331
  )
218
- iso3_code = pycountry.countries.lookup(country_code).alpha_3
332
+
333
+ # Validate input parameters
334
+ if not country_code and not data_store:
335
+ raise ValueError("Either country_code or data_store must be provided.")
336
+
337
+ if data_store and not path and not country_code:
338
+ raise ValueError(
339
+ "If data_store is provided, either path or country_code must also be specified."
340
+ )
341
+
342
+ # Handle data store path first
219
343
  if data_store is not None:
220
- if path is None:
221
- if country_code is None:
222
- ValueError(
223
- "If data_store is provided, path or country_code must also be specified."
224
- )
344
+ iso3_code = None
345
+ if country_code:
346
+ try:
347
+ iso3_code = pycountry.countries.lookup(country_code).alpha_3
348
+ except LookupError as e:
349
+ raise ValueError(f"Invalid country code '{country_code}': {e}")
350
+
351
+ # Generate path if not provided
352
+ if path is None and iso3_code:
225
353
  path = config.get_admin_path(
226
354
  country_code=iso3_code,
227
355
  admin_level=admin_level,
228
356
  )
357
+
229
358
  return cls.from_data_store(data_store, path, admin_level, **kwargs)
230
- elif country_code is not None:
231
- from gigaspatial.handlers.unicef_georepo import GeoRepoClient
232
359
 
360
+ # Handle country code path
361
+ if country_code is not None:
233
362
  try:
234
- client = GeoRepoClient()
235
- if client.check_connection():
236
- cls.logger.info("GeoRepo connection successful.")
237
- return cls.from_georepo(
238
- iso3_code,
239
- admin_level=admin_level,
240
- )
241
- except ValueError as e:
363
+ iso3_code = pycountry.countries.lookup(country_code).alpha_3
364
+ except LookupError as e:
365
+ raise ValueError(f"Invalid country code '{country_code}': {e}")
366
+
367
+ # Try GeoRepo first
368
+ if cls._try_georepo(iso3_code, admin_level):
369
+ return cls.from_georepo(iso3_code, admin_level=admin_level)
370
+
371
+ # Fallback to GADM
372
+ try:
373
+ cls.logger.info("Attempting to load from GADM.")
374
+ return cls.from_gadm(iso3_code, admin_level, **kwargs)
375
+ except Exception as e:
242
376
  cls.logger.warning(
243
- f"GeoRepo initialization failed: {str(e)}. Falling back to GADM."
377
+ f"GADM loading failed: {e}. Falling back to geoBoundaries."
244
378
  )
379
+
380
+ # Final fallback to geoBoundaries
381
+ try:
382
+ return cls.from_geoboundaries(iso3_code, admin_level)
245
383
  except Exception as e:
246
- cls.logger.warning(f"GeoRepo error: {str(e)}. Falling back to GADM.")
384
+ cls.logger.error(f"All data sources failed. geoBoundaries error: {e}")
385
+ raise RuntimeError(
386
+ f"Failed to load administrative boundaries for {country_code} "
387
+ f"from all available sources (GeoRepo, GADM, geoBoundaries)."
388
+ ) from e
247
389
 
248
- return cls.from_gadm(iso3_code, admin_level, **kwargs)
249
- else:
250
- raise ValueError(
251
- "Either country_code or (data_store, path) must be provided."
252
- )
390
+ # This should never be reached due to validation above
391
+ raise ValueError("Unexpected error: no valid data source could be determined.")
392
+
393
+ @classmethod
394
+ def _try_georepo(cls, iso3_code: str, admin_level: int) -> bool:
395
+ """Helper method to test GeoRepo availability.
396
+
397
+ Args:
398
+ iso3_code: ISO3 country code
399
+ admin_level: Administrative level
400
+
401
+ Returns:
402
+ bool: True if GeoRepo is available and working, False otherwise
403
+ """
404
+ try:
405
+ from gigaspatial.handlers.unicef_georepo import GeoRepoClient
406
+
407
+ client = GeoRepoClient()
408
+ if client.check_connection():
409
+ cls.logger.info("GeoRepo connection successful.")
410
+ return True
411
+ else:
412
+ cls.logger.info("GeoRepo connection failed.")
413
+ return False
414
+
415
+ except ImportError:
416
+ cls.logger.info("GeoRepo client not available (import failed).")
417
+ return False
418
+ except ValueError as e:
419
+ cls.logger.warning(f"GeoRepo initialization failed: {e}")
420
+ return False
421
+ except Exception as e:
422
+ cls.logger.warning(f"GeoRepo error: {e}")
423
+ return False
253
424
 
254
425
  @classmethod
255
426
  def _create_empty_instance(
@@ -288,21 +459,3 @@ class AdminBoundaries(BaseModel):
288
459
  field_mapping[v] = k
289
460
 
290
461
  return gdf.rename(columns=field_mapping)
291
-
292
- def to_geodataframe(self) -> gpd.GeoDataFrame:
293
- """Convert the AdminBoundaries to a GeoDataFrame."""
294
- if not self.boundaries:
295
- if hasattr(self, "_empty_schema"):
296
- columns = self._empty_schema
297
- else:
298
- columns = ["id", "name", "country_code", "geometry"]
299
- if self.level > 0:
300
- columns.append("parent_id")
301
-
302
- return gpd.GeoDataFrame(columns=columns, geometry="geometry", crs=4326)
303
-
304
- return gpd.GeoDataFrame(
305
- [boundary.model_dump() for boundary in self.boundaries],
306
- geometry="geometry",
307
- crs=4326,
308
- )
@@ -1,10 +1,12 @@
1
1
  import requests
2
2
  import pandas as pd
3
3
  import time
4
+ from datetime import datetime, date
4
5
  from pydantic.dataclasses import dataclass, Field
5
6
  from pydantic import ConfigDict
6
7
  from shapely.geometry import Point
7
8
  import pycountry
9
+ from typing import Optional, Union
8
10
  import logging
9
11
 
10
12
  from gigaspatial.config import config as global_config
@@ -143,3 +145,642 @@ class GigaSchoolLocationFetcher:
143
145
  self.logger.info(f"Created geometry for all {len(df)} records")
144
146
 
145
147
  return df
148
+
149
+
150
+ @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
151
+ class GigaSchoolProfileFetcher:
152
+ """
153
+ Fetch and process school profile data from the Giga School Profile API.
154
+ This includes connectivity information and other school details.
155
+ """
156
+
157
+ country: str = Field(...)
158
+ api_url: str = Field(
159
+ default="https://uni-ooi-giga-maps-service.azurewebsites.net/api/v1/schools_profile/",
160
+ description="Base URL for the Giga School Profile API",
161
+ )
162
+ api_key: str = global_config.GIGA_SCHOOL_PROFILE_API_KEY
163
+ page_size: int = Field(default=1000, description="Number of records per API page")
164
+ sleep_time: float = Field(
165
+ default=0.2, description="Sleep time between API requests"
166
+ )
167
+ giga_id_school: Optional[str] = Field(
168
+ default=None, description="Optional specific giga school ID to fetch"
169
+ )
170
+
171
+ logger: logging.Logger = Field(default=None, repr=False)
172
+
173
+ def __post_init__(self):
174
+ try:
175
+ self.country = pycountry.countries.lookup(self.country).alpha_3
176
+ except LookupError:
177
+ raise ValueError(f"Invalid country code provided: {self.country}")
178
+
179
+ if self.logger is None:
180
+ self.logger = global_config.get_logger(self.__class__.__name__)
181
+
182
+ def fetch_profiles(self, **kwargs) -> pd.DataFrame:
183
+ """
184
+ Fetch and process school profiles including connectivity information.
185
+
186
+ Args:
187
+ **kwargs: Additional parameters for customization
188
+ - page_size: Override default page size
189
+ - sleep_time: Override default sleep time between requests
190
+ - max_pages: Limit the number of pages to fetch
191
+ - giga_id_school: Override default giga_id_school filter
192
+
193
+ Returns:
194
+ pd.DataFrame: School profiles with connectivity and geospatial info.
195
+ """
196
+ # Override defaults with kwargs if provided
197
+ page_size = kwargs.get("page_size", self.page_size)
198
+ sleep_time = kwargs.get("sleep_time", self.sleep_time)
199
+ max_pages = kwargs.get("max_pages", None)
200
+ giga_id_school = kwargs.get("giga_id_school", self.giga_id_school)
201
+
202
+ # Prepare headers
203
+ headers = {
204
+ "Authorization": f"Bearer {self.api_key}",
205
+ "Accept": "application/json",
206
+ }
207
+
208
+ all_data = []
209
+ page = 1
210
+
211
+ self.logger.info(
212
+ f"Starting to fetch school profiles for country: {self.country}"
213
+ )
214
+
215
+ if giga_id_school:
216
+ self.logger.info(f"Filtering for specific school ID: {giga_id_school}")
217
+
218
+ while True:
219
+ # Check if we've reached max_pages limit
220
+ if max_pages and page > max_pages:
221
+ self.logger.info(f"Reached maximum pages limit: {max_pages}")
222
+ break
223
+
224
+ # Build parameters
225
+ params = {
226
+ "country_iso3_code": self.country,
227
+ "page": page,
228
+ "size": page_size,
229
+ }
230
+
231
+ # Add giga_id_school filter if specified
232
+ if giga_id_school:
233
+ params["giga_id_school"] = giga_id_school
234
+
235
+ try:
236
+ self.logger.debug(f"Fetching page {page} with params: {params}")
237
+ response = requests.get(self.api_url, headers=headers, params=params)
238
+ response.raise_for_status()
239
+
240
+ parsed = response.json()
241
+ data = parsed.get("data", [])
242
+
243
+ except requests.exceptions.RequestException as e:
244
+ self.logger.error(f"Request failed on page {page}: {e}")
245
+ break
246
+ except ValueError as e:
247
+ self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
248
+ break
249
+
250
+ # Check if we got any data
251
+ if not data:
252
+ self.logger.info(f"No data on page {page}. Stopping.")
253
+ break
254
+
255
+ all_data.extend(data)
256
+ self.logger.info(f"Fetched page {page} with {len(data)} records")
257
+
258
+ # If we got fewer records than page_size, we've reached the end
259
+ if len(data) < page_size:
260
+ self.logger.info("Reached end of data (partial page received)")
261
+ break
262
+
263
+ # If filtering by specific school ID, we likely only need one page
264
+ if giga_id_school:
265
+ self.logger.info(
266
+ "Specific school ID requested, stopping after first page"
267
+ )
268
+ break
269
+
270
+ page += 1
271
+
272
+ # Sleep to be respectful to the API
273
+ if sleep_time > 0:
274
+ time.sleep(sleep_time)
275
+
276
+ self.logger.info(f"Finished fetching. Total records: {len(all_data)}")
277
+
278
+ # Convert to DataFrame and process
279
+ if not all_data:
280
+ self.logger.warning("No data fetched, returning empty DataFrame")
281
+ return pd.DataFrame()
282
+
283
+ df = pd.DataFrame(all_data)
284
+
285
+ return df
286
+
287
+ def get_connectivity_summary(self, df: pd.DataFrame) -> dict:
288
+ """
289
+ Generate a summary of connectivity statistics from the fetched data.
290
+
291
+ Args:
292
+ df: DataFrame with school profile data
293
+
294
+ Returns:
295
+ dict: Summary statistics about connectivity
296
+ """
297
+ if df.empty:
298
+ return {"error": "No data available"}
299
+
300
+ summary = {
301
+ "total_schools": len(df),
302
+ "country": (
303
+ df["country_iso3_code"].iloc[0]
304
+ if "country_iso3_code" in df.columns
305
+ else "Unknown"
306
+ ),
307
+ }
308
+
309
+ # Administrative region analysis
310
+ if "admin1" in df.columns:
311
+ admin1_counts = df["admin1"].value_counts().head(10).to_dict()
312
+ summary["top_admin1_regions"] = admin1_counts
313
+
314
+ if "admin2" in df.columns:
315
+ admin2_counts = df["admin2"].value_counts().head(10).to_dict()
316
+ summary["top_admin2_regions"] = admin2_counts
317
+
318
+ # Connectivity analysis
319
+ if "connectivity" in df.columns:
320
+ connected_count = df["connectivity"].sum()
321
+ summary["schools_with_connectivity"] = int(connected_count)
322
+ summary["connectivity_percentage"] = connected_count / len(df) * 100
323
+
324
+ if "connectivity_RT" in df.columns:
325
+ rt_connected_count = df["connectivity_RT"].sum()
326
+ summary["schools_with_realtime_connectivity"] = int(rt_connected_count)
327
+ summary["realtime_connectivity_percentage"] = (
328
+ rt_connected_count / len(df) * 100
329
+ )
330
+
331
+ # Connectivity type analysis
332
+ if "connectivity_type" in df.columns:
333
+
334
+ if not all(df.connectivity_type.isna()):
335
+ from collections import Counter
336
+
337
+ type_counts = dict(Counter(df.connectivity_type.dropna().to_list()))
338
+ summary["connectivity_types_breakdown"] = type_counts
339
+
340
+ # Data source analysis
341
+ if "connectivity_RT_datasource" in df.columns:
342
+ datasource_counts = (
343
+ df["connectivity_RT_datasource"].value_counts().to_dict()
344
+ )
345
+ summary["realtime_connectivity_datasources"] = datasource_counts
346
+
347
+ if "school_data_source" in df.columns:
348
+ school_datasource_counts = df["school_data_source"].value_counts().to_dict()
349
+ summary["school_data_sources"] = school_datasource_counts
350
+
351
+ self.logger.info("Generated connectivity summary")
352
+ return summary
353
+
354
+
355
+ @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
356
+ class GigaSchoolMeasurementsFetcher:
357
+ """
358
+ Fetch and process school daily realtime connectivity measurements from the Giga API.
359
+ This includes download/upload speeds, latency, and connectivity performance data.
360
+ """
361
+
362
+ country: str = Field(...)
363
+ start_date: Union[str, date, datetime] = Field(...)
364
+ end_date: Union[str, date, datetime] = Field(...)
365
+ api_url: str = Field(
366
+ default="https://uni-ooi-giga-maps-service.azurewebsites.net/api/v1/all_measurements",
367
+ description="Base URL for the Giga School Measurements API",
368
+ )
369
+ api_key: str = global_config.GIGA_SCHOOL_MEASUREMENTS_API_KEY
370
+ page_size: int = Field(default=1000, description="Number of records per API page")
371
+ sleep_time: float = Field(
372
+ default=0.2, description="Sleep time between API requests"
373
+ )
374
+ giga_id_school: Optional[str] = Field(
375
+ default=None, description="Optional specific giga school ID to fetch"
376
+ )
377
+
378
+ logger: logging.Logger = Field(default=None, repr=False)
379
+
380
+ def __post_init__(self):
381
+ try:
382
+ self.country = pycountry.countries.lookup(self.country).alpha_3
383
+ except LookupError:
384
+ raise ValueError(f"Invalid country code provided: {self.country}")
385
+
386
+ # Convert dates to string format if needed
387
+ self.start_date = self._format_date(self.start_date)
388
+ self.end_date = self._format_date(self.end_date)
389
+
390
+ # Validate date range
391
+ if self.start_date > self.end_date:
392
+ raise ValueError("start_date must be before or equal to end_date")
393
+
394
+ if self.logger is None:
395
+ self.logger = global_config.get_logger(self.__class__.__name__)
396
+
397
+ def _format_date(self, date_input: Union[str, date, datetime]) -> str:
398
+ """
399
+ Convert date input to string format expected by API (YYYY-MM-DD).
400
+
401
+ Args:
402
+ date_input: Date in various formats
403
+
404
+ Returns:
405
+ str: Date in YYYY-MM-DD format
406
+ """
407
+ if isinstance(date_input, str):
408
+ # Assume it's already in correct format or parse it
409
+ try:
410
+ parsed_date = datetime.strptime(date_input, "%Y-%m-%d")
411
+ return date_input
412
+ except ValueError:
413
+ try:
414
+ parsed_date = pd.to_datetime(date_input)
415
+ return parsed_date.strftime("%Y-%m-%d")
416
+ except:
417
+ raise ValueError(
418
+ f"Invalid date format: {date_input}. Expected YYYY-MM-DD"
419
+ )
420
+ elif isinstance(date_input, (date, datetime)):
421
+ return date_input.strftime("%Y-%m-%d")
422
+ else:
423
+ raise ValueError(f"Invalid date type: {type(date_input)}")
424
+
425
+ def fetch_measurements(self, **kwargs) -> pd.DataFrame:
426
+ """
427
+ Fetch and process school connectivity measurements.
428
+
429
+ Args:
430
+ **kwargs: Additional parameters for customization
431
+ - page_size: Override default page size
432
+ - sleep_time: Override default sleep time between requests
433
+ - max_pages: Limit the number of pages to fetch
434
+ - giga_id_school: Override default giga_id_school filter
435
+ - start_date: Override default start_date
436
+ - end_date: Override default end_date
437
+
438
+ Returns:
439
+ pd.DataFrame: School measurements with connectivity performance data.
440
+ """
441
+ # Override defaults with kwargs if provided
442
+ page_size = kwargs.get("page_size", self.page_size)
443
+ sleep_time = kwargs.get("sleep_time", self.sleep_time)
444
+ max_pages = kwargs.get("max_pages", None)
445
+ giga_id_school = kwargs.get("giga_id_school", self.giga_id_school)
446
+ start_date = kwargs.get("start_date", self.start_date)
447
+ end_date = kwargs.get("end_date", self.end_date)
448
+
449
+ # Format dates if overridden
450
+ if start_date != self.start_date:
451
+ start_date = self._format_date(start_date)
452
+ if end_date != self.end_date:
453
+ end_date = self._format_date(end_date)
454
+
455
+ # Prepare headers
456
+ headers = {
457
+ "Authorization": f"Bearer {self.api_key}",
458
+ "Accept": "application/json",
459
+ }
460
+
461
+ all_data = []
462
+ page = 1
463
+
464
+ self.logger.info(
465
+ f"Starting to fetch measurements for country: {self.country} "
466
+ f"from {start_date} to {end_date}"
467
+ )
468
+
469
+ if giga_id_school:
470
+ self.logger.info(f"Filtering for specific school ID: {giga_id_school}")
471
+
472
+ while True:
473
+ # Check if we've reached max_pages limit
474
+ if max_pages and page > max_pages:
475
+ self.logger.info(f"Reached maximum pages limit: {max_pages}")
476
+ break
477
+
478
+ # Build parameters
479
+ params = {
480
+ "country_iso3_code": self.country,
481
+ "start_date": start_date,
482
+ "end_date": end_date,
483
+ "page": page,
484
+ "size": page_size,
485
+ }
486
+
487
+ # Add giga_id_school filter if specified
488
+ if giga_id_school:
489
+ params["giga_id_school"] = giga_id_school
490
+
491
+ try:
492
+ self.logger.debug(f"Fetching page {page} with params: {params}")
493
+ response = requests.get(self.api_url, headers=headers, params=params)
494
+ response.raise_for_status()
495
+
496
+ parsed = response.json()
497
+ data = parsed.get("data", [])
498
+
499
+ except requests.exceptions.RequestException as e:
500
+ self.logger.error(f"Request failed on page {page}: {e}")
501
+ break
502
+ except ValueError as e:
503
+ self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
504
+ break
505
+
506
+ # Check if we got any data
507
+ if not data:
508
+ self.logger.info(f"No data on page {page}. Stopping.")
509
+ break
510
+
511
+ all_data.extend(data)
512
+ self.logger.info(f"Fetched page {page} with {len(data)} records")
513
+
514
+ # If we got fewer records than page_size, we've reached the end
515
+ if len(data) < page_size:
516
+ self.logger.info("Reached end of data (partial page received)")
517
+ break
518
+
519
+ # If filtering by specific school ID, we might only need one page
520
+ if giga_id_school and len(all_data) > 0:
521
+ self.logger.info(
522
+ "Specific school ID requested, checking if more data needed"
523
+ )
524
+
525
+ page += 1
526
+
527
+ # Sleep to be respectful to the API
528
+ if sleep_time > 0:
529
+ time.sleep(sleep_time)
530
+
531
+ self.logger.info(f"Finished fetching. Total records: {len(all_data)}")
532
+
533
+ # Convert to DataFrame and process
534
+ if not all_data:
535
+ self.logger.warning("No data fetched, returning empty DataFrame")
536
+ return pd.DataFrame()
537
+
538
+ df = pd.DataFrame(all_data)
539
+ df = self._process_measurements_data(df)
540
+
541
+ return df
542
+
543
+ def _process_measurements_data(self, df: pd.DataFrame) -> pd.DataFrame:
544
+ """
545
+ Process and enhance the DataFrame with measurement performance metrics.
546
+
547
+ Args:
548
+ df: Raw DataFrame from API
549
+
550
+ Returns:
551
+ pd.DataFrame: Enhanced DataFrame with processed measurement data
552
+ """
553
+ if df.empty:
554
+ return df
555
+
556
+ # Convert date column to datetime
557
+ if "date" in df.columns:
558
+ df["date"] = pd.to_datetime(df["date"], errors="coerce")
559
+ df["date_only"] = df["date"].dt.date
560
+ df["year"] = df["date"].dt.year
561
+ df["month"] = df["date"].dt.month
562
+ df["day_of_week"] = df["date"].dt.day_name()
563
+ self.logger.info("Processed date fields")
564
+
565
+ # Process speed measurements
566
+ numeric_columns = ["download_speed", "upload_speed", "latency"]
567
+ for col in numeric_columns:
568
+ if col in df.columns:
569
+ df[col] = pd.to_numeric(df[col], errors="coerce")
570
+
571
+ # Create performance categories
572
+ if "download_speed" in df.columns:
573
+ df["download_speed_category"] = pd.cut(
574
+ df["download_speed"],
575
+ bins=[0, 5, 25, 100, float("inf")],
576
+ labels=[
577
+ "Very Slow (<5 Mbps)",
578
+ "Slow (5-25 Mbps)",
579
+ "Moderate (25-100 Mbps)",
580
+ "Fast (>100 Mbps)",
581
+ ],
582
+ include_lowest=True,
583
+ )
584
+
585
+ if "upload_speed" in df.columns:
586
+ df["upload_speed_category"] = pd.cut(
587
+ df["upload_speed"],
588
+ bins=[0, 1, 10, 50, float("inf")],
589
+ labels=[
590
+ "Very Slow (<1 Mbps)",
591
+ "Slow (1-10 Mbps)",
592
+ "Moderate (10-50 Mbps)",
593
+ "Fast (>50 Mbps)",
594
+ ],
595
+ include_lowest=True,
596
+ )
597
+
598
+ if "latency" in df.columns:
599
+ df["latency_category"] = pd.cut(
600
+ df["latency"],
601
+ bins=[0, 50, 150, 300, float("inf")],
602
+ labels=[
603
+ "Excellent (<50ms)",
604
+ "Good (50-150ms)",
605
+ "Fair (150-300ms)",
606
+ "Poor (>300ms)",
607
+ ],
608
+ include_lowest=True,
609
+ )
610
+
611
+ # Create quality flags
612
+ if "download_speed" in df.columns and "upload_speed" in df.columns:
613
+ df["has_broadband"] = (df["download_speed"] >= 25) & (
614
+ df["upload_speed"] >= 3
615
+ )
616
+ df["has_basic_connectivity"] = (df["download_speed"] >= 1) & (
617
+ df["upload_speed"] >= 0.5
618
+ )
619
+
620
+ # Flag measurements with missing data
621
+ df["has_complete_measurement"] = (
622
+ df["download_speed"].notna()
623
+ & df["upload_speed"].notna()
624
+ & df["latency"].notna()
625
+ )
626
+
627
+ self.logger.info(f"Processed measurement data for {len(df)} records")
628
+
629
+ return df
630
+
631
+ def get_performance_summary(self, df: pd.DataFrame) -> dict:
632
+ """
633
+ Generate a comprehensive summary of connectivity performance metrics.
634
+
635
+ Args:
636
+ df: DataFrame with measurement data
637
+
638
+ Returns:
639
+ dict: Summary statistics about connectivity performance
640
+ """
641
+ if df.empty:
642
+ return {"error": "No data available"}
643
+
644
+ summary = {
645
+ "total_measurements": len(df),
646
+ "country": (
647
+ df["country_iso3_code"].iloc[0]
648
+ if "country_iso3_code" in df.columns
649
+ else "Unknown"
650
+ ),
651
+ "date_range": {
652
+ "start": (
653
+ df["date"].min().strftime("%Y-%m-%d")
654
+ if "date" in df.columns
655
+ else None
656
+ ),
657
+ "end": (
658
+ df["date"].max().strftime("%Y-%m-%d")
659
+ if "date" in df.columns
660
+ else None
661
+ ),
662
+ },
663
+ }
664
+
665
+ # School coverage
666
+ if "giga_id_school" in df.columns:
667
+ unique_schools = df["giga_id_school"].nunique()
668
+ summary["unique_schools_measured"] = unique_schools
669
+ summary["avg_measurements_per_school"] = (
670
+ len(df) / unique_schools if unique_schools > 0 else 0
671
+ )
672
+
673
+ # Speed statistics
674
+ for speed_col in ["download_speed", "upload_speed"]:
675
+ if speed_col in df.columns:
676
+ speed_data = df[speed_col].dropna()
677
+ if len(speed_data) > 0:
678
+ summary[f"{speed_col}_stats"] = {
679
+ "mean": float(speed_data.mean()),
680
+ "median": float(speed_data.median()),
681
+ "min": float(speed_data.min()),
682
+ "max": float(speed_data.max()),
683
+ "std": float(speed_data.std()),
684
+ }
685
+
686
+ # Latency statistics
687
+ if "latency" in df.columns:
688
+ latency_data = df["latency"].dropna()
689
+ if len(latency_data) > 0:
690
+ summary["latency_stats"] = {
691
+ "mean": float(latency_data.mean()),
692
+ "median": float(latency_data.median()),
693
+ "min": float(latency_data.min()),
694
+ "max": float(latency_data.max()),
695
+ "std": float(latency_data.std()),
696
+ }
697
+
698
+ # Performance categories
699
+ for cat_col in [
700
+ "download_speed_category",
701
+ "upload_speed_category",
702
+ "latency_category",
703
+ ]:
704
+ if cat_col in df.columns:
705
+ cat_counts = df[cat_col].value_counts().to_dict()
706
+ summary[cat_col.replace("_category", "_breakdown")] = cat_counts
707
+
708
+ # Quality metrics
709
+ if "has_broadband" in df.columns:
710
+ summary["broadband_capable_measurements"] = int(df["has_broadband"].sum())
711
+ summary["broadband_percentage"] = float(df["has_broadband"].mean() * 100)
712
+
713
+ if "has_basic_connectivity" in df.columns:
714
+ summary["basic_connectivity_measurements"] = int(
715
+ df["has_basic_connectivity"].sum()
716
+ )
717
+ summary["basic_connectivity_percentage"] = float(
718
+ df["has_basic_connectivity"].mean() * 100
719
+ )
720
+
721
+ # Data completeness
722
+ if "has_complete_measurement" in df.columns:
723
+ summary["complete_measurements"] = int(df["has_complete_measurement"].sum())
724
+ summary["data_completeness_percentage"] = float(
725
+ df["has_complete_measurement"].mean() * 100
726
+ )
727
+
728
+ # Data sources
729
+ if "data_source" in df.columns:
730
+ source_counts = df["data_source"].value_counts().to_dict()
731
+ summary["data_sources"] = source_counts
732
+
733
+ # Temporal patterns
734
+ if "day_of_week" in df.columns:
735
+ day_counts = df["day_of_week"].value_counts().to_dict()
736
+ summary["measurements_by_day_of_week"] = day_counts
737
+
738
+ self.logger.info("Generated performance summary")
739
+ return summary
740
+
741
+ def get_school_performance_comparison(
742
+ self, df: pd.DataFrame, top_n: int = 10
743
+ ) -> dict:
744
+ """
745
+ Compare performance across schools.
746
+
747
+ Args:
748
+ df: DataFrame with measurement data
749
+ top_n: Number of top/bottom schools to include
750
+
751
+ Returns:
752
+ dict: School performance comparison
753
+ """
754
+ if df.empty or "giga_id_school" not in df.columns:
755
+ return {"error": "No school data available"}
756
+
757
+ school_stats = (
758
+ df.groupby("giga_id_school")
759
+ .agg(
760
+ {
761
+ "download_speed": ["mean", "median", "count"],
762
+ "upload_speed": ["mean", "median"],
763
+ "latency": ["mean", "median"],
764
+ "has_broadband": (
765
+ "mean" if "has_broadband" in df.columns else lambda x: None
766
+ ),
767
+ }
768
+ )
769
+ .round(2)
770
+ )
771
+
772
+ # Flatten column names
773
+ school_stats.columns = ["_".join(col).strip() for col in school_stats.columns]
774
+
775
+ # Sort by download speed
776
+ if "download_speed_mean" in school_stats.columns:
777
+ top_schools = school_stats.nlargest(top_n, "download_speed_mean")
778
+ bottom_schools = school_stats.nsmallest(top_n, "download_speed_mean")
779
+
780
+ return {
781
+ "top_performing_schools": top_schools.to_dict("index"),
782
+ "bottom_performing_schools": bottom_schools.to_dict("index"),
783
+ "total_schools_analyzed": len(school_stats),
784
+ }
785
+
786
+ return {"error": "Insufficient data for school comparison"}
@@ -1,13 +1,9 @@
1
- import os
2
1
  import logging
3
2
  from tqdm import tqdm
4
3
  from pathlib import Path
5
- from typing import List, Optional, Tuple, Union, Dict, Any, Iterable
4
+ from typing import List, Optional, Union, Dict, Any, Iterable
6
5
  import tempfile
7
- import functools
8
- import multiprocessing
9
6
 
10
- import pandas as pd
11
7
  import geopandas as gpd
12
8
  from pydantic import Field, ConfigDict
13
9
  from pydantic.dataclasses import dataclass
@@ -50,6 +46,48 @@ class HDXConfig(BaseHandlerConfig):
50
46
  _hdx_configured: bool = Field(default=False, init=False)
51
47
  dataset: Optional[Dataset] = Field(default=None, init=False)
52
48
 
49
+ @staticmethod
50
+ def search_datasets(
51
+ query: str,
52
+ rows: int = None,
53
+ sort: str = "relevance asc, metadata_modified desc",
54
+ hdx_site: str = "prod",
55
+ user_agent: str = "gigaspatial",
56
+ ) -> List[Dict]:
57
+ """Search for datasets in HDX before initializing the class.
58
+
59
+ Args:
60
+ query: Search query string
61
+ rows: Number of results per page. Defaults to all datasets (sys.maxsize).
62
+ sort: Sort order - one of 'relevance', 'views_recent', 'views_total', 'last_modified' (default: 'relevance')
63
+ hdx_site: HDX site to use - 'prod' or 'test' (default: 'prod')
64
+ user_agent: User agent for HDX API requests (default: 'gigaspatial')
65
+
66
+ Returns:
67
+ List of dataset dictionaries containing search results
68
+
69
+ Example:
70
+ >>> results = HDXConfig.search_datasets("population", rows=5)
71
+ >>> for dataset in results:
72
+ >>> print(f"Name: {dataset['name']}, Title: {dataset['title']}")
73
+ """
74
+ try:
75
+ Configuration.create(
76
+ hdx_site=hdx_site,
77
+ user_agent=user_agent,
78
+ hdx_read_only=True,
79
+ )
80
+ except:
81
+ pass
82
+
83
+ try:
84
+ results = Dataset.search_in_hdx(query=query, rows=rows, sort=sort)
85
+
86
+ return results
87
+ except Exception as e:
88
+ logging.error(f"Error searching HDX datasets: {str(e)}")
89
+ raise
90
+
53
91
  def __post_init__(self):
54
92
  super().__post_init__()
55
93
  try:
@@ -85,7 +123,11 @@ class HDXConfig(BaseHandlerConfig):
85
123
  self.logger.info(f"Fetching HDX dataset: {self.dataset_name}")
86
124
  dataset = Dataset.read_from_hdx(self.dataset_name)
87
125
  if not dataset:
88
- raise ValueError(f"Dataset '{self.dataset_name}' not found on HDX")
126
+ raise ValueError(
127
+ f"Dataset '{self.dataset_name}' not found on HDX. "
128
+ "Please verify the dataset name or use search_datasets() "
129
+ "to find available datasets."
130
+ )
89
131
  return dataset
90
132
  except Exception as e:
91
133
  self.logger.error(f"Error fetching HDX dataset: {str(e)}")
@@ -386,9 +428,9 @@ class HDXReader(BaseHandlerReader):
386
428
  self, source_data_path: List[Union[str, Path]], **kwargs
387
429
  ) -> Any:
388
430
  """Load data from paths"""
389
- if len(source_data_path)==1:
431
+ if len(source_data_path) == 1:
390
432
  return read_dataset(self.data_store, source_data_path[0])
391
-
433
+
392
434
  all_data = {}
393
435
  for file_path in source_data_path:
394
436
  try:
@@ -401,49 +443,6 @@ class HDXReader(BaseHandlerReader):
401
443
  resources = self.config.list_resources()
402
444
  return self.load_from_paths(resources)
403
445
 
404
- # def read_resource(
405
- # self, resource_file: str
406
- # ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
407
- # """Read a specific resource file from the dataset using the data_store."""
408
- # if not self.dataset_path:
409
- # raise ValueError("No dataset path configured")
410
-
411
- # file_path = str(self.dataset_path / resource_file)
412
-
413
- # if not self.data_store.file_exists(file_path):
414
- # raise FileNotFoundError(
415
- # f"Resource file {resource_file} not found in dataset"
416
- # )
417
-
418
- # try:
419
- # return read_dataset(self.data_store, file_path)
420
- # except Exception as e:
421
- # raise ValueError(f"Could not read file {file_path}: {str(e)}")
422
-
423
- # def read_all_resources(self) -> Dict[str, Union[pd.DataFrame, gpd.GeoDataFrame]]:
424
- # """Read all resources in the dataset directory using the data_store."""
425
- # resources = self.list_resources()
426
- # result = {}
427
-
428
- # for resource in resources:
429
- # try:
430
- # result[resource] = self.read_resource(resource)
431
- # except Exception as e:
432
- # self.logger.warning(f"Could not read resource {resource}: {str(e)}")
433
-
434
- # return result
435
-
436
- # def load_from_paths(
437
- # self, source_data_path: List[Union[str, Path]], **kwargs
438
- # ) -> Union[
439
- # pd.DataFrame, gpd.GeoDataFrame, Dict[str, Union[pd.DataFrame, gpd.GeoDataFrame]]
440
- # ]:
441
- # """Load data from paths"""
442
- # if len(source_data_path) == 1:
443
- # return self.read_resource(str(source_data_path[0]))
444
- # else:
445
- # return self.read_all_resources()
446
-
447
446
 
448
447
  class HDXHandler(BaseHandler):
449
448
  """Handler for HDX datasets"""
@@ -14,7 +14,6 @@ from gigaspatial.processing.geo import (
14
14
  convert_to_geodataframe,
15
15
  buffer_geodataframe,
16
16
  )
17
- from gigaspatial.processing.sat_images import calculate_pixels_at_location
18
17
  from gigaspatial.config import config as global_config
19
18
 
20
19
 
@@ -142,7 +141,7 @@ class MaxarImageDownloader:
142
141
  self.logger.warning(
143
142
  f"Attempt {attempt + 1} of downloading {output_path.name} failed: {str(e)}"
144
143
  )
145
- if attempt < self.max_retries - 1:
144
+ if attempt < self.config.max_retries - 1:
146
145
  sleep(self.config.retry_delay)
147
146
  else:
148
147
  self.logger.warning(