giga-spatial 0.6.3__py3-none-any.whl → 0.6.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {giga_spatial-0.6.3.dist-info → giga_spatial-0.6.4.dist-info}/METADATA +1 -1
- {giga_spatial-0.6.3.dist-info → giga_spatial-0.6.4.dist-info}/RECORD +12 -12
- gigaspatial/__init__.py +1 -1
- gigaspatial/config.py +6 -0
- gigaspatial/handlers/__init__.py +5 -1
- gigaspatial/handlers/boundaries.py +196 -43
- gigaspatial/handlers/giga.py +641 -0
- gigaspatial/handlers/hdx.py +50 -51
- gigaspatial/handlers/maxar_image.py +1 -2
- {giga_spatial-0.6.3.dist-info → giga_spatial-0.6.4.dist-info}/WHEEL +0 -0
- {giga_spatial-0.6.3.dist-info → giga_spatial-0.6.4.dist-info}/licenses/LICENSE +0 -0
- {giga_spatial-0.6.3.dist-info → giga_spatial-0.6.4.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
|
-
giga_spatial-0.6.
|
2
|
-
gigaspatial/__init__.py,sha256=
|
3
|
-
gigaspatial/config.py,sha256=
|
1
|
+
giga_spatial-0.6.4.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
2
|
+
gigaspatial/__init__.py,sha256=WMmvm2Keb76yMz8OL_h4fKT34Xpi-1BVfCiTn2QGzz4,22
|
3
|
+
gigaspatial/config.py,sha256=PR6n6NDDD4560zWEbaFiYSitr9PAKik915cxCCMZNQc,8392
|
4
4
|
gigaspatial/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
5
|
gigaspatial/core/io/__init__.py,sha256=y4QNWx6US1-adTuAO_NZwLmjzSQj25HNDL5hUGvEHZc,263
|
6
6
|
gigaspatial/core/io/adls_data_store.py,sha256=Zv-D_8d_2h57HnCUTJb0JWWjXqR_0XH4F8Nu_UFZK9E,11975
|
@@ -19,15 +19,15 @@ gigaspatial/generators/zonal/geometry.py,sha256=XPcX5lT7X7Z1vn72sN-VKLb2hDP9F_w3
|
|
19
19
|
gigaspatial/generators/zonal/mercator.py,sha256=R_KlaqF4lnc0cRqVfcNVO8i0Re21_6w7pnclVKSohcY,3125
|
20
20
|
gigaspatial/grid/__init__.py,sha256=H8SnNAMDafJXJ9bUp2zU0Z3t6s8niqY5rGP5nFhnbLA,45
|
21
21
|
gigaspatial/grid/mercator_tiles.py,sha256=Z_3M4sy1tyxywAo2wmBb6niBP3x-IWgwMkmUp8LOSDg,10492
|
22
|
-
gigaspatial/handlers/__init__.py,sha256=
|
22
|
+
gigaspatial/handlers/__init__.py,sha256=R2rugXR5kF4lLkSO1fjpVDYK_jWdD8U2NbXbW71Ezv8,1523
|
23
23
|
gigaspatial/handlers/base.py,sha256=rL94c3wDjsqzLp4na8FfYXW6tNjVGX6v4M-Ce4LrAro,26413
|
24
|
-
gigaspatial/handlers/boundaries.py,sha256=
|
24
|
+
gigaspatial/handlers/boundaries.py,sha256=UM0lFcTzy64ADdMnPOkzLGJ-OG5P7KyoZtA91GTWxYs,17242
|
25
25
|
gigaspatial/handlers/ghsl.py,sha256=GHao8lkmj1C0-QFqNwH9jr0Lqzu6NTj_7ooQdj1h6ok,27760
|
26
|
-
gigaspatial/handlers/giga.py,sha256=
|
26
|
+
gigaspatial/handlers/giga.py,sha256=F5ZfcE37a24X-c6Xhyt72C9eZZbyN_gV7w_InxKFMQQ,28348
|
27
27
|
gigaspatial/handlers/google_open_buildings.py,sha256=Liqk7qJhDtB4Ia4uhBe44LFcf-XVKBjRfj-pWlE5erY,16594
|
28
|
-
gigaspatial/handlers/hdx.py,sha256=
|
28
|
+
gigaspatial/handlers/hdx.py,sha256=LTEs_xZF1yPhD8dAdZ_YN8Vcan7iB5_tZ8NjF_ip6u0,18001
|
29
29
|
gigaspatial/handlers/mapbox_image.py,sha256=M_nkJ_b1PD8FG1ajVgSycCb0NRTAI_SLpHdzszNetKA,7786
|
30
|
-
gigaspatial/handlers/maxar_image.py,sha256=
|
30
|
+
gigaspatial/handlers/maxar_image.py,sha256=kcc8uGljQB0Yh0MKBA7lT7KwBbNZwFzuyBklR3db1P4,10204
|
31
31
|
gigaspatial/handlers/microsoft_global_buildings.py,sha256=bQ5WHIv3v0wWrZZUbZkKPRjgdlqIxlK7CV_0zSvdrTw,20292
|
32
32
|
gigaspatial/handlers/ookla_speedtest.py,sha256=EcvSAxJZ9GPfzYnT_C85Qgy2ecc9ndf70Pklk53OdC8,6506
|
33
33
|
gigaspatial/handlers/opencellid.py,sha256=KuJqd-5-RO5ZzyDaBSrTgCK2ib5N_m3RUcPlX5heWwI,10683
|
@@ -41,7 +41,7 @@ gigaspatial/processing/geo.py,sha256=D-S3IlhQwLIxrCcxy6NhNmKLrOIjoRHfK_eZJGKpe2U
|
|
41
41
|
gigaspatial/processing/sat_images.py,sha256=YUbH5MFNzl6NX49Obk14WaFcr1s3SyGJIOk-kRpbBNg,1429
|
42
42
|
gigaspatial/processing/tif_processor.py,sha256=zqcP_ioo9KHNJ6H0uba4UghW4MToTRwq1iE-nZbb8zA,21101
|
43
43
|
gigaspatial/processing/utils.py,sha256=HC85vGKQakxlkoQAkZmeAXWHsenAwTIRn7jPKUA7x20,1500
|
44
|
-
giga_spatial-0.6.
|
45
|
-
giga_spatial-0.6.
|
46
|
-
giga_spatial-0.6.
|
47
|
-
giga_spatial-0.6.
|
44
|
+
giga_spatial-0.6.4.dist-info/METADATA,sha256=WQUWSdjlmfh09kkX20cgudrGHWmldXlNbh4DNjB0Xgo,7467
|
45
|
+
giga_spatial-0.6.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
46
|
+
giga_spatial-0.6.4.dist-info/top_level.txt,sha256=LZsccgw6H4zXT7m6Y4XChm-Y5LjHAwZ2hkGN_B3ExmI,12
|
47
|
+
giga_spatial-0.6.4.dist-info/RECORD,,
|
gigaspatial/__init__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.6.
|
1
|
+
__version__ = "0.6.4"
|
gigaspatial/config.py
CHANGED
@@ -32,6 +32,12 @@ class Config(BaseSettings):
|
|
32
32
|
GIGA_SCHOOL_LOCATION_API_KEY: str = Field(
|
33
33
|
default="", alias="GIGA_SCHOOL_LOCATION_API_KEY"
|
34
34
|
)
|
35
|
+
GIGA_SCHOOL_PROFILE_API_KEY: str = Field(
|
36
|
+
default="", alias="GIGA_SCHOOL_PROFILE_API_KEY"
|
37
|
+
)
|
38
|
+
GIGA_SCHOOL_MEASUREMENTS_API_KEY: str = Field(
|
39
|
+
default="", alias="GIGA_SCHOOL_MEASUREMENTS_API_KEY"
|
40
|
+
)
|
35
41
|
|
36
42
|
ROOT_DATA_DIR: Path = Field(
|
37
43
|
default=Path("."),
|
gigaspatial/handlers/__init__.py
CHANGED
@@ -37,4 +37,8 @@ from gigaspatial.handlers.unicef_georepo import (
|
|
37
37
|
GeoRepoClient,
|
38
38
|
get_country_boundaries_by_iso3,
|
39
39
|
)
|
40
|
-
from gigaspatial.handlers.giga import
|
40
|
+
from gigaspatial.handlers.giga import (
|
41
|
+
GigaSchoolLocationFetcher,
|
42
|
+
GigaSchoolProfileFetcher,
|
43
|
+
GigaSchoolMeasurementsFetcher,
|
44
|
+
)
|
@@ -4,10 +4,12 @@ import geopandas as gpd
|
|
4
4
|
from pathlib import Path
|
5
5
|
from urllib.error import HTTPError
|
6
6
|
from shapely.geometry import Polygon, MultiPolygon, shape
|
7
|
+
import tempfile
|
7
8
|
import pycountry
|
8
9
|
|
9
10
|
from gigaspatial.core.io.data_store import DataStore
|
10
11
|
from gigaspatial.core.io.readers import read_dataset
|
12
|
+
from gigaspatial.handlers.hdx import HDXConfig
|
11
13
|
from gigaspatial.config import config
|
12
14
|
|
13
15
|
|
@@ -61,8 +63,31 @@ class AdminBoundaries(BaseModel):
|
|
61
63
|
"name_en": "name_en",
|
62
64
|
"country_code": "iso_3166_1_alpha_3",
|
63
65
|
},
|
66
|
+
"geoBoundaries": {
|
67
|
+
"id": "shapeID",
|
68
|
+
"name": "shapeName",
|
69
|
+
"country_code": "shapeGroup",
|
70
|
+
},
|
64
71
|
}
|
65
72
|
|
73
|
+
def to_geodataframe(self) -> gpd.GeoDataFrame:
|
74
|
+
"""Convert the AdminBoundaries to a GeoDataFrame."""
|
75
|
+
if not self.boundaries:
|
76
|
+
if hasattr(self, "_empty_schema"):
|
77
|
+
columns = self._empty_schema
|
78
|
+
else:
|
79
|
+
columns = ["id", "name", "country_code", "geometry"]
|
80
|
+
if self.level > 0:
|
81
|
+
columns.append("parent_id")
|
82
|
+
|
83
|
+
return gpd.GeoDataFrame(columns=columns, geometry="geometry", crs=4326)
|
84
|
+
|
85
|
+
return gpd.GeoDataFrame(
|
86
|
+
[boundary.model_dump() for boundary in self.boundaries],
|
87
|
+
geometry="geometry",
|
88
|
+
crs=4326,
|
89
|
+
)
|
90
|
+
|
66
91
|
@classmethod
|
67
92
|
def get_schema_config(cls) -> Dict[str, Dict[str, str]]:
|
68
93
|
"""Return field mappings for different data sources"""
|
@@ -100,6 +125,7 @@ class AdminBoundaries(BaseModel):
|
|
100
125
|
cls.logger.warning(
|
101
126
|
f"Error loading GADM data for {country_code} at admin level {admin_level}: {str(e)}"
|
102
127
|
)
|
128
|
+
cls.logger.info("Falling back to empty instance")
|
103
129
|
return cls._create_empty_instance(country_code, admin_level, "gadm")
|
104
130
|
|
105
131
|
@classmethod
|
@@ -138,6 +164,7 @@ class AdminBoundaries(BaseModel):
|
|
138
164
|
cls.logger.warning(
|
139
165
|
f"No data found at {path} for admin level {admin_level}: {str(e)}"
|
140
166
|
)
|
167
|
+
cls.logger.info("Falling back to empty instance")
|
141
168
|
return cls._create_empty_instance(None, admin_level, "internal")
|
142
169
|
|
143
170
|
@classmethod
|
@@ -202,6 +229,69 @@ class AdminBoundaries(BaseModel):
|
|
202
229
|
|
203
230
|
return cls(boundaries=boundaries, level=admin_level)
|
204
231
|
|
232
|
+
@classmethod
|
233
|
+
def from_geoboundaries(cls, country_code, admin_level: int = 0):
|
234
|
+
cls.logger.info(
|
235
|
+
f"Searching for geoBoundaries data for country: {country_code}, admin level: {admin_level}"
|
236
|
+
)
|
237
|
+
|
238
|
+
country_datasets = HDXConfig.search_datasets(
|
239
|
+
query=f'dataseries_name:"geoBoundaries - Subnational Administrative Boundaries" AND groups:"{country_code.lower()}"',
|
240
|
+
rows=1,
|
241
|
+
)
|
242
|
+
if not country_datasets:
|
243
|
+
cls.logger.error(f"No datasets found for country: {country_code}")
|
244
|
+
raise ValueError(
|
245
|
+
"No resources found for the specified country. Please check your search parameters and try again."
|
246
|
+
)
|
247
|
+
|
248
|
+
cls.logger.info(f"Found dataset: {country_datasets[0].get('title', 'Unknown')}")
|
249
|
+
|
250
|
+
resources = [
|
251
|
+
resource
|
252
|
+
for resource in country_datasets[0].get_resources()
|
253
|
+
if (
|
254
|
+
resource.data["name"]
|
255
|
+
== f"geoBoundaries-{country_code.upper()}-ADM{admin_level}.geojson"
|
256
|
+
)
|
257
|
+
]
|
258
|
+
|
259
|
+
if not resources:
|
260
|
+
cls.logger.error(
|
261
|
+
f"No resources found for {country_code} at admin level {admin_level}"
|
262
|
+
)
|
263
|
+
raise ValueError(
|
264
|
+
"No resources found for the specified criteria. Please check your search parameters and try again."
|
265
|
+
)
|
266
|
+
|
267
|
+
cls.logger.info(f"Found resource: {resources[0].data.get('name', 'Unknown')}")
|
268
|
+
|
269
|
+
try:
|
270
|
+
cls.logger.info("Downloading and processing boundary data...")
|
271
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
272
|
+
url, local_path = resources[0].download(folder=tmpdir)
|
273
|
+
cls.logger.debug(f"Downloaded file to temporary path: {local_path}")
|
274
|
+
with open(local_path, "rb") as f:
|
275
|
+
gdf = gpd.read_file(f)
|
276
|
+
|
277
|
+
gdf = cls._map_fields(gdf, "geoBoundaries", admin_level)
|
278
|
+
boundaries = [
|
279
|
+
AdminBoundary(**row_dict) for row_dict in gdf.to_dict("records")
|
280
|
+
]
|
281
|
+
cls.logger.info(
|
282
|
+
f"Successfully created {len(boundaries)} AdminBoundary objects"
|
283
|
+
)
|
284
|
+
return cls(boundaries=boundaries, level=admin_level)
|
285
|
+
|
286
|
+
except (ValueError, HTTPError, FileNotFoundError) as e:
|
287
|
+
cls.logger.warning(
|
288
|
+
f"Error loading geoBoundaries data for {country_code} at admin level {admin_level}: {str(e)}"
|
289
|
+
)
|
290
|
+
cls.logger.info("Falling back to empty instance")
|
291
|
+
return cls._create_empty_instance(
|
292
|
+
country_code, admin_level, "geoBoundaries"
|
293
|
+
)
|
294
|
+
|
205
295
|
@classmethod
|
206
296
|
def create(
|
207
297
|
cls,
|
@@ -211,45 +301,126 @@ class AdminBoundaries(BaseModel):
|
|
211
301
|
path: Optional[Union[str, "Path"]] = None,
|
212
302
|
**kwargs,
|
213
303
|
) -> "AdminBoundaries":
|
214
|
-
"""Factory method to create AdminBoundaries instance from either GADM or data store.
|
304
|
+
"""Factory method to create AdminBoundaries instance from either GADM or data store.
|
305
|
+
|
306
|
+
Args:
|
307
|
+
country_code: ISO country code (2 or 3 letter) or country name
|
308
|
+
admin_level: Administrative level (0=country, 1=state/province, etc.)
|
309
|
+
data_store: Optional data store instance for loading from existing data
|
310
|
+
path: Optional path to data file (used with data_store)
|
311
|
+
**kwargs: Additional arguments passed to the underlying creation methods
|
312
|
+
|
313
|
+
Returns:
|
314
|
+
AdminBoundaries: Configured instance
|
315
|
+
|
316
|
+
Raises:
|
317
|
+
ValueError: If neither country_code nor (data_store, path) are provided,
|
318
|
+
or if country_code lookup fails
|
319
|
+
|
320
|
+
Example:
|
321
|
+
# From country code
|
322
|
+
boundaries = AdminBoundaries.create(country_code="USA", admin_level=1)
|
323
|
+
|
324
|
+
# From data store
|
325
|
+
boundaries = AdminBoundaries.create(data_store=store, path="data.shp")
|
326
|
+
"""
|
215
327
|
cls.logger.info(
|
216
|
-
f"Creating AdminBoundaries instance. Country: {country_code},
|
328
|
+
f"Creating AdminBoundaries instance. Country: {country_code}, "
|
329
|
+
f"admin level: {admin_level}, data_store provided: {data_store is not None}, "
|
330
|
+
f"path provided: {path is not None}"
|
217
331
|
)
|
218
|
-
|
332
|
+
|
333
|
+
# Validate input parameters
|
334
|
+
if not country_code and not data_store:
|
335
|
+
raise ValueError("Either country_code or data_store must be provided.")
|
336
|
+
|
337
|
+
if data_store and not path and not country_code:
|
338
|
+
raise ValueError(
|
339
|
+
"If data_store is provided, either path or country_code must also be specified."
|
340
|
+
)
|
341
|
+
|
342
|
+
# Handle data store path first
|
219
343
|
if data_store is not None:
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
344
|
+
iso3_code = None
|
345
|
+
if country_code:
|
346
|
+
try:
|
347
|
+
iso3_code = pycountry.countries.lookup(country_code).alpha_3
|
348
|
+
except LookupError as e:
|
349
|
+
raise ValueError(f"Invalid country code '{country_code}': {e}")
|
350
|
+
|
351
|
+
# Generate path if not provided
|
352
|
+
if path is None and iso3_code:
|
225
353
|
path = config.get_admin_path(
|
226
354
|
country_code=iso3_code,
|
227
355
|
admin_level=admin_level,
|
228
356
|
)
|
357
|
+
|
229
358
|
return cls.from_data_store(data_store, path, admin_level, **kwargs)
|
230
|
-
elif country_code is not None:
|
231
|
-
from gigaspatial.handlers.unicef_georepo import GeoRepoClient
|
232
359
|
|
360
|
+
# Handle country code path
|
361
|
+
if country_code is not None:
|
233
362
|
try:
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
363
|
+
iso3_code = pycountry.countries.lookup(country_code).alpha_3
|
364
|
+
except LookupError as e:
|
365
|
+
raise ValueError(f"Invalid country code '{country_code}': {e}")
|
366
|
+
|
367
|
+
# Try GeoRepo first
|
368
|
+
if cls._try_georepo(iso3_code, admin_level):
|
369
|
+
return cls.from_georepo(iso3_code, admin_level=admin_level)
|
370
|
+
|
371
|
+
# Fallback to GADM
|
372
|
+
try:
|
373
|
+
cls.logger.info("Attempting to load from GADM.")
|
374
|
+
return cls.from_gadm(iso3_code, admin_level, **kwargs)
|
375
|
+
except Exception as e:
|
242
376
|
cls.logger.warning(
|
243
|
-
f"
|
377
|
+
f"GADM loading failed: {e}. Falling back to geoBoundaries."
|
244
378
|
)
|
379
|
+
|
380
|
+
# Final fallback to geoBoundaries
|
381
|
+
try:
|
382
|
+
return cls.from_geoboundaries(iso3_code, admin_level)
|
245
383
|
except Exception as e:
|
246
|
-
cls.logger.
|
384
|
+
cls.logger.error(f"All data sources failed. geoBoundaries error: {e}")
|
385
|
+
raise RuntimeError(
|
386
|
+
f"Failed to load administrative boundaries for {country_code} "
|
387
|
+
f"from all available sources (GeoRepo, GADM, geoBoundaries)."
|
388
|
+
) from e
|
247
389
|
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
390
|
+
# This should never be reached due to validation above
|
391
|
+
raise ValueError("Unexpected error: no valid data source could be determined.")
|
392
|
+
|
393
|
+
@classmethod
|
394
|
+
def _try_georepo(cls, iso3_code: str, admin_level: int) -> bool:
|
395
|
+
"""Helper method to test GeoRepo availability.
|
396
|
+
|
397
|
+
Args:
|
398
|
+
iso3_code: ISO3 country code
|
399
|
+
admin_level: Administrative level
|
400
|
+
|
401
|
+
Returns:
|
402
|
+
bool: True if GeoRepo is available and working, False otherwise
|
403
|
+
"""
|
404
|
+
try:
|
405
|
+
from gigaspatial.handlers.unicef_georepo import GeoRepoClient
|
406
|
+
|
407
|
+
client = GeoRepoClient()
|
408
|
+
if client.check_connection():
|
409
|
+
cls.logger.info("GeoRepo connection successful.")
|
410
|
+
return True
|
411
|
+
else:
|
412
|
+
cls.logger.info("GeoRepo connection failed.")
|
413
|
+
return False
|
414
|
+
|
415
|
+
except ImportError:
|
416
|
+
cls.logger.info("GeoRepo client not available (import failed).")
|
417
|
+
return False
|
418
|
+
except ValueError as e:
|
419
|
+
cls.logger.warning(f"GeoRepo initialization failed: {e}")
|
420
|
+
return False
|
421
|
+
except Exception as e:
|
422
|
+
cls.logger.warning(f"GeoRepo error: {e}")
|
423
|
+
return False
|
253
424
|
|
254
425
|
@classmethod
|
255
426
|
def _create_empty_instance(
|
@@ -288,21 +459,3 @@ class AdminBoundaries(BaseModel):
|
|
288
459
|
field_mapping[v] = k
|
289
460
|
|
290
461
|
return gdf.rename(columns=field_mapping)
|
291
|
-
|
292
|
-
def to_geodataframe(self) -> gpd.GeoDataFrame:
|
293
|
-
"""Convert the AdminBoundaries to a GeoDataFrame."""
|
294
|
-
if not self.boundaries:
|
295
|
-
if hasattr(self, "_empty_schema"):
|
296
|
-
columns = self._empty_schema
|
297
|
-
else:
|
298
|
-
columns = ["id", "name", "country_code", "geometry"]
|
299
|
-
if self.level > 0:
|
300
|
-
columns.append("parent_id")
|
301
|
-
|
302
|
-
return gpd.GeoDataFrame(columns=columns, geometry="geometry", crs=4326)
|
303
|
-
|
304
|
-
return gpd.GeoDataFrame(
|
305
|
-
[boundary.model_dump() for boundary in self.boundaries],
|
306
|
-
geometry="geometry",
|
307
|
-
crs=4326,
|
308
|
-
)
|
gigaspatial/handlers/giga.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
import requests
|
2
2
|
import pandas as pd
|
3
3
|
import time
|
4
|
+
from datetime import datetime, date
|
4
5
|
from pydantic.dataclasses import dataclass, Field
|
5
6
|
from pydantic import ConfigDict
|
6
7
|
from shapely.geometry import Point
|
7
8
|
import pycountry
|
9
|
+
from typing import Optional, Union
|
8
10
|
import logging
|
9
11
|
|
10
12
|
from gigaspatial.config import config as global_config
|
@@ -143,3 +145,642 @@ class GigaSchoolLocationFetcher:
|
|
143
145
|
self.logger.info(f"Created geometry for all {len(df)} records")
|
144
146
|
|
145
147
|
return df
|
148
|
+
|
149
|
+
|
150
|
+
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
151
|
+
class GigaSchoolProfileFetcher:
|
152
|
+
"""
|
153
|
+
Fetch and process school profile data from the Giga School Profile API.
|
154
|
+
This includes connectivity information and other school details.
|
155
|
+
"""
|
156
|
+
|
157
|
+
country: str = Field(...)
|
158
|
+
api_url: str = Field(
|
159
|
+
default="https://uni-ooi-giga-maps-service.azurewebsites.net/api/v1/schools_profile/",
|
160
|
+
description="Base URL for the Giga School Profile API",
|
161
|
+
)
|
162
|
+
api_key: str = global_config.GIGA_SCHOOL_PROFILE_API_KEY
|
163
|
+
page_size: int = Field(default=1000, description="Number of records per API page")
|
164
|
+
sleep_time: float = Field(
|
165
|
+
default=0.2, description="Sleep time between API requests"
|
166
|
+
)
|
167
|
+
giga_id_school: Optional[str] = Field(
|
168
|
+
default=None, description="Optional specific giga school ID to fetch"
|
169
|
+
)
|
170
|
+
|
171
|
+
logger: logging.Logger = Field(default=None, repr=False)
|
172
|
+
|
173
|
+
def __post_init__(self):
|
174
|
+
try:
|
175
|
+
self.country = pycountry.countries.lookup(self.country).alpha_3
|
176
|
+
except LookupError:
|
177
|
+
raise ValueError(f"Invalid country code provided: {self.country}")
|
178
|
+
|
179
|
+
if self.logger is None:
|
180
|
+
self.logger = global_config.get_logger(self.__class__.__name__)
|
181
|
+
|
182
|
+
def fetch_profiles(self, **kwargs) -> pd.DataFrame:
|
183
|
+
"""
|
184
|
+
Fetch and process school profiles including connectivity information.
|
185
|
+
|
186
|
+
Args:
|
187
|
+
**kwargs: Additional parameters for customization
|
188
|
+
- page_size: Override default page size
|
189
|
+
- sleep_time: Override default sleep time between requests
|
190
|
+
- max_pages: Limit the number of pages to fetch
|
191
|
+
- giga_id_school: Override default giga_id_school filter
|
192
|
+
|
193
|
+
Returns:
|
194
|
+
pd.DataFrame: School profiles with connectivity and geospatial info.
|
195
|
+
"""
|
196
|
+
# Override defaults with kwargs if provided
|
197
|
+
page_size = kwargs.get("page_size", self.page_size)
|
198
|
+
sleep_time = kwargs.get("sleep_time", self.sleep_time)
|
199
|
+
max_pages = kwargs.get("max_pages", None)
|
200
|
+
giga_id_school = kwargs.get("giga_id_school", self.giga_id_school)
|
201
|
+
|
202
|
+
# Prepare headers
|
203
|
+
headers = {
|
204
|
+
"Authorization": f"Bearer {self.api_key}",
|
205
|
+
"Accept": "application/json",
|
206
|
+
}
|
207
|
+
|
208
|
+
all_data = []
|
209
|
+
page = 1
|
210
|
+
|
211
|
+
self.logger.info(
|
212
|
+
f"Starting to fetch school profiles for country: {self.country}"
|
213
|
+
)
|
214
|
+
|
215
|
+
if giga_id_school:
|
216
|
+
self.logger.info(f"Filtering for specific school ID: {giga_id_school}")
|
217
|
+
|
218
|
+
while True:
|
219
|
+
# Check if we've reached max_pages limit
|
220
|
+
if max_pages and page > max_pages:
|
221
|
+
self.logger.info(f"Reached maximum pages limit: {max_pages}")
|
222
|
+
break
|
223
|
+
|
224
|
+
# Build parameters
|
225
|
+
params = {
|
226
|
+
"country_iso3_code": self.country,
|
227
|
+
"page": page,
|
228
|
+
"size": page_size,
|
229
|
+
}
|
230
|
+
|
231
|
+
# Add giga_id_school filter if specified
|
232
|
+
if giga_id_school:
|
233
|
+
params["giga_id_school"] = giga_id_school
|
234
|
+
|
235
|
+
try:
|
236
|
+
self.logger.debug(f"Fetching page {page} with params: {params}")
|
237
|
+
response = requests.get(self.api_url, headers=headers, params=params)
|
238
|
+
response.raise_for_status()
|
239
|
+
|
240
|
+
parsed = response.json()
|
241
|
+
data = parsed.get("data", [])
|
242
|
+
|
243
|
+
except requests.exceptions.RequestException as e:
|
244
|
+
self.logger.error(f"Request failed on page {page}: {e}")
|
245
|
+
break
|
246
|
+
except ValueError as e:
|
247
|
+
self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
|
248
|
+
break
|
249
|
+
|
250
|
+
# Check if we got any data
|
251
|
+
if not data:
|
252
|
+
self.logger.info(f"No data on page {page}. Stopping.")
|
253
|
+
break
|
254
|
+
|
255
|
+
all_data.extend(data)
|
256
|
+
self.logger.info(f"Fetched page {page} with {len(data)} records")
|
257
|
+
|
258
|
+
# If we got fewer records than page_size, we've reached the end
|
259
|
+
if len(data) < page_size:
|
260
|
+
self.logger.info("Reached end of data (partial page received)")
|
261
|
+
break
|
262
|
+
|
263
|
+
# If filtering by specific school ID, we likely only need one page
|
264
|
+
if giga_id_school:
|
265
|
+
self.logger.info(
|
266
|
+
"Specific school ID requested, stopping after first page"
|
267
|
+
)
|
268
|
+
break
|
269
|
+
|
270
|
+
page += 1
|
271
|
+
|
272
|
+
# Sleep to be respectful to the API
|
273
|
+
if sleep_time > 0:
|
274
|
+
time.sleep(sleep_time)
|
275
|
+
|
276
|
+
self.logger.info(f"Finished fetching. Total records: {len(all_data)}")
|
277
|
+
|
278
|
+
# Convert to DataFrame and process
|
279
|
+
if not all_data:
|
280
|
+
self.logger.warning("No data fetched, returning empty DataFrame")
|
281
|
+
return pd.DataFrame()
|
282
|
+
|
283
|
+
df = pd.DataFrame(all_data)
|
284
|
+
|
285
|
+
return df
|
286
|
+
|
287
|
+
def get_connectivity_summary(self, df: pd.DataFrame) -> dict:
|
288
|
+
"""
|
289
|
+
Generate a summary of connectivity statistics from the fetched data.
|
290
|
+
|
291
|
+
Args:
|
292
|
+
df: DataFrame with school profile data
|
293
|
+
|
294
|
+
Returns:
|
295
|
+
dict: Summary statistics about connectivity
|
296
|
+
"""
|
297
|
+
if df.empty:
|
298
|
+
return {"error": "No data available"}
|
299
|
+
|
300
|
+
summary = {
|
301
|
+
"total_schools": len(df),
|
302
|
+
"country": (
|
303
|
+
df["country_iso3_code"].iloc[0]
|
304
|
+
if "country_iso3_code" in df.columns
|
305
|
+
else "Unknown"
|
306
|
+
),
|
307
|
+
}
|
308
|
+
|
309
|
+
# Administrative region analysis
|
310
|
+
if "admin1" in df.columns:
|
311
|
+
admin1_counts = df["admin1"].value_counts().head(10).to_dict()
|
312
|
+
summary["top_admin1_regions"] = admin1_counts
|
313
|
+
|
314
|
+
if "admin2" in df.columns:
|
315
|
+
admin2_counts = df["admin2"].value_counts().head(10).to_dict()
|
316
|
+
summary["top_admin2_regions"] = admin2_counts
|
317
|
+
|
318
|
+
# Connectivity analysis
|
319
|
+
if "connectivity" in df.columns:
|
320
|
+
connected_count = df["connectivity"].sum()
|
321
|
+
summary["schools_with_connectivity"] = int(connected_count)
|
322
|
+
summary["connectivity_percentage"] = connected_count / len(df) * 100
|
323
|
+
|
324
|
+
if "connectivity_RT" in df.columns:
|
325
|
+
rt_connected_count = df["connectivity_RT"].sum()
|
326
|
+
summary["schools_with_realtime_connectivity"] = int(rt_connected_count)
|
327
|
+
summary["realtime_connectivity_percentage"] = (
|
328
|
+
rt_connected_count / len(df) * 100
|
329
|
+
)
|
330
|
+
|
331
|
+
# Connectivity type analysis
|
332
|
+
if "connectivity_type" in df.columns:
|
333
|
+
|
334
|
+
if not all(df.connectivity_type.isna()):
|
335
|
+
from collections import Counter
|
336
|
+
|
337
|
+
type_counts = dict(Counter(df.connectivity_type.dropna().to_list()))
|
338
|
+
summary["connectivity_types_breakdown"] = type_counts
|
339
|
+
|
340
|
+
# Data source analysis
|
341
|
+
if "connectivity_RT_datasource" in df.columns:
|
342
|
+
datasource_counts = (
|
343
|
+
df["connectivity_RT_datasource"].value_counts().to_dict()
|
344
|
+
)
|
345
|
+
summary["realtime_connectivity_datasources"] = datasource_counts
|
346
|
+
|
347
|
+
if "school_data_source" in df.columns:
|
348
|
+
school_datasource_counts = df["school_data_source"].value_counts().to_dict()
|
349
|
+
summary["school_data_sources"] = school_datasource_counts
|
350
|
+
|
351
|
+
self.logger.info("Generated connectivity summary")
|
352
|
+
return summary
|
353
|
+
|
354
|
+
|
355
|
+
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
356
|
+
class GigaSchoolMeasurementsFetcher:
|
357
|
+
"""
|
358
|
+
Fetch and process school daily realtime connectivity measurements from the Giga API.
|
359
|
+
This includes download/upload speeds, latency, and connectivity performance data.
|
360
|
+
"""
|
361
|
+
|
362
|
+
country: str = Field(...)
|
363
|
+
start_date: Union[str, date, datetime] = Field(...)
|
364
|
+
end_date: Union[str, date, datetime] = Field(...)
|
365
|
+
api_url: str = Field(
|
366
|
+
default="https://uni-ooi-giga-maps-service.azurewebsites.net/api/v1/all_measurements",
|
367
|
+
description="Base URL for the Giga School Measurements API",
|
368
|
+
)
|
369
|
+
api_key: str = global_config.GIGA_SCHOOL_MEASUREMENTS_API_KEY
|
370
|
+
page_size: int = Field(default=1000, description="Number of records per API page")
|
371
|
+
sleep_time: float = Field(
|
372
|
+
default=0.2, description="Sleep time between API requests"
|
373
|
+
)
|
374
|
+
giga_id_school: Optional[str] = Field(
|
375
|
+
default=None, description="Optional specific giga school ID to fetch"
|
376
|
+
)
|
377
|
+
|
378
|
+
logger: logging.Logger = Field(default=None, repr=False)
|
379
|
+
|
380
|
+
def __post_init__(self):
|
381
|
+
try:
|
382
|
+
self.country = pycountry.countries.lookup(self.country).alpha_3
|
383
|
+
except LookupError:
|
384
|
+
raise ValueError(f"Invalid country code provided: {self.country}")
|
385
|
+
|
386
|
+
# Convert dates to string format if needed
|
387
|
+
self.start_date = self._format_date(self.start_date)
|
388
|
+
self.end_date = self._format_date(self.end_date)
|
389
|
+
|
390
|
+
# Validate date range
|
391
|
+
if self.start_date > self.end_date:
|
392
|
+
raise ValueError("start_date must be before or equal to end_date")
|
393
|
+
|
394
|
+
if self.logger is None:
|
395
|
+
self.logger = global_config.get_logger(self.__class__.__name__)
|
396
|
+
|
397
|
+
def _format_date(self, date_input: Union[str, date, datetime]) -> str:
|
398
|
+
"""
|
399
|
+
Convert date input to string format expected by API (YYYY-MM-DD).
|
400
|
+
|
401
|
+
Args:
|
402
|
+
date_input: Date in various formats
|
403
|
+
|
404
|
+
Returns:
|
405
|
+
str: Date in YYYY-MM-DD format
|
406
|
+
"""
|
407
|
+
if isinstance(date_input, str):
|
408
|
+
# Assume it's already in correct format or parse it
|
409
|
+
try:
|
410
|
+
parsed_date = datetime.strptime(date_input, "%Y-%m-%d")
|
411
|
+
return date_input
|
412
|
+
except ValueError:
|
413
|
+
try:
|
414
|
+
parsed_date = pd.to_datetime(date_input)
|
415
|
+
return parsed_date.strftime("%Y-%m-%d")
|
416
|
+
except:
|
417
|
+
raise ValueError(
|
418
|
+
f"Invalid date format: {date_input}. Expected YYYY-MM-DD"
|
419
|
+
)
|
420
|
+
elif isinstance(date_input, (date, datetime)):
|
421
|
+
return date_input.strftime("%Y-%m-%d")
|
422
|
+
else:
|
423
|
+
raise ValueError(f"Invalid date type: {type(date_input)}")
|
424
|
+
|
425
|
+
def fetch_measurements(self, **kwargs) -> pd.DataFrame:
|
426
|
+
"""
|
427
|
+
Fetch and process school connectivity measurements.
|
428
|
+
|
429
|
+
Args:
|
430
|
+
**kwargs: Additional parameters for customization
|
431
|
+
- page_size: Override default page size
|
432
|
+
- sleep_time: Override default sleep time between requests
|
433
|
+
- max_pages: Limit the number of pages to fetch
|
434
|
+
- giga_id_school: Override default giga_id_school filter
|
435
|
+
- start_date: Override default start_date
|
436
|
+
- end_date: Override default end_date
|
437
|
+
|
438
|
+
Returns:
|
439
|
+
pd.DataFrame: School measurements with connectivity performance data.
|
440
|
+
"""
|
441
|
+
# Override defaults with kwargs if provided
|
442
|
+
page_size = kwargs.get("page_size", self.page_size)
|
443
|
+
sleep_time = kwargs.get("sleep_time", self.sleep_time)
|
444
|
+
max_pages = kwargs.get("max_pages", None)
|
445
|
+
giga_id_school = kwargs.get("giga_id_school", self.giga_id_school)
|
446
|
+
start_date = kwargs.get("start_date", self.start_date)
|
447
|
+
end_date = kwargs.get("end_date", self.end_date)
|
448
|
+
|
449
|
+
# Format dates if overridden
|
450
|
+
if start_date != self.start_date:
|
451
|
+
start_date = self._format_date(start_date)
|
452
|
+
if end_date != self.end_date:
|
453
|
+
end_date = self._format_date(end_date)
|
454
|
+
|
455
|
+
# Prepare headers
|
456
|
+
headers = {
|
457
|
+
"Authorization": f"Bearer {self.api_key}",
|
458
|
+
"Accept": "application/json",
|
459
|
+
}
|
460
|
+
|
461
|
+
all_data = []
|
462
|
+
page = 1
|
463
|
+
|
464
|
+
self.logger.info(
|
465
|
+
f"Starting to fetch measurements for country: {self.country} "
|
466
|
+
f"from {start_date} to {end_date}"
|
467
|
+
)
|
468
|
+
|
469
|
+
if giga_id_school:
|
470
|
+
self.logger.info(f"Filtering for specific school ID: {giga_id_school}")
|
471
|
+
|
472
|
+
while True:
|
473
|
+
# Check if we've reached max_pages limit
|
474
|
+
if max_pages and page > max_pages:
|
475
|
+
self.logger.info(f"Reached maximum pages limit: {max_pages}")
|
476
|
+
break
|
477
|
+
|
478
|
+
# Build parameters
|
479
|
+
params = {
|
480
|
+
"country_iso3_code": self.country,
|
481
|
+
"start_date": start_date,
|
482
|
+
"end_date": end_date,
|
483
|
+
"page": page,
|
484
|
+
"size": page_size,
|
485
|
+
}
|
486
|
+
|
487
|
+
# Add giga_id_school filter if specified
|
488
|
+
if giga_id_school:
|
489
|
+
params["giga_id_school"] = giga_id_school
|
490
|
+
|
491
|
+
try:
|
492
|
+
self.logger.debug(f"Fetching page {page} with params: {params}")
|
493
|
+
response = requests.get(self.api_url, headers=headers, params=params)
|
494
|
+
response.raise_for_status()
|
495
|
+
|
496
|
+
parsed = response.json()
|
497
|
+
data = parsed.get("data", [])
|
498
|
+
|
499
|
+
except requests.exceptions.RequestException as e:
|
500
|
+
self.logger.error(f"Request failed on page {page}: {e}")
|
501
|
+
break
|
502
|
+
except ValueError as e:
|
503
|
+
self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
|
504
|
+
break
|
505
|
+
|
506
|
+
# Check if we got any data
|
507
|
+
if not data:
|
508
|
+
self.logger.info(f"No data on page {page}. Stopping.")
|
509
|
+
break
|
510
|
+
|
511
|
+
all_data.extend(data)
|
512
|
+
self.logger.info(f"Fetched page {page} with {len(data)} records")
|
513
|
+
|
514
|
+
# If we got fewer records than page_size, we've reached the end
|
515
|
+
if len(data) < page_size:
|
516
|
+
self.logger.info("Reached end of data (partial page received)")
|
517
|
+
break
|
518
|
+
|
519
|
+
# If filtering by specific school ID, we might only need one page
|
520
|
+
if giga_id_school and len(all_data) > 0:
|
521
|
+
self.logger.info(
|
522
|
+
"Specific school ID requested, checking if more data needed"
|
523
|
+
)
|
524
|
+
|
525
|
+
page += 1
|
526
|
+
|
527
|
+
# Sleep to be respectful to the API
|
528
|
+
if sleep_time > 0:
|
529
|
+
time.sleep(sleep_time)
|
530
|
+
|
531
|
+
self.logger.info(f"Finished fetching. Total records: {len(all_data)}")
|
532
|
+
|
533
|
+
# Convert to DataFrame and process
|
534
|
+
if not all_data:
|
535
|
+
self.logger.warning("No data fetched, returning empty DataFrame")
|
536
|
+
return pd.DataFrame()
|
537
|
+
|
538
|
+
df = pd.DataFrame(all_data)
|
539
|
+
df = self._process_measurements_data(df)
|
540
|
+
|
541
|
+
return df
|
542
|
+
|
543
|
+
def _process_measurements_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
544
|
+
"""
|
545
|
+
Process and enhance the DataFrame with measurement performance metrics.
|
546
|
+
|
547
|
+
Args:
|
548
|
+
df: Raw DataFrame from API
|
549
|
+
|
550
|
+
Returns:
|
551
|
+
pd.DataFrame: Enhanced DataFrame with processed measurement data
|
552
|
+
"""
|
553
|
+
if df.empty:
|
554
|
+
return df
|
555
|
+
|
556
|
+
# Convert date column to datetime
|
557
|
+
if "date" in df.columns:
|
558
|
+
df["date"] = pd.to_datetime(df["date"], errors="coerce")
|
559
|
+
df["date_only"] = df["date"].dt.date
|
560
|
+
df["year"] = df["date"].dt.year
|
561
|
+
df["month"] = df["date"].dt.month
|
562
|
+
df["day_of_week"] = df["date"].dt.day_name()
|
563
|
+
self.logger.info("Processed date fields")
|
564
|
+
|
565
|
+
# Process speed measurements
|
566
|
+
numeric_columns = ["download_speed", "upload_speed", "latency"]
|
567
|
+
for col in numeric_columns:
|
568
|
+
if col in df.columns:
|
569
|
+
df[col] = pd.to_numeric(df[col], errors="coerce")
|
570
|
+
|
571
|
+
# Create performance categories
|
572
|
+
if "download_speed" in df.columns:
|
573
|
+
df["download_speed_category"] = pd.cut(
|
574
|
+
df["download_speed"],
|
575
|
+
bins=[0, 5, 25, 100, float("inf")],
|
576
|
+
labels=[
|
577
|
+
"Very Slow (<5 Mbps)",
|
578
|
+
"Slow (5-25 Mbps)",
|
579
|
+
"Moderate (25-100 Mbps)",
|
580
|
+
"Fast (>100 Mbps)",
|
581
|
+
],
|
582
|
+
include_lowest=True,
|
583
|
+
)
|
584
|
+
|
585
|
+
if "upload_speed" in df.columns:
|
586
|
+
df["upload_speed_category"] = pd.cut(
|
587
|
+
df["upload_speed"],
|
588
|
+
bins=[0, 1, 10, 50, float("inf")],
|
589
|
+
labels=[
|
590
|
+
"Very Slow (<1 Mbps)",
|
591
|
+
"Slow (1-10 Mbps)",
|
592
|
+
"Moderate (10-50 Mbps)",
|
593
|
+
"Fast (>50 Mbps)",
|
594
|
+
],
|
595
|
+
include_lowest=True,
|
596
|
+
)
|
597
|
+
|
598
|
+
if "latency" in df.columns:
|
599
|
+
df["latency_category"] = pd.cut(
|
600
|
+
df["latency"],
|
601
|
+
bins=[0, 50, 150, 300, float("inf")],
|
602
|
+
labels=[
|
603
|
+
"Excellent (<50ms)",
|
604
|
+
"Good (50-150ms)",
|
605
|
+
"Fair (150-300ms)",
|
606
|
+
"Poor (>300ms)",
|
607
|
+
],
|
608
|
+
include_lowest=True,
|
609
|
+
)
|
610
|
+
|
611
|
+
# Create quality flags
|
612
|
+
if "download_speed" in df.columns and "upload_speed" in df.columns:
|
613
|
+
df["has_broadband"] = (df["download_speed"] >= 25) & (
|
614
|
+
df["upload_speed"] >= 3
|
615
|
+
)
|
616
|
+
df["has_basic_connectivity"] = (df["download_speed"] >= 1) & (
|
617
|
+
df["upload_speed"] >= 0.5
|
618
|
+
)
|
619
|
+
|
620
|
+
# Flag measurements with missing data
|
621
|
+
df["has_complete_measurement"] = (
|
622
|
+
df["download_speed"].notna()
|
623
|
+
& df["upload_speed"].notna()
|
624
|
+
& df["latency"].notna()
|
625
|
+
)
|
626
|
+
|
627
|
+
self.logger.info(f"Processed measurement data for {len(df)} records")
|
628
|
+
|
629
|
+
return df
|
630
|
+
|
631
|
+
def get_performance_summary(self, df: pd.DataFrame) -> dict:
|
632
|
+
"""
|
633
|
+
Generate a comprehensive summary of connectivity performance metrics.
|
634
|
+
|
635
|
+
Args:
|
636
|
+
df: DataFrame with measurement data
|
637
|
+
|
638
|
+
Returns:
|
639
|
+
dict: Summary statistics about connectivity performance
|
640
|
+
"""
|
641
|
+
if df.empty:
|
642
|
+
return {"error": "No data available"}
|
643
|
+
|
644
|
+
summary = {
|
645
|
+
"total_measurements": len(df),
|
646
|
+
"country": (
|
647
|
+
df["country_iso3_code"].iloc[0]
|
648
|
+
if "country_iso3_code" in df.columns
|
649
|
+
else "Unknown"
|
650
|
+
),
|
651
|
+
"date_range": {
|
652
|
+
"start": (
|
653
|
+
df["date"].min().strftime("%Y-%m-%d")
|
654
|
+
if "date" in df.columns
|
655
|
+
else None
|
656
|
+
),
|
657
|
+
"end": (
|
658
|
+
df["date"].max().strftime("%Y-%m-%d")
|
659
|
+
if "date" in df.columns
|
660
|
+
else None
|
661
|
+
),
|
662
|
+
},
|
663
|
+
}
|
664
|
+
|
665
|
+
# School coverage
|
666
|
+
if "giga_id_school" in df.columns:
|
667
|
+
unique_schools = df["giga_id_school"].nunique()
|
668
|
+
summary["unique_schools_measured"] = unique_schools
|
669
|
+
summary["avg_measurements_per_school"] = (
|
670
|
+
len(df) / unique_schools if unique_schools > 0 else 0
|
671
|
+
)
|
672
|
+
|
673
|
+
# Speed statistics
|
674
|
+
for speed_col in ["download_speed", "upload_speed"]:
|
675
|
+
if speed_col in df.columns:
|
676
|
+
speed_data = df[speed_col].dropna()
|
677
|
+
if len(speed_data) > 0:
|
678
|
+
summary[f"{speed_col}_stats"] = {
|
679
|
+
"mean": float(speed_data.mean()),
|
680
|
+
"median": float(speed_data.median()),
|
681
|
+
"min": float(speed_data.min()),
|
682
|
+
"max": float(speed_data.max()),
|
683
|
+
"std": float(speed_data.std()),
|
684
|
+
}
|
685
|
+
|
686
|
+
# Latency statistics
|
687
|
+
if "latency" in df.columns:
|
688
|
+
latency_data = df["latency"].dropna()
|
689
|
+
if len(latency_data) > 0:
|
690
|
+
summary["latency_stats"] = {
|
691
|
+
"mean": float(latency_data.mean()),
|
692
|
+
"median": float(latency_data.median()),
|
693
|
+
"min": float(latency_data.min()),
|
694
|
+
"max": float(latency_data.max()),
|
695
|
+
"std": float(latency_data.std()),
|
696
|
+
}
|
697
|
+
|
698
|
+
# Performance categories
|
699
|
+
for cat_col in [
|
700
|
+
"download_speed_category",
|
701
|
+
"upload_speed_category",
|
702
|
+
"latency_category",
|
703
|
+
]:
|
704
|
+
if cat_col in df.columns:
|
705
|
+
cat_counts = df[cat_col].value_counts().to_dict()
|
706
|
+
summary[cat_col.replace("_category", "_breakdown")] = cat_counts
|
707
|
+
|
708
|
+
# Quality metrics
|
709
|
+
if "has_broadband" in df.columns:
|
710
|
+
summary["broadband_capable_measurements"] = int(df["has_broadband"].sum())
|
711
|
+
summary["broadband_percentage"] = float(df["has_broadband"].mean() * 100)
|
712
|
+
|
713
|
+
if "has_basic_connectivity" in df.columns:
|
714
|
+
summary["basic_connectivity_measurements"] = int(
|
715
|
+
df["has_basic_connectivity"].sum()
|
716
|
+
)
|
717
|
+
summary["basic_connectivity_percentage"] = float(
|
718
|
+
df["has_basic_connectivity"].mean() * 100
|
719
|
+
)
|
720
|
+
|
721
|
+
# Data completeness
|
722
|
+
if "has_complete_measurement" in df.columns:
|
723
|
+
summary["complete_measurements"] = int(df["has_complete_measurement"].sum())
|
724
|
+
summary["data_completeness_percentage"] = float(
|
725
|
+
df["has_complete_measurement"].mean() * 100
|
726
|
+
)
|
727
|
+
|
728
|
+
# Data sources
|
729
|
+
if "data_source" in df.columns:
|
730
|
+
source_counts = df["data_source"].value_counts().to_dict()
|
731
|
+
summary["data_sources"] = source_counts
|
732
|
+
|
733
|
+
# Temporal patterns
|
734
|
+
if "day_of_week" in df.columns:
|
735
|
+
day_counts = df["day_of_week"].value_counts().to_dict()
|
736
|
+
summary["measurements_by_day_of_week"] = day_counts
|
737
|
+
|
738
|
+
self.logger.info("Generated performance summary")
|
739
|
+
return summary
|
740
|
+
|
741
|
+
def get_school_performance_comparison(
|
742
|
+
self, df: pd.DataFrame, top_n: int = 10
|
743
|
+
) -> dict:
|
744
|
+
"""
|
745
|
+
Compare performance across schools.
|
746
|
+
|
747
|
+
Args:
|
748
|
+
df: DataFrame with measurement data
|
749
|
+
top_n: Number of top/bottom schools to include
|
750
|
+
|
751
|
+
Returns:
|
752
|
+
dict: School performance comparison
|
753
|
+
"""
|
754
|
+
if df.empty or "giga_id_school" not in df.columns:
|
755
|
+
return {"error": "No school data available"}
|
756
|
+
|
757
|
+
school_stats = (
|
758
|
+
df.groupby("giga_id_school")
|
759
|
+
.agg(
|
760
|
+
{
|
761
|
+
"download_speed": ["mean", "median", "count"],
|
762
|
+
"upload_speed": ["mean", "median"],
|
763
|
+
"latency": ["mean", "median"],
|
764
|
+
"has_broadband": (
|
765
|
+
"mean" if "has_broadband" in df.columns else lambda x: None
|
766
|
+
),
|
767
|
+
}
|
768
|
+
)
|
769
|
+
.round(2)
|
770
|
+
)
|
771
|
+
|
772
|
+
# Flatten column names
|
773
|
+
school_stats.columns = ["_".join(col).strip() for col in school_stats.columns]
|
774
|
+
|
775
|
+
# Sort by download speed
|
776
|
+
if "download_speed_mean" in school_stats.columns:
|
777
|
+
top_schools = school_stats.nlargest(top_n, "download_speed_mean")
|
778
|
+
bottom_schools = school_stats.nsmallest(top_n, "download_speed_mean")
|
779
|
+
|
780
|
+
return {
|
781
|
+
"top_performing_schools": top_schools.to_dict("index"),
|
782
|
+
"bottom_performing_schools": bottom_schools.to_dict("index"),
|
783
|
+
"total_schools_analyzed": len(school_stats),
|
784
|
+
}
|
785
|
+
|
786
|
+
return {"error": "Insufficient data for school comparison"}
|
gigaspatial/handlers/hdx.py
CHANGED
@@ -1,13 +1,9 @@
|
|
1
|
-
import os
|
2
1
|
import logging
|
3
2
|
from tqdm import tqdm
|
4
3
|
from pathlib import Path
|
5
|
-
from typing import List, Optional,
|
4
|
+
from typing import List, Optional, Union, Dict, Any, Iterable
|
6
5
|
import tempfile
|
7
|
-
import functools
|
8
|
-
import multiprocessing
|
9
6
|
|
10
|
-
import pandas as pd
|
11
7
|
import geopandas as gpd
|
12
8
|
from pydantic import Field, ConfigDict
|
13
9
|
from pydantic.dataclasses import dataclass
|
@@ -50,6 +46,48 @@ class HDXConfig(BaseHandlerConfig):
|
|
50
46
|
_hdx_configured: bool = Field(default=False, init=False)
|
51
47
|
dataset: Optional[Dataset] = Field(default=None, init=False)
|
52
48
|
|
49
|
+
@staticmethod
|
50
|
+
def search_datasets(
|
51
|
+
query: str,
|
52
|
+
rows: int = None,
|
53
|
+
sort: str = "relevance asc, metadata_modified desc",
|
54
|
+
hdx_site: str = "prod",
|
55
|
+
user_agent: str = "gigaspatial",
|
56
|
+
) -> List[Dict]:
|
57
|
+
"""Search for datasets in HDX before initializing the class.
|
58
|
+
|
59
|
+
Args:
|
60
|
+
query: Search query string
|
61
|
+
rows: Number of results per page. Defaults to all datasets (sys.maxsize).
|
62
|
+
sort: Sort order - one of 'relevance', 'views_recent', 'views_total', 'last_modified' (default: 'relevance')
|
63
|
+
hdx_site: HDX site to use - 'prod' or 'test' (default: 'prod')
|
64
|
+
user_agent: User agent for HDX API requests (default: 'gigaspatial')
|
65
|
+
|
66
|
+
Returns:
|
67
|
+
List of dataset dictionaries containing search results
|
68
|
+
|
69
|
+
Example:
|
70
|
+
>>> results = HDXConfig.search_datasets("population", rows=5)
|
71
|
+
>>> for dataset in results:
|
72
|
+
>>> print(f"Name: {dataset['name']}, Title: {dataset['title']}")
|
73
|
+
"""
|
74
|
+
try:
|
75
|
+
Configuration.create(
|
76
|
+
hdx_site=hdx_site,
|
77
|
+
user_agent=user_agent,
|
78
|
+
hdx_read_only=True,
|
79
|
+
)
|
80
|
+
except:
|
81
|
+
pass
|
82
|
+
|
83
|
+
try:
|
84
|
+
results = Dataset.search_in_hdx(query=query, rows=rows, sort=sort)
|
85
|
+
|
86
|
+
return results
|
87
|
+
except Exception as e:
|
88
|
+
logging.error(f"Error searching HDX datasets: {str(e)}")
|
89
|
+
raise
|
90
|
+
|
53
91
|
def __post_init__(self):
|
54
92
|
super().__post_init__()
|
55
93
|
try:
|
@@ -85,7 +123,11 @@ class HDXConfig(BaseHandlerConfig):
|
|
85
123
|
self.logger.info(f"Fetching HDX dataset: {self.dataset_name}")
|
86
124
|
dataset = Dataset.read_from_hdx(self.dataset_name)
|
87
125
|
if not dataset:
|
88
|
-
raise ValueError(
|
126
|
+
raise ValueError(
|
127
|
+
f"Dataset '{self.dataset_name}' not found on HDX. "
|
128
|
+
"Please verify the dataset name or use search_datasets() "
|
129
|
+
"to find available datasets."
|
130
|
+
)
|
89
131
|
return dataset
|
90
132
|
except Exception as e:
|
91
133
|
self.logger.error(f"Error fetching HDX dataset: {str(e)}")
|
@@ -386,9 +428,9 @@ class HDXReader(BaseHandlerReader):
|
|
386
428
|
self, source_data_path: List[Union[str, Path]], **kwargs
|
387
429
|
) -> Any:
|
388
430
|
"""Load data from paths"""
|
389
|
-
if len(source_data_path)==1:
|
431
|
+
if len(source_data_path) == 1:
|
390
432
|
return read_dataset(self.data_store, source_data_path[0])
|
391
|
-
|
433
|
+
|
392
434
|
all_data = {}
|
393
435
|
for file_path in source_data_path:
|
394
436
|
try:
|
@@ -401,49 +443,6 @@ class HDXReader(BaseHandlerReader):
|
|
401
443
|
resources = self.config.list_resources()
|
402
444
|
return self.load_from_paths(resources)
|
403
445
|
|
404
|
-
# def read_resource(
|
405
|
-
# self, resource_file: str
|
406
|
-
# ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
|
407
|
-
# """Read a specific resource file from the dataset using the data_store."""
|
408
|
-
# if not self.dataset_path:
|
409
|
-
# raise ValueError("No dataset path configured")
|
410
|
-
|
411
|
-
# file_path = str(self.dataset_path / resource_file)
|
412
|
-
|
413
|
-
# if not self.data_store.file_exists(file_path):
|
414
|
-
# raise FileNotFoundError(
|
415
|
-
# f"Resource file {resource_file} not found in dataset"
|
416
|
-
# )
|
417
|
-
|
418
|
-
# try:
|
419
|
-
# return read_dataset(self.data_store, file_path)
|
420
|
-
# except Exception as e:
|
421
|
-
# raise ValueError(f"Could not read file {file_path}: {str(e)}")
|
422
|
-
|
423
|
-
# def read_all_resources(self) -> Dict[str, Union[pd.DataFrame, gpd.GeoDataFrame]]:
|
424
|
-
# """Read all resources in the dataset directory using the data_store."""
|
425
|
-
# resources = self.list_resources()
|
426
|
-
# result = {}
|
427
|
-
|
428
|
-
# for resource in resources:
|
429
|
-
# try:
|
430
|
-
# result[resource] = self.read_resource(resource)
|
431
|
-
# except Exception as e:
|
432
|
-
# self.logger.warning(f"Could not read resource {resource}: {str(e)}")
|
433
|
-
|
434
|
-
# return result
|
435
|
-
|
436
|
-
# def load_from_paths(
|
437
|
-
# self, source_data_path: List[Union[str, Path]], **kwargs
|
438
|
-
# ) -> Union[
|
439
|
-
# pd.DataFrame, gpd.GeoDataFrame, Dict[str, Union[pd.DataFrame, gpd.GeoDataFrame]]
|
440
|
-
# ]:
|
441
|
-
# """Load data from paths"""
|
442
|
-
# if len(source_data_path) == 1:
|
443
|
-
# return self.read_resource(str(source_data_path[0]))
|
444
|
-
# else:
|
445
|
-
# return self.read_all_resources()
|
446
|
-
|
447
446
|
|
448
447
|
class HDXHandler(BaseHandler):
|
449
448
|
"""Handler for HDX datasets"""
|
@@ -14,7 +14,6 @@ from gigaspatial.processing.geo import (
|
|
14
14
|
convert_to_geodataframe,
|
15
15
|
buffer_geodataframe,
|
16
16
|
)
|
17
|
-
from gigaspatial.processing.sat_images import calculate_pixels_at_location
|
18
17
|
from gigaspatial.config import config as global_config
|
19
18
|
|
20
19
|
|
@@ -142,7 +141,7 @@ class MaxarImageDownloader:
|
|
142
141
|
self.logger.warning(
|
143
142
|
f"Attempt {attempt + 1} of downloading {output_path.name} failed: {str(e)}"
|
144
143
|
)
|
145
|
-
if attempt < self.max_retries - 1:
|
144
|
+
if attempt < self.config.max_retries - 1:
|
146
145
|
sleep(self.config.retry_delay)
|
147
146
|
else:
|
148
147
|
self.logger.warning(
|
File without changes
|
File without changes
|
File without changes
|