giga-spatial 0.6.2__py3-none-any.whl → 0.6.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {giga_spatial-0.6.2.dist-info → giga_spatial-0.6.4.dist-info}/METADATA +18 -8
- {giga_spatial-0.6.2.dist-info → giga_spatial-0.6.4.dist-info}/RECORD +15 -15
- gigaspatial/__init__.py +1 -1
- gigaspatial/config.py +6 -0
- gigaspatial/handlers/__init__.py +7 -3
- gigaspatial/handlers/boundaries.py +196 -43
- gigaspatial/handlers/ghsl.py +7 -6
- gigaspatial/handlers/giga.py +641 -0
- gigaspatial/handlers/hdx.py +411 -143
- gigaspatial/handlers/maxar_image.py +1 -2
- gigaspatial/handlers/rwi.py +119 -121
- gigaspatial/processing/tif_processor.py +88 -2
- {giga_spatial-0.6.2.dist-info → giga_spatial-0.6.4.dist-info}/WHEEL +0 -0
- {giga_spatial-0.6.2.dist-info → giga_spatial-0.6.4.dist-info}/licenses/LICENSE +0 -0
- {giga_spatial-0.6.2.dist-info → giga_spatial-0.6.4.dist-info}/top_level.txt +0 -0
gigaspatial/handlers/giga.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
import requests
|
2
2
|
import pandas as pd
|
3
3
|
import time
|
4
|
+
from datetime import datetime, date
|
4
5
|
from pydantic.dataclasses import dataclass, Field
|
5
6
|
from pydantic import ConfigDict
|
6
7
|
from shapely.geometry import Point
|
7
8
|
import pycountry
|
9
|
+
from typing import Optional, Union
|
8
10
|
import logging
|
9
11
|
|
10
12
|
from gigaspatial.config import config as global_config
|
@@ -143,3 +145,642 @@ class GigaSchoolLocationFetcher:
|
|
143
145
|
self.logger.info(f"Created geometry for all {len(df)} records")
|
144
146
|
|
145
147
|
return df
|
148
|
+
|
149
|
+
|
150
|
+
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
151
|
+
class GigaSchoolProfileFetcher:
|
152
|
+
"""
|
153
|
+
Fetch and process school profile data from the Giga School Profile API.
|
154
|
+
This includes connectivity information and other school details.
|
155
|
+
"""
|
156
|
+
|
157
|
+
country: str = Field(...)
|
158
|
+
api_url: str = Field(
|
159
|
+
default="https://uni-ooi-giga-maps-service.azurewebsites.net/api/v1/schools_profile/",
|
160
|
+
description="Base URL for the Giga School Profile API",
|
161
|
+
)
|
162
|
+
api_key: str = global_config.GIGA_SCHOOL_PROFILE_API_KEY
|
163
|
+
page_size: int = Field(default=1000, description="Number of records per API page")
|
164
|
+
sleep_time: float = Field(
|
165
|
+
default=0.2, description="Sleep time between API requests"
|
166
|
+
)
|
167
|
+
giga_id_school: Optional[str] = Field(
|
168
|
+
default=None, description="Optional specific giga school ID to fetch"
|
169
|
+
)
|
170
|
+
|
171
|
+
logger: logging.Logger = Field(default=None, repr=False)
|
172
|
+
|
173
|
+
def __post_init__(self):
|
174
|
+
try:
|
175
|
+
self.country = pycountry.countries.lookup(self.country).alpha_3
|
176
|
+
except LookupError:
|
177
|
+
raise ValueError(f"Invalid country code provided: {self.country}")
|
178
|
+
|
179
|
+
if self.logger is None:
|
180
|
+
self.logger = global_config.get_logger(self.__class__.__name__)
|
181
|
+
|
182
|
+
def fetch_profiles(self, **kwargs) -> pd.DataFrame:
|
183
|
+
"""
|
184
|
+
Fetch and process school profiles including connectivity information.
|
185
|
+
|
186
|
+
Args:
|
187
|
+
**kwargs: Additional parameters for customization
|
188
|
+
- page_size: Override default page size
|
189
|
+
- sleep_time: Override default sleep time between requests
|
190
|
+
- max_pages: Limit the number of pages to fetch
|
191
|
+
- giga_id_school: Override default giga_id_school filter
|
192
|
+
|
193
|
+
Returns:
|
194
|
+
pd.DataFrame: School profiles with connectivity and geospatial info.
|
195
|
+
"""
|
196
|
+
# Override defaults with kwargs if provided
|
197
|
+
page_size = kwargs.get("page_size", self.page_size)
|
198
|
+
sleep_time = kwargs.get("sleep_time", self.sleep_time)
|
199
|
+
max_pages = kwargs.get("max_pages", None)
|
200
|
+
giga_id_school = kwargs.get("giga_id_school", self.giga_id_school)
|
201
|
+
|
202
|
+
# Prepare headers
|
203
|
+
headers = {
|
204
|
+
"Authorization": f"Bearer {self.api_key}",
|
205
|
+
"Accept": "application/json",
|
206
|
+
}
|
207
|
+
|
208
|
+
all_data = []
|
209
|
+
page = 1
|
210
|
+
|
211
|
+
self.logger.info(
|
212
|
+
f"Starting to fetch school profiles for country: {self.country}"
|
213
|
+
)
|
214
|
+
|
215
|
+
if giga_id_school:
|
216
|
+
self.logger.info(f"Filtering for specific school ID: {giga_id_school}")
|
217
|
+
|
218
|
+
while True:
|
219
|
+
# Check if we've reached max_pages limit
|
220
|
+
if max_pages and page > max_pages:
|
221
|
+
self.logger.info(f"Reached maximum pages limit: {max_pages}")
|
222
|
+
break
|
223
|
+
|
224
|
+
# Build parameters
|
225
|
+
params = {
|
226
|
+
"country_iso3_code": self.country,
|
227
|
+
"page": page,
|
228
|
+
"size": page_size,
|
229
|
+
}
|
230
|
+
|
231
|
+
# Add giga_id_school filter if specified
|
232
|
+
if giga_id_school:
|
233
|
+
params["giga_id_school"] = giga_id_school
|
234
|
+
|
235
|
+
try:
|
236
|
+
self.logger.debug(f"Fetching page {page} with params: {params}")
|
237
|
+
response = requests.get(self.api_url, headers=headers, params=params)
|
238
|
+
response.raise_for_status()
|
239
|
+
|
240
|
+
parsed = response.json()
|
241
|
+
data = parsed.get("data", [])
|
242
|
+
|
243
|
+
except requests.exceptions.RequestException as e:
|
244
|
+
self.logger.error(f"Request failed on page {page}: {e}")
|
245
|
+
break
|
246
|
+
except ValueError as e:
|
247
|
+
self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
|
248
|
+
break
|
249
|
+
|
250
|
+
# Check if we got any data
|
251
|
+
if not data:
|
252
|
+
self.logger.info(f"No data on page {page}. Stopping.")
|
253
|
+
break
|
254
|
+
|
255
|
+
all_data.extend(data)
|
256
|
+
self.logger.info(f"Fetched page {page} with {len(data)} records")
|
257
|
+
|
258
|
+
# If we got fewer records than page_size, we've reached the end
|
259
|
+
if len(data) < page_size:
|
260
|
+
self.logger.info("Reached end of data (partial page received)")
|
261
|
+
break
|
262
|
+
|
263
|
+
# If filtering by specific school ID, we likely only need one page
|
264
|
+
if giga_id_school:
|
265
|
+
self.logger.info(
|
266
|
+
"Specific school ID requested, stopping after first page"
|
267
|
+
)
|
268
|
+
break
|
269
|
+
|
270
|
+
page += 1
|
271
|
+
|
272
|
+
# Sleep to be respectful to the API
|
273
|
+
if sleep_time > 0:
|
274
|
+
time.sleep(sleep_time)
|
275
|
+
|
276
|
+
self.logger.info(f"Finished fetching. Total records: {len(all_data)}")
|
277
|
+
|
278
|
+
# Convert to DataFrame and process
|
279
|
+
if not all_data:
|
280
|
+
self.logger.warning("No data fetched, returning empty DataFrame")
|
281
|
+
return pd.DataFrame()
|
282
|
+
|
283
|
+
df = pd.DataFrame(all_data)
|
284
|
+
|
285
|
+
return df
|
286
|
+
|
287
|
+
def get_connectivity_summary(self, df: pd.DataFrame) -> dict:
|
288
|
+
"""
|
289
|
+
Generate a summary of connectivity statistics from the fetched data.
|
290
|
+
|
291
|
+
Args:
|
292
|
+
df: DataFrame with school profile data
|
293
|
+
|
294
|
+
Returns:
|
295
|
+
dict: Summary statistics about connectivity
|
296
|
+
"""
|
297
|
+
if df.empty:
|
298
|
+
return {"error": "No data available"}
|
299
|
+
|
300
|
+
summary = {
|
301
|
+
"total_schools": len(df),
|
302
|
+
"country": (
|
303
|
+
df["country_iso3_code"].iloc[0]
|
304
|
+
if "country_iso3_code" in df.columns
|
305
|
+
else "Unknown"
|
306
|
+
),
|
307
|
+
}
|
308
|
+
|
309
|
+
# Administrative region analysis
|
310
|
+
if "admin1" in df.columns:
|
311
|
+
admin1_counts = df["admin1"].value_counts().head(10).to_dict()
|
312
|
+
summary["top_admin1_regions"] = admin1_counts
|
313
|
+
|
314
|
+
if "admin2" in df.columns:
|
315
|
+
admin2_counts = df["admin2"].value_counts().head(10).to_dict()
|
316
|
+
summary["top_admin2_regions"] = admin2_counts
|
317
|
+
|
318
|
+
# Connectivity analysis
|
319
|
+
if "connectivity" in df.columns:
|
320
|
+
connected_count = df["connectivity"].sum()
|
321
|
+
summary["schools_with_connectivity"] = int(connected_count)
|
322
|
+
summary["connectivity_percentage"] = connected_count / len(df) * 100
|
323
|
+
|
324
|
+
if "connectivity_RT" in df.columns:
|
325
|
+
rt_connected_count = df["connectivity_RT"].sum()
|
326
|
+
summary["schools_with_realtime_connectivity"] = int(rt_connected_count)
|
327
|
+
summary["realtime_connectivity_percentage"] = (
|
328
|
+
rt_connected_count / len(df) * 100
|
329
|
+
)
|
330
|
+
|
331
|
+
# Connectivity type analysis
|
332
|
+
if "connectivity_type" in df.columns:
|
333
|
+
|
334
|
+
if not all(df.connectivity_type.isna()):
|
335
|
+
from collections import Counter
|
336
|
+
|
337
|
+
type_counts = dict(Counter(df.connectivity_type.dropna().to_list()))
|
338
|
+
summary["connectivity_types_breakdown"] = type_counts
|
339
|
+
|
340
|
+
# Data source analysis
|
341
|
+
if "connectivity_RT_datasource" in df.columns:
|
342
|
+
datasource_counts = (
|
343
|
+
df["connectivity_RT_datasource"].value_counts().to_dict()
|
344
|
+
)
|
345
|
+
summary["realtime_connectivity_datasources"] = datasource_counts
|
346
|
+
|
347
|
+
if "school_data_source" in df.columns:
|
348
|
+
school_datasource_counts = df["school_data_source"].value_counts().to_dict()
|
349
|
+
summary["school_data_sources"] = school_datasource_counts
|
350
|
+
|
351
|
+
self.logger.info("Generated connectivity summary")
|
352
|
+
return summary
|
353
|
+
|
354
|
+
|
355
|
+
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
356
|
+
class GigaSchoolMeasurementsFetcher:
|
357
|
+
"""
|
358
|
+
Fetch and process school daily realtime connectivity measurements from the Giga API.
|
359
|
+
This includes download/upload speeds, latency, and connectivity performance data.
|
360
|
+
"""
|
361
|
+
|
362
|
+
country: str = Field(...)
|
363
|
+
start_date: Union[str, date, datetime] = Field(...)
|
364
|
+
end_date: Union[str, date, datetime] = Field(...)
|
365
|
+
api_url: str = Field(
|
366
|
+
default="https://uni-ooi-giga-maps-service.azurewebsites.net/api/v1/all_measurements",
|
367
|
+
description="Base URL for the Giga School Measurements API",
|
368
|
+
)
|
369
|
+
api_key: str = global_config.GIGA_SCHOOL_MEASUREMENTS_API_KEY
|
370
|
+
page_size: int = Field(default=1000, description="Number of records per API page")
|
371
|
+
sleep_time: float = Field(
|
372
|
+
default=0.2, description="Sleep time between API requests"
|
373
|
+
)
|
374
|
+
giga_id_school: Optional[str] = Field(
|
375
|
+
default=None, description="Optional specific giga school ID to fetch"
|
376
|
+
)
|
377
|
+
|
378
|
+
logger: logging.Logger = Field(default=None, repr=False)
|
379
|
+
|
380
|
+
def __post_init__(self):
|
381
|
+
try:
|
382
|
+
self.country = pycountry.countries.lookup(self.country).alpha_3
|
383
|
+
except LookupError:
|
384
|
+
raise ValueError(f"Invalid country code provided: {self.country}")
|
385
|
+
|
386
|
+
# Convert dates to string format if needed
|
387
|
+
self.start_date = self._format_date(self.start_date)
|
388
|
+
self.end_date = self._format_date(self.end_date)
|
389
|
+
|
390
|
+
# Validate date range
|
391
|
+
if self.start_date > self.end_date:
|
392
|
+
raise ValueError("start_date must be before or equal to end_date")
|
393
|
+
|
394
|
+
if self.logger is None:
|
395
|
+
self.logger = global_config.get_logger(self.__class__.__name__)
|
396
|
+
|
397
|
+
def _format_date(self, date_input: Union[str, date, datetime]) -> str:
|
398
|
+
"""
|
399
|
+
Convert date input to string format expected by API (YYYY-MM-DD).
|
400
|
+
|
401
|
+
Args:
|
402
|
+
date_input: Date in various formats
|
403
|
+
|
404
|
+
Returns:
|
405
|
+
str: Date in YYYY-MM-DD format
|
406
|
+
"""
|
407
|
+
if isinstance(date_input, str):
|
408
|
+
# Assume it's already in correct format or parse it
|
409
|
+
try:
|
410
|
+
parsed_date = datetime.strptime(date_input, "%Y-%m-%d")
|
411
|
+
return date_input
|
412
|
+
except ValueError:
|
413
|
+
try:
|
414
|
+
parsed_date = pd.to_datetime(date_input)
|
415
|
+
return parsed_date.strftime("%Y-%m-%d")
|
416
|
+
except:
|
417
|
+
raise ValueError(
|
418
|
+
f"Invalid date format: {date_input}. Expected YYYY-MM-DD"
|
419
|
+
)
|
420
|
+
elif isinstance(date_input, (date, datetime)):
|
421
|
+
return date_input.strftime("%Y-%m-%d")
|
422
|
+
else:
|
423
|
+
raise ValueError(f"Invalid date type: {type(date_input)}")
|
424
|
+
|
425
|
+
def fetch_measurements(self, **kwargs) -> pd.DataFrame:
|
426
|
+
"""
|
427
|
+
Fetch and process school connectivity measurements.
|
428
|
+
|
429
|
+
Args:
|
430
|
+
**kwargs: Additional parameters for customization
|
431
|
+
- page_size: Override default page size
|
432
|
+
- sleep_time: Override default sleep time between requests
|
433
|
+
- max_pages: Limit the number of pages to fetch
|
434
|
+
- giga_id_school: Override default giga_id_school filter
|
435
|
+
- start_date: Override default start_date
|
436
|
+
- end_date: Override default end_date
|
437
|
+
|
438
|
+
Returns:
|
439
|
+
pd.DataFrame: School measurements with connectivity performance data.
|
440
|
+
"""
|
441
|
+
# Override defaults with kwargs if provided
|
442
|
+
page_size = kwargs.get("page_size", self.page_size)
|
443
|
+
sleep_time = kwargs.get("sleep_time", self.sleep_time)
|
444
|
+
max_pages = kwargs.get("max_pages", None)
|
445
|
+
giga_id_school = kwargs.get("giga_id_school", self.giga_id_school)
|
446
|
+
start_date = kwargs.get("start_date", self.start_date)
|
447
|
+
end_date = kwargs.get("end_date", self.end_date)
|
448
|
+
|
449
|
+
# Format dates if overridden
|
450
|
+
if start_date != self.start_date:
|
451
|
+
start_date = self._format_date(start_date)
|
452
|
+
if end_date != self.end_date:
|
453
|
+
end_date = self._format_date(end_date)
|
454
|
+
|
455
|
+
# Prepare headers
|
456
|
+
headers = {
|
457
|
+
"Authorization": f"Bearer {self.api_key}",
|
458
|
+
"Accept": "application/json",
|
459
|
+
}
|
460
|
+
|
461
|
+
all_data = []
|
462
|
+
page = 1
|
463
|
+
|
464
|
+
self.logger.info(
|
465
|
+
f"Starting to fetch measurements for country: {self.country} "
|
466
|
+
f"from {start_date} to {end_date}"
|
467
|
+
)
|
468
|
+
|
469
|
+
if giga_id_school:
|
470
|
+
self.logger.info(f"Filtering for specific school ID: {giga_id_school}")
|
471
|
+
|
472
|
+
while True:
|
473
|
+
# Check if we've reached max_pages limit
|
474
|
+
if max_pages and page > max_pages:
|
475
|
+
self.logger.info(f"Reached maximum pages limit: {max_pages}")
|
476
|
+
break
|
477
|
+
|
478
|
+
# Build parameters
|
479
|
+
params = {
|
480
|
+
"country_iso3_code": self.country,
|
481
|
+
"start_date": start_date,
|
482
|
+
"end_date": end_date,
|
483
|
+
"page": page,
|
484
|
+
"size": page_size,
|
485
|
+
}
|
486
|
+
|
487
|
+
# Add giga_id_school filter if specified
|
488
|
+
if giga_id_school:
|
489
|
+
params["giga_id_school"] = giga_id_school
|
490
|
+
|
491
|
+
try:
|
492
|
+
self.logger.debug(f"Fetching page {page} with params: {params}")
|
493
|
+
response = requests.get(self.api_url, headers=headers, params=params)
|
494
|
+
response.raise_for_status()
|
495
|
+
|
496
|
+
parsed = response.json()
|
497
|
+
data = parsed.get("data", [])
|
498
|
+
|
499
|
+
except requests.exceptions.RequestException as e:
|
500
|
+
self.logger.error(f"Request failed on page {page}: {e}")
|
501
|
+
break
|
502
|
+
except ValueError as e:
|
503
|
+
self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
|
504
|
+
break
|
505
|
+
|
506
|
+
# Check if we got any data
|
507
|
+
if not data:
|
508
|
+
self.logger.info(f"No data on page {page}. Stopping.")
|
509
|
+
break
|
510
|
+
|
511
|
+
all_data.extend(data)
|
512
|
+
self.logger.info(f"Fetched page {page} with {len(data)} records")
|
513
|
+
|
514
|
+
# If we got fewer records than page_size, we've reached the end
|
515
|
+
if len(data) < page_size:
|
516
|
+
self.logger.info("Reached end of data (partial page received)")
|
517
|
+
break
|
518
|
+
|
519
|
+
# If filtering by specific school ID, we might only need one page
|
520
|
+
if giga_id_school and len(all_data) > 0:
|
521
|
+
self.logger.info(
|
522
|
+
"Specific school ID requested, checking if more data needed"
|
523
|
+
)
|
524
|
+
|
525
|
+
page += 1
|
526
|
+
|
527
|
+
# Sleep to be respectful to the API
|
528
|
+
if sleep_time > 0:
|
529
|
+
time.sleep(sleep_time)
|
530
|
+
|
531
|
+
self.logger.info(f"Finished fetching. Total records: {len(all_data)}")
|
532
|
+
|
533
|
+
# Convert to DataFrame and process
|
534
|
+
if not all_data:
|
535
|
+
self.logger.warning("No data fetched, returning empty DataFrame")
|
536
|
+
return pd.DataFrame()
|
537
|
+
|
538
|
+
df = pd.DataFrame(all_data)
|
539
|
+
df = self._process_measurements_data(df)
|
540
|
+
|
541
|
+
return df
|
542
|
+
|
543
|
+
def _process_measurements_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
544
|
+
"""
|
545
|
+
Process and enhance the DataFrame with measurement performance metrics.
|
546
|
+
|
547
|
+
Args:
|
548
|
+
df: Raw DataFrame from API
|
549
|
+
|
550
|
+
Returns:
|
551
|
+
pd.DataFrame: Enhanced DataFrame with processed measurement data
|
552
|
+
"""
|
553
|
+
if df.empty:
|
554
|
+
return df
|
555
|
+
|
556
|
+
# Convert date column to datetime
|
557
|
+
if "date" in df.columns:
|
558
|
+
df["date"] = pd.to_datetime(df["date"], errors="coerce")
|
559
|
+
df["date_only"] = df["date"].dt.date
|
560
|
+
df["year"] = df["date"].dt.year
|
561
|
+
df["month"] = df["date"].dt.month
|
562
|
+
df["day_of_week"] = df["date"].dt.day_name()
|
563
|
+
self.logger.info("Processed date fields")
|
564
|
+
|
565
|
+
# Process speed measurements
|
566
|
+
numeric_columns = ["download_speed", "upload_speed", "latency"]
|
567
|
+
for col in numeric_columns:
|
568
|
+
if col in df.columns:
|
569
|
+
df[col] = pd.to_numeric(df[col], errors="coerce")
|
570
|
+
|
571
|
+
# Create performance categories
|
572
|
+
if "download_speed" in df.columns:
|
573
|
+
df["download_speed_category"] = pd.cut(
|
574
|
+
df["download_speed"],
|
575
|
+
bins=[0, 5, 25, 100, float("inf")],
|
576
|
+
labels=[
|
577
|
+
"Very Slow (<5 Mbps)",
|
578
|
+
"Slow (5-25 Mbps)",
|
579
|
+
"Moderate (25-100 Mbps)",
|
580
|
+
"Fast (>100 Mbps)",
|
581
|
+
],
|
582
|
+
include_lowest=True,
|
583
|
+
)
|
584
|
+
|
585
|
+
if "upload_speed" in df.columns:
|
586
|
+
df["upload_speed_category"] = pd.cut(
|
587
|
+
df["upload_speed"],
|
588
|
+
bins=[0, 1, 10, 50, float("inf")],
|
589
|
+
labels=[
|
590
|
+
"Very Slow (<1 Mbps)",
|
591
|
+
"Slow (1-10 Mbps)",
|
592
|
+
"Moderate (10-50 Mbps)",
|
593
|
+
"Fast (>50 Mbps)",
|
594
|
+
],
|
595
|
+
include_lowest=True,
|
596
|
+
)
|
597
|
+
|
598
|
+
if "latency" in df.columns:
|
599
|
+
df["latency_category"] = pd.cut(
|
600
|
+
df["latency"],
|
601
|
+
bins=[0, 50, 150, 300, float("inf")],
|
602
|
+
labels=[
|
603
|
+
"Excellent (<50ms)",
|
604
|
+
"Good (50-150ms)",
|
605
|
+
"Fair (150-300ms)",
|
606
|
+
"Poor (>300ms)",
|
607
|
+
],
|
608
|
+
include_lowest=True,
|
609
|
+
)
|
610
|
+
|
611
|
+
# Create quality flags
|
612
|
+
if "download_speed" in df.columns and "upload_speed" in df.columns:
|
613
|
+
df["has_broadband"] = (df["download_speed"] >= 25) & (
|
614
|
+
df["upload_speed"] >= 3
|
615
|
+
)
|
616
|
+
df["has_basic_connectivity"] = (df["download_speed"] >= 1) & (
|
617
|
+
df["upload_speed"] >= 0.5
|
618
|
+
)
|
619
|
+
|
620
|
+
# Flag measurements with missing data
|
621
|
+
df["has_complete_measurement"] = (
|
622
|
+
df["download_speed"].notna()
|
623
|
+
& df["upload_speed"].notna()
|
624
|
+
& df["latency"].notna()
|
625
|
+
)
|
626
|
+
|
627
|
+
self.logger.info(f"Processed measurement data for {len(df)} records")
|
628
|
+
|
629
|
+
return df
|
630
|
+
|
631
|
+
def get_performance_summary(self, df: pd.DataFrame) -> dict:
|
632
|
+
"""
|
633
|
+
Generate a comprehensive summary of connectivity performance metrics.
|
634
|
+
|
635
|
+
Args:
|
636
|
+
df: DataFrame with measurement data
|
637
|
+
|
638
|
+
Returns:
|
639
|
+
dict: Summary statistics about connectivity performance
|
640
|
+
"""
|
641
|
+
if df.empty:
|
642
|
+
return {"error": "No data available"}
|
643
|
+
|
644
|
+
summary = {
|
645
|
+
"total_measurements": len(df),
|
646
|
+
"country": (
|
647
|
+
df["country_iso3_code"].iloc[0]
|
648
|
+
if "country_iso3_code" in df.columns
|
649
|
+
else "Unknown"
|
650
|
+
),
|
651
|
+
"date_range": {
|
652
|
+
"start": (
|
653
|
+
df["date"].min().strftime("%Y-%m-%d")
|
654
|
+
if "date" in df.columns
|
655
|
+
else None
|
656
|
+
),
|
657
|
+
"end": (
|
658
|
+
df["date"].max().strftime("%Y-%m-%d")
|
659
|
+
if "date" in df.columns
|
660
|
+
else None
|
661
|
+
),
|
662
|
+
},
|
663
|
+
}
|
664
|
+
|
665
|
+
# School coverage
|
666
|
+
if "giga_id_school" in df.columns:
|
667
|
+
unique_schools = df["giga_id_school"].nunique()
|
668
|
+
summary["unique_schools_measured"] = unique_schools
|
669
|
+
summary["avg_measurements_per_school"] = (
|
670
|
+
len(df) / unique_schools if unique_schools > 0 else 0
|
671
|
+
)
|
672
|
+
|
673
|
+
# Speed statistics
|
674
|
+
for speed_col in ["download_speed", "upload_speed"]:
|
675
|
+
if speed_col in df.columns:
|
676
|
+
speed_data = df[speed_col].dropna()
|
677
|
+
if len(speed_data) > 0:
|
678
|
+
summary[f"{speed_col}_stats"] = {
|
679
|
+
"mean": float(speed_data.mean()),
|
680
|
+
"median": float(speed_data.median()),
|
681
|
+
"min": float(speed_data.min()),
|
682
|
+
"max": float(speed_data.max()),
|
683
|
+
"std": float(speed_data.std()),
|
684
|
+
}
|
685
|
+
|
686
|
+
# Latency statistics
|
687
|
+
if "latency" in df.columns:
|
688
|
+
latency_data = df["latency"].dropna()
|
689
|
+
if len(latency_data) > 0:
|
690
|
+
summary["latency_stats"] = {
|
691
|
+
"mean": float(latency_data.mean()),
|
692
|
+
"median": float(latency_data.median()),
|
693
|
+
"min": float(latency_data.min()),
|
694
|
+
"max": float(latency_data.max()),
|
695
|
+
"std": float(latency_data.std()),
|
696
|
+
}
|
697
|
+
|
698
|
+
# Performance categories
|
699
|
+
for cat_col in [
|
700
|
+
"download_speed_category",
|
701
|
+
"upload_speed_category",
|
702
|
+
"latency_category",
|
703
|
+
]:
|
704
|
+
if cat_col in df.columns:
|
705
|
+
cat_counts = df[cat_col].value_counts().to_dict()
|
706
|
+
summary[cat_col.replace("_category", "_breakdown")] = cat_counts
|
707
|
+
|
708
|
+
# Quality metrics
|
709
|
+
if "has_broadband" in df.columns:
|
710
|
+
summary["broadband_capable_measurements"] = int(df["has_broadband"].sum())
|
711
|
+
summary["broadband_percentage"] = float(df["has_broadband"].mean() * 100)
|
712
|
+
|
713
|
+
if "has_basic_connectivity" in df.columns:
|
714
|
+
summary["basic_connectivity_measurements"] = int(
|
715
|
+
df["has_basic_connectivity"].sum()
|
716
|
+
)
|
717
|
+
summary["basic_connectivity_percentage"] = float(
|
718
|
+
df["has_basic_connectivity"].mean() * 100
|
719
|
+
)
|
720
|
+
|
721
|
+
# Data completeness
|
722
|
+
if "has_complete_measurement" in df.columns:
|
723
|
+
summary["complete_measurements"] = int(df["has_complete_measurement"].sum())
|
724
|
+
summary["data_completeness_percentage"] = float(
|
725
|
+
df["has_complete_measurement"].mean() * 100
|
726
|
+
)
|
727
|
+
|
728
|
+
# Data sources
|
729
|
+
if "data_source" in df.columns:
|
730
|
+
source_counts = df["data_source"].value_counts().to_dict()
|
731
|
+
summary["data_sources"] = source_counts
|
732
|
+
|
733
|
+
# Temporal patterns
|
734
|
+
if "day_of_week" in df.columns:
|
735
|
+
day_counts = df["day_of_week"].value_counts().to_dict()
|
736
|
+
summary["measurements_by_day_of_week"] = day_counts
|
737
|
+
|
738
|
+
self.logger.info("Generated performance summary")
|
739
|
+
return summary
|
740
|
+
|
741
|
+
def get_school_performance_comparison(
|
742
|
+
self, df: pd.DataFrame, top_n: int = 10
|
743
|
+
) -> dict:
|
744
|
+
"""
|
745
|
+
Compare performance across schools.
|
746
|
+
|
747
|
+
Args:
|
748
|
+
df: DataFrame with measurement data
|
749
|
+
top_n: Number of top/bottom schools to include
|
750
|
+
|
751
|
+
Returns:
|
752
|
+
dict: School performance comparison
|
753
|
+
"""
|
754
|
+
if df.empty or "giga_id_school" not in df.columns:
|
755
|
+
return {"error": "No school data available"}
|
756
|
+
|
757
|
+
school_stats = (
|
758
|
+
df.groupby("giga_id_school")
|
759
|
+
.agg(
|
760
|
+
{
|
761
|
+
"download_speed": ["mean", "median", "count"],
|
762
|
+
"upload_speed": ["mean", "median"],
|
763
|
+
"latency": ["mean", "median"],
|
764
|
+
"has_broadband": (
|
765
|
+
"mean" if "has_broadband" in df.columns else lambda x: None
|
766
|
+
),
|
767
|
+
}
|
768
|
+
)
|
769
|
+
.round(2)
|
770
|
+
)
|
771
|
+
|
772
|
+
# Flatten column names
|
773
|
+
school_stats.columns = ["_".join(col).strip() for col in school_stats.columns]
|
774
|
+
|
775
|
+
# Sort by download speed
|
776
|
+
if "download_speed_mean" in school_stats.columns:
|
777
|
+
top_schools = school_stats.nlargest(top_n, "download_speed_mean")
|
778
|
+
bottom_schools = school_stats.nsmallest(top_n, "download_speed_mean")
|
779
|
+
|
780
|
+
return {
|
781
|
+
"top_performing_schools": top_schools.to_dict("index"),
|
782
|
+
"bottom_performing_schools": bottom_schools.to_dict("index"),
|
783
|
+
"total_schools_analyzed": len(school_stats),
|
784
|
+
}
|
785
|
+
|
786
|
+
return {"error": "Insufficient data for school comparison"}
|