giga-spatial 0.6.2__py3-none-any.whl → 0.6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,12 @@
1
1
  import requests
2
2
  import pandas as pd
3
3
  import time
4
+ from datetime import datetime, date
4
5
  from pydantic.dataclasses import dataclass, Field
5
6
  from pydantic import ConfigDict
6
7
  from shapely.geometry import Point
7
8
  import pycountry
9
+ from typing import Optional, Union
8
10
  import logging
9
11
 
10
12
  from gigaspatial.config import config as global_config
@@ -143,3 +145,642 @@ class GigaSchoolLocationFetcher:
143
145
  self.logger.info(f"Created geometry for all {len(df)} records")
144
146
 
145
147
  return df
148
+
149
+
150
+ @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
151
+ class GigaSchoolProfileFetcher:
152
+ """
153
+ Fetch and process school profile data from the Giga School Profile API.
154
+ This includes connectivity information and other school details.
155
+ """
156
+
157
+ country: str = Field(...)
158
+ api_url: str = Field(
159
+ default="https://uni-ooi-giga-maps-service.azurewebsites.net/api/v1/schools_profile/",
160
+ description="Base URL for the Giga School Profile API",
161
+ )
162
+ api_key: str = global_config.GIGA_SCHOOL_PROFILE_API_KEY
163
+ page_size: int = Field(default=1000, description="Number of records per API page")
164
+ sleep_time: float = Field(
165
+ default=0.2, description="Sleep time between API requests"
166
+ )
167
+ giga_id_school: Optional[str] = Field(
168
+ default=None, description="Optional specific giga school ID to fetch"
169
+ )
170
+
171
+ logger: logging.Logger = Field(default=None, repr=False)
172
+
173
+ def __post_init__(self):
174
+ try:
175
+ self.country = pycountry.countries.lookup(self.country).alpha_3
176
+ except LookupError:
177
+ raise ValueError(f"Invalid country code provided: {self.country}")
178
+
179
+ if self.logger is None:
180
+ self.logger = global_config.get_logger(self.__class__.__name__)
181
+
182
+ def fetch_profiles(self, **kwargs) -> pd.DataFrame:
183
+ """
184
+ Fetch and process school profiles including connectivity information.
185
+
186
+ Args:
187
+ **kwargs: Additional parameters for customization
188
+ - page_size: Override default page size
189
+ - sleep_time: Override default sleep time between requests
190
+ - max_pages: Limit the number of pages to fetch
191
+ - giga_id_school: Override default giga_id_school filter
192
+
193
+ Returns:
194
+ pd.DataFrame: School profiles with connectivity and geospatial info.
195
+ """
196
+ # Override defaults with kwargs if provided
197
+ page_size = kwargs.get("page_size", self.page_size)
198
+ sleep_time = kwargs.get("sleep_time", self.sleep_time)
199
+ max_pages = kwargs.get("max_pages", None)
200
+ giga_id_school = kwargs.get("giga_id_school", self.giga_id_school)
201
+
202
+ # Prepare headers
203
+ headers = {
204
+ "Authorization": f"Bearer {self.api_key}",
205
+ "Accept": "application/json",
206
+ }
207
+
208
+ all_data = []
209
+ page = 1
210
+
211
+ self.logger.info(
212
+ f"Starting to fetch school profiles for country: {self.country}"
213
+ )
214
+
215
+ if giga_id_school:
216
+ self.logger.info(f"Filtering for specific school ID: {giga_id_school}")
217
+
218
+ while True:
219
+ # Check if we've reached max_pages limit
220
+ if max_pages and page > max_pages:
221
+ self.logger.info(f"Reached maximum pages limit: {max_pages}")
222
+ break
223
+
224
+ # Build parameters
225
+ params = {
226
+ "country_iso3_code": self.country,
227
+ "page": page,
228
+ "size": page_size,
229
+ }
230
+
231
+ # Add giga_id_school filter if specified
232
+ if giga_id_school:
233
+ params["giga_id_school"] = giga_id_school
234
+
235
+ try:
236
+ self.logger.debug(f"Fetching page {page} with params: {params}")
237
+ response = requests.get(self.api_url, headers=headers, params=params)
238
+ response.raise_for_status()
239
+
240
+ parsed = response.json()
241
+ data = parsed.get("data", [])
242
+
243
+ except requests.exceptions.RequestException as e:
244
+ self.logger.error(f"Request failed on page {page}: {e}")
245
+ break
246
+ except ValueError as e:
247
+ self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
248
+ break
249
+
250
+ # Check if we got any data
251
+ if not data:
252
+ self.logger.info(f"No data on page {page}. Stopping.")
253
+ break
254
+
255
+ all_data.extend(data)
256
+ self.logger.info(f"Fetched page {page} with {len(data)} records")
257
+
258
+ # If we got fewer records than page_size, we've reached the end
259
+ if len(data) < page_size:
260
+ self.logger.info("Reached end of data (partial page received)")
261
+ break
262
+
263
+ # If filtering by specific school ID, we likely only need one page
264
+ if giga_id_school:
265
+ self.logger.info(
266
+ "Specific school ID requested, stopping after first page"
267
+ )
268
+ break
269
+
270
+ page += 1
271
+
272
+ # Sleep to be respectful to the API
273
+ if sleep_time > 0:
274
+ time.sleep(sleep_time)
275
+
276
+ self.logger.info(f"Finished fetching. Total records: {len(all_data)}")
277
+
278
+ # Convert to DataFrame and process
279
+ if not all_data:
280
+ self.logger.warning("No data fetched, returning empty DataFrame")
281
+ return pd.DataFrame()
282
+
283
+ df = pd.DataFrame(all_data)
284
+
285
+ return df
286
+
287
+ def get_connectivity_summary(self, df: pd.DataFrame) -> dict:
288
+ """
289
+ Generate a summary of connectivity statistics from the fetched data.
290
+
291
+ Args:
292
+ df: DataFrame with school profile data
293
+
294
+ Returns:
295
+ dict: Summary statistics about connectivity
296
+ """
297
+ if df.empty:
298
+ return {"error": "No data available"}
299
+
300
+ summary = {
301
+ "total_schools": len(df),
302
+ "country": (
303
+ df["country_iso3_code"].iloc[0]
304
+ if "country_iso3_code" in df.columns
305
+ else "Unknown"
306
+ ),
307
+ }
308
+
309
+ # Administrative region analysis
310
+ if "admin1" in df.columns:
311
+ admin1_counts = df["admin1"].value_counts().head(10).to_dict()
312
+ summary["top_admin1_regions"] = admin1_counts
313
+
314
+ if "admin2" in df.columns:
315
+ admin2_counts = df["admin2"].value_counts().head(10).to_dict()
316
+ summary["top_admin2_regions"] = admin2_counts
317
+
318
+ # Connectivity analysis
319
+ if "connectivity" in df.columns:
320
+ connected_count = df["connectivity"].sum()
321
+ summary["schools_with_connectivity"] = int(connected_count)
322
+ summary["connectivity_percentage"] = connected_count / len(df) * 100
323
+
324
+ if "connectivity_RT" in df.columns:
325
+ rt_connected_count = df["connectivity_RT"].sum()
326
+ summary["schools_with_realtime_connectivity"] = int(rt_connected_count)
327
+ summary["realtime_connectivity_percentage"] = (
328
+ rt_connected_count / len(df) * 100
329
+ )
330
+
331
+ # Connectivity type analysis
332
+ if "connectivity_type" in df.columns:
333
+
334
+ if not all(df.connectivity_type.isna()):
335
+ from collections import Counter
336
+
337
+ type_counts = dict(Counter(df.connectivity_type.dropna().to_list()))
338
+ summary["connectivity_types_breakdown"] = type_counts
339
+
340
+ # Data source analysis
341
+ if "connectivity_RT_datasource" in df.columns:
342
+ datasource_counts = (
343
+ df["connectivity_RT_datasource"].value_counts().to_dict()
344
+ )
345
+ summary["realtime_connectivity_datasources"] = datasource_counts
346
+
347
+ if "school_data_source" in df.columns:
348
+ school_datasource_counts = df["school_data_source"].value_counts().to_dict()
349
+ summary["school_data_sources"] = school_datasource_counts
350
+
351
+ self.logger.info("Generated connectivity summary")
352
+ return summary
353
+
354
+
355
+ @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
356
+ class GigaSchoolMeasurementsFetcher:
357
+ """
358
+ Fetch and process school daily realtime connectivity measurements from the Giga API.
359
+ This includes download/upload speeds, latency, and connectivity performance data.
360
+ """
361
+
362
+ country: str = Field(...)
363
+ start_date: Union[str, date, datetime] = Field(...)
364
+ end_date: Union[str, date, datetime] = Field(...)
365
+ api_url: str = Field(
366
+ default="https://uni-ooi-giga-maps-service.azurewebsites.net/api/v1/all_measurements",
367
+ description="Base URL for the Giga School Measurements API",
368
+ )
369
+ api_key: str = global_config.GIGA_SCHOOL_MEASUREMENTS_API_KEY
370
+ page_size: int = Field(default=1000, description="Number of records per API page")
371
+ sleep_time: float = Field(
372
+ default=0.2, description="Sleep time between API requests"
373
+ )
374
+ giga_id_school: Optional[str] = Field(
375
+ default=None, description="Optional specific giga school ID to fetch"
376
+ )
377
+
378
+ logger: logging.Logger = Field(default=None, repr=False)
379
+
380
+ def __post_init__(self):
381
+ try:
382
+ self.country = pycountry.countries.lookup(self.country).alpha_3
383
+ except LookupError:
384
+ raise ValueError(f"Invalid country code provided: {self.country}")
385
+
386
+ # Convert dates to string format if needed
387
+ self.start_date = self._format_date(self.start_date)
388
+ self.end_date = self._format_date(self.end_date)
389
+
390
+ # Validate date range
391
+ if self.start_date > self.end_date:
392
+ raise ValueError("start_date must be before or equal to end_date")
393
+
394
+ if self.logger is None:
395
+ self.logger = global_config.get_logger(self.__class__.__name__)
396
+
397
+ def _format_date(self, date_input: Union[str, date, datetime]) -> str:
398
+ """
399
+ Convert date input to string format expected by API (YYYY-MM-DD).
400
+
401
+ Args:
402
+ date_input: Date in various formats
403
+
404
+ Returns:
405
+ str: Date in YYYY-MM-DD format
406
+ """
407
+ if isinstance(date_input, str):
408
+ # Assume it's already in correct format or parse it
409
+ try:
410
+ parsed_date = datetime.strptime(date_input, "%Y-%m-%d")
411
+ return date_input
412
+ except ValueError:
413
+ try:
414
+ parsed_date = pd.to_datetime(date_input)
415
+ return parsed_date.strftime("%Y-%m-%d")
416
+ except:
417
+ raise ValueError(
418
+ f"Invalid date format: {date_input}. Expected YYYY-MM-DD"
419
+ )
420
+ elif isinstance(date_input, (date, datetime)):
421
+ return date_input.strftime("%Y-%m-%d")
422
+ else:
423
+ raise ValueError(f"Invalid date type: {type(date_input)}")
424
+
425
+ def fetch_measurements(self, **kwargs) -> pd.DataFrame:
426
+ """
427
+ Fetch and process school connectivity measurements.
428
+
429
+ Args:
430
+ **kwargs: Additional parameters for customization
431
+ - page_size: Override default page size
432
+ - sleep_time: Override default sleep time between requests
433
+ - max_pages: Limit the number of pages to fetch
434
+ - giga_id_school: Override default giga_id_school filter
435
+ - start_date: Override default start_date
436
+ - end_date: Override default end_date
437
+
438
+ Returns:
439
+ pd.DataFrame: School measurements with connectivity performance data.
440
+ """
441
+ # Override defaults with kwargs if provided
442
+ page_size = kwargs.get("page_size", self.page_size)
443
+ sleep_time = kwargs.get("sleep_time", self.sleep_time)
444
+ max_pages = kwargs.get("max_pages", None)
445
+ giga_id_school = kwargs.get("giga_id_school", self.giga_id_school)
446
+ start_date = kwargs.get("start_date", self.start_date)
447
+ end_date = kwargs.get("end_date", self.end_date)
448
+
449
+ # Format dates if overridden
450
+ if start_date != self.start_date:
451
+ start_date = self._format_date(start_date)
452
+ if end_date != self.end_date:
453
+ end_date = self._format_date(end_date)
454
+
455
+ # Prepare headers
456
+ headers = {
457
+ "Authorization": f"Bearer {self.api_key}",
458
+ "Accept": "application/json",
459
+ }
460
+
461
+ all_data = []
462
+ page = 1
463
+
464
+ self.logger.info(
465
+ f"Starting to fetch measurements for country: {self.country} "
466
+ f"from {start_date} to {end_date}"
467
+ )
468
+
469
+ if giga_id_school:
470
+ self.logger.info(f"Filtering for specific school ID: {giga_id_school}")
471
+
472
+ while True:
473
+ # Check if we've reached max_pages limit
474
+ if max_pages and page > max_pages:
475
+ self.logger.info(f"Reached maximum pages limit: {max_pages}")
476
+ break
477
+
478
+ # Build parameters
479
+ params = {
480
+ "country_iso3_code": self.country,
481
+ "start_date": start_date,
482
+ "end_date": end_date,
483
+ "page": page,
484
+ "size": page_size,
485
+ }
486
+
487
+ # Add giga_id_school filter if specified
488
+ if giga_id_school:
489
+ params["giga_id_school"] = giga_id_school
490
+
491
+ try:
492
+ self.logger.debug(f"Fetching page {page} with params: {params}")
493
+ response = requests.get(self.api_url, headers=headers, params=params)
494
+ response.raise_for_status()
495
+
496
+ parsed = response.json()
497
+ data = parsed.get("data", [])
498
+
499
+ except requests.exceptions.RequestException as e:
500
+ self.logger.error(f"Request failed on page {page}: {e}")
501
+ break
502
+ except ValueError as e:
503
+ self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
504
+ break
505
+
506
+ # Check if we got any data
507
+ if not data:
508
+ self.logger.info(f"No data on page {page}. Stopping.")
509
+ break
510
+
511
+ all_data.extend(data)
512
+ self.logger.info(f"Fetched page {page} with {len(data)} records")
513
+
514
+ # If we got fewer records than page_size, we've reached the end
515
+ if len(data) < page_size:
516
+ self.logger.info("Reached end of data (partial page received)")
517
+ break
518
+
519
+ # If filtering by specific school ID, we might only need one page
520
+ if giga_id_school and len(all_data) > 0:
521
+ self.logger.info(
522
+ "Specific school ID requested, checking if more data needed"
523
+ )
524
+
525
+ page += 1
526
+
527
+ # Sleep to be respectful to the API
528
+ if sleep_time > 0:
529
+ time.sleep(sleep_time)
530
+
531
+ self.logger.info(f"Finished fetching. Total records: {len(all_data)}")
532
+
533
+ # Convert to DataFrame and process
534
+ if not all_data:
535
+ self.logger.warning("No data fetched, returning empty DataFrame")
536
+ return pd.DataFrame()
537
+
538
+ df = pd.DataFrame(all_data)
539
+ df = self._process_measurements_data(df)
540
+
541
+ return df
542
+
543
+ def _process_measurements_data(self, df: pd.DataFrame) -> pd.DataFrame:
544
+ """
545
+ Process and enhance the DataFrame with measurement performance metrics.
546
+
547
+ Args:
548
+ df: Raw DataFrame from API
549
+
550
+ Returns:
551
+ pd.DataFrame: Enhanced DataFrame with processed measurement data
552
+ """
553
+ if df.empty:
554
+ return df
555
+
556
+ # Convert date column to datetime
557
+ if "date" in df.columns:
558
+ df["date"] = pd.to_datetime(df["date"], errors="coerce")
559
+ df["date_only"] = df["date"].dt.date
560
+ df["year"] = df["date"].dt.year
561
+ df["month"] = df["date"].dt.month
562
+ df["day_of_week"] = df["date"].dt.day_name()
563
+ self.logger.info("Processed date fields")
564
+
565
+ # Process speed measurements
566
+ numeric_columns = ["download_speed", "upload_speed", "latency"]
567
+ for col in numeric_columns:
568
+ if col in df.columns:
569
+ df[col] = pd.to_numeric(df[col], errors="coerce")
570
+
571
+ # Create performance categories
572
+ if "download_speed" in df.columns:
573
+ df["download_speed_category"] = pd.cut(
574
+ df["download_speed"],
575
+ bins=[0, 5, 25, 100, float("inf")],
576
+ labels=[
577
+ "Very Slow (<5 Mbps)",
578
+ "Slow (5-25 Mbps)",
579
+ "Moderate (25-100 Mbps)",
580
+ "Fast (>100 Mbps)",
581
+ ],
582
+ include_lowest=True,
583
+ )
584
+
585
+ if "upload_speed" in df.columns:
586
+ df["upload_speed_category"] = pd.cut(
587
+ df["upload_speed"],
588
+ bins=[0, 1, 10, 50, float("inf")],
589
+ labels=[
590
+ "Very Slow (<1 Mbps)",
591
+ "Slow (1-10 Mbps)",
592
+ "Moderate (10-50 Mbps)",
593
+ "Fast (>50 Mbps)",
594
+ ],
595
+ include_lowest=True,
596
+ )
597
+
598
+ if "latency" in df.columns:
599
+ df["latency_category"] = pd.cut(
600
+ df["latency"],
601
+ bins=[0, 50, 150, 300, float("inf")],
602
+ labels=[
603
+ "Excellent (<50ms)",
604
+ "Good (50-150ms)",
605
+ "Fair (150-300ms)",
606
+ "Poor (>300ms)",
607
+ ],
608
+ include_lowest=True,
609
+ )
610
+
611
+ # Create quality flags
612
+ if "download_speed" in df.columns and "upload_speed" in df.columns:
613
+ df["has_broadband"] = (df["download_speed"] >= 25) & (
614
+ df["upload_speed"] >= 3
615
+ )
616
+ df["has_basic_connectivity"] = (df["download_speed"] >= 1) & (
617
+ df["upload_speed"] >= 0.5
618
+ )
619
+
620
+ # Flag measurements with missing data
621
+ df["has_complete_measurement"] = (
622
+ df["download_speed"].notna()
623
+ & df["upload_speed"].notna()
624
+ & df["latency"].notna()
625
+ )
626
+
627
+ self.logger.info(f"Processed measurement data for {len(df)} records")
628
+
629
+ return df
630
+
631
+ def get_performance_summary(self, df: pd.DataFrame) -> dict:
632
+ """
633
+ Generate a comprehensive summary of connectivity performance metrics.
634
+
635
+ Args:
636
+ df: DataFrame with measurement data
637
+
638
+ Returns:
639
+ dict: Summary statistics about connectivity performance
640
+ """
641
+ if df.empty:
642
+ return {"error": "No data available"}
643
+
644
+ summary = {
645
+ "total_measurements": len(df),
646
+ "country": (
647
+ df["country_iso3_code"].iloc[0]
648
+ if "country_iso3_code" in df.columns
649
+ else "Unknown"
650
+ ),
651
+ "date_range": {
652
+ "start": (
653
+ df["date"].min().strftime("%Y-%m-%d")
654
+ if "date" in df.columns
655
+ else None
656
+ ),
657
+ "end": (
658
+ df["date"].max().strftime("%Y-%m-%d")
659
+ if "date" in df.columns
660
+ else None
661
+ ),
662
+ },
663
+ }
664
+
665
+ # School coverage
666
+ if "giga_id_school" in df.columns:
667
+ unique_schools = df["giga_id_school"].nunique()
668
+ summary["unique_schools_measured"] = unique_schools
669
+ summary["avg_measurements_per_school"] = (
670
+ len(df) / unique_schools if unique_schools > 0 else 0
671
+ )
672
+
673
+ # Speed statistics
674
+ for speed_col in ["download_speed", "upload_speed"]:
675
+ if speed_col in df.columns:
676
+ speed_data = df[speed_col].dropna()
677
+ if len(speed_data) > 0:
678
+ summary[f"{speed_col}_stats"] = {
679
+ "mean": float(speed_data.mean()),
680
+ "median": float(speed_data.median()),
681
+ "min": float(speed_data.min()),
682
+ "max": float(speed_data.max()),
683
+ "std": float(speed_data.std()),
684
+ }
685
+
686
+ # Latency statistics
687
+ if "latency" in df.columns:
688
+ latency_data = df["latency"].dropna()
689
+ if len(latency_data) > 0:
690
+ summary["latency_stats"] = {
691
+ "mean": float(latency_data.mean()),
692
+ "median": float(latency_data.median()),
693
+ "min": float(latency_data.min()),
694
+ "max": float(latency_data.max()),
695
+ "std": float(latency_data.std()),
696
+ }
697
+
698
+ # Performance categories
699
+ for cat_col in [
700
+ "download_speed_category",
701
+ "upload_speed_category",
702
+ "latency_category",
703
+ ]:
704
+ if cat_col in df.columns:
705
+ cat_counts = df[cat_col].value_counts().to_dict()
706
+ summary[cat_col.replace("_category", "_breakdown")] = cat_counts
707
+
708
+ # Quality metrics
709
+ if "has_broadband" in df.columns:
710
+ summary["broadband_capable_measurements"] = int(df["has_broadband"].sum())
711
+ summary["broadband_percentage"] = float(df["has_broadband"].mean() * 100)
712
+
713
+ if "has_basic_connectivity" in df.columns:
714
+ summary["basic_connectivity_measurements"] = int(
715
+ df["has_basic_connectivity"].sum()
716
+ )
717
+ summary["basic_connectivity_percentage"] = float(
718
+ df["has_basic_connectivity"].mean() * 100
719
+ )
720
+
721
+ # Data completeness
722
+ if "has_complete_measurement" in df.columns:
723
+ summary["complete_measurements"] = int(df["has_complete_measurement"].sum())
724
+ summary["data_completeness_percentage"] = float(
725
+ df["has_complete_measurement"].mean() * 100
726
+ )
727
+
728
+ # Data sources
729
+ if "data_source" in df.columns:
730
+ source_counts = df["data_source"].value_counts().to_dict()
731
+ summary["data_sources"] = source_counts
732
+
733
+ # Temporal patterns
734
+ if "day_of_week" in df.columns:
735
+ day_counts = df["day_of_week"].value_counts().to_dict()
736
+ summary["measurements_by_day_of_week"] = day_counts
737
+
738
+ self.logger.info("Generated performance summary")
739
+ return summary
740
+
741
+ def get_school_performance_comparison(
742
+ self, df: pd.DataFrame, top_n: int = 10
743
+ ) -> dict:
744
+ """
745
+ Compare performance across schools.
746
+
747
+ Args:
748
+ df: DataFrame with measurement data
749
+ top_n: Number of top/bottom schools to include
750
+
751
+ Returns:
752
+ dict: School performance comparison
753
+ """
754
+ if df.empty or "giga_id_school" not in df.columns:
755
+ return {"error": "No school data available"}
756
+
757
+ school_stats = (
758
+ df.groupby("giga_id_school")
759
+ .agg(
760
+ {
761
+ "download_speed": ["mean", "median", "count"],
762
+ "upload_speed": ["mean", "median"],
763
+ "latency": ["mean", "median"],
764
+ "has_broadband": (
765
+ "mean" if "has_broadband" in df.columns else lambda x: None
766
+ ),
767
+ }
768
+ )
769
+ .round(2)
770
+ )
771
+
772
+ # Flatten column names
773
+ school_stats.columns = ["_".join(col).strip() for col in school_stats.columns]
774
+
775
+ # Sort by download speed
776
+ if "download_speed_mean" in school_stats.columns:
777
+ top_schools = school_stats.nlargest(top_n, "download_speed_mean")
778
+ bottom_schools = school_stats.nsmallest(top_n, "download_speed_mean")
779
+
780
+ return {
781
+ "top_performing_schools": top_schools.to_dict("index"),
782
+ "bottom_performing_schools": bottom_schools.to_dict("index"),
783
+ "total_schools_analyzed": len(school_stats),
784
+ }
785
+
786
+ return {"error": "Insufficient data for school comparison"}