giga-spatial 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. giga_spatial-0.6.0.dist-info/METADATA +141 -0
  2. giga_spatial-0.6.0.dist-info/RECORD +47 -0
  3. giga_spatial-0.6.0.dist-info/WHEEL +5 -0
  4. giga_spatial-0.6.0.dist-info/licenses/LICENSE +661 -0
  5. giga_spatial-0.6.0.dist-info/top_level.txt +1 -0
  6. gigaspatial/__init__.py +1 -0
  7. gigaspatial/config.py +226 -0
  8. gigaspatial/core/__init__.py +0 -0
  9. gigaspatial/core/io/__init__.py +5 -0
  10. gigaspatial/core/io/adls_data_store.py +325 -0
  11. gigaspatial/core/io/data_api.py +113 -0
  12. gigaspatial/core/io/data_store.py +147 -0
  13. gigaspatial/core/io/local_data_store.py +92 -0
  14. gigaspatial/core/io/readers.py +265 -0
  15. gigaspatial/core/io/writers.py +128 -0
  16. gigaspatial/core/schemas/__init__.py +0 -0
  17. gigaspatial/core/schemas/entity.py +244 -0
  18. gigaspatial/generators/__init__.py +2 -0
  19. gigaspatial/generators/poi.py +636 -0
  20. gigaspatial/generators/zonal/__init__.py +3 -0
  21. gigaspatial/generators/zonal/base.py +370 -0
  22. gigaspatial/generators/zonal/geometry.py +439 -0
  23. gigaspatial/generators/zonal/mercator.py +78 -0
  24. gigaspatial/grid/__init__.py +1 -0
  25. gigaspatial/grid/mercator_tiles.py +286 -0
  26. gigaspatial/handlers/__init__.py +40 -0
  27. gigaspatial/handlers/base.py +761 -0
  28. gigaspatial/handlers/boundaries.py +305 -0
  29. gigaspatial/handlers/ghsl.py +772 -0
  30. gigaspatial/handlers/giga.py +145 -0
  31. gigaspatial/handlers/google_open_buildings.py +472 -0
  32. gigaspatial/handlers/hdx.py +241 -0
  33. gigaspatial/handlers/mapbox_image.py +208 -0
  34. gigaspatial/handlers/maxar_image.py +291 -0
  35. gigaspatial/handlers/microsoft_global_buildings.py +548 -0
  36. gigaspatial/handlers/ookla_speedtest.py +199 -0
  37. gigaspatial/handlers/opencellid.py +290 -0
  38. gigaspatial/handlers/osm.py +356 -0
  39. gigaspatial/handlers/overture.py +126 -0
  40. gigaspatial/handlers/rwi.py +157 -0
  41. gigaspatial/handlers/unicef_georepo.py +806 -0
  42. gigaspatial/handlers/worldpop.py +266 -0
  43. gigaspatial/processing/__init__.py +4 -0
  44. gigaspatial/processing/geo.py +1054 -0
  45. gigaspatial/processing/sat_images.py +39 -0
  46. gigaspatial/processing/tif_processor.py +477 -0
  47. gigaspatial/processing/utils.py +49 -0
@@ -0,0 +1,199 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import geopandas as gpd
4
+ from shapely import wkt
5
+ from datetime import datetime
6
+ import json
7
+ import requests
8
+ from pathlib import Path
9
+ from pydantic import BaseModel, ConfigDict, Field
10
+ from typing import List, Literal, Optional
11
+
12
+ from gigaspatial.grid.mercator_tiles import CountryMercatorTiles
13
+ from gigaspatial.core.io.readers import read_dataset
14
+ from gigaspatial.core.io.data_store import DataStore
15
+ from gigaspatial.core.io.local_data_store import LocalDataStore
16
+ from gigaspatial.config import config
17
+
18
+
19
+ class OoklaSpeedtestTileConfig(BaseModel):
20
+ service_type: Literal["fixed", "mobile"]
21
+ year: int
22
+ quarter: int
23
+ data_store: DataStore = Field(default_factory=LocalDataStore, exclude=True)
24
+ base_path: Path = Field(
25
+ default=config.get_path("ookla_speedtest", "bronze"), exclude=True
26
+ )
27
+
28
+ class Config:
29
+ arbitrary_types_allowed = True
30
+
31
+ @property
32
+ def quarter_start(self):
33
+ if not 1 <= self.quarter <= 4:
34
+ raise ValueError("Quarter must be within [1, 2, 3, 4]")
35
+
36
+ month = [1, 4, 7, 10]
37
+ return datetime(self.year, month[self.quarter - 1], 1)
38
+
39
+ @property
40
+ def tile_name(self):
41
+ return f"{self.quarter_start:%Y-%m-%d}_performance_{self.service_type}_tiles.parquet"
42
+
43
+ @property
44
+ def tile_url(self):
45
+ base_url = "https://ookla-open-data.s3.amazonaws.com/parquet/performance"
46
+ qs_dt = self.quarter_start
47
+ return f"{base_url}/type={self.service_type}/year={qs_dt:%Y}/quarter={self.quarter}/{qs_dt:%Y-%m-%d}_performance_{self.service_type}_tiles.parquet"
48
+
49
+ def download_tile(self):
50
+ path = str(self.base_path / self.tile_name)
51
+ if not self.data_store.file_exists(path):
52
+ response = requests.get(self.tile_url)
53
+ response.raise_for_status()
54
+ self.data_store.write_file(path, response.content)
55
+
56
+ def read_tile(self):
57
+ path = str(self.base_path / self.tile_name)
58
+
59
+ if self.data_store.file_exists(path):
60
+ df = read_dataset(self.data_store, path)
61
+ return df
62
+ else:
63
+ self.download_tile()
64
+ df = self.read_tile()
65
+ return df
66
+
67
+
68
+ class OoklaSpeedtestConfig(BaseModel):
69
+ tiles: List[OoklaSpeedtestTileConfig] = Field(default_factory=list)
70
+
71
+ @classmethod
72
+ def from_available_ookla_tiles(
73
+ cls, data_store: DataStore = None, base_path: Path = None
74
+ ):
75
+ data_store = data_store or LocalDataStore()
76
+ base_path = base_path or config.get_path("ookla_speedtest", "bronze")
77
+
78
+ # first data year
79
+ start_year = 2019
80
+ # max data year
81
+ max_year = datetime.today().year
82
+ max_quarter = np.floor((datetime.today().month - 1) / 3)
83
+ if max_quarter == 0:
84
+ max_year -= 1
85
+ max_quarter = 4
86
+
87
+ ookla_tiles = []
88
+ for year in range(start_year, max_year + 1):
89
+ for quarter in range(1, 5):
90
+ if year == max_year and quarter > max_quarter:
91
+ continue
92
+ for type in ["fixed", "mobile"]:
93
+ ookla_tiles.append(
94
+ OoklaSpeedtestTileConfig(
95
+ service_type=type,
96
+ year=year,
97
+ quarter=quarter,
98
+ data_store=data_store,
99
+ base_path=base_path,
100
+ )
101
+ )
102
+ return cls(tiles=ookla_tiles)
103
+
104
+
105
+ class OoklaSpeedtestTile(BaseModel):
106
+ quadkey: str
107
+ tile: str
108
+ avg_d_kbps: float
109
+ avg_u_kbps: float
110
+ avg_lat_ms: float
111
+ avg_lat_down_ms: Optional[float] = None
112
+ avg_lat_up_ms: Optional[float] = None
113
+ tests: int
114
+ devices: int
115
+
116
+ model_config = ConfigDict(extra="allow")
117
+
118
+
119
+ class CountryOoklaTiles(BaseModel):
120
+ country: str
121
+ service_type: str
122
+ year: int
123
+ quarter: int
124
+ quadkeys: List[OoklaSpeedtestTile]
125
+
126
+ @staticmethod
127
+ def from_country(country, ookla_tile_config: OoklaSpeedtestTileConfig):
128
+ # load country zoom level 16 quadkeys
129
+ country_tiles = CountryMercatorTiles.create(country, 16)
130
+
131
+ # read ookla tiles for the config
132
+ ookla_tiles = ookla_tile_config.read_tile()
133
+
134
+ # filter country tiles by ookla tile quadkeys
135
+ country_ookla_tiles = country_tiles.filter_quadkeys(ookla_tiles.quadkey)
136
+ if len(country_ookla_tiles):
137
+ df_quadkeys = country_ookla_tiles.to_dataframe().merge(
138
+ ookla_tiles, on="quadkey", how="left"
139
+ )
140
+ return CountryOoklaTiles(
141
+ country=country,
142
+ service_type=ookla_tile_config.service_type,
143
+ year=ookla_tile_config.year,
144
+ quarter=ookla_tile_config.quarter,
145
+ quadkeys=[
146
+ OoklaSpeedtestTile(**tile_dict)
147
+ for tile_dict in df_quadkeys.to_dict("records")
148
+ ],
149
+ )
150
+ else:
151
+ return CountryOoklaTiles(
152
+ country=country,
153
+ service_type=ookla_tile_config.service_type,
154
+ year=ookla_tile_config.year,
155
+ quarter=ookla_tile_config.quarter,
156
+ quadkeys=[],
157
+ )
158
+
159
+ def to_dataframe(self):
160
+ if len(self):
161
+ return pd.DataFrame([q.model_dump() for q in self.quadkeys])
162
+ else:
163
+ return pd.DataFrame(
164
+ columns=[
165
+ "quadkey",
166
+ "tile",
167
+ "avg_d_kbps",
168
+ "avg_u_kbps",
169
+ "avg_lat_ms",
170
+ "avg_lat_down_ms",
171
+ "avg_lat_up_ms",
172
+ "tests",
173
+ "devices",
174
+ ]
175
+ )
176
+
177
+ def to_geodataframe(self):
178
+ if len(self):
179
+ df = self.to_dataframe()
180
+ df["geometry"] = df.tile.apply(wkt.loads)
181
+ df.drop(columns="tile", inplace=True)
182
+ return gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
183
+ else:
184
+ return gpd.GeoDataFrame(
185
+ columns=[
186
+ "quadkey",
187
+ "avg_d_kbps",
188
+ "avg_u_kbps",
189
+ "avg_lat_ms",
190
+ "avg_lat_down_ms",
191
+ "avg_lat_up_ms",
192
+ "tests",
193
+ "devices",
194
+ "geometry",
195
+ ]
196
+ )
197
+
198
+ def __len__(self):
199
+ return len(self.quadkeys)
@@ -0,0 +1,290 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import geopandas as gpd
4
+ import requests
5
+ import logging
6
+ import gzip
7
+ import os
8
+ import tempfile
9
+ from datetime import datetime
10
+ from typing import List, Optional, Union
11
+ from pathlib import Path
12
+ from bs4 import BeautifulSoup
13
+ import pycountry
14
+ from pydantic import BaseModel, Field, HttpUrl, field_validator
15
+
16
+ from gigaspatial.core.io.data_store import DataStore
17
+ from gigaspatial.core.io.local_data_store import LocalDataStore
18
+ from gigaspatial.core.io.readers import read_dataset
19
+ from gigaspatial.config import config as global_config
20
+
21
+
22
+ class OpenCellIDConfig(BaseModel):
23
+ """Configuration for OpenCellID data access"""
24
+
25
+ # Base URLs
26
+ BASE_URL: HttpUrl = Field(default="https://opencellid.org/")
27
+ DOWNLOAD_URL: HttpUrl = Field(default="https://opencellid.org/downloads.php?token=")
28
+
29
+ # User configuration
30
+ country: str = Field(...)
31
+ api_token: str = Field(
32
+ default=global_config.OPENCELLID_ACCESS_TOKEN,
33
+ description="OpenCellID API Access Token",
34
+ )
35
+ base_path: Path = Field(default=global_config.get_path("opencellid", "bronze"))
36
+ created_newer: int = Field(
37
+ default=2003, description="Filter out cell towers added before this year"
38
+ )
39
+ created_before: int = Field(
40
+ default=datetime.now().year,
41
+ description="Filter out cell towers added after this year",
42
+ )
43
+ drop_duplicates: bool = Field(
44
+ default=True,
45
+ description="Drop cells that are in the exact same location and radio technology",
46
+ )
47
+
48
+ @field_validator("country")
49
+ def validate_country(cls, value: str) -> str:
50
+ try:
51
+ return pycountry.countries.lookup(value).alpha_3
52
+ except LookupError:
53
+ raise ValueError(f"Invalid country code provided: {value}")
54
+
55
+ @property
56
+ def output_file_path(self) -> Path:
57
+ """Path to save the downloaded OpenCellID data"""
58
+ return self.base_path / f"opencellid_{self.country.lower()}.csv.gz"
59
+
60
+ def __repr__(self) -> str:
61
+ return (
62
+ f"OpenCellIDConfig(\n"
63
+ f" country='{self.country}'\n"
64
+ f" created_newer={self.created_newer}\n"
65
+ f" created_before={self.created_before}\n"
66
+ f" drop_duplicates={self.drop_duplicates}\n"
67
+ f")"
68
+ )
69
+
70
+
71
+ class OpenCellIDDownloader:
72
+ """Downloader for OpenCellID data"""
73
+
74
+ def __init__(
75
+ self,
76
+ config: Union[OpenCellIDConfig, dict],
77
+ data_store: Optional[DataStore] = None,
78
+ logger: Optional[logging.Logger] = None,
79
+ ):
80
+ if isinstance(config, dict):
81
+ self.config = OpenCellIDConfig(**config)
82
+ else:
83
+ self.config = config
84
+
85
+ self.data_store = data_store or LocalDataStore()
86
+ self.logger = logger or global_config.get_logger(self.__class__.__name__)
87
+
88
+ @classmethod
89
+ def from_country(
90
+ cls,
91
+ country: str,
92
+ api_token: str = global_config.OPENCELLID_ACCESS_TOKEN,
93
+ **kwargs,
94
+ ):
95
+ """Create a downloader for a specific country"""
96
+ config = OpenCellIDConfig(country=country, api_token=api_token, **kwargs)
97
+ return cls(config=config)
98
+
99
+ def get_download_links(self) -> List[str]:
100
+ """Get download links for the country from OpenCellID website"""
101
+ url = f"{self.config.DOWNLOAD_URL}{self.config.api_token}"
102
+ country_alpha2 = pycountry.countries.get(
103
+ alpha_3=self.config.country.upper()
104
+ ).alpha_2
105
+
106
+ try:
107
+ # Find table with cell tower data links
108
+ self.logger.info(f"Fetching download links for {self.config.country}")
109
+ html_content = requests.get(url).text
110
+ soup = BeautifulSoup(html_content, "lxml")
111
+ table = soup.find("table", {"id": "regions"})
112
+
113
+ if not table:
114
+ raise ValueError(
115
+ "Could not find cell tower data table on OpenCellID website"
116
+ )
117
+
118
+ # Parse table headers
119
+ t_headers = []
120
+ for th in table.find_all("th"):
121
+ t_headers.append(th.text.replace("\n", " ").strip())
122
+
123
+ # Parse table data
124
+ table_data = []
125
+ for tr in table.tbody.find_all("tr"):
126
+ t_row = {}
127
+
128
+ for td, th in zip(tr.find_all("td"), t_headers):
129
+ if "Files" in th:
130
+ t_row[th] = []
131
+ for a in td.find_all("a"):
132
+ t_row[th].append(a.get("href"))
133
+ else:
134
+ t_row[th] = td.text.replace("\n", "").strip()
135
+
136
+ table_data.append(t_row)
137
+
138
+ cell_dict = pd.DataFrame(table_data)
139
+
140
+ # Get links for the country code
141
+ if country_alpha2 not in cell_dict["Country Code"].values:
142
+ raise ValueError(
143
+ f"Country code {country_alpha2} not found in OpenCellID database"
144
+ )
145
+ else:
146
+ links = cell_dict[cell_dict["Country Code"] == country_alpha2][
147
+ "Files (grouped by MCC)"
148
+ ].values[0]
149
+
150
+ return links
151
+
152
+ except Exception as e:
153
+ self.logger.error(f"Error fetching download links: {str(e)}")
154
+ raise
155
+
156
+ def download_and_process(self) -> str:
157
+ """Download and process OpenCellID data for the configured country"""
158
+
159
+ try:
160
+ links = self.get_download_links()
161
+ self.logger.info(f"Found {len(links)} data files for {self.config.country}")
162
+
163
+ dfs = []
164
+
165
+ for link in links:
166
+ self.logger.info(f"Downloading data from {link}")
167
+ response = requests.get(link, stream=True)
168
+ response.raise_for_status()
169
+
170
+ # Use a temporary file for download
171
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".gz") as tmpfile:
172
+ for chunk in response.iter_content(chunk_size=1024):
173
+ if chunk:
174
+ tmpfile.write(chunk)
175
+ temp_file = tmpfile.name
176
+
177
+ try:
178
+ # Read the downloaded gzipped CSV data
179
+ with gzip.open(temp_file, "rt") as feed_data:
180
+ dfs.append(
181
+ pd.read_csv(
182
+ feed_data,
183
+ names=[
184
+ "radio",
185
+ "mcc",
186
+ "net",
187
+ "area",
188
+ "cell",
189
+ "unit",
190
+ "lon",
191
+ "lat",
192
+ "range",
193
+ "samples",
194
+ "changeable",
195
+ "created",
196
+ "updated",
197
+ "average_signal",
198
+ ],
199
+ )
200
+ )
201
+ except IOError as e:
202
+ with open(temp_file, "r") as error_file:
203
+ contents = error_file.readline()
204
+
205
+ if "RATE_LIMITED" in contents:
206
+ raise RuntimeError(
207
+ "API rate limit exceeded. You're rate-limited!"
208
+ )
209
+ elif "INVALID_TOKEN" in contents:
210
+ raise RuntimeError("API token rejected by OpenCellID!")
211
+ else:
212
+ raise RuntimeError(
213
+ f"Error processing downloaded data: {str(e)}"
214
+ )
215
+ finally:
216
+ # Clean up temporary file
217
+ if os.path.exists(temp_file):
218
+ os.remove(temp_file)
219
+
220
+ df_cell = pd.concat(dfs, ignore_index=True)
221
+
222
+ # Process the data
223
+ if not df_cell.empty:
224
+ # Convert timestamps to datetime
225
+ df_cell["created"] = pd.to_datetime(
226
+ df_cell["created"], unit="s", origin="unix"
227
+ )
228
+ df_cell["updated"] = pd.to_datetime(
229
+ df_cell["updated"], unit="s", origin="unix"
230
+ )
231
+
232
+ # Filter by year
233
+ df_cell = df_cell[
234
+ (df_cell.created.dt.year >= self.config.created_newer)
235
+ & (df_cell.created.dt.year < self.config.created_before)
236
+ ]
237
+
238
+ # Drop duplicates if configured
239
+ if self.config.drop_duplicates:
240
+ df_cell = (
241
+ df_cell.groupby(["radio", "lon", "lat"]).first().reset_index()
242
+ )
243
+
244
+ # Save processed data using data_store
245
+ output_path = str(self.config.output_file_path)
246
+ self.logger.info(f"Saving processed data to {output_path}")
247
+ with self.data_store.open(output_path, "wb") as f:
248
+ df_cell.to_csv(f, compression="gzip", index=False)
249
+
250
+ return output_path
251
+ else:
252
+ raise ValueError(f"No data found for {self.config.country}")
253
+
254
+ except Exception as e:
255
+ self.logger.error(f"Error downloading and processing data: {str(e)}")
256
+ raise
257
+
258
+
259
+ class OpenCellIDReader:
260
+ """Reader for OpenCellID data"""
261
+
262
+ def __init__(
263
+ self,
264
+ country: str,
265
+ data_store: Optional[DataStore] = None,
266
+ base_path: Optional[Path] = None,
267
+ ):
268
+ self.country = pycountry.countries.lookup(country).alpha_3
269
+ self.data_store = data_store or LocalDataStore()
270
+ self.base_path = base_path or global_config.get_path("opencellid", "bronze")
271
+
272
+ def read_data(self) -> pd.DataFrame:
273
+ """Read OpenCellID data for the specified country"""
274
+ file_path = str(self.base_path / f"opencellid_{self.country.lower()}.csv.gz")
275
+
276
+ if not self.data_store.file_exists(file_path):
277
+ raise FileNotFoundError(
278
+ f"OpenCellID data for {self.country} not found at {file_path}. "
279
+ "Download the data first using OpenCellIDDownloader."
280
+ )
281
+
282
+ return read_dataset(self.data_store, file_path)
283
+
284
+ def to_geodataframe(self) -> gpd.GeoDataFrame:
285
+ """Convert OpenCellID data to a GeoDataFrame"""
286
+ df = self.read_data()
287
+ gdf = gpd.GeoDataFrame(
288
+ df, geometry=gpd.points_from_xy(df.lon, df.lat), crs="EPSG:4326"
289
+ )
290
+ return gdf