giga-spatial 0.7.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,350 @@
1
+ import requests
2
+ import pandas as pd
3
+ import geopandas as gpd
4
+ import time
5
+ from typing import List, Optional, Union, Tuple
6
+ from pydantic.dataclasses import dataclass, Field
7
+ from pydantic import ConfigDict
8
+ import pycountry
9
+
10
+ from gigaspatial.config import config
11
+ from gigaspatial.handlers import OSMLocationFetcher
12
+
13
+
14
+ @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
15
+ class HealthSitesFetcher:
16
+ """
17
+ Fetch and process health facility location data from the Healthsites.io API.
18
+ """
19
+
20
+ country: Optional[str] = Field(default=None, description="Country to filter")
21
+ api_url: str = Field(
22
+ default="https://healthsites.io/api/v3/facilities/",
23
+ description="Base URL for the Healthsites API",
24
+ )
25
+ api_key: str = config.HEALTHSITES_API_KEY
26
+ extent: Optional[Tuple[float, float, float, float]] = Field(
27
+ default=None, description="Bounding box as (minLng, minLat, maxLng, maxLat)"
28
+ )
29
+ page_size: int = Field(default=100, description="Number of records per API page")
30
+ flat_properties: bool = Field(
31
+ default=True, description="Show properties in flat format"
32
+ )
33
+ tag_format: str = Field(default="osm", description="Tag format (osm/hxl)")
34
+ output_format: str = Field(
35
+ default="geojson", description="Output format (json/geojson)"
36
+ )
37
+ sleep_time: float = Field(
38
+ default=0.2, description="Sleep time between API requests"
39
+ )
40
+
41
+ def __post_init__(self):
42
+ self.logger = config.get_logger(self.__class__.__name__)
43
+ # Convert country code to OSM English name if provided
44
+ if self.country:
45
+ self.country = self._convert_country(self.country)
46
+
47
+ def fetch_facilities(self, **kwargs) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
48
+ """
49
+ Fetch and process health facility locations.
50
+
51
+ Args:
52
+ **kwargs: Additional parameters for customization
53
+ - country: Override country filter
54
+ - extent: Override extent filter
55
+ - from_date: Get data modified from this timestamp (datetime or string)
56
+ - to_date: Get data modified to this timestamp (datetime or string)
57
+ - page_size: Override default page size
58
+ - sleep_time: Override default sleep time between requests
59
+ - max_pages: Limit the number of pages to fetch
60
+ - output_format: Override output format ('json' or 'geojson')
61
+ - flat_properties: Override flat properties setting
62
+
63
+ Returns:
64
+ Union[pd.DataFrame, gpd.GeoDataFrame]: Health facilities data.
65
+ Returns GeoDataFrame for geojson format, DataFrame for json format.
66
+ """
67
+ # Override defaults with kwargs if provided
68
+ country = kwargs.get("country", self.country)
69
+ extent = kwargs.get("extent", self.extent)
70
+ from_date = kwargs.get("from_date", None)
71
+ to_date = kwargs.get("to_date", None)
72
+ page_size = kwargs.get("page_size", self.page_size)
73
+ sleep_time = kwargs.get("sleep_time", self.sleep_time)
74
+ max_pages = kwargs.get("max_pages", None)
75
+ output_format = kwargs.get("output_format", self.output_format)
76
+ flat_properties = kwargs.get("flat_properties", self.flat_properties)
77
+
78
+ # Convert country if provided in kwargs
79
+ if country:
80
+ country = self._convert_country(country)
81
+
82
+ # Prepare base parameters
83
+ base_params = {
84
+ "api-key": self.api_key,
85
+ "tag-format": self.tag_format,
86
+ "output": output_format,
87
+ }
88
+
89
+ # Only add flat-properties if True (don't send it as false, as that makes it flat anyway)
90
+ if flat_properties:
91
+ base_params["flat-properties"] = "true"
92
+
93
+ # Add optional filters
94
+ if country:
95
+ base_params["country"] = country
96
+
97
+ if extent:
98
+ if len(extent) != 4:
99
+ raise ValueError(
100
+ "Extent must be a tuple of 4 values: (minLng, minLat, maxLng, maxLat)"
101
+ )
102
+ base_params["extent"] = ",".join(map(str, extent))
103
+
104
+ if from_date:
105
+ base_params["from"] = self._format_timestamp(from_date)
106
+
107
+ if to_date:
108
+ base_params["to"] = self._format_timestamp(to_date)
109
+
110
+ all_data = []
111
+ page = 1
112
+
113
+ self.logger.info(
114
+ f"Starting to fetch health facilities for country: {country or 'all countries'}"
115
+ )
116
+ self.logger.info(
117
+ f"Output format: {output_format}, Flat properties: {flat_properties}"
118
+ )
119
+
120
+ while True:
121
+ # Check if we've reached max_pages limit
122
+ if max_pages and page > max_pages:
123
+ self.logger.info(f"Reached maximum pages limit: {max_pages}")
124
+ break
125
+
126
+ # Add page parameter
127
+ params = base_params.copy()
128
+ params["page"] = page
129
+
130
+ try:
131
+ self.logger.debug(f"Fetching page {page} with params: {params}")
132
+ response = requests.get(self.api_url, params=params)
133
+ response.raise_for_status()
134
+
135
+ parsed = response.json()
136
+
137
+ # Handle different response structures based on output format
138
+ if output_format == "geojson":
139
+ # GeoJSON returns FeatureCollection with features list
140
+ data = parsed.get("features", [])
141
+ else:
142
+ # JSON returns direct list
143
+ data = parsed if isinstance(parsed, list) else []
144
+
145
+ except requests.exceptions.RequestException as e:
146
+ self.logger.error(f"Request failed on page {page}: {e}")
147
+ break
148
+ except ValueError as e:
149
+ self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
150
+ break
151
+
152
+ # Check if we got any data
153
+ if not data or not isinstance(data, list):
154
+ self.logger.info(f"No data on page {page}. Stopping.")
155
+ break
156
+
157
+ all_data.extend(data)
158
+ self.logger.info(f"Fetched page {page} with {len(data)} records")
159
+
160
+ # If we got fewer records than page_size, we've reached the end
161
+ if len(data) < page_size:
162
+ self.logger.info("Reached end of data (partial page received)")
163
+ break
164
+
165
+ page += 1
166
+
167
+ # Sleep to be respectful to the API
168
+ if sleep_time > 0:
169
+ time.sleep(sleep_time)
170
+
171
+ self.logger.info(f"Finished fetching. Total records: {len(all_data)}")
172
+
173
+ # Convert to DataFrame/GeoDataFrame based on format
174
+ if not all_data:
175
+ self.logger.warning("No data fetched, returning empty DataFrame")
176
+ if output_format == "geojson":
177
+ return gpd.GeoDataFrame()
178
+ return pd.DataFrame()
179
+
180
+ if output_format == "geojson":
181
+ # Use GeoDataFrame.from_features for GeoJSON format
182
+ gdf = gpd.GeoDataFrame.from_features(all_data)
183
+ self.logger.info(f"Created GeoDataFrame with {len(gdf)} records")
184
+ return gdf
185
+ else:
186
+ # For JSON format, handle nested structure if flat_properties is False
187
+ if not flat_properties:
188
+ df = self._process_json_with_centroid(all_data)
189
+ else:
190
+ df = pd.DataFrame(all_data)
191
+
192
+ self.logger.info(f"Created DataFrame with {len(df)} records")
193
+ return df
194
+
195
+ def fetch_statistics(self, **kwargs) -> dict:
196
+ """
197
+ Fetch statistics for health facilities.
198
+
199
+ Args:
200
+ **kwargs: Same filtering parameters as fetch_facilities
201
+
202
+ Returns:
203
+ dict: Statistics data
204
+ """
205
+ country = kwargs.get("country", self.country)
206
+ extent = kwargs.get("extent", self.extent)
207
+ from_date = kwargs.get("from_date", None)
208
+ to_date = kwargs.get("to_date", None)
209
+
210
+ # Convert country if provided
211
+ if country:
212
+ country = self._convert_country(country)
213
+
214
+ params = {
215
+ "api-key": self.api_key,
216
+ }
217
+
218
+ # Add optional filters
219
+ if country:
220
+ params["country"] = country
221
+ if extent:
222
+ params["extent"] = ",".join(map(str, extent))
223
+ if from_date:
224
+ params["from"] = self._format_timestamp(from_date)
225
+ if to_date:
226
+ params["to"] = self._format_timestamp(to_date)
227
+
228
+ try:
229
+ response = requests.get(f"{self.api_url}/statistic/", params=params)
230
+ response.raise_for_status()
231
+ return response.json()
232
+ except requests.exceptions.RequestException as e:
233
+ self.logger.error(f"Request failed for statistics: {e}")
234
+ raise
235
+
236
+ def fetch_facility_by_id(self, osm_type: str, osm_id: str) -> dict:
237
+ """
238
+ Fetch a specific facility by OSM type and ID.
239
+
240
+ Args:
241
+ osm_type: OSM type (node, way, relation)
242
+ osm_id: OSM ID
243
+
244
+ Returns:
245
+ dict: Facility details
246
+ """
247
+ params = {"api-key": self.api_key}
248
+
249
+ try:
250
+ url = f"{self.api_url}/{osm_type}/{osm_id}"
251
+ response = requests.get(url, params=params)
252
+ response.raise_for_status()
253
+ return response.json()
254
+ except requests.exceptions.RequestException as e:
255
+ self.logger.error(f"Request failed for facility {osm_type}/{osm_id}: {e}")
256
+ raise
257
+
258
+ def _create_dataframe(self, data: List[dict]) -> pd.DataFrame:
259
+ """
260
+ Create DataFrame from API response data.
261
+
262
+ Args:
263
+ data: List of facility records
264
+
265
+ Returns:
266
+ pd.DataFrame: Processed DataFrame
267
+ """
268
+ if self.output_format == "geojson":
269
+ # Handle GeoJSON format
270
+ records = []
271
+ for feature in data:
272
+ record = feature.get("properties", {}).copy()
273
+ geometry = feature.get("geometry", {})
274
+ coordinates = geometry.get("coordinates", [])
275
+
276
+ if coordinates and len(coordinates) >= 2:
277
+ record["longitude"] = coordinates[0]
278
+ record["latitude"] = coordinates[1]
279
+
280
+ records.append(record)
281
+ return pd.DataFrame(records)
282
+ else:
283
+ # Handle regular JSON format
284
+ return pd.DataFrame(data)
285
+
286
+ def _process_json_with_centroid(self, data: List[dict]) -> pd.DataFrame:
287
+ """
288
+ Process JSON data to flatten 'attributes' and 'centroid' fields,
289
+ and extract longitude/latitude from centroid.
290
+
291
+ Args:
292
+ data: List of facility records, where each record might contain
293
+ nested 'attributes' and 'centroid' dictionaries.
294
+
295
+ Returns:
296
+ pd.DataFrame: Processed DataFrame with flattened data.
297
+ """
298
+ processed_records = []
299
+ for record in data:
300
+ new_record = {}
301
+
302
+ # Flatten top-level keys
303
+ for key, value in record.items():
304
+ if key not in ["attributes", "centroid"]:
305
+ new_record[key] = value
306
+
307
+ # Flatten 'attributes'
308
+ attributes = record.get("attributes", {})
309
+ for attr_key, attr_value in attributes.items():
310
+ new_record[f"{attr_key}"] = attr_value
311
+
312
+ # Extract centroid coordinates
313
+ centroid = record.get("centroid", {})
314
+ coordinates = centroid.get("coordinates", [])
315
+ if coordinates and len(coordinates) == 2:
316
+ new_record["longitude"] = coordinates[0]
317
+ new_record["latitude"] = coordinates[1]
318
+ else:
319
+ new_record["longitude"] = None
320
+ new_record["latitude"] = None
321
+
322
+ processed_records.append(new_record)
323
+
324
+ return pd.DataFrame(processed_records)
325
+
326
+ def _convert_country(self, country: str) -> str:
327
+ try:
328
+ # First convert to ISO3 format if needed
329
+ country_obj = pycountry.countries.lookup(country)
330
+ iso3_code = country_obj.alpha_3
331
+
332
+ # Get OSM English name using OSMLocationFetcher
333
+ osm_data = OSMLocationFetcher.get_osm_countries(iso3_code=iso3_code)
334
+ osm_name_en = osm_data.get("name:en")
335
+
336
+ if not osm_name_en:
337
+ raise ValueError(
338
+ f"Could not find OSM English name for country: {country}"
339
+ )
340
+
341
+ self.logger.info(
342
+ f"Converted country code to OSM English name: {osm_name_en}"
343
+ )
344
+
345
+ return osm_name_en
346
+
347
+ except LookupError:
348
+ raise ValueError(f"Invalid country code provided: {country}")
349
+ except Exception as e:
350
+ raise ValueError(f"Failed to get OSM English name: {e}")