giga-spatial 0.6.9__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -610,19 +610,27 @@ class GHSLDataReader(BaseHandlerReader):
610
610
  super().__init__(config=config, data_store=data_store, logger=logger)
611
611
 
612
612
  def load_from_paths(
613
- self, source_data_path: List[Union[str, Path]], **kwargs
614
- ) -> List[TifProcessor]:
613
+ self,
614
+ source_data_path: List[Union[str, Path]],
615
+ merge_rasters: bool = False,
616
+ **kwargs,
617
+ ) -> Union[List[TifProcessor], TifProcessor]:
615
618
  """
616
619
  Load TifProcessors from GHSL dataset.
617
620
  Args:
618
621
  source_data_path: List of file paths to load
622
+ merge_rasters: If True, all rasters will be merged into a single TifProcessor.
623
+ Defaults to False.
619
624
  Returns:
620
- List[TifProcessor]: List of TifProcessor objects for accessing the raster data.
625
+ Union[List[TifProcessor], TifProcessor]: List of TifProcessor objects for accessing the raster data or a single
626
+ TifProcessor if merge_rasters is True.
621
627
  """
622
- return self._load_raster_data(raster_paths=source_data_path)
628
+ return self._load_raster_data(
629
+ raster_paths=source_data_path, merge_rasters=merge_rasters
630
+ )
623
631
 
624
- def load(self, source, **kwargs):
625
- return super().load(source=source, file_ext=".tif")
632
+ def load(self, source, merge_rasters: bool = False, **kwargs):
633
+ return super().load(source=source, file_ext=".tif", merge_rasters=merge_rasters)
626
634
 
627
635
 
628
636
  class GHSLDataHandler(BaseHandler):
@@ -763,6 +771,7 @@ class GHSLDataHandler(BaseHandler):
763
771
  List[Union[str, Path]], # list of paths
764
772
  ],
765
773
  ensure_available: bool = True,
774
+ merge_rasters: bool = False,
766
775
  **kwargs,
767
776
  ):
768
777
  return super().load_data(
@@ -771,6 +780,7 @@ class GHSLDataHandler(BaseHandler):
771
780
  file_ext=".tif",
772
781
  extract=True,
773
782
  file_pattern=r".*\.tif$",
783
+ merge_rasters=merge_rasters,
774
784
  **kwargs,
775
785
  )
776
786
 
@@ -801,8 +811,10 @@ class GHSLDataHandler(BaseHandler):
801
811
  tif_processors = self.load_data(
802
812
  source=source, ensure_available=ensure_available, **kwargs
803
813
  )
814
+ if isinstance(tif_processors, TifProcessor):
815
+ return tif_processors.to_dataframe(**kwargs)
804
816
  return pd.concat(
805
- [tp.to_dataframe() for tp in tif_processors], ignore_index=True
817
+ [tp.to_dataframe(**kwargs) for tp in tif_processors], ignore_index=True
806
818
  )
807
819
 
808
820
  def load_into_geodataframe(
@@ -832,8 +844,10 @@ class GHSLDataHandler(BaseHandler):
832
844
  tif_processors = self.load_data(
833
845
  source=source, ensure_available=ensure_available, **kwargs
834
846
  )
847
+ if isinstance(tif_processors, TifProcessor):
848
+ return tif_processors.to_geodataframe(**kwargs)
835
849
  return pd.concat(
836
- [tp.to_geodataframe() for tp in tif_processors], ignore_index=True
850
+ [tp.to_geodataframe(**kwargs) for tp in tif_processors], ignore_index=True
837
851
  )
838
852
 
839
853
  def get_available_data_info(
@@ -8,6 +8,7 @@ from shapely.geometry import Point
8
8
  import pycountry
9
9
  from typing import Optional, Union
10
10
  import logging
11
+ import geopandas as gpd
11
12
 
12
13
  from gigaspatial.config import config as global_config
13
14
 
@@ -40,11 +41,14 @@ class GigaSchoolLocationFetcher:
40
41
  if self.logger is None:
41
42
  self.logger = global_config.get_logger(self.__class__.__name__)
42
43
 
43
- def fetch_locations(self, **kwargs) -> pd.DataFrame:
44
+ def fetch_locations(
45
+ self, process_geospatial: bool = False, **kwargs
46
+ ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
44
47
  """
45
48
  Fetch and process school locations.
46
49
 
47
50
  Args:
51
+ process_geospatial (bool): Whether to process geospatial data and return a GeoDataFrame. Defaults to False.
48
52
  **kwargs: Additional parameters for customization
49
53
  - page_size: Override default page size
50
54
  - sleep_time: Override default sleep time between requests
@@ -122,11 +126,12 @@ class GigaSchoolLocationFetcher:
122
126
 
123
127
  df = pd.DataFrame(all_data)
124
128
 
125
- df = self._process_geospatial_data(df)
129
+ if process_geospatial:
130
+ df = self._process_geospatial_data(df)
126
131
 
127
132
  return df
128
133
 
129
- def _process_geospatial_data(self, df: pd.DataFrame) -> pd.DataFrame:
134
+ def _process_geospatial_data(self, df: pd.DataFrame) -> gpd.GeoDataFrame:
130
135
  """
131
136
  Process and enhance the DataFrame with geospatial information.
132
137
 
@@ -144,7 +149,7 @@ class GigaSchoolLocationFetcher:
144
149
  )
145
150
  self.logger.info(f"Created geometry for all {len(df)} records")
146
151
 
147
- return df
152
+ return gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
148
153
 
149
154
 
150
155
  @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
@@ -0,0 +1,350 @@
1
+ import requests
2
+ import pandas as pd
3
+ import geopandas as gpd
4
+ import time
5
+ from typing import List, Optional, Union, Tuple
6
+ from pydantic.dataclasses import dataclass, Field
7
+ from pydantic import ConfigDict
8
+ import pycountry
9
+
10
+ from gigaspatial.config import config
11
+ from gigaspatial.handlers import OSMLocationFetcher
12
+
13
+
14
+ @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
15
+ class HealthSitesFetcher:
16
+ """
17
+ Fetch and process health facility location data from the Healthsites.io API.
18
+ """
19
+
20
+ country: Optional[str] = Field(default=None, description="Country to filter")
21
+ api_url: str = Field(
22
+ default="https://healthsites.io/api/v3/facilities/",
23
+ description="Base URL for the Healthsites API",
24
+ )
25
+ api_key: str = config.HEALTHSITES_API_KEY
26
+ extent: Optional[Tuple[float, float, float, float]] = Field(
27
+ default=None, description="Bounding box as (minLng, minLat, maxLng, maxLat)"
28
+ )
29
+ page_size: int = Field(default=100, description="Number of records per API page")
30
+ flat_properties: bool = Field(
31
+ default=True, description="Show properties in flat format"
32
+ )
33
+ tag_format: str = Field(default="osm", description="Tag format (osm/hxl)")
34
+ output_format: str = Field(
35
+ default="geojson", description="Output format (json/geojson)"
36
+ )
37
+ sleep_time: float = Field(
38
+ default=0.2, description="Sleep time between API requests"
39
+ )
40
+
41
+ def __post_init__(self):
42
+ self.logger = config.get_logger(self.__class__.__name__)
43
+ # Convert country code to OSM English name if provided
44
+ if self.country:
45
+ self.country = self._convert_country(self.country)
46
+
47
+ def fetch_facilities(self, **kwargs) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
48
+ """
49
+ Fetch and process health facility locations.
50
+
51
+ Args:
52
+ **kwargs: Additional parameters for customization
53
+ - country: Override country filter
54
+ - extent: Override extent filter
55
+ - from_date: Get data modified from this timestamp (datetime or string)
56
+ - to_date: Get data modified to this timestamp (datetime or string)
57
+ - page_size: Override default page size
58
+ - sleep_time: Override default sleep time between requests
59
+ - max_pages: Limit the number of pages to fetch
60
+ - output_format: Override output format ('json' or 'geojson')
61
+ - flat_properties: Override flat properties setting
62
+
63
+ Returns:
64
+ Union[pd.DataFrame, gpd.GeoDataFrame]: Health facilities data.
65
+ Returns GeoDataFrame for geojson format, DataFrame for json format.
66
+ """
67
+ # Override defaults with kwargs if provided
68
+ country = kwargs.get("country", self.country)
69
+ extent = kwargs.get("extent", self.extent)
70
+ from_date = kwargs.get("from_date", None)
71
+ to_date = kwargs.get("to_date", None)
72
+ page_size = kwargs.get("page_size", self.page_size)
73
+ sleep_time = kwargs.get("sleep_time", self.sleep_time)
74
+ max_pages = kwargs.get("max_pages", None)
75
+ output_format = kwargs.get("output_format", self.output_format)
76
+ flat_properties = kwargs.get("flat_properties", self.flat_properties)
77
+
78
+ # Convert country if provided in kwargs
79
+ if country:
80
+ country = self._convert_country(country)
81
+
82
+ # Prepare base parameters
83
+ base_params = {
84
+ "api-key": self.api_key,
85
+ "tag-format": self.tag_format,
86
+ "output": output_format,
87
+ }
88
+
89
+ # Only add flat-properties if True (don't send it as false, as that makes it flat anyway)
90
+ if flat_properties:
91
+ base_params["flat-properties"] = "true"
92
+
93
+ # Add optional filters
94
+ if country:
95
+ base_params["country"] = country
96
+
97
+ if extent:
98
+ if len(extent) != 4:
99
+ raise ValueError(
100
+ "Extent must be a tuple of 4 values: (minLng, minLat, maxLng, maxLat)"
101
+ )
102
+ base_params["extent"] = ",".join(map(str, extent))
103
+
104
+ if from_date:
105
+ base_params["from"] = self._format_timestamp(from_date)
106
+
107
+ if to_date:
108
+ base_params["to"] = self._format_timestamp(to_date)
109
+
110
+ all_data = []
111
+ page = 1
112
+
113
+ self.logger.info(
114
+ f"Starting to fetch health facilities for country: {country or 'all countries'}"
115
+ )
116
+ self.logger.info(
117
+ f"Output format: {output_format}, Flat properties: {flat_properties}"
118
+ )
119
+
120
+ while True:
121
+ # Check if we've reached max_pages limit
122
+ if max_pages and page > max_pages:
123
+ self.logger.info(f"Reached maximum pages limit: {max_pages}")
124
+ break
125
+
126
+ # Add page parameter
127
+ params = base_params.copy()
128
+ params["page"] = page
129
+
130
+ try:
131
+ self.logger.debug(f"Fetching page {page} with params: {params}")
132
+ response = requests.get(self.api_url, params=params)
133
+ response.raise_for_status()
134
+
135
+ parsed = response.json()
136
+
137
+ # Handle different response structures based on output format
138
+ if output_format == "geojson":
139
+ # GeoJSON returns FeatureCollection with features list
140
+ data = parsed.get("features", [])
141
+ else:
142
+ # JSON returns direct list
143
+ data = parsed if isinstance(parsed, list) else []
144
+
145
+ except requests.exceptions.RequestException as e:
146
+ self.logger.error(f"Request failed on page {page}: {e}")
147
+ break
148
+ except ValueError as e:
149
+ self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
150
+ break
151
+
152
+ # Check if we got any data
153
+ if not data or not isinstance(data, list):
154
+ self.logger.info(f"No data on page {page}. Stopping.")
155
+ break
156
+
157
+ all_data.extend(data)
158
+ self.logger.info(f"Fetched page {page} with {len(data)} records")
159
+
160
+ # If we got fewer records than page_size, we've reached the end
161
+ if len(data) < page_size:
162
+ self.logger.info("Reached end of data (partial page received)")
163
+ break
164
+
165
+ page += 1
166
+
167
+ # Sleep to be respectful to the API
168
+ if sleep_time > 0:
169
+ time.sleep(sleep_time)
170
+
171
+ self.logger.info(f"Finished fetching. Total records: {len(all_data)}")
172
+
173
+ # Convert to DataFrame/GeoDataFrame based on format
174
+ if not all_data:
175
+ self.logger.warning("No data fetched, returning empty DataFrame")
176
+ if output_format == "geojson":
177
+ return gpd.GeoDataFrame()
178
+ return pd.DataFrame()
179
+
180
+ if output_format == "geojson":
181
+ # Use GeoDataFrame.from_features for GeoJSON format
182
+ gdf = gpd.GeoDataFrame.from_features(all_data)
183
+ self.logger.info(f"Created GeoDataFrame with {len(gdf)} records")
184
+ return gdf
185
+ else:
186
+ # For JSON format, handle nested structure if flat_properties is False
187
+ if not flat_properties:
188
+ df = self._process_json_with_centroid(all_data)
189
+ else:
190
+ df = pd.DataFrame(all_data)
191
+
192
+ self.logger.info(f"Created DataFrame with {len(df)} records")
193
+ return df
194
+
195
+ def fetch_statistics(self, **kwargs) -> dict:
196
+ """
197
+ Fetch statistics for health facilities.
198
+
199
+ Args:
200
+ **kwargs: Same filtering parameters as fetch_facilities
201
+
202
+ Returns:
203
+ dict: Statistics data
204
+ """
205
+ country = kwargs.get("country", self.country)
206
+ extent = kwargs.get("extent", self.extent)
207
+ from_date = kwargs.get("from_date", None)
208
+ to_date = kwargs.get("to_date", None)
209
+
210
+ # Convert country if provided
211
+ if country:
212
+ country = self._convert_country(country)
213
+
214
+ params = {
215
+ "api-key": self.api_key,
216
+ }
217
+
218
+ # Add optional filters
219
+ if country:
220
+ params["country"] = country
221
+ if extent:
222
+ params["extent"] = ",".join(map(str, extent))
223
+ if from_date:
224
+ params["from"] = self._format_timestamp(from_date)
225
+ if to_date:
226
+ params["to"] = self._format_timestamp(to_date)
227
+
228
+ try:
229
+ response = requests.get(f"{self.api_url}/statistic/", params=params)
230
+ response.raise_for_status()
231
+ return response.json()
232
+ except requests.exceptions.RequestException as e:
233
+ self.logger.error(f"Request failed for statistics: {e}")
234
+ raise
235
+
236
+ def fetch_facility_by_id(self, osm_type: str, osm_id: str) -> dict:
237
+ """
238
+ Fetch a specific facility by OSM type and ID.
239
+
240
+ Args:
241
+ osm_type: OSM type (node, way, relation)
242
+ osm_id: OSM ID
243
+
244
+ Returns:
245
+ dict: Facility details
246
+ """
247
+ params = {"api-key": self.api_key}
248
+
249
+ try:
250
+ url = f"{self.api_url}/{osm_type}/{osm_id}"
251
+ response = requests.get(url, params=params)
252
+ response.raise_for_status()
253
+ return response.json()
254
+ except requests.exceptions.RequestException as e:
255
+ self.logger.error(f"Request failed for facility {osm_type}/{osm_id}: {e}")
256
+ raise
257
+
258
+ def _create_dataframe(self, data: List[dict]) -> pd.DataFrame:
259
+ """
260
+ Create DataFrame from API response data.
261
+
262
+ Args:
263
+ data: List of facility records
264
+
265
+ Returns:
266
+ pd.DataFrame: Processed DataFrame
267
+ """
268
+ if self.output_format == "geojson":
269
+ # Handle GeoJSON format
270
+ records = []
271
+ for feature in data:
272
+ record = feature.get("properties", {}).copy()
273
+ geometry = feature.get("geometry", {})
274
+ coordinates = geometry.get("coordinates", [])
275
+
276
+ if coordinates and len(coordinates) >= 2:
277
+ record["longitude"] = coordinates[0]
278
+ record["latitude"] = coordinates[1]
279
+
280
+ records.append(record)
281
+ return pd.DataFrame(records)
282
+ else:
283
+ # Handle regular JSON format
284
+ return pd.DataFrame(data)
285
+
286
+ def _process_json_with_centroid(self, data: List[dict]) -> pd.DataFrame:
287
+ """
288
+ Process JSON data to flatten 'attributes' and 'centroid' fields,
289
+ and extract longitude/latitude from centroid.
290
+
291
+ Args:
292
+ data: List of facility records, where each record might contain
293
+ nested 'attributes' and 'centroid' dictionaries.
294
+
295
+ Returns:
296
+ pd.DataFrame: Processed DataFrame with flattened data.
297
+ """
298
+ processed_records = []
299
+ for record in data:
300
+ new_record = {}
301
+
302
+ # Flatten top-level keys
303
+ for key, value in record.items():
304
+ if key not in ["attributes", "centroid"]:
305
+ new_record[key] = value
306
+
307
+ # Flatten 'attributes'
308
+ attributes = record.get("attributes", {})
309
+ for attr_key, attr_value in attributes.items():
310
+ new_record[f"{attr_key}"] = attr_value
311
+
312
+ # Extract centroid coordinates
313
+ centroid = record.get("centroid", {})
314
+ coordinates = centroid.get("coordinates", [])
315
+ if coordinates and len(coordinates) == 2:
316
+ new_record["longitude"] = coordinates[0]
317
+ new_record["latitude"] = coordinates[1]
318
+ else:
319
+ new_record["longitude"] = None
320
+ new_record["latitude"] = None
321
+
322
+ processed_records.append(new_record)
323
+
324
+ return pd.DataFrame(processed_records)
325
+
326
+ def _convert_country(self, country: str) -> str:
327
+ try:
328
+ # First convert to ISO3 format if needed
329
+ country_obj = pycountry.countries.lookup(country)
330
+ iso3_code = country_obj.alpha_3
331
+
332
+ # Get OSM English name using OSMLocationFetcher
333
+ osm_data = OSMLocationFetcher.get_osm_countries(iso3_code=iso3_code)
334
+ osm_name_en = osm_data.get("name:en")
335
+
336
+ if not osm_name_en:
337
+ raise ValueError(
338
+ f"Could not find OSM English name for country: {country}"
339
+ )
340
+
341
+ self.logger.info(
342
+ f"Converted country code to OSM English name: {osm_name_en}"
343
+ )
344
+
345
+ return osm_name_en
346
+
347
+ except LookupError:
348
+ raise ValueError(f"Invalid country code provided: {country}")
349
+ except Exception as e:
350
+ raise ValueError(f"Failed to get OSM English name: {e}")