giga-spatial 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. giga_spatial-0.6.0.dist-info/METADATA +141 -0
  2. giga_spatial-0.6.0.dist-info/RECORD +47 -0
  3. giga_spatial-0.6.0.dist-info/WHEEL +5 -0
  4. giga_spatial-0.6.0.dist-info/licenses/LICENSE +661 -0
  5. giga_spatial-0.6.0.dist-info/top_level.txt +1 -0
  6. gigaspatial/__init__.py +1 -0
  7. gigaspatial/config.py +226 -0
  8. gigaspatial/core/__init__.py +0 -0
  9. gigaspatial/core/io/__init__.py +5 -0
  10. gigaspatial/core/io/adls_data_store.py +325 -0
  11. gigaspatial/core/io/data_api.py +113 -0
  12. gigaspatial/core/io/data_store.py +147 -0
  13. gigaspatial/core/io/local_data_store.py +92 -0
  14. gigaspatial/core/io/readers.py +265 -0
  15. gigaspatial/core/io/writers.py +128 -0
  16. gigaspatial/core/schemas/__init__.py +0 -0
  17. gigaspatial/core/schemas/entity.py +244 -0
  18. gigaspatial/generators/__init__.py +2 -0
  19. gigaspatial/generators/poi.py +636 -0
  20. gigaspatial/generators/zonal/__init__.py +3 -0
  21. gigaspatial/generators/zonal/base.py +370 -0
  22. gigaspatial/generators/zonal/geometry.py +439 -0
  23. gigaspatial/generators/zonal/mercator.py +78 -0
  24. gigaspatial/grid/__init__.py +1 -0
  25. gigaspatial/grid/mercator_tiles.py +286 -0
  26. gigaspatial/handlers/__init__.py +40 -0
  27. gigaspatial/handlers/base.py +761 -0
  28. gigaspatial/handlers/boundaries.py +305 -0
  29. gigaspatial/handlers/ghsl.py +772 -0
  30. gigaspatial/handlers/giga.py +145 -0
  31. gigaspatial/handlers/google_open_buildings.py +472 -0
  32. gigaspatial/handlers/hdx.py +241 -0
  33. gigaspatial/handlers/mapbox_image.py +208 -0
  34. gigaspatial/handlers/maxar_image.py +291 -0
  35. gigaspatial/handlers/microsoft_global_buildings.py +548 -0
  36. gigaspatial/handlers/ookla_speedtest.py +199 -0
  37. gigaspatial/handlers/opencellid.py +290 -0
  38. gigaspatial/handlers/osm.py +356 -0
  39. gigaspatial/handlers/overture.py +126 -0
  40. gigaspatial/handlers/rwi.py +157 -0
  41. gigaspatial/handlers/unicef_georepo.py +806 -0
  42. gigaspatial/handlers/worldpop.py +266 -0
  43. gigaspatial/processing/__init__.py +4 -0
  44. gigaspatial/processing/geo.py +1054 -0
  45. gigaspatial/processing/sat_images.py +39 -0
  46. gigaspatial/processing/tif_processor.py +477 -0
  47. gigaspatial/processing/utils.py +49 -0
gigaspatial/config.py ADDED
@@ -0,0 +1,226 @@
1
+ from pathlib import Path
2
+ from pydantic import Field, field_validator
3
+ from pydantic_settings import BaseSettings, SettingsConfigDict
4
+ from typing import Optional, Union, Literal, Dict, Any
5
+ import io
6
+ from functools import lru_cache
7
+ import logging
8
+
9
+
10
+ class Config(BaseSettings):
11
+ """
12
+ Unified configuration with environment variable loading.
13
+ Manages file system paths for different data tiers (bronze, silver, gold)
14
+ and their subpaths. All paths can be overridden through environment variables.
15
+ """
16
+
17
+ ADLS_CONNECTION_STRING: str = Field(default="", alias="ADLS_CONNECTION_STRING")
18
+ ADLS_CONTAINER_NAME: str = Field(default="", alias="ADLS_CONTAINER_NAME")
19
+ GOOGLE_SERVICE_ACCOUNT: str = Field(default="", alias="GOOGLE_SERVICE_ACCOUNT")
20
+ API_PROFILE_FILE_PATH: Path = Field(
21
+ default=Path("profile.share"), alias="API_PROFILE_FILE_PATH"
22
+ )
23
+ API_SHARE_NAME: str = Field(default="", alias="API_SHARE_NAME")
24
+ API_SCHEMA_NAME: str = Field(default="", alias="API_SCHEMA_NAME")
25
+ MAPBOX_ACCESS_TOKEN: str = Field(default="", alias="MAPBOX_ACCESS_TOKEN")
26
+ MAXAR_USERNAME: str = Field(default="", alias="MAXAR_USERNAME")
27
+ MAXAR_PASSWORD: str = Field(default="", alias="MAXAR_PASSWORD")
28
+ MAXAR_CONNECTION_STRING: str = Field(default="", alias="MAXAR_CONNECTION_STRING")
29
+ OPENCELLID_ACCESS_TOKEN: str = Field(default="", alias="OPENCELLID_ACCESS_TOKEN")
30
+ GEOREPO_API_KEY: str = Field(default="", alias="GEOREPO_API_KEY")
31
+ GEOREPO_USER_EMAIL: str = Field(default="", alias="GEOREPO_USER_EMAIL")
32
+ GIGA_SCHOOL_LOCATION_API_KEY: str = Field(
33
+ default="", alias="GIGA_SCHOOL_LOCATION_API_KEY"
34
+ )
35
+
36
+ BRONZE_DATA_DIR: Path = Field(
37
+ default=Path("bronze"),
38
+ description="Root directory for raw/bronze tier data",
39
+ alias="BRONZE_DIR",
40
+ )
41
+ SILVER_DATA_DIR: Path = Field(
42
+ default=Path("silver"),
43
+ description="Root directory for processed/silver tier data",
44
+ alias="SILVER_DIR",
45
+ )
46
+ GOLD_DATA_DIR: Path = Field(
47
+ default=Path("gold"),
48
+ description="Root directory for final/gold tier data",
49
+ alias="GOLD_DIR",
50
+ )
51
+ VIEWS_DATA_DIR: Path = Field(
52
+ default=Path("views"),
53
+ description="Root directory for views data",
54
+ alias="VIEWS_DIR",
55
+ )
56
+ CACHE_DIR: Path = Field(
57
+ default=Path("cache"),
58
+ description="Directory for temporary/cache files",
59
+ alias="CACHE_DIR",
60
+ )
61
+ ADMIN_BOUNDARIES_DATA_DIR: Path = Field(
62
+ default=Path("admin_boundaries"),
63
+ description="Root directory for administrative boundary data",
64
+ alias="ADMIN_BOUNDARIES_DIR",
65
+ )
66
+
67
+ DATA_TYPES: Dict[str, str] = Field(
68
+ default={
69
+ "google_open_buildings": "google_open_buildings",
70
+ "mapbox_image": "mapbox_images",
71
+ "microsoft_global_buildings": "microsoft_global_buildings",
72
+ "ookla_speedtest": "ookla",
73
+ "srtm": "srtm",
74
+ "worldpop": "worldpop",
75
+ "ghsl": "ghsl",
76
+ "opencellid": "opencellid",
77
+ "hdx": "hdx",
78
+ "poi": "poi",
79
+ "zonal": "zonal",
80
+ },
81
+ description="Mapping of data types to directory names",
82
+ )
83
+
84
+ def get_logger(self, name="GigaSpatial", console_level=logging.INFO):
85
+ logger = logging.getLogger(name)
86
+ logger.setLevel(logging.INFO)
87
+
88
+ console_handler = logging.StreamHandler()
89
+ console_handler.setLevel(console_level)
90
+
91
+ LOG_FORMAT = "%(levelname) -10s %(name) -10s %(asctime) " "-30s: %(message)s"
92
+
93
+ formatter = logging.Formatter(LOG_FORMAT)
94
+ console_handler.setFormatter(formatter)
95
+
96
+ if not logger.hasHandlers():
97
+ logger.addHandler(console_handler)
98
+
99
+ return logger
100
+
101
+ def get_tqdm_logger_stream(self, logger: logging.Logger, level=logging.INFO):
102
+ return TqdmToLogger(logger, level=level)
103
+
104
+ def set_path(
105
+ self,
106
+ tier: Literal["bronze", "silver", "gold", "views"],
107
+ path: Union[str, Path],
108
+ ) -> None:
109
+ """Dynamically set the base path for a given tier."""
110
+ if tier not in ["bronze", "silver", "gold", "views"]:
111
+ raise ValueError(
112
+ f"Invalid tier: {tier}. Must be one of 'bronze', 'silver', 'gold', or 'views'."
113
+ )
114
+
115
+ if isinstance(path, str):
116
+ path = Path(path)
117
+
118
+ setattr(self, f"{tier.upper()}_DATA_DIR", path)
119
+
120
+ def get_path(
121
+ self,
122
+ data_type: str,
123
+ tier: Literal["bronze", "silver", "gold", "views"],
124
+ version: Optional[str] = None,
125
+ ) -> Path:
126
+ """Dynamic path construction based on data type and tier."""
127
+ if tier not in ["bronze", "silver", "gold", "views"]:
128
+ raise ValueError(
129
+ f"Invalid tier: {tier}. Must be one of 'bronze', 'silver', 'gold', or 'views'."
130
+ )
131
+
132
+ base_dir = getattr(self, f"{tier.upper()}_DATA_DIR")
133
+ type_dir = self.DATA_TYPES[data_type]
134
+ if version:
135
+ return base_dir / type_dir / version
136
+ else:
137
+ return base_dir / type_dir
138
+
139
+ def get_admin_path(
140
+ self,
141
+ country_code,
142
+ admin_level: Literal[0, 1, 2, 3, 4],
143
+ file_suffix: str = ".geojson",
144
+ ) -> Path:
145
+ """Dynamic path construction for administrative boundary data based on admin level."""
146
+ base_dir = getattr(self, "ADMIN_BOUNDARIES_DATA_DIR")
147
+ level_dir = f"admin{admin_level}"
148
+ file = f"{country_code}_{level_dir}{file_suffix}"
149
+
150
+ return base_dir / level_dir / file
151
+
152
+ model_config = SettingsConfigDict(
153
+ env_file=".env",
154
+ env_prefix="",
155
+ validate_assignment=True,
156
+ case_sensitive=True,
157
+ extra="allow",
158
+ )
159
+
160
+ @field_validator(
161
+ "BRONZE_DATA_DIR",
162
+ "SILVER_DATA_DIR",
163
+ "GOLD_DATA_DIR",
164
+ "CACHE_DIR",
165
+ "ADMIN_BOUNDARIES_DATA_DIR",
166
+ mode="before",
167
+ )
168
+ def resolve_and_validate_paths(
169
+ cls, value: Union[str, Path], resolve=False
170
+ ) -> Union[Path, Any]:
171
+ """Smart validator that only processes Path fields"""
172
+
173
+ if isinstance(value, str):
174
+ path = Path(value)
175
+ elif isinstance(value, Path):
176
+ path = value
177
+ else:
178
+ raise ValueError(f"Invalid path type for {field.name}: {type(value)}")
179
+
180
+ resolved = path.expanduser().resolve()
181
+ return resolved if resolve else path
182
+
183
+ def ensure_directories_exist(self, create: bool = False) -> None:
184
+ """Ensures all configured directories exist."""
185
+ for field_name, field_value in self.__dict__.items():
186
+ if isinstance(field_value, Path) and not field_value.exists():
187
+ if create:
188
+ field_value.mkdir(parents=True, exist_ok=True)
189
+ else:
190
+ raise FileNotFoundError(f"Directory does not exist: {field_value}")
191
+
192
+
193
+ class TqdmToLogger(io.StringIO):
194
+ """
195
+ File-like object to redirect tqdm output to a logger.
196
+ """
197
+
198
+ def __init__(self, logger, level=logging.INFO):
199
+ super().__init__()
200
+ self.logger = logger
201
+ self.level = level
202
+ self.buf = "" # To store partial writes
203
+
204
+ def write(self, buf):
205
+ # tqdm often writes partial lines, and then a full line with \r
206
+ # We accumulate buffer and only log when a full line (or significant update) is received
207
+ self.buf += buf
208
+ if "\r" in buf or "\n" in buf: # Heuristic for a "full" update
209
+ self.logger.log(self.level, self.buf.strip("\r\n"))
210
+ self.buf = "" # Reset buffer after logging
211
+
212
+ def flush(self):
213
+ # Ensure any remaining buffer is logged on flush
214
+ if self.buf:
215
+ self.logger.log(self.level, self.buf.strip("\r\n"))
216
+ self.buf = ""
217
+
218
+
219
+ @lru_cache()
220
+ def get_default_config() -> Config:
221
+ """Returns a singleton instance of Config."""
222
+ return Config()
223
+
224
+
225
+ # Singleton instance
226
+ config = get_default_config()
File without changes
@@ -0,0 +1,5 @@
1
+ from gigaspatial.core.io.adls_data_store import ADLSDataStore
2
+ from gigaspatial.core.io.local_data_store import LocalDataStore
3
+ from gigaspatial.core.io.data_api import GigaDataAPI
4
+ from gigaspatial.core.io.readers import *
5
+ from gigaspatial.core.io.writers import *
@@ -0,0 +1,325 @@
1
+ from azure.storage.blob import BlobServiceClient
2
+ import io
3
+ import contextlib
4
+ import logging
5
+ import os
6
+ from typing import Union, Optional
7
+
8
+ from .data_store import DataStore
9
+ from gigaspatial.config import config
10
+
11
+ logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
12
+ logging.WARNING
13
+ )
14
+
15
+
16
+ class ADLSDataStore(DataStore):
17
+ """
18
+ An implementation of DataStore for Azure Data Lake Storage.
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ container: str = config.ADLS_CONTAINER_NAME,
24
+ connection_string: str = config.ADLS_CONNECTION_STRING,
25
+ ):
26
+ """
27
+ Create a new instance of ADLSDataStore
28
+ :param container: The name of the container in ADLS to interact with.
29
+ """
30
+ self.blob_service_client = BlobServiceClient.from_connection_string(
31
+ connection_string
32
+ )
33
+ self.container_client = self.blob_service_client.get_container_client(
34
+ container=container
35
+ )
36
+ self.container = container
37
+
38
+ def read_file(self, path: str, encoding: Optional[str] = None) -> Union[str, bytes]:
39
+ """
40
+ Read file with flexible encoding support.
41
+
42
+ :param path: Path to the file in blob storage
43
+ :param encoding: File encoding (optional)
44
+ :return: File contents as string or bytes
45
+ """
46
+ try:
47
+ blob_client = self.container_client.get_blob_client(path)
48
+ blob_data = blob_client.download_blob().readall()
49
+
50
+ # If no encoding specified, return raw bytes
51
+ if encoding is None:
52
+ return blob_data
53
+
54
+ # If encoding is specified, decode the bytes
55
+ return blob_data.decode(encoding)
56
+
57
+ except Exception as e:
58
+ raise IOError(f"Error reading file {path}: {e}")
59
+
60
+ def write_file(self, path: str, data) -> None:
61
+ """
62
+ Write file with support for content type and improved type handling.
63
+
64
+ :param path: Destination path in blob storage
65
+ :param data: File contents
66
+ """
67
+ blob_client = self.blob_service_client.get_blob_client(
68
+ container=self.container, blob=path, snapshot=None
69
+ )
70
+
71
+ if isinstance(data, str):
72
+ binary_data = data.encode()
73
+ elif isinstance(data, bytes):
74
+ binary_data = data
75
+ else:
76
+ raise Exception(f'Unsupported data type. Only "bytes" or "string" accepted')
77
+
78
+ blob_client.upload_blob(binary_data, overwrite=True)
79
+
80
+ def upload_file(self, file_path, blob_path):
81
+ """Uploads a single file to Azure Blob Storage."""
82
+ try:
83
+ blob_client = self.container_client.get_blob_client(blob_path)
84
+ with open(file_path, "rb") as data:
85
+ blob_client.upload_blob(data, overwrite=True)
86
+ print(f"Uploaded {file_path} to {blob_path}")
87
+ except Exception as e:
88
+ print(f"Failed to upload {file_path}: {e}")
89
+
90
+ def upload_directory(self, dir_path, blob_dir_path):
91
+ """Uploads all files from a directory to Azure Blob Storage."""
92
+ for root, dirs, files in os.walk(dir_path):
93
+ for file in files:
94
+ local_file_path = os.path.join(root, file)
95
+ relative_path = os.path.relpath(local_file_path, dir_path)
96
+ blob_file_path = os.path.join(blob_dir_path, relative_path).replace(
97
+ "\\", "/"
98
+ )
99
+
100
+ self.upload_file(local_file_path, blob_file_path)
101
+
102
+ def download_directory(self, blob_dir_path: str, local_dir_path: str):
103
+ """Downloads all files from a directory in Azure Blob Storage to a local directory."""
104
+ try:
105
+ # Ensure the local directory exists
106
+ os.makedirs(local_dir_path, exist_ok=True)
107
+
108
+ # List all files in the blob directory
109
+ blob_items = self.container_client.list_blobs(
110
+ name_starts_with=blob_dir_path
111
+ )
112
+
113
+ for blob_item in blob_items:
114
+ # Get the relative path of the blob file
115
+ relative_path = os.path.relpath(blob_item.name, blob_dir_path)
116
+ # Construct the local file path
117
+ local_file_path = os.path.join(local_dir_path, relative_path)
118
+ # Create directories if needed
119
+ os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
120
+
121
+ # Download the blob to the local file
122
+ blob_client = self.container_client.get_blob_client(blob_item.name)
123
+ with open(local_file_path, "wb") as file:
124
+ file.write(blob_client.download_blob().readall())
125
+
126
+ print(f"Downloaded directory {blob_dir_path} to {local_dir_path}")
127
+ except Exception as e:
128
+ print(f"Failed to download directory {blob_dir_path}: {e}")
129
+
130
+ def copy_directory(self, source_dir: str, destination_dir: str):
131
+ """
132
+ Copies all files from a source directory to a destination directory within the same container.
133
+
134
+ :param source_dir: The source directory path in the blob storage
135
+ :param destination_dir: The destination directory path in the blob storage
136
+ """
137
+ try:
138
+ # Ensure source directory path ends with a trailing slash
139
+ source_dir = source_dir.rstrip("/") + "/"
140
+ destination_dir = destination_dir.rstrip("/") + "/"
141
+
142
+ # List all blobs in the source directory
143
+ source_blobs = self.container_client.list_blobs(name_starts_with=source_dir)
144
+
145
+ for blob in source_blobs:
146
+ # Get the relative path of the blob
147
+ relative_path = os.path.relpath(blob.name, source_dir)
148
+
149
+ # Construct the new blob path
150
+ new_blob_path = os.path.join(destination_dir, relative_path).replace(
151
+ "\\", "/"
152
+ )
153
+
154
+ # Create a source blob client
155
+ source_blob_client = self.container_client.get_blob_client(blob.name)
156
+
157
+ # Create a destination blob client
158
+ destination_blob_client = self.container_client.get_blob_client(
159
+ new_blob_path
160
+ )
161
+
162
+ # Start the copy operation
163
+ destination_blob_client.start_copy_from_url(source_blob_client.url)
164
+
165
+ print(f"Copied directory from {source_dir} to {destination_dir}")
166
+ except Exception as e:
167
+ print(f"Failed to copy directory {source_dir}: {e}")
168
+
169
+ def exists(self, path: str) -> bool:
170
+ blob_client = self.blob_service_client.get_blob_client(
171
+ container=self.container, blob=path, snapshot=None
172
+ )
173
+ return blob_client.exists()
174
+
175
+ def file_exists(self, path: str) -> bool:
176
+ return self.exists(path) and not self.is_dir(path)
177
+
178
+ def file_size(self, path: str) -> float:
179
+ blob_client = self.blob_service_client.get_blob_client(
180
+ container=self.container, blob=path, snapshot=None
181
+ )
182
+ properties = blob_client.get_blob_properties()
183
+
184
+ # The size is in bytes, convert it to kilobytes
185
+ size_in_bytes = properties.size
186
+ size_in_kb = size_in_bytes / 1024.0
187
+ return size_in_kb
188
+
189
+ def list_files(self, path: str):
190
+ blob_items = self.container_client.list_blobs(name_starts_with=path)
191
+ return [item["name"] for item in blob_items]
192
+
193
+ def walk(self, top: str):
194
+ top = top.rstrip("/") + "/"
195
+ blob_items = self.container_client.list_blobs(name_starts_with=top)
196
+ blobs = [item["name"] for item in blob_items]
197
+ for blob in blobs:
198
+ dirpath, filename = os.path.split(blob)
199
+ yield (dirpath, [], [filename])
200
+
201
+ def list_directories(self, path: str) -> list:
202
+ """List only directory names (not files) from a given path in ADLS."""
203
+ search_path = path.rstrip("/") + "/" if path else ""
204
+
205
+ blob_items = self.container_client.list_blobs(name_starts_with=search_path)
206
+
207
+ directories = set()
208
+
209
+ for blob_item in blob_items:
210
+ # Get the relative path from the search path
211
+ relative_path = blob_item.name[len(search_path) :]
212
+
213
+ # Skip if it's empty (shouldn't happen but just in case)
214
+ if not relative_path:
215
+ continue
216
+
217
+ # If there's a "/" in the relative path, it means there's a subdirectory
218
+ if "/" in relative_path:
219
+ # Get the first directory name
220
+ dir_name = relative_path.split("/")[0]
221
+ directories.add(dir_name)
222
+
223
+ return sorted(list(directories))
224
+
225
+ @contextlib.contextmanager
226
+ def open(self, path: str, mode: str = "r"):
227
+ """
228
+ Context manager for file operations with enhanced mode support.
229
+
230
+ :param path: File path in blob storage
231
+ :param mode: File open mode (r, rb, w, wb)
232
+ """
233
+ if mode == "w":
234
+ file = io.StringIO()
235
+ yield file
236
+ self.write_file(path, file.getvalue())
237
+
238
+ elif mode == "wb":
239
+ file = io.BytesIO()
240
+ yield file
241
+ self.write_file(path, file.getvalue())
242
+
243
+ elif mode == "r":
244
+ data = self.read_file(path, encoding="UTF-8")
245
+ file = io.StringIO(data)
246
+ yield file
247
+
248
+ elif mode == "rb":
249
+ data = self.read_file(path)
250
+ file = io.BytesIO(data)
251
+ yield file
252
+
253
+ def get_file_metadata(self, path: str) -> dict:
254
+ """
255
+ Retrieve comprehensive file metadata.
256
+
257
+ :param path: File path in blob storage
258
+ :return: File metadata dictionary
259
+ """
260
+ blob_client = self.container_client.get_blob_client(path)
261
+ properties = blob_client.get_blob_properties()
262
+
263
+ return {
264
+ "name": path,
265
+ "size_bytes": properties.size,
266
+ "content_type": properties.content_settings.content_type,
267
+ "last_modified": properties.last_modified,
268
+ "etag": properties.etag,
269
+ }
270
+
271
+ def is_file(self, path: str) -> bool:
272
+ return self.file_exists(path)
273
+
274
+ def is_dir(self, path: str) -> bool:
275
+ dir_path = path.rstrip("/") + "/"
276
+
277
+ existing_blobs = self.list_files(dir_path)
278
+
279
+ if len(existing_blobs) > 1:
280
+ return True
281
+ elif len(existing_blobs) == 1:
282
+ if existing_blobs[0] != path.rstrip("/"):
283
+ return True
284
+
285
+ return False
286
+
287
+ def rmdir(self, dir: str) -> None:
288
+ blobs = self.list_files(dir)
289
+ self.container_client.delete_blobs(*blobs)
290
+
291
+ def mkdir(self, path: str, exist_ok: bool = False) -> None:
292
+ """
293
+ Create a directory in Azure Blob Storage.
294
+
295
+ In ADLS, directories are conceptual and created by adding a placeholder blob.
296
+
297
+ :param path: Path of the directory to create
298
+ :param exist_ok: If False, raise an error if the directory already exists
299
+ """
300
+ dir_path = path.rstrip("/") + "/"
301
+
302
+ existing_blobs = list(self.list_files(dir_path))
303
+
304
+ if existing_blobs and not exist_ok:
305
+ raise FileExistsError(f"Directory {path} already exists")
306
+
307
+ # Create a placeholder blob to represent the directory
308
+ placeholder_blob_path = os.path.join(dir_path, ".placeholder")
309
+
310
+ # Only create placeholder if it doesn't already exist
311
+ if not self.file_exists(placeholder_blob_path):
312
+ placeholder_content = (
313
+ b"This is a placeholder blob to represent a directory."
314
+ )
315
+ blob_client = self.blob_service_client.get_blob_client(
316
+ container=self.container, blob=placeholder_blob_path
317
+ )
318
+ blob_client.upload_blob(placeholder_content, overwrite=True)
319
+
320
+ def remove(self, path: str) -> None:
321
+ blob_client = self.blob_service_client.get_blob_client(
322
+ container=self.container, blob=path, snapshot=None
323
+ )
324
+ if blob_client.exists():
325
+ blob_client.delete_blob()
@@ -0,0 +1,113 @@
1
+ import pandas as pd
2
+ import delta_sharing
3
+ from typing import Union
4
+ from pathlib import Path
5
+
6
+ from gigaspatial.config import config
7
+
8
+
9
+ class GigaDataAPI:
10
+
11
+ def __init__(
12
+ self,
13
+ profile_file: Union[str, Path] = config.API_PROFILE_FILE_PATH,
14
+ share_name: str = config.API_SHARE_NAME,
15
+ schema_name: str = config.API_SCHEMA_NAME,
16
+ ):
17
+ """
18
+ Initialize the GigaDataAPI class with the profile file, share name, and schema name.
19
+
20
+ profile_file: Path to the delta-sharing profile file.
21
+ share_name: Name of the share (e.g., "gold").
22
+ schema_name: Name of the schema (e.g., "school-master").
23
+ """
24
+ self.profile_file = profile_file
25
+ self.share_name = share_name
26
+ self.schema_name = schema_name
27
+ self.client = delta_sharing.SharingClient(profile_file)
28
+
29
+ self._cache = {}
30
+
31
+ def get_country_list(self, sort=True):
32
+ """
33
+ Retrieve a list of available countries in the dataset.
34
+
35
+ :param sort: Whether to sort the country list alphabetically (default is True).
36
+ """
37
+ country_list = [
38
+ t.name for t in self.client.list_all_tables() if t.share == self.share_name
39
+ ]
40
+ if sort:
41
+ country_list.sort()
42
+ return country_list
43
+
44
+ def load_country_data(self, country, filters=None, use_cache=True):
45
+ """
46
+ Load the dataset for the specified country with optional filtering and caching.
47
+
48
+ country: The country code (e.g., "MWI").
49
+ filters: A dictionary with column names as keys and filter values as values.
50
+ use_cache: Whether to use cached data if available (default is True).
51
+ """
52
+ # Check if data is cached
53
+ if use_cache and country in self._cache:
54
+ df_country = self._cache[country]
55
+ else:
56
+ # Load data from the API
57
+ table_url = (
58
+ f"{self.profile_file}#{self.share_name}.{self.schema_name}.{country}"
59
+ )
60
+ df_country = delta_sharing.load_as_pandas(table_url)
61
+ self._cache[country] = df_country # Cache the data
62
+
63
+ # Apply filters if provided
64
+ if filters:
65
+ for column, value in filters.items():
66
+ df_country = df_country[df_country[column] == value]
67
+
68
+ return df_country
69
+
70
+ def load_multiple_countries(self, countries):
71
+ """
72
+ Load data for multiple countries and combine them into a single DataFrame.
73
+
74
+ countries: A list of country codes.
75
+ """
76
+ df_list = []
77
+ for country in countries:
78
+ df_list.append(self.load_country_data(country))
79
+ return pd.concat(df_list, ignore_index=True)
80
+
81
+ def get_country_metadata(self, country):
82
+ """
83
+ Retrieve metadata (e.g., column names and data types) for a country's dataset.
84
+
85
+ country: The country code (e.g., "MWI").
86
+ """
87
+ df_country = self.load_country_data(country)
88
+ metadata = {
89
+ "columns": df_country.columns.tolist(),
90
+ "data_types": df_country.dtypes.to_dict(),
91
+ "num_records": len(df_country),
92
+ }
93
+ return metadata
94
+
95
+ def get_all_cached_data_as_dict(self):
96
+ """
97
+ Retrieve all cached data in a dictionary format, where each key is a country code,
98
+ and the value is the DataFrame of that country.
99
+ """
100
+ return self._cache if self._cache else {}
101
+
102
+ def get_all_cached_data_as_json(self):
103
+ """
104
+ Retrieve all cached data in a JSON-like format. Each country is represented as a key,
105
+ and the value is a list of records (i.e., the DataFrame's `to_dict(orient='records')` format).
106
+ """
107
+ if not self._cache:
108
+ return {}
109
+
110
+ # Convert each DataFrame in the cache to a JSON-like format (list of records)
111
+ return {
112
+ country: df.to_dict(orient="records") for country, df in self._cache.items()
113
+ }