giga-spatial 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- giga_spatial-0.6.0.dist-info/METADATA +141 -0
- giga_spatial-0.6.0.dist-info/RECORD +47 -0
- giga_spatial-0.6.0.dist-info/WHEEL +5 -0
- giga_spatial-0.6.0.dist-info/licenses/LICENSE +661 -0
- giga_spatial-0.6.0.dist-info/top_level.txt +1 -0
- gigaspatial/__init__.py +1 -0
- gigaspatial/config.py +226 -0
- gigaspatial/core/__init__.py +0 -0
- gigaspatial/core/io/__init__.py +5 -0
- gigaspatial/core/io/adls_data_store.py +325 -0
- gigaspatial/core/io/data_api.py +113 -0
- gigaspatial/core/io/data_store.py +147 -0
- gigaspatial/core/io/local_data_store.py +92 -0
- gigaspatial/core/io/readers.py +265 -0
- gigaspatial/core/io/writers.py +128 -0
- gigaspatial/core/schemas/__init__.py +0 -0
- gigaspatial/core/schemas/entity.py +244 -0
- gigaspatial/generators/__init__.py +2 -0
- gigaspatial/generators/poi.py +636 -0
- gigaspatial/generators/zonal/__init__.py +3 -0
- gigaspatial/generators/zonal/base.py +370 -0
- gigaspatial/generators/zonal/geometry.py +439 -0
- gigaspatial/generators/zonal/mercator.py +78 -0
- gigaspatial/grid/__init__.py +1 -0
- gigaspatial/grid/mercator_tiles.py +286 -0
- gigaspatial/handlers/__init__.py +40 -0
- gigaspatial/handlers/base.py +761 -0
- gigaspatial/handlers/boundaries.py +305 -0
- gigaspatial/handlers/ghsl.py +772 -0
- gigaspatial/handlers/giga.py +145 -0
- gigaspatial/handlers/google_open_buildings.py +472 -0
- gigaspatial/handlers/hdx.py +241 -0
- gigaspatial/handlers/mapbox_image.py +208 -0
- gigaspatial/handlers/maxar_image.py +291 -0
- gigaspatial/handlers/microsoft_global_buildings.py +548 -0
- gigaspatial/handlers/ookla_speedtest.py +199 -0
- gigaspatial/handlers/opencellid.py +290 -0
- gigaspatial/handlers/osm.py +356 -0
- gigaspatial/handlers/overture.py +126 -0
- gigaspatial/handlers/rwi.py +157 -0
- gigaspatial/handlers/unicef_georepo.py +806 -0
- gigaspatial/handlers/worldpop.py +266 -0
- gigaspatial/processing/__init__.py +4 -0
- gigaspatial/processing/geo.py +1054 -0
- gigaspatial/processing/sat_images.py +39 -0
- gigaspatial/processing/tif_processor.py +477 -0
- gigaspatial/processing/utils.py +49 -0
gigaspatial/config.py
ADDED
@@ -0,0 +1,226 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from pydantic import Field, field_validator
|
3
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
4
|
+
from typing import Optional, Union, Literal, Dict, Any
|
5
|
+
import io
|
6
|
+
from functools import lru_cache
|
7
|
+
import logging
|
8
|
+
|
9
|
+
|
10
|
+
class Config(BaseSettings):
|
11
|
+
"""
|
12
|
+
Unified configuration with environment variable loading.
|
13
|
+
Manages file system paths for different data tiers (bronze, silver, gold)
|
14
|
+
and their subpaths. All paths can be overridden through environment variables.
|
15
|
+
"""
|
16
|
+
|
17
|
+
ADLS_CONNECTION_STRING: str = Field(default="", alias="ADLS_CONNECTION_STRING")
|
18
|
+
ADLS_CONTAINER_NAME: str = Field(default="", alias="ADLS_CONTAINER_NAME")
|
19
|
+
GOOGLE_SERVICE_ACCOUNT: str = Field(default="", alias="GOOGLE_SERVICE_ACCOUNT")
|
20
|
+
API_PROFILE_FILE_PATH: Path = Field(
|
21
|
+
default=Path("profile.share"), alias="API_PROFILE_FILE_PATH"
|
22
|
+
)
|
23
|
+
API_SHARE_NAME: str = Field(default="", alias="API_SHARE_NAME")
|
24
|
+
API_SCHEMA_NAME: str = Field(default="", alias="API_SCHEMA_NAME")
|
25
|
+
MAPBOX_ACCESS_TOKEN: str = Field(default="", alias="MAPBOX_ACCESS_TOKEN")
|
26
|
+
MAXAR_USERNAME: str = Field(default="", alias="MAXAR_USERNAME")
|
27
|
+
MAXAR_PASSWORD: str = Field(default="", alias="MAXAR_PASSWORD")
|
28
|
+
MAXAR_CONNECTION_STRING: str = Field(default="", alias="MAXAR_CONNECTION_STRING")
|
29
|
+
OPENCELLID_ACCESS_TOKEN: str = Field(default="", alias="OPENCELLID_ACCESS_TOKEN")
|
30
|
+
GEOREPO_API_KEY: str = Field(default="", alias="GEOREPO_API_KEY")
|
31
|
+
GEOREPO_USER_EMAIL: str = Field(default="", alias="GEOREPO_USER_EMAIL")
|
32
|
+
GIGA_SCHOOL_LOCATION_API_KEY: str = Field(
|
33
|
+
default="", alias="GIGA_SCHOOL_LOCATION_API_KEY"
|
34
|
+
)
|
35
|
+
|
36
|
+
BRONZE_DATA_DIR: Path = Field(
|
37
|
+
default=Path("bronze"),
|
38
|
+
description="Root directory for raw/bronze tier data",
|
39
|
+
alias="BRONZE_DIR",
|
40
|
+
)
|
41
|
+
SILVER_DATA_DIR: Path = Field(
|
42
|
+
default=Path("silver"),
|
43
|
+
description="Root directory for processed/silver tier data",
|
44
|
+
alias="SILVER_DIR",
|
45
|
+
)
|
46
|
+
GOLD_DATA_DIR: Path = Field(
|
47
|
+
default=Path("gold"),
|
48
|
+
description="Root directory for final/gold tier data",
|
49
|
+
alias="GOLD_DIR",
|
50
|
+
)
|
51
|
+
VIEWS_DATA_DIR: Path = Field(
|
52
|
+
default=Path("views"),
|
53
|
+
description="Root directory for views data",
|
54
|
+
alias="VIEWS_DIR",
|
55
|
+
)
|
56
|
+
CACHE_DIR: Path = Field(
|
57
|
+
default=Path("cache"),
|
58
|
+
description="Directory for temporary/cache files",
|
59
|
+
alias="CACHE_DIR",
|
60
|
+
)
|
61
|
+
ADMIN_BOUNDARIES_DATA_DIR: Path = Field(
|
62
|
+
default=Path("admin_boundaries"),
|
63
|
+
description="Root directory for administrative boundary data",
|
64
|
+
alias="ADMIN_BOUNDARIES_DIR",
|
65
|
+
)
|
66
|
+
|
67
|
+
DATA_TYPES: Dict[str, str] = Field(
|
68
|
+
default={
|
69
|
+
"google_open_buildings": "google_open_buildings",
|
70
|
+
"mapbox_image": "mapbox_images",
|
71
|
+
"microsoft_global_buildings": "microsoft_global_buildings",
|
72
|
+
"ookla_speedtest": "ookla",
|
73
|
+
"srtm": "srtm",
|
74
|
+
"worldpop": "worldpop",
|
75
|
+
"ghsl": "ghsl",
|
76
|
+
"opencellid": "opencellid",
|
77
|
+
"hdx": "hdx",
|
78
|
+
"poi": "poi",
|
79
|
+
"zonal": "zonal",
|
80
|
+
},
|
81
|
+
description="Mapping of data types to directory names",
|
82
|
+
)
|
83
|
+
|
84
|
+
def get_logger(self, name="GigaSpatial", console_level=logging.INFO):
|
85
|
+
logger = logging.getLogger(name)
|
86
|
+
logger.setLevel(logging.INFO)
|
87
|
+
|
88
|
+
console_handler = logging.StreamHandler()
|
89
|
+
console_handler.setLevel(console_level)
|
90
|
+
|
91
|
+
LOG_FORMAT = "%(levelname) -10s %(name) -10s %(asctime) " "-30s: %(message)s"
|
92
|
+
|
93
|
+
formatter = logging.Formatter(LOG_FORMAT)
|
94
|
+
console_handler.setFormatter(formatter)
|
95
|
+
|
96
|
+
if not logger.hasHandlers():
|
97
|
+
logger.addHandler(console_handler)
|
98
|
+
|
99
|
+
return logger
|
100
|
+
|
101
|
+
def get_tqdm_logger_stream(self, logger: logging.Logger, level=logging.INFO):
|
102
|
+
return TqdmToLogger(logger, level=level)
|
103
|
+
|
104
|
+
def set_path(
|
105
|
+
self,
|
106
|
+
tier: Literal["bronze", "silver", "gold", "views"],
|
107
|
+
path: Union[str, Path],
|
108
|
+
) -> None:
|
109
|
+
"""Dynamically set the base path for a given tier."""
|
110
|
+
if tier not in ["bronze", "silver", "gold", "views"]:
|
111
|
+
raise ValueError(
|
112
|
+
f"Invalid tier: {tier}. Must be one of 'bronze', 'silver', 'gold', or 'views'."
|
113
|
+
)
|
114
|
+
|
115
|
+
if isinstance(path, str):
|
116
|
+
path = Path(path)
|
117
|
+
|
118
|
+
setattr(self, f"{tier.upper()}_DATA_DIR", path)
|
119
|
+
|
120
|
+
def get_path(
|
121
|
+
self,
|
122
|
+
data_type: str,
|
123
|
+
tier: Literal["bronze", "silver", "gold", "views"],
|
124
|
+
version: Optional[str] = None,
|
125
|
+
) -> Path:
|
126
|
+
"""Dynamic path construction based on data type and tier."""
|
127
|
+
if tier not in ["bronze", "silver", "gold", "views"]:
|
128
|
+
raise ValueError(
|
129
|
+
f"Invalid tier: {tier}. Must be one of 'bronze', 'silver', 'gold', or 'views'."
|
130
|
+
)
|
131
|
+
|
132
|
+
base_dir = getattr(self, f"{tier.upper()}_DATA_DIR")
|
133
|
+
type_dir = self.DATA_TYPES[data_type]
|
134
|
+
if version:
|
135
|
+
return base_dir / type_dir / version
|
136
|
+
else:
|
137
|
+
return base_dir / type_dir
|
138
|
+
|
139
|
+
def get_admin_path(
|
140
|
+
self,
|
141
|
+
country_code,
|
142
|
+
admin_level: Literal[0, 1, 2, 3, 4],
|
143
|
+
file_suffix: str = ".geojson",
|
144
|
+
) -> Path:
|
145
|
+
"""Dynamic path construction for administrative boundary data based on admin level."""
|
146
|
+
base_dir = getattr(self, "ADMIN_BOUNDARIES_DATA_DIR")
|
147
|
+
level_dir = f"admin{admin_level}"
|
148
|
+
file = f"{country_code}_{level_dir}{file_suffix}"
|
149
|
+
|
150
|
+
return base_dir / level_dir / file
|
151
|
+
|
152
|
+
model_config = SettingsConfigDict(
|
153
|
+
env_file=".env",
|
154
|
+
env_prefix="",
|
155
|
+
validate_assignment=True,
|
156
|
+
case_sensitive=True,
|
157
|
+
extra="allow",
|
158
|
+
)
|
159
|
+
|
160
|
+
@field_validator(
|
161
|
+
"BRONZE_DATA_DIR",
|
162
|
+
"SILVER_DATA_DIR",
|
163
|
+
"GOLD_DATA_DIR",
|
164
|
+
"CACHE_DIR",
|
165
|
+
"ADMIN_BOUNDARIES_DATA_DIR",
|
166
|
+
mode="before",
|
167
|
+
)
|
168
|
+
def resolve_and_validate_paths(
|
169
|
+
cls, value: Union[str, Path], resolve=False
|
170
|
+
) -> Union[Path, Any]:
|
171
|
+
"""Smart validator that only processes Path fields"""
|
172
|
+
|
173
|
+
if isinstance(value, str):
|
174
|
+
path = Path(value)
|
175
|
+
elif isinstance(value, Path):
|
176
|
+
path = value
|
177
|
+
else:
|
178
|
+
raise ValueError(f"Invalid path type for {field.name}: {type(value)}")
|
179
|
+
|
180
|
+
resolved = path.expanduser().resolve()
|
181
|
+
return resolved if resolve else path
|
182
|
+
|
183
|
+
def ensure_directories_exist(self, create: bool = False) -> None:
|
184
|
+
"""Ensures all configured directories exist."""
|
185
|
+
for field_name, field_value in self.__dict__.items():
|
186
|
+
if isinstance(field_value, Path) and not field_value.exists():
|
187
|
+
if create:
|
188
|
+
field_value.mkdir(parents=True, exist_ok=True)
|
189
|
+
else:
|
190
|
+
raise FileNotFoundError(f"Directory does not exist: {field_value}")
|
191
|
+
|
192
|
+
|
193
|
+
class TqdmToLogger(io.StringIO):
|
194
|
+
"""
|
195
|
+
File-like object to redirect tqdm output to a logger.
|
196
|
+
"""
|
197
|
+
|
198
|
+
def __init__(self, logger, level=logging.INFO):
|
199
|
+
super().__init__()
|
200
|
+
self.logger = logger
|
201
|
+
self.level = level
|
202
|
+
self.buf = "" # To store partial writes
|
203
|
+
|
204
|
+
def write(self, buf):
|
205
|
+
# tqdm often writes partial lines, and then a full line with \r
|
206
|
+
# We accumulate buffer and only log when a full line (or significant update) is received
|
207
|
+
self.buf += buf
|
208
|
+
if "\r" in buf or "\n" in buf: # Heuristic for a "full" update
|
209
|
+
self.logger.log(self.level, self.buf.strip("\r\n"))
|
210
|
+
self.buf = "" # Reset buffer after logging
|
211
|
+
|
212
|
+
def flush(self):
|
213
|
+
# Ensure any remaining buffer is logged on flush
|
214
|
+
if self.buf:
|
215
|
+
self.logger.log(self.level, self.buf.strip("\r\n"))
|
216
|
+
self.buf = ""
|
217
|
+
|
218
|
+
|
219
|
+
@lru_cache()
|
220
|
+
def get_default_config() -> Config:
|
221
|
+
"""Returns a singleton instance of Config."""
|
222
|
+
return Config()
|
223
|
+
|
224
|
+
|
225
|
+
# Singleton instance
|
226
|
+
config = get_default_config()
|
File without changes
|
@@ -0,0 +1,325 @@
|
|
1
|
+
from azure.storage.blob import BlobServiceClient
|
2
|
+
import io
|
3
|
+
import contextlib
|
4
|
+
import logging
|
5
|
+
import os
|
6
|
+
from typing import Union, Optional
|
7
|
+
|
8
|
+
from .data_store import DataStore
|
9
|
+
from gigaspatial.config import config
|
10
|
+
|
11
|
+
logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
|
12
|
+
logging.WARNING
|
13
|
+
)
|
14
|
+
|
15
|
+
|
16
|
+
class ADLSDataStore(DataStore):
|
17
|
+
"""
|
18
|
+
An implementation of DataStore for Azure Data Lake Storage.
|
19
|
+
"""
|
20
|
+
|
21
|
+
def __init__(
|
22
|
+
self,
|
23
|
+
container: str = config.ADLS_CONTAINER_NAME,
|
24
|
+
connection_string: str = config.ADLS_CONNECTION_STRING,
|
25
|
+
):
|
26
|
+
"""
|
27
|
+
Create a new instance of ADLSDataStore
|
28
|
+
:param container: The name of the container in ADLS to interact with.
|
29
|
+
"""
|
30
|
+
self.blob_service_client = BlobServiceClient.from_connection_string(
|
31
|
+
connection_string
|
32
|
+
)
|
33
|
+
self.container_client = self.blob_service_client.get_container_client(
|
34
|
+
container=container
|
35
|
+
)
|
36
|
+
self.container = container
|
37
|
+
|
38
|
+
def read_file(self, path: str, encoding: Optional[str] = None) -> Union[str, bytes]:
|
39
|
+
"""
|
40
|
+
Read file with flexible encoding support.
|
41
|
+
|
42
|
+
:param path: Path to the file in blob storage
|
43
|
+
:param encoding: File encoding (optional)
|
44
|
+
:return: File contents as string or bytes
|
45
|
+
"""
|
46
|
+
try:
|
47
|
+
blob_client = self.container_client.get_blob_client(path)
|
48
|
+
blob_data = blob_client.download_blob().readall()
|
49
|
+
|
50
|
+
# If no encoding specified, return raw bytes
|
51
|
+
if encoding is None:
|
52
|
+
return blob_data
|
53
|
+
|
54
|
+
# If encoding is specified, decode the bytes
|
55
|
+
return blob_data.decode(encoding)
|
56
|
+
|
57
|
+
except Exception as e:
|
58
|
+
raise IOError(f"Error reading file {path}: {e}")
|
59
|
+
|
60
|
+
def write_file(self, path: str, data) -> None:
|
61
|
+
"""
|
62
|
+
Write file with support for content type and improved type handling.
|
63
|
+
|
64
|
+
:param path: Destination path in blob storage
|
65
|
+
:param data: File contents
|
66
|
+
"""
|
67
|
+
blob_client = self.blob_service_client.get_blob_client(
|
68
|
+
container=self.container, blob=path, snapshot=None
|
69
|
+
)
|
70
|
+
|
71
|
+
if isinstance(data, str):
|
72
|
+
binary_data = data.encode()
|
73
|
+
elif isinstance(data, bytes):
|
74
|
+
binary_data = data
|
75
|
+
else:
|
76
|
+
raise Exception(f'Unsupported data type. Only "bytes" or "string" accepted')
|
77
|
+
|
78
|
+
blob_client.upload_blob(binary_data, overwrite=True)
|
79
|
+
|
80
|
+
def upload_file(self, file_path, blob_path):
|
81
|
+
"""Uploads a single file to Azure Blob Storage."""
|
82
|
+
try:
|
83
|
+
blob_client = self.container_client.get_blob_client(blob_path)
|
84
|
+
with open(file_path, "rb") as data:
|
85
|
+
blob_client.upload_blob(data, overwrite=True)
|
86
|
+
print(f"Uploaded {file_path} to {blob_path}")
|
87
|
+
except Exception as e:
|
88
|
+
print(f"Failed to upload {file_path}: {e}")
|
89
|
+
|
90
|
+
def upload_directory(self, dir_path, blob_dir_path):
|
91
|
+
"""Uploads all files from a directory to Azure Blob Storage."""
|
92
|
+
for root, dirs, files in os.walk(dir_path):
|
93
|
+
for file in files:
|
94
|
+
local_file_path = os.path.join(root, file)
|
95
|
+
relative_path = os.path.relpath(local_file_path, dir_path)
|
96
|
+
blob_file_path = os.path.join(blob_dir_path, relative_path).replace(
|
97
|
+
"\\", "/"
|
98
|
+
)
|
99
|
+
|
100
|
+
self.upload_file(local_file_path, blob_file_path)
|
101
|
+
|
102
|
+
def download_directory(self, blob_dir_path: str, local_dir_path: str):
|
103
|
+
"""Downloads all files from a directory in Azure Blob Storage to a local directory."""
|
104
|
+
try:
|
105
|
+
# Ensure the local directory exists
|
106
|
+
os.makedirs(local_dir_path, exist_ok=True)
|
107
|
+
|
108
|
+
# List all files in the blob directory
|
109
|
+
blob_items = self.container_client.list_blobs(
|
110
|
+
name_starts_with=blob_dir_path
|
111
|
+
)
|
112
|
+
|
113
|
+
for blob_item in blob_items:
|
114
|
+
# Get the relative path of the blob file
|
115
|
+
relative_path = os.path.relpath(blob_item.name, blob_dir_path)
|
116
|
+
# Construct the local file path
|
117
|
+
local_file_path = os.path.join(local_dir_path, relative_path)
|
118
|
+
# Create directories if needed
|
119
|
+
os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
|
120
|
+
|
121
|
+
# Download the blob to the local file
|
122
|
+
blob_client = self.container_client.get_blob_client(blob_item.name)
|
123
|
+
with open(local_file_path, "wb") as file:
|
124
|
+
file.write(blob_client.download_blob().readall())
|
125
|
+
|
126
|
+
print(f"Downloaded directory {blob_dir_path} to {local_dir_path}")
|
127
|
+
except Exception as e:
|
128
|
+
print(f"Failed to download directory {blob_dir_path}: {e}")
|
129
|
+
|
130
|
+
def copy_directory(self, source_dir: str, destination_dir: str):
|
131
|
+
"""
|
132
|
+
Copies all files from a source directory to a destination directory within the same container.
|
133
|
+
|
134
|
+
:param source_dir: The source directory path in the blob storage
|
135
|
+
:param destination_dir: The destination directory path in the blob storage
|
136
|
+
"""
|
137
|
+
try:
|
138
|
+
# Ensure source directory path ends with a trailing slash
|
139
|
+
source_dir = source_dir.rstrip("/") + "/"
|
140
|
+
destination_dir = destination_dir.rstrip("/") + "/"
|
141
|
+
|
142
|
+
# List all blobs in the source directory
|
143
|
+
source_blobs = self.container_client.list_blobs(name_starts_with=source_dir)
|
144
|
+
|
145
|
+
for blob in source_blobs:
|
146
|
+
# Get the relative path of the blob
|
147
|
+
relative_path = os.path.relpath(blob.name, source_dir)
|
148
|
+
|
149
|
+
# Construct the new blob path
|
150
|
+
new_blob_path = os.path.join(destination_dir, relative_path).replace(
|
151
|
+
"\\", "/"
|
152
|
+
)
|
153
|
+
|
154
|
+
# Create a source blob client
|
155
|
+
source_blob_client = self.container_client.get_blob_client(blob.name)
|
156
|
+
|
157
|
+
# Create a destination blob client
|
158
|
+
destination_blob_client = self.container_client.get_blob_client(
|
159
|
+
new_blob_path
|
160
|
+
)
|
161
|
+
|
162
|
+
# Start the copy operation
|
163
|
+
destination_blob_client.start_copy_from_url(source_blob_client.url)
|
164
|
+
|
165
|
+
print(f"Copied directory from {source_dir} to {destination_dir}")
|
166
|
+
except Exception as e:
|
167
|
+
print(f"Failed to copy directory {source_dir}: {e}")
|
168
|
+
|
169
|
+
def exists(self, path: str) -> bool:
|
170
|
+
blob_client = self.blob_service_client.get_blob_client(
|
171
|
+
container=self.container, blob=path, snapshot=None
|
172
|
+
)
|
173
|
+
return blob_client.exists()
|
174
|
+
|
175
|
+
def file_exists(self, path: str) -> bool:
|
176
|
+
return self.exists(path) and not self.is_dir(path)
|
177
|
+
|
178
|
+
def file_size(self, path: str) -> float:
|
179
|
+
blob_client = self.blob_service_client.get_blob_client(
|
180
|
+
container=self.container, blob=path, snapshot=None
|
181
|
+
)
|
182
|
+
properties = blob_client.get_blob_properties()
|
183
|
+
|
184
|
+
# The size is in bytes, convert it to kilobytes
|
185
|
+
size_in_bytes = properties.size
|
186
|
+
size_in_kb = size_in_bytes / 1024.0
|
187
|
+
return size_in_kb
|
188
|
+
|
189
|
+
def list_files(self, path: str):
|
190
|
+
blob_items = self.container_client.list_blobs(name_starts_with=path)
|
191
|
+
return [item["name"] for item in blob_items]
|
192
|
+
|
193
|
+
def walk(self, top: str):
|
194
|
+
top = top.rstrip("/") + "/"
|
195
|
+
blob_items = self.container_client.list_blobs(name_starts_with=top)
|
196
|
+
blobs = [item["name"] for item in blob_items]
|
197
|
+
for blob in blobs:
|
198
|
+
dirpath, filename = os.path.split(blob)
|
199
|
+
yield (dirpath, [], [filename])
|
200
|
+
|
201
|
+
def list_directories(self, path: str) -> list:
|
202
|
+
"""List only directory names (not files) from a given path in ADLS."""
|
203
|
+
search_path = path.rstrip("/") + "/" if path else ""
|
204
|
+
|
205
|
+
blob_items = self.container_client.list_blobs(name_starts_with=search_path)
|
206
|
+
|
207
|
+
directories = set()
|
208
|
+
|
209
|
+
for blob_item in blob_items:
|
210
|
+
# Get the relative path from the search path
|
211
|
+
relative_path = blob_item.name[len(search_path) :]
|
212
|
+
|
213
|
+
# Skip if it's empty (shouldn't happen but just in case)
|
214
|
+
if not relative_path:
|
215
|
+
continue
|
216
|
+
|
217
|
+
# If there's a "/" in the relative path, it means there's a subdirectory
|
218
|
+
if "/" in relative_path:
|
219
|
+
# Get the first directory name
|
220
|
+
dir_name = relative_path.split("/")[0]
|
221
|
+
directories.add(dir_name)
|
222
|
+
|
223
|
+
return sorted(list(directories))
|
224
|
+
|
225
|
+
@contextlib.contextmanager
|
226
|
+
def open(self, path: str, mode: str = "r"):
|
227
|
+
"""
|
228
|
+
Context manager for file operations with enhanced mode support.
|
229
|
+
|
230
|
+
:param path: File path in blob storage
|
231
|
+
:param mode: File open mode (r, rb, w, wb)
|
232
|
+
"""
|
233
|
+
if mode == "w":
|
234
|
+
file = io.StringIO()
|
235
|
+
yield file
|
236
|
+
self.write_file(path, file.getvalue())
|
237
|
+
|
238
|
+
elif mode == "wb":
|
239
|
+
file = io.BytesIO()
|
240
|
+
yield file
|
241
|
+
self.write_file(path, file.getvalue())
|
242
|
+
|
243
|
+
elif mode == "r":
|
244
|
+
data = self.read_file(path, encoding="UTF-8")
|
245
|
+
file = io.StringIO(data)
|
246
|
+
yield file
|
247
|
+
|
248
|
+
elif mode == "rb":
|
249
|
+
data = self.read_file(path)
|
250
|
+
file = io.BytesIO(data)
|
251
|
+
yield file
|
252
|
+
|
253
|
+
def get_file_metadata(self, path: str) -> dict:
|
254
|
+
"""
|
255
|
+
Retrieve comprehensive file metadata.
|
256
|
+
|
257
|
+
:param path: File path in blob storage
|
258
|
+
:return: File metadata dictionary
|
259
|
+
"""
|
260
|
+
blob_client = self.container_client.get_blob_client(path)
|
261
|
+
properties = blob_client.get_blob_properties()
|
262
|
+
|
263
|
+
return {
|
264
|
+
"name": path,
|
265
|
+
"size_bytes": properties.size,
|
266
|
+
"content_type": properties.content_settings.content_type,
|
267
|
+
"last_modified": properties.last_modified,
|
268
|
+
"etag": properties.etag,
|
269
|
+
}
|
270
|
+
|
271
|
+
def is_file(self, path: str) -> bool:
|
272
|
+
return self.file_exists(path)
|
273
|
+
|
274
|
+
def is_dir(self, path: str) -> bool:
|
275
|
+
dir_path = path.rstrip("/") + "/"
|
276
|
+
|
277
|
+
existing_blobs = self.list_files(dir_path)
|
278
|
+
|
279
|
+
if len(existing_blobs) > 1:
|
280
|
+
return True
|
281
|
+
elif len(existing_blobs) == 1:
|
282
|
+
if existing_blobs[0] != path.rstrip("/"):
|
283
|
+
return True
|
284
|
+
|
285
|
+
return False
|
286
|
+
|
287
|
+
def rmdir(self, dir: str) -> None:
|
288
|
+
blobs = self.list_files(dir)
|
289
|
+
self.container_client.delete_blobs(*blobs)
|
290
|
+
|
291
|
+
def mkdir(self, path: str, exist_ok: bool = False) -> None:
|
292
|
+
"""
|
293
|
+
Create a directory in Azure Blob Storage.
|
294
|
+
|
295
|
+
In ADLS, directories are conceptual and created by adding a placeholder blob.
|
296
|
+
|
297
|
+
:param path: Path of the directory to create
|
298
|
+
:param exist_ok: If False, raise an error if the directory already exists
|
299
|
+
"""
|
300
|
+
dir_path = path.rstrip("/") + "/"
|
301
|
+
|
302
|
+
existing_blobs = list(self.list_files(dir_path))
|
303
|
+
|
304
|
+
if existing_blobs and not exist_ok:
|
305
|
+
raise FileExistsError(f"Directory {path} already exists")
|
306
|
+
|
307
|
+
# Create a placeholder blob to represent the directory
|
308
|
+
placeholder_blob_path = os.path.join(dir_path, ".placeholder")
|
309
|
+
|
310
|
+
# Only create placeholder if it doesn't already exist
|
311
|
+
if not self.file_exists(placeholder_blob_path):
|
312
|
+
placeholder_content = (
|
313
|
+
b"This is a placeholder blob to represent a directory."
|
314
|
+
)
|
315
|
+
blob_client = self.blob_service_client.get_blob_client(
|
316
|
+
container=self.container, blob=placeholder_blob_path
|
317
|
+
)
|
318
|
+
blob_client.upload_blob(placeholder_content, overwrite=True)
|
319
|
+
|
320
|
+
def remove(self, path: str) -> None:
|
321
|
+
blob_client = self.blob_service_client.get_blob_client(
|
322
|
+
container=self.container, blob=path, snapshot=None
|
323
|
+
)
|
324
|
+
if blob_client.exists():
|
325
|
+
blob_client.delete_blob()
|
@@ -0,0 +1,113 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import delta_sharing
|
3
|
+
from typing import Union
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
from gigaspatial.config import config
|
7
|
+
|
8
|
+
|
9
|
+
class GigaDataAPI:
|
10
|
+
|
11
|
+
def __init__(
|
12
|
+
self,
|
13
|
+
profile_file: Union[str, Path] = config.API_PROFILE_FILE_PATH,
|
14
|
+
share_name: str = config.API_SHARE_NAME,
|
15
|
+
schema_name: str = config.API_SCHEMA_NAME,
|
16
|
+
):
|
17
|
+
"""
|
18
|
+
Initialize the GigaDataAPI class with the profile file, share name, and schema name.
|
19
|
+
|
20
|
+
profile_file: Path to the delta-sharing profile file.
|
21
|
+
share_name: Name of the share (e.g., "gold").
|
22
|
+
schema_name: Name of the schema (e.g., "school-master").
|
23
|
+
"""
|
24
|
+
self.profile_file = profile_file
|
25
|
+
self.share_name = share_name
|
26
|
+
self.schema_name = schema_name
|
27
|
+
self.client = delta_sharing.SharingClient(profile_file)
|
28
|
+
|
29
|
+
self._cache = {}
|
30
|
+
|
31
|
+
def get_country_list(self, sort=True):
|
32
|
+
"""
|
33
|
+
Retrieve a list of available countries in the dataset.
|
34
|
+
|
35
|
+
:param sort: Whether to sort the country list alphabetically (default is True).
|
36
|
+
"""
|
37
|
+
country_list = [
|
38
|
+
t.name for t in self.client.list_all_tables() if t.share == self.share_name
|
39
|
+
]
|
40
|
+
if sort:
|
41
|
+
country_list.sort()
|
42
|
+
return country_list
|
43
|
+
|
44
|
+
def load_country_data(self, country, filters=None, use_cache=True):
|
45
|
+
"""
|
46
|
+
Load the dataset for the specified country with optional filtering and caching.
|
47
|
+
|
48
|
+
country: The country code (e.g., "MWI").
|
49
|
+
filters: A dictionary with column names as keys and filter values as values.
|
50
|
+
use_cache: Whether to use cached data if available (default is True).
|
51
|
+
"""
|
52
|
+
# Check if data is cached
|
53
|
+
if use_cache and country in self._cache:
|
54
|
+
df_country = self._cache[country]
|
55
|
+
else:
|
56
|
+
# Load data from the API
|
57
|
+
table_url = (
|
58
|
+
f"{self.profile_file}#{self.share_name}.{self.schema_name}.{country}"
|
59
|
+
)
|
60
|
+
df_country = delta_sharing.load_as_pandas(table_url)
|
61
|
+
self._cache[country] = df_country # Cache the data
|
62
|
+
|
63
|
+
# Apply filters if provided
|
64
|
+
if filters:
|
65
|
+
for column, value in filters.items():
|
66
|
+
df_country = df_country[df_country[column] == value]
|
67
|
+
|
68
|
+
return df_country
|
69
|
+
|
70
|
+
def load_multiple_countries(self, countries):
|
71
|
+
"""
|
72
|
+
Load data for multiple countries and combine them into a single DataFrame.
|
73
|
+
|
74
|
+
countries: A list of country codes.
|
75
|
+
"""
|
76
|
+
df_list = []
|
77
|
+
for country in countries:
|
78
|
+
df_list.append(self.load_country_data(country))
|
79
|
+
return pd.concat(df_list, ignore_index=True)
|
80
|
+
|
81
|
+
def get_country_metadata(self, country):
|
82
|
+
"""
|
83
|
+
Retrieve metadata (e.g., column names and data types) for a country's dataset.
|
84
|
+
|
85
|
+
country: The country code (e.g., "MWI").
|
86
|
+
"""
|
87
|
+
df_country = self.load_country_data(country)
|
88
|
+
metadata = {
|
89
|
+
"columns": df_country.columns.tolist(),
|
90
|
+
"data_types": df_country.dtypes.to_dict(),
|
91
|
+
"num_records": len(df_country),
|
92
|
+
}
|
93
|
+
return metadata
|
94
|
+
|
95
|
+
def get_all_cached_data_as_dict(self):
|
96
|
+
"""
|
97
|
+
Retrieve all cached data in a dictionary format, where each key is a country code,
|
98
|
+
and the value is the DataFrame of that country.
|
99
|
+
"""
|
100
|
+
return self._cache if self._cache else {}
|
101
|
+
|
102
|
+
def get_all_cached_data_as_json(self):
|
103
|
+
"""
|
104
|
+
Retrieve all cached data in a JSON-like format. Each country is represented as a key,
|
105
|
+
and the value is a list of records (i.e., the DataFrame's `to_dict(orient='records')` format).
|
106
|
+
"""
|
107
|
+
if not self._cache:
|
108
|
+
return {}
|
109
|
+
|
110
|
+
# Convert each DataFrame in the cache to a JSON-like format (list of records)
|
111
|
+
return {
|
112
|
+
country: df.to_dict(orient="records") for country, df in self._cache.items()
|
113
|
+
}
|